Exemple #1
0
def acr_homolog(FAA_FILE, GFF_FILE, FNA_FILE, MIN_PROTEINS_IN_LOCUS,
                AA_THRESHOLD, DISTANCE_THRESHOLD, DIAMOND_ACRHOMOLOG_FILE,
                OUTPUT_DIR, isProdigalUsed):
    ORGANISM_SUBJECT = Organism(
        [GFF_FILE, FAA_FILE],
        FNA_FILE,
        isProdigalUsed,
        bufferSize=30720,
        twoFileParse=True)  # creates Organism object used to parse gff file
    GCF = ORGANISM_SUBJECT.GCF  # obtaions GCF ID that corresponds to subject

    acr_hit_record = dict()
    record_dict = SeqIO.to_dict(SeqIO.parse(FAA_FILE, 'fasta'))
    HOMOLOGY_FINAL_RESULT_FILE = OUTPUT_DIR + GCF + '_homology_based.out'

    candidate_loci = second_and_third_filter(
        first_filter(ORGANISM_SUBJECT, MIN_PROTEINS_IN_LOCUS), GCF,
        AA_THRESHOLD, DISTANCE_THRESHOLD, MIN_PROTEINS_IN_LOCUS)
    WP_Maps_Protein = {
        protein.id: protein
        for _, proteinList in ORGANISM_SUBJECT.get_ncid_contents().items()
        for protein in proteinList
    }
    Protein_Maps_Loci = {
        protein.id: loci
        for loci in candidate_loci for protein in loci
    }

    with open(DIAMOND_ACRHOMOLOG_FILE, 'r', 512) as handle:
        for line in handle:
            cols = line.rstrip().split('\t')
            acr, wp, pident, evalue = cols[0], cols[1], cols[2], float(
                cols[10])
            if isProdigalUsed:
                protein_info_list = record_dict[wp].description.split('#')
                protein_start = protein_info_list[1].strip()
                protein_end = protein_info_list[2].strip()
                protein = 'Protein({0}-{1})'.format(protein_start, protein_end)
                nc_id = wp[0:wp.rfind('_')]

                _id = '-'.join([nc_id, protein])

                if _id in acr_hit_record:
                    if evalue < acr_hit_record[_id]['evalue']:
                        acr_hit_record[_id] = {
                            'record': record_dict[wp],
                            'evalue': evalue,
                            'nc_id': nc_id,
                            'protein_id': protein,
                            'pident': pident,
                            'acr': acr
                        }
                else:
                    acr_hit_record[_id] = {
                        'record': record_dict[wp],
                        'evalue': evalue,
                        'nc_id': nc_id,
                        'protein_id': protein,
                        'pident': pident,
                        'acr': acr
                    }
            else:
                if wp in acr_hit_record:
                    if evalue < acr_hit_record[wp]['evalue']:
                        acr_hit_record[wp] = {
                            'record': record_dict[wp],
                            'evalue': evalue,
                            'protein_id': wp,
                            'pident': pident,
                            'acr': acr
                        }
                else:
                    acr_hit_record[wp] = {
                        'record': record_dict[wp],
                        'evalue': evalue,
                        'protein_id': wp,
                        'pident': pident,
                        'acr': acr
                    }

    output = '#GCF\tNC ID\tStart\tEnd\tStrand\tProtein ID\taa Length\tGenome_Loci|start|end\tAcr_Hit|pident\tSequence\n'
    used_wp = set()
    if len(acr_hit_record) > 0:
        for wp in acr_hit_record:
            if wp in Protein_Maps_Loci:
                if wp not in used_wp:
                    startList = []
                    endList = []
                    loci_list = []
                    for loci_protein in Protein_Maps_Loci[wp]:
                        loci_list.append(str(loci_protein.wp))
                        startList.append(loci_protein.start)
                        endList.append(loci_protein.end)
                    start = min(startList)
                    end = max(endList)

                    for loci_protein in Protein_Maps_Loci[wp]:
                        protein = loci_protein
                        output += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
                            GCF, protein.nc, protein.start, protein.end,
                            protein.strand, protein.wp,
                            str(int(((protein.end - protein.start + 1) / 3))))
                        output += "{}|{}|{}\t".format('-'.join(loci_list),
                                                      start, end)
                        if loci_protein.id in acr_hit_record:
                            output += "{}|{}\t".format(
                                acr_hit_record[protein.id]['acr'],
                                acr_hit_record[protein.id]['pident'])
                        else:
                            output += "---\t"
                        output += "{}\n".format(protein.sequence)
                        used_wp.add(protein.id)
                    output += "\n"
            else:
                protein = WP_Maps_Protein[wp]
                output += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t".format(
                    GCF, protein.nc, protein.start, protein.end,
                    protein.strand, protein.wp,
                    str(int(((protein.end - protein.start + 1) / 3))))
                output += "---\t{}|{}\t".format(acr_hit_record[wp]['acr'],
                                                acr_hit_record[wp]['pident'])
                output += "{}\n\n".format(protein.sequence)

            # protein = WP_Maps_Protein[wp]
            # output += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t".format(GCF, protein.nc, protein.start, protein.end, protein.strand, protein.wp,  str( int(((protein.end - protein.start + 1) / 3)) ) )
            # if wp in Protein_Maps_Loci:
            # 	startList = []; endList = []; loci_list = []
            # 	for loci_protein in Protein_Maps_Loci[wp]:
            # 		loci_list.append(str(loci_protein.wp))
            # 		startList.append(loci_protein.start)
            # 		endList.append(loci_protein.end)
            # 	start = min(startList); end = max(endList)
            # 	output += "{}|{}|{}\t{}|{}\t".format('-'.join(loci_list), start, end, acr_hit_record[wp]['acr'], acr_hit_record[wp]['pident'])
            # else:
            # 	output += "---\t---\t"
            # output += "{}\n".format(protein.sequence)

        with open(HOMOLOGY_FINAL_RESULT_FILE, 'w') as out_handle:
            out_handle.write(output)
        return acr_hit_record, HOMOLOGY_FINAL_RESULT_FILE
    else:
        return acr_hit_record, None