def acr_homolog(FAA_FILE, GFF_FILE, FNA_FILE, MIN_PROTEINS_IN_LOCUS, AA_THRESHOLD, DISTANCE_THRESHOLD, DIAMOND_ACRHOMOLOG_FILE, OUTPUT_DIR, isProdigalUsed): ORGANISM_SUBJECT = Organism( [GFF_FILE, FAA_FILE], FNA_FILE, isProdigalUsed, bufferSize=30720, twoFileParse=True) # creates Organism object used to parse gff file GCF = ORGANISM_SUBJECT.GCF # obtaions GCF ID that corresponds to subject acr_hit_record = dict() record_dict = SeqIO.to_dict(SeqIO.parse(FAA_FILE, 'fasta')) HOMOLOGY_FINAL_RESULT_FILE = OUTPUT_DIR + GCF + '_homology_based.out' candidate_loci = second_and_third_filter( first_filter(ORGANISM_SUBJECT, MIN_PROTEINS_IN_LOCUS), GCF, AA_THRESHOLD, DISTANCE_THRESHOLD, MIN_PROTEINS_IN_LOCUS) WP_Maps_Protein = { protein.id: protein for _, proteinList in ORGANISM_SUBJECT.get_ncid_contents().items() for protein in proteinList } Protein_Maps_Loci = { protein.id: loci for loci in candidate_loci for protein in loci } with open(DIAMOND_ACRHOMOLOG_FILE, 'r', 512) as handle: for line in handle: cols = line.rstrip().split('\t') acr, wp, pident, evalue = cols[0], cols[1], cols[2], float( cols[10]) if isProdigalUsed: protein_info_list = record_dict[wp].description.split('#') protein_start = protein_info_list[1].strip() protein_end = protein_info_list[2].strip() protein = 'Protein({0}-{1})'.format(protein_start, protein_end) nc_id = wp[0:wp.rfind('_')] _id = '-'.join([nc_id, protein]) if _id in acr_hit_record: if evalue < acr_hit_record[_id]['evalue']: acr_hit_record[_id] = { 'record': record_dict[wp], 'evalue': evalue, 'nc_id': nc_id, 'protein_id': protein, 'pident': pident, 'acr': acr } else: acr_hit_record[_id] = { 'record': record_dict[wp], 'evalue': evalue, 'nc_id': nc_id, 'protein_id': protein, 'pident': pident, 'acr': acr } else: if wp in acr_hit_record: if evalue < acr_hit_record[wp]['evalue']: acr_hit_record[wp] = { 'record': record_dict[wp], 'evalue': evalue, 'protein_id': wp, 'pident': pident, 'acr': acr } else: acr_hit_record[wp] = { 'record': record_dict[wp], 'evalue': evalue, 'protein_id': wp, 'pident': pident, 'acr': acr } output = '#GCF\tNC ID\tStart\tEnd\tStrand\tProtein ID\taa Length\tGenome_Loci|start|end\tAcr_Hit|pident\tSequence\n' used_wp = set() if len(acr_hit_record) > 0: for wp in acr_hit_record: if wp in Protein_Maps_Loci: if wp not in used_wp: startList = [] endList = [] loci_list = [] for loci_protein in Protein_Maps_Loci[wp]: loci_list.append(str(loci_protein.wp)) startList.append(loci_protein.start) endList.append(loci_protein.end) start = min(startList) end = max(endList) for loci_protein in Protein_Maps_Loci[wp]: protein = loci_protein output += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t".format( GCF, protein.nc, protein.start, protein.end, protein.strand, protein.wp, str(int(((protein.end - protein.start + 1) / 3)))) output += "{}|{}|{}\t".format('-'.join(loci_list), start, end) if loci_protein.id in acr_hit_record: output += "{}|{}\t".format( acr_hit_record[protein.id]['acr'], acr_hit_record[protein.id]['pident']) else: output += "---\t" output += "{}\n".format(protein.sequence) used_wp.add(protein.id) output += "\n" else: protein = WP_Maps_Protein[wp] output += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t".format( GCF, protein.nc, protein.start, protein.end, protein.strand, protein.wp, str(int(((protein.end - protein.start + 1) / 3)))) output += "---\t{}|{}\t".format(acr_hit_record[wp]['acr'], acr_hit_record[wp]['pident']) output += "{}\n\n".format(protein.sequence) # protein = WP_Maps_Protein[wp] # output += "{}\t{}\t{}\t{}\t{}\t{}\t{}\t".format(GCF, protein.nc, protein.start, protein.end, protein.strand, protein.wp, str( int(((protein.end - protein.start + 1) / 3)) ) ) # if wp in Protein_Maps_Loci: # startList = []; endList = []; loci_list = [] # for loci_protein in Protein_Maps_Loci[wp]: # loci_list.append(str(loci_protein.wp)) # startList.append(loci_protein.start) # endList.append(loci_protein.end) # start = min(startList); end = max(endList) # output += "{}|{}|{}\t{}|{}\t".format('-'.join(loci_list), start, end, acr_hit_record[wp]['acr'], acr_hit_record[wp]['pident']) # else: # output += "---\t---\t" # output += "{}\n".format(protein.sequence) with open(HOMOLOGY_FINAL_RESULT_FILE, 'w') as out_handle: out_handle.write(output) return acr_hit_record, HOMOLOGY_FINAL_RESULT_FILE else: return acr_hit_record, None