print('Multiple closest') problem_dic[i] = multiple_dic[i] GI_to_pick = [ multiple_dic[i][m] for m, x in enumerate(identities) if x == max(identities) ] print(GI_to_pick) multfinalseqs.append(GI_to_pick) for i in two_dic: print(i) #align the two seqs list_of_GIs = two_dic[i] print(list_of_GIs) alignment = alignment_reg(list_of_GIs) iden = identity_calc(alignment) if iden < 95: print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: #get taxonomy for query(main species) print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: print("Low Complement Aligned Identity: " + str(iden)) #add tiling thing gene_name = '_'.join(i.split('_')[1:]) idens, start_stop = tiling(list_of_GIs, gene_name) current_start = -1
def test_resolved_seqs(infile, blastdb, taxdb): import sqlite3, sys, time from Bio.Blast import NCBIWWW, NCBIXML from blastlib.clean_seq_funcs import resolve_seqs, alignment_comp, alignment_reg, alignment_rev_comp, blast, blast_all, identity_calc, tiling conn = sqlite3.connect(blastdb) c = conn.cursor() c.execute("ATTACH '" + taxdb + "' as 'tax'") error_dic = {} blast_dic_nums = {} blast_dic_tcids = {} seqs_to_blast = [] finalseqs = set() multseqs = [] #do self blast and print to file with open(infile) as fasta_file: print("Blasting " + infile) records = fasta_file.read() numqueries = records.count('>') error = True while error == True: try: result_handle = NCBIWWW.qblast( "blastn", "nt", records, entrez_query= '((Papilionoidea[Organism]) OR Hedylidae[Organism]) OR Hesperiidae[Organism]', word_size=28, hitlist_size=100) error = False except: error = True #get rid of this extra step of printing xml using NCBIXML with open(infile + ".xml", "w") as save_file: save_file.write(result_handle.read()) result_handle.close() #open self blast file for parsing #make dictionary of query species: query GI of those that don't have the top hit as the same species with open(infile + ".xml") as p: print("Parsing blast output") blast_recs = NCBIXML.parse(p) count = 0 for rec in blast_recs: count += 1 print(str(round(float(count) / float(numqueries) * 100, 2)) + "%") #figure out a new way to do this queryAcc = str(rec.query.split()[0]) for iter in c.execute("SELECT GI FROM blast WHERE accession='" + queryAcc + "'"): queryGI = (str(iter[0])) hitdic = {} hitSp = set() for alignment in rec.alignments: for hsp in alignment.hsps: identity = float(hsp.identities) / float(hsp.align_length) if alignment.title.split("|")[1] == queryGI: pass else: hitdic[str(alignment.title.split("|")[1])] = identity maxiden = max(hitdic.values()) hitGIs = [GI for GI, iden in hitdic.iteritems() if iden == maxiden] for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + queryGI + "'"): querySp = (str(iter[0])) for i in hitGIs: for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + i + "'"): hitSp.add(str(iter[0])) #only look at top 5 if it doesnt hit the top hit - faster if querySp not in hitSp: hitSp = set() if len(hitdic.values()) > 5: maxiden = sorted(hitdic.values(), reverse=True)[0:5] else: maxiden = hitdic.values() hitGIs = [ GI for GI, iden in hitdic.iteritems() if iden in maxiden ] for i in hitGIs: for iter in c.execute( "SELECT tc_id FROM blast WHERE GI='" + i + "'"): hitSp.add(str(iter[0])) if querySp not in hitSp: error_dic[querySp] = queryGI else: finalseqs.add(queryGI) else: finalseqs.add(queryGI) count = 0 #error_dic['21204'] = '316994286' ##go through error dictionary and align the 'same' gene/species to see if they're weird looking newseqs = set() print("Checking nonmatching sequences") for tc_id in error_dic: count += 1 print(str(round(float(count) / float(len(error_dic)) * 100, 2)) + "%") list_of_GIs = [] for iter in c.execute("SELECT Gene_name FROM blast WHERE GI ='" + error_dic[tc_id] + "'"): gene_name = str(iter[0]) for iter in c.execute("SELECT GI FROM blast WHERE tc_id = '" + tc_id + "' and Gene_name = '" + gene_name + "'"): list_of_GIs.append(str(iter[0])) first_GI = list_of_GIs[0] other_GIs = list_of_GIs[1:] pair_check = [] #align each seq to the first one - first one acts as the 'default' direction for x, GI in enumerate(other_GIs): GI_pair = [first_GI, GI] alignment = alignment_reg(GI_pair) iden = identity_calc(alignment) if iden < 90: # print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(GI_pair) iden = identity_calc(alignment) if iden < 90: # print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(GI_pair) iden = identity_calc(alignment) if iden < 90: # print("Low Complement Aligned Identity: " + str(iden)) pair_check.append(0) else: pair_check.append(1) # print("Complement iden: " + str(iden) + " so pair is fine") else: pair_check.append(1) # print("Reverse Complement iden: " + str(iden) + " so pair is fine") else: pair_check.append(1) # print("High Aligned Identity: " + str(iden) + " so pair is fine") # print(pair_check) if all(i == 1 for i in pair_check): finalseqs.add(error_dic[tc_id]) newseqs.add(error_dic[tc_id]) else: idens, start_stop = tiling(list_of_GIs, gene_name) current_start = -1 current_stop = -1 result = [] if all(i > 70 for i in idens): for start, stop in sorted(start_stop): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) if len(result) == len(start_stop): # print("Seqs align to different regions of probe, choosing all") for x in list_of_GIs: finalseqs.add(x) newseqs.add(x) else: # print('Seqs overlap: Printing to file for hand checking') with open('these_seqs_overlap.txt', 'a') as a: a.write(str(list_of_GIs) + '\n') else: # print('Somethings up with a sequence - printing to check - will blast') pair_check.append(0) blast_dic_nums[list_of_GIs[0]] = len(list_of_GIs) blast_dic_tcids[list_of_GIs[0]] = tc_id seqs_to_blast.append(list_of_GIs) with open('seqs_to_be_blasted.txt', 'a') as a: a.write(str(list_of_GIs) + '\n') if len(seqs_to_blast) > 0: print( "Blasting error seqeuences (seqs don't align together and one doesn't align to whole)" ) seqs_to_blast_flat = [ item for sublist in seqs_to_blast for item in sublist ] try: hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums, blast_dic_tcids, c) except: time.sleep(5) hits_all = blast_all(seqs_to_blast_flat, blast_dic_nums, blast_dic_tcids, c) # print(hits_all) print("Parsing taxonomy for error sequences") for x, list_of_GIs in enumerate(seqs_to_blast): hits = hits_all[x] tc_id = blast_dic_tcids[list_of_GIs[0]] #if theres only one lowest taxonomy hit and its not itself, change if hits.count(min(hits)) == 1 and error_dic[tc_id] != list_of_GIs[ hits.index(min(hits))]: finalseqs.add(list_of_GIs[hits.index(min(hits))]) # print(str(list_of_GIs[hits.index(min(hits))]) + " had closer taxonomy hit") elif hits.count(min( hits)) == 1 and error_dic[tc_id] == list_of_GIs[hits.index( min(hits))]: finalseqs.add(error_dic[tc_id]) # print(str(list_of_GIs[hits.index(min(hits))]) + " was previously chosen") else: #there are multiple lowest taxonomy hits # print('Taxonomies had multiple closest hits') index_pos = [] count = 0 for x in hits: if x == min(hits): index_pos.append(count) count += 1 mult_GIs = [list_of_GIs[x] for x in index_pos] GI_to_pick = resolve_seqs(mult_GIs) #if theres only one chosen and it wasnt the one already picked...add to change dic if len(GI_to_pick) == 1 and error_dic[tc_id] != GI_to_pick[0]: finalseqs.add(GI_to_pick[0]) # print(str(GI_to_pick) + " chosen") #if theres only one max length and it was already picked elif len( GI_to_pick) == 1 and error_dic[tc_id] == GI_to_pick[0]: # print(str(GI_to_pick) + " was previously chosen") finalseqs.add(GI_to_pick[0]) else: #this only happens if the originally picked one is crappy and the rest are the same # print("Multiple choices: " + str(GI_to_pick)) multseqs.append(error_dic[tc_id]) #Go to cluster analysis print('length of resolved=' + str(len(finalseqs))) print('length of not resolved = ' + str(len(multseqs))) with open("final_GIs.txt", "a") as o: for m in finalseqs: o.write(str(m) + "\n") #to cluster with open("multiple_gene_choices.txt", "a") as o: for m in multseqs: o.write(str(m) + "\n") conn.close()
def test_resolved_seqs(infile, blastdb, taxdb): ent_query = get_blast_query(taxdb) import sqlite3, sys, time, subprocess from Bio.Blast import NCBIWWW, NCBIXML from blastlib.clean_seq_funcs import resolve_seqs, alignment_comp, alignment_reg, alignment_rev_comp, blast, blast_all, identity_calc, tiling from cleanlib.databasing import get_seqs_from_sqldb, export_fasta, create_blast_db, local_blast conn = sqlite3.connect(blastdb) c = conn.cursor() c.execute("ATTACH '" + taxdb + "' as 'tax'") error_dic = {} blast_dic_nums = {} blast_dic_tcids = {} seqs_to_blast = [] finalseqs = set() multseqs = [] #open self blast file for parsing #make dictionary of query species: query GI of those that don't have the top hit as the same species with open(infile) as p: print("Parsing reciprocal blast output from " + infile) blast_recs=NCBIXML.parse(p) count = 0 numqueries = subprocess.check_output("grep -c '>' "+ infile.split(".xml")[0], shell=True) for rec in blast_recs: count += 1 print(str(round(float(count)/float(numqueries)*100, 2))+ "%") queryAcc = str(rec.query.split()[0]) #this broke when ncbi updated for iter in c.execute("SELECT GI FROM blast WHERE accession='" + queryAcc + "';"): queryGI = (str(iter[0])) #print(queryGI) #hitdic is GIs and idens of 20 in .xml for each rec in blast_rec hitdic = {} hitSp = set() count1 = 0 for alignment in rec.alignments: for hsp in alignment.hsps: identity=float(hsp.identities)/float(hsp.align_length) #print(identity) if alignment.hit_def == queryAcc: pass else: for iter in c.execute("SELECT GI FROM blast WHERE accession='" + alignment.hit_def + "'"): hitGI = (str(iter[0])) count1 += 1 hitdic[str(hitGI)] = identity if count1 >= 20: break # if alignment.title.split("|")[1] == queryGI: # pass # else: # hitdic[str(alignment.title.split("|")[1])] = identity if len(hitdic.values()) == 0: #####################chose next one####################### print('No matching hits from blast') c.execute("UPDATE blast SET Decision='Chosen, but then did not blast to anything/Not chosen' WHERE GI = '" + queryGI + "';") pass else: ##first looks at just the top iden value maxiden = max(hitdic.values()) try: hitGIs = [GI for GI, iden in hitdic.iteritems() if iden == maxiden] #python2 except: hitGIs = [GI for GI, iden in hitdic.items() if iden == maxiden] #python3 for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + queryGI + "'"): querySp = (str(iter[0])) for i in hitGIs: for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + i + "'"): hitSp.add(str(iter[0])) ##if it doesn't work, look at top 5 idens - ignore top hit b/c already looked at if querySp not in hitSp: hitSp = set() if len(hitdic.values()) > 5: maxiden = sorted(set(hitdic.values()), reverse = True)[1:5] else: maxiden = hitdic.values() try: hitGIs = [GI for GI, iden in hitdic.iteritems() if iden in maxiden] #python2 except: hitGIs = [GI for GI, iden in hitdic.items() if iden in maxiden] #python3 #get hit species (actually tc_ids) for i in hitGIs: for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + i + "'"): hitSp.add(str(iter[0])) #if species of query not in the list of hit species if querySp not in hitSp: #will overwrite tiling info #c.execute("UPDATE blast SET decision='Chosen, but does not reciprocal blast' WHERE GI='" + queryGI + "';") if querySp not in error_dic: error_dic[querySp] = [queryGI] else: output = error_dic[querySp] output.append(querySp) error_dic[querySp] = output else: c.execute("UPDATE blast SET decision='Longest or most info, good top hit/chosen' WHERE GI='" + queryGI + "';") else: c.execute("UPDATE blast SET decision='Longest or most info, good top hit/chosen' WHERE GI='" + queryGI + "';") count = 0 #error_dic is dictionary that has species = GI that doesn't match species when self blast # error_dic = {'81': [908332352], '56': '227435787', '60': '224939202', '39': '497153612', '157': '224939198', '87': '695134008', '628': '695134046', '184': '1174533403', '116': '558477133', '121': '316994106', '206': '545690375', '429': '1160414548', '431': '227436029', '480': '695134034', '105': '545690685', '651': '695134010', '413': '529377043', '373': '443611395', '252': '529388651', '354': '529386663', '335': '443611449', '584': '295078734', '375': '1559462858', '523': '1051545064', '350': '443611513', '410': '302487953', '527': '316994100', '244': '575502624', '355': '443611465', '595': '317467717', '380': '443611323', '357': '529374537', '577': '317467813', '363': '675401803', '406': '443611367', '372': '73533766', '656': '299833006'} ##go through error dictionary and align the 'same' gene/species to see if they're weird looking print("Aligning sequences where chosen does not reciprocal blast to another") for tc_id in error_dic: for tile in error_dic[tc_id]: count += 1 print(str(round(float(count)/float(len(error_dic))*100, 2))+"%") list_of_GIs = [tile] for iter in c.execute("SELECT Gene_name, Decision FROM blast WHERE GI ='" + tile + "'"): gene = str(iter[0]) tilenum = str(iter[1].split()[7]) for iter in c.execute("SELECT GI FROM blast WHERE Decision = 'Short or less info, tile "+tilenum+"/Not chosen' and tc_id = '"+tc_id+"' and Gene_name = '"+gene+"'"): list_of_GIs.append(str(iter[0])) #first is always querygi first_GI = list_of_GIs[0] other_GIs = list_of_GIs[1:] pair_check = [] #align each seq to the first one - first one acts as the 'default' direction for x, GI in enumerate(other_GIs): GI_pair = [first_GI, GI] alignment = alignment_reg(GI_pair, blastdb, False, gene, c) iden = identity_calc(alignment) if iden < 90: # print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(GI_pair, blastdb, False, gene, c) iden = identity_calc(alignment) if iden < 90: # print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(GI_pair, blastdb, False, gene, c) iden = identity_calc(alignment) if iden < 90: # print("Low Complement Aligned Identity: " + str(iden)) pair_check.append(0) else: pair_check.append(1) # print("Complement iden: " + str(iden) + " so pair is fine") else: pair_check.append(1) # print("Reverse Complement iden: " + str(iden) + " so pair is fine") else: pair_check.append(1) # print("High Aligned Identity: " + str(iden) + " so pair is fine") #print(pair_check) #1 is when the GI aligned to chosen one just fine, 0 is when it doesn't if all(i == 1 for i in pair_check): c.execute("UPDATE blast SET decision='Sequence did not have same top blast species, but all aligned correctly, tile "+tilenum+"/Chosen' WHERE GI='" + tile + "';") else: ##either all are 0s if first one is wrong or one is zero where one is wrong # print('Somethings up with a sequence - printing to check - will blast') seqs_to_blast.append(list_of_GIs) list_of_GIs_str = str(list_of_GIs).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='Chosen, but does not reciprocal blast and all do not align' WHERE GI IN " + list_of_GIs_str + ";") # seqs_to_blast = [['695134046', '317467897'], ['316994106', '295069424', '414303142'], ['695134034', '71836048'], ['302487953', '73533700', '66096804'], ['575502624', '1723904125', '1723904119'], ['73533766', '443611441', '443611439'], ['296792260', '1531247074', '1531246852', '1531245174', '1531244766', '1531244542', '1531244412', '870902494', '520760133', '520760131', '331691381', '331690706', '331690580', '331690574', '331690568', '296467088', '296467084', '331691379', '331690792', '974999640', '974999638', '1531247630', '1531247410', '1531247308', '1531247180', '1531247130', '1531246686', '1531246650', '1531246450', '1531246148', '1531246132', '1531245926', '1531245702', '1531245624', '1531245592', '1531245462', '1531245454', '1531245440', '1531245354', '1531244888', '1531244834', '1531244804', '1531244664', '1531244614', '1531244586', '1531244178', '1531244132', '1531244016', '520760163', '520760159', '520760149', '331691355', '331690624', '331690576', '331690564', '331690544', '307641798', '296469132', '296468960', '296467194', '296466760', '304270861', '331690704', '63030162', '974999636', '227436371', '1532637164', '1532637082', '1532637024', '1531247684', '1531247306', '1531247040', '1531246466', '1531246208', '1531246108', '1531245848', '1531245842', '1531245638', '1531245614', '1531245574', '1531245376', '1531244734', '1531244658', '1531244080', '1531244012', '1531243910', '633896164', '520760155', '520759941', '331691337', '331690536', '300203102', '156619418', '156619410', '974999630', '296467200', '304270883', '1532637150', '1532637144', '1532637064', '1532637028', '1532637018', '730837173', '633896166', '520760157', '520760151', '331691341', '331691339', '331690582', '331690562', '1557923373', '1557923370', '156619416', '156619412', '1532637074', '1532637022', '1532637016', '156619414', '156619408', '730837157', '633896160', '1532637100', '1532637090', '1532637072', '1532637038', '633896162', '1042273875', '520760209', '313173207', '296468852', '88604889', '1532637050', '520760225', '520760219', '520760213', '296468054', '1532637044', '520760221', '520760193', '296792296', '312928999', '1557923367', '1557923364', '156619420', '331690820', '156619406', '520760211', '227436369', '296463378', '296463050', '227436365', '227436367', '296463374', '870901906', '870902242', '299833006', '76008949', '296464328', '296465368', '296464998', '296465182', '296464956', '296465198', '296463914']] ##sequences don't align together and chosen hit doesn't blast to same species (one area of tiling) if len(seqs_to_blast) > 0: for iter in c.execute("SELECT Gene_name FROM blast WHERE GI ='" + seqs_to_blast[0][0] + "'"): gene = str(iter[0]) seqs_to_blast_flat = [item for sublist in seqs_to_blast for item in sublist] #queryGI: nearest tax rank of hit nearest_hit_taxo_dic = blast_all(seqs_to_blast_flat, c, gene, taxdb, blastdb) print("Parsing taxonomy for error sequences") for i in seqs_to_blast: hit_taxonomies = [nearest_hit_taxo_dic[x] for x in i] #want max tcid - that is closest to query maxtcid = max(hit_taxonomies) GI_to_pick = [GI for GI in enumerate(hit_taxonomies) if iden == maxtcid] if len(GI_to_pick) == 1: c.execute("UPDATE blast SET decision='Blast hits have closest taxonomy/Chosen' WHERE GI='" + GI_to_pick[0] + "';") else: GI_to_pick_str = str(GI_to_pick).replace("[", "(").replace("]", ")") c.execute("UPDATE blast SET Decision='To cluster analysis/Chosen' WHERE GI IN " + GI_to_pick_str + ";") conn.commit() conn.close()
def cluster(blastdb, taxdb): Entrez.email = "*****@*****.**" conn = sqlite3.connect(blastdb) c = conn.cursor() c.execute("ATTACH '" + taxdb + "' as 'tax'") muscle_cline = MuscleCommandline(clwstrict=True) input_dic = {} multiple_dic = {} two_dic = {} problem_dic = {} finalseqs = set() multfinalseqs = [] unresolved = [] with open("multiple_gene_choices.txt") as o: line = o.readline() while line: input_dic[line.split("\t")[0]] = line.strip().split( "\t")[1].replace("[", "").replace("]", "").replace("'", "") line = o.readline() for i in input_dic: GIs = input_dic[i] GIs_list = GIs.split(", ") if len(GIs_list) > 2: multiple_dic[i] = GIs_list if len(GIs_list) == 2: two_dic[i] = GIs_list for i in multiple_dic: identities = [] joined_GIs = ",".join(multiple_dic[i]) handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=joined_GIs) seqs = SeqIO.parse(handle, "fasta") handle_string = StringIO() SeqIO.write(seqs, handle_string, "fasta") data = handle_string.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") summary_align = AlignInfo.SummaryInfo(align) consensus = summary_align.gap_consensus(threshold=.5, ambiguous='N') consensus_record = SeqRecord(consensus, id="Consensus_all") for m in multiple_dic[i]: error = True while error == True: try: handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=m) error = False except: print('Error, trying again') time.sleep(10) seqs = SeqIO.read(handle, "fasta") handle_string = StringIO() SeqIO.write(seqs, handle_string, "fasta") SeqIO.write(consensus_record, handle_string, "fasta") data = handle_string.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") count = 0 gaps = 0 for col in range(0, len(align[0])): column = align[:, col] if "-" not in column: if column[1:] == column[:-1]: count = count + 1 else: gaps = gaps + 1 iden = 100 * (count / float((len(align[0]) - gaps))) identities.append(iden) if identities.count(max(identities)) == 1: finalseqs.add(multiple_dic[i][identities.index(max(identities))]) else: problem_dic[i] = multiple_dic[i] GI_to_pick = [ multiple_dic[i][m] for m, x in enumerate(identities) if x == max(identities) ] multfinalseqs.append(GI_to_pick) for i in two_dic: #align the two seqs list_of_GIs = two_dic[i] alignment = alignment_reg(list_of_GIs) iden = identity_calc(alignment) if iden < 95: # print("Low Aligned Identity: " + str(iden)) alignment = alignment_rev_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: #get taxonomy for query(main species) # print("Low Reverse Complement Aligned Identity: " + str(iden)) alignment = alignment_comp(list_of_GIs) iden = identity_calc(alignment) if iden < 95: # print("Low Complement Aligned Identity: " + str(iden)) #add tiling thing gene_name = '_'.join(i.split('_')[1:]) idens, start_stop = tiling(list_of_GIs, gene_name) current_start = -1 current_stop = -1 result = [] if all(m > 70 for m in idens): for start, stop in sorted(start_stop): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) if len(result) == len(start_stop): # print("Seqs align to different regions of probe, choosing all") multfinalseqs.append(list_of_GIs) else: # print('Seqs overlap: Printing to file for hand checking') with open('these_seqs_overlap_cluster.txt', 'a') as a: unresolved.append(list_of_GIs) a.write(str(list_of_GIs) + '\n') else: #get taxonomy for query(main species) print("Parsing taxonomy for error sequences") hits = blast(i, list_of_GIs, c) #if theres only one lowest taxonomy hit, change if hits.count(min(hits)) == 1: finalseqs.add( str(two_dic[i][hit_levels.index(min(hits))])) # print(str(two_dic[i][hit_levels.index(min(hits))]) + " had closer taxonomy hit") else: #there are multiple lowest taxonomy hits multfinalseqs.append(two_dic[i]) problem_dic[i] = two_dic[i] # print('Taxonomies had the multiple closest hits') else: multfinalseqs.append(two_dic[i]) # print("Complement iden: " + str(iden) + " so pair is fine") else: multfinalseqs.append(two_dic[i]) # print("Reverse Complement iden: " + str(iden) + " so pair is fine") else: multfinalseqs.append(two_dic[i]) # print("High Aligned Identity: " + str(iden) + " so pair is fine") print("length of resolved = " + str(len(finalseqs))) print("length of choose multiple = " + str(len(multfinalseqs))) print("length of unresolved = " + str(len(unresolved))) with open("final_GIs.txt", "a") as o: for m in finalseqs: o.write(str(m) + "\n") with open("choose_mult.txt", "a") as o: for m in [num for pair in multfinalseqs for num in pair]: o.write(str(m) + "\n")
def resolve_seqs(blastdb): import os, sys, sqlite3, re, time, itertools from Bio import Seq, Entrez, SeqIO from blastlib.clean_seq_funcs import resolve_seqs, alignment_reg, alignment_comp, alignment_rev_comp, identity_calc, tiling conn = sqlite3.connect(blastdb) c = conn.cursor() Entrez.email = "*****@*****.**" GI_nums_all = set() GI_nums_single = set() GI_nums_all_COI = set() GI_nums_single_COI = set() genes = set() dic = {} dic_COI = {} dic_single = {} dic_mult = {} count2 = 1 records = [] #this gets list of all taxa/genes regardless if they have multiple gene choices or not for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;" ): GI_nums_all.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this gets a list of all taxa/genes if they only have one gene choice for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name != 'COI_trnL_COII' GROUP BY tc_id, Gene_name HAVING COUNT(*) =1;" ): GI_nums_single.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this give me all the GIs that have multiple gene choices GI_nums = GI_nums_all - GI_nums_single #do the same with COI #this gets list of all taxa/genes regardless if they have multiple gene choices or not for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name == 'COI_trnL_COII' GROUP BY tc_id, Gene_name, GI;" ): GI_nums_all_COI.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this gets a list of all taxa/genes if they only have one gene choice for iter in c.execute( "SELECT tc_id, Gene_name, GI, hit_length FROM blast WHERE tc_id NOT NULL AND Gene_name = 'COI_trnL_COII' GROUP BY tc_id HAVING COUNT(*) =1;" ): GI_nums_single_COI.add( str(iter[0]) + "_" + str(iter[1]) + "|" + str(iter[2]) + "_" + str(iter[3])) #this give me all the GIs that have multiple gene choices GI_nums_mult_COI = GI_nums_all_COI - GI_nums_single_COI for i in GI_nums_single_COI: if int(i.split("_")[-1]) > 3000: GI_nums_mult_COI.add(i) #makes a dictionary of lists for each taxa/gene choice for i in GI_nums: if i.split("|")[0] in dic.keys(): dic_list = dic[i.split("|")[0]] dic_list.append(i.split("|")[1]) dic[i.split("|")[0]] = dic_list else: dic[i.split("|")[0]] = [i.split("|")[1]] genes.add(re.split('_|\|', i)[1]) #same for COI for i in GI_nums_mult_COI: if i.split("|")[0] in dic_COI.keys(): dic_list = dic_COI[i.split("|")[0]] dic_list.append(i.split("|")[1]) dic_COI[i.split("|")[0]] = dic_list else: dic_COI[i.split("|")[0]] = [i.split("|")[1]] genes.add(re.split('_|\|', i)[1]) count = 0 #deal with lengths of COI and add to dic to try and resolve print("Trying to resolve COI/COII sequences") for i in dic_COI: count += 1 print(str(round(float(count) / float(len(dic_COI)) * 100, 2)) + '%') lengths = [int(m.split('_')[1]) for m in dic_COI[i]] individual = [dic_COI[i][x] for x, l in enumerate(lengths) if l < 2000] whole = [ dic_COI[i][x] for x, l in enumerate(lengths) if l > 2000 and l < 3000 ] mito = [dic_COI[i][x] for x, l in enumerate(lengths) if l > 3000] if len(mito) > 0: GIs_to_align = [mito[0].split("_")[0], 'GU365907'] alignment = alignment_reg(GIs_to_align) iden = identity_calc(alignment) if iden > 80: #have to do span to account for random small blocks that dont align span = 0 #get start for l in range(len(alignment[0])): col = alignment[:, l] if '-' not in col: span += 1 if span == 10: break elif span > 0 and '-' in col: span = 0 start = l - 8 span = 0 #get stop for l in reversed(range(len(alignment[0]))): col = alignment[:, l] if '-' not in col: span += 1 if span == 10: break elif span > 0 and '-' in col: span = 0 end = l + 10 handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=mito[0].split("_")[0], seq_start=start - 1, seq_stop=end - 2) record = SeqIO.read(handle, "fasta") records.append(record) else: print('Low iden when matching COI to whole mito') with open("COI_hand_check.txt", "a") as a: a.write(i + '\n') elif len(whole) > 0: #pick whole if len(whole) == 1: GI_nums_single.add(i + "|" + whole[0]) else: #to pipeline dic[i] = whole if len(individual) > 0: #pick individual if len(individual) == 1: GI_nums_single.add(i + "|" + individual[0]) else: # print(individual) result = [] ranges = {} current_start = -1 current_stop = -1 whole_length = 0 #do tiling for m in [x.split('_')[0] for x in individual]: iden, start_stop = tiling([m], 'COI_trnL_COII') start, end = start_stop[0] #uses gi for danaus chrysippus COI_trnL_COII if iden > 70: #make dic of lists if (start, end) in ranges.keys(): ranges_dic_list = ranges[(start, end)] ranges_dic_list.append(m) ranges[(start, end)] = ranges_dic_list else: ranges[(start, end)] = [m] else: print('Alignment below 70') print(GIs_to_align) if len(ranges) == 0: print('All alignments below 70, printing to file') with open("COI_hand_check.txt", "a") as a: a.write(i + '\n') #get merged range for start, stop in sorted(ranges.keys()): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) for n in result: whole_length += n[1] - n[0] + 1 #go through each combination of ranges to get 95% of whole range for L in range(1, len(ranges) + 1): max_perc = 0 for subset in itertools.combinations(ranges.keys(), L): comb_length = 0 #for each combination, get merged length current_start = -1 current_stop = -1 result = [] for start, stop in sorted(subset): if start > current_stop: result.append((start, stop)) current_start, current_stop = start, stop else: current_stop = max(current_stop, stop) result[-1] = (current_start, current_stop) for x in result: comb_length += x[1] - x[0] + 1 if whole_length >= comb_length: perc = float(comb_length) / float(whole_length) if perc > max_perc: max_perc = perc max_subset = set() max_subset.add(subset) elif perc == max_perc: max_subset.add(subset) else: pass else: pass # goes through all combinations in a level before breaking if max_perc > .95: break final_tiling = [(0, 0)] * L for combination in max_subset: for x, comb_frag in enumerate(sorted(combination)): if comb_frag[1] - comb_frag[0] + 1 > final_tiling[x][ 1] - final_tiling[x][0] + 1: final_tiling[x] = comb_frag # print(final_tiling) possible_GIs = [ranges[x] for x in final_tiling] # print(possible_GIs) count = 0 for m in possible_GIs: if len(m) == 1: GI_nums_single.add(i + "|" + m[0] + "_0") else: if count == 0: dic[i] = m count += 1 else: dic[i + "_" + str(count)] = m count += 1 # merge ranges - if same number of ranges as original, keep all, # want the least number to overlap 95% of whole range # try each comb from low to high and if hits 95%, choose those # if multiple higher than 95%, choose one with best % # if multiple with same % and same numb of combs- multiple print(dic) # sys.exit() SeqIO.write(records, "mito_COI.fa", "fasta") #pulls out the GIs with first, the longest number of ATCGs and second, the longest length and makes dictionary print("Trying to resolve all other sequences") count = 0 for i in dic: GIlist = [] for n in dic[i]: GIlist.append(n.split("_")[0]) dic[i] = resolve_seqs(GIlist) print(str(round((float(count) / float(len(dic))) * 100, 2)) + "%") count += 1 #splits the ones that still have multiple (so the longest had multiple choices) and the ones that are resolved for i in dic: if len(dic[i]) > 1: dic_mult[i] = dic[i] else: dic_single[i] = dic[i] for i in genes: finalGInums_only1 = set() finalGInums_longest = set() for n in dic_single.keys(): if i == re.split('_|\|', n)[1]: finalGInums_longest.add(''.join(dic_single[n])) for n in GI_nums_single: if i == re.split('_|\|', n)[1]: finalGInums_only1.add(re.split('_|\|', n)[-2]) with open("final_GIs.txt", "a") as o: for m in finalGInums_only1: o.write(str(m) + "\n") #this needs to go to blast_sp_parse.py with open(i + "_accession_nums_resolved.txt", "w") as o: for m in finalGInums_longest: o.write(str(m) + "\n") #this needs to go to cluster.py with open("multiple_gene_choices.txt", "w") as w: for i in dic_mult: w.write(i + "\t" + str(dic_mult[i]) + "\n") conn.close()