def alignment_reg(align_GIs, blastdb, qseqbool, gene, c): #qseqbool is a boolian that says whether or not the first GI is a qseq or not seqs = [] if qseqbool == True: iterator = get_seqs_from_sqldb_GI(align_GIs[:1], "qseq", blastdb, gene, c) handle_string = StringIO() for seq in iterator: seqs.append(seq) iterator = get_seqs_from_sqldb_GI(align_GIs[1:], "hseq", blastdb, gene, c) handle_string = StringIO() for seq in iterator: seqs.append(seq) if qseqbool == False: iterator = get_seqs_from_sqldb_GI(align_GIs, "hseq", blastdb, gene, c) handle_string = StringIO() for seq in iterator: seqs.append(seq) SeqIO.write(seqs, handle_string, "fasta") data = handle_string.getvalue() stdout, stderr = muscle_cline(stdin=data) align = AlignIO.read(StringIO(stdout), "clustal") return (align)
def resolve_seqs(list_of_GIs, blastdb, gene, c): #give a list of GIs, checks the amount DNA/length, returns list of max amount_DNA = [] length_DNA = [] list_of_accs = [] GI_to_pick = [] iterator = get_seqs_from_sqldb_GI(list_of_GIs, "hseq", blastdb, gene, c) for seq_record in iterator: list_of_accs.append(seq_record.id) amount_DNA.append( seq_record.seq.count("A") + seq_record.seq.count("T") + seq_record.seq.count("C") + seq_record.seq.count("G")) for i in list_of_accs: for iter in c.execute( "SELECT hit_length FROM blast WHERE accession = '" + i + "';"): length_DNA.append(int(iter[0])) if amount_DNA.count(max(amount_DNA)) == 1: acc_to_pick = [list_of_accs[amount_DNA.index(max(amount_DNA))]] elif length_DNA.count(max(length_DNA)) == 1: acc_to_pick = [list_of_accs[length_DNA.index(max(length_DNA))]] else: try: sums = [ amount_DNA[i] + length_DNA[i] for i in xrange(len(amount_DNA)) ] #python2 except: sums = [ amount_DNA[i] + length_DNA[i] for i in range(len(amount_DNA)) ] #python3 acc_to_pick = [ list_of_accs[i] for i, x in enumerate(sums) if x == max(sums) ] for i in acc_to_pick: for iter in c.execute("SELECT GI from blast where accession = '" + i + "';"): GI_to_pick.append(str(iter[0])) return (GI_to_pick)
def pullseqs(blastdb, email): from Bio import Entrez, SeqIO import os, sys, time, sqlite3 from random import randint from cleanlib.databasing import get_seqs_from_sqldb_GI conn = sqlite3.connect(blastdb) c = conn.cursor() GI_gene_dic = {} mitochloro_gene_dic = {} tc_ids_random = set() genes = set() Entrez.email = email # Ambiguous species/not chosen # Better tiling/Not chosen # Closest to consensus in cluster analysis/Chosen # Further from consensus in cluster analysis/Not chosen # Longest or most info, good top hit/chosen # Mito or chloro sequence/Chosen # Only choice/chosen # Only or best choice in tiling analysis/chosen # Pick one randomly/Chosen # Sequence did not have same top blast species, but all aligned correctly, tile 1/Chosen # Short or less info, tile 1/Not chosen # Short or less info/Not chosen # Species not in taxonomy/not chosen for iter in c.execute( "SELECT GI, Gene_name FROM blast WHERE Decision IN ('Closest to consensus in cluster analysis/Chosen', 'Longest or most info, good top hit/chosen', 'Only choice/chosen', 'Only or best choice in tiling analysis/chosen') OR Decision LIKE 'Sequence did not have same top blast species, but all aligned correctly%'" ): GI_gene_dic[str(iter[0])] = iter[1] genes.add(iter[1]) for iter in c.execute( "SELECT GI, Gene_name FROM blast WHERE Decision IN ('Mito or chloro sequence/Chosen')" ): mitochloro_gene_dic[str(iter[0])] = iter[1] genes.add(iter[1]) for gene in genes: #get regular sequences pick_random_dic = {} records = [] #get regular seqids = [i for i in GI_gene_dic if GI_gene_dic[i] == gene] #choose random for iter in c.execute( "SELECT tc_id, GI FROM blast WHERE Decision IN ('Pick one randomly/Chosen') AND Gene_name = '" + gene + "' ORDER BY tc_id"): if iter[0] not in pick_random_dic: pick_random_dic[iter[0]] = [iter[1]] else: output = pick_random_dic[iter[0]] output.append(iter[1]) pick_random_dic[iter[0]] = output for i in pick_random_dic: choice = randint(0, len(pick_random_dic[i]) - 1) GI_choice = pick_random_dic[i][choice] seqids.append(str(GI_choice)) #pull the seqs seqlists = [seqids[i:i + 200] for i in range(0, len(seqids), 200)] for i in seqlists: error = False seqids_sub = ",".join(i) try: handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=seqids_sub) except: error = True while error is True: try: print("Error, trying again") time.sleep(5) handle = Entrez.efetch(db="nucleotide", rettype="fasta", retmode="text", id=seqids_sub) error = False except: pass for seq_record in SeqIO.parse(handle, "fasta"): records.append(seq_record) #get mito/chloro GI_mito_GI = [ i for i in mitochloro_gene_dic if mitochloro_gene_dic[i] == gene ] if len(GI_mito_GI) != 0: iterator = get_seqs_from_sqldb_GI(GI_mito_GI, "hseq", blastdb, gene, c) for seq in iterator: records.append(seq) # print(str(round((float(len(records))/float(len(seqids)))*100, 2)) + "%") SeqIO.write(records, gene + "_cleaned.fa", "fasta") conn.close()
def blast_all(blast_list, c, gene, taxdb, blastdb): #first do blast of all (so we only do one blast), then get taxonomy for each query, then get taxonomies for top 10 hits hit_levels_return = {} hitdic = {} print( "Blasting error sequences (seqs do not align together and the one picked does not blast to other)" ) iterator = get_seqs_from_sqldb_GI(blast_list, "hseq", blastdb, gene, c) export_fasta(iterator, "error_seqs.fa") local_blast(gene + "_db.fa", "error_seqs.fa") with open("error_seqs.fa.xml") as p: count = 0 print("Parsing blast output") blast_records = NCBIXML.parse(p) #this iterates in a wierd way - note next at the end - this is due to the structure of ncbi's blast returns for rec in blast_records: count += 1 print( str(round(float(count) / float(len(blast_list)) * 100, 2)) + "%") #get GI - blast doesnt pull down GI nums anymore queryacc = str(rec.query.split()[0]) for iter in c.execute( "SELECT GI, tc_id FROM blast WHERE accession = '" + queryacc + "'"): rec_GI = str(iter[0]) sp_tc_id = str(iter[1]) #get taxonomy for tc_id query_taxonomy = [int(sp_tc_id)] while 0 not in query_taxonomy: for iter in c.execute( "SELECT tc.parent_id FROM taxon_concepts tc, ranks r WHERE tc_id = '" + str(sp_tc_id) + "' AND tc.rank_id = r.rank_id"): sp_tc_id = iter[0] query_taxonomy.append(sp_tc_id) hit_levels_all = [] count1 = 0 for alignment in rec.alignments: for hsp in alignment.hsps: identity = float(hsp.identities) / float(hsp.align_length) if alignment.hit_def == queryacc: pass else: count1 += 1 for iter in c.execute( "SELECT GI FROM blast WHERE accession='" + alignment.hit_def + "'"): hitGI = (str(iter[0])) hitdic[str(hitGI)] = identity if count1 >= 5: break #hitdic is GI: identity of all hits for a query #get species for all the top hits for each GI - hitsp #get a set of all the sp_id for all taxonomies in top 5 hits. Can iterate through query taxonomy and choose closest sp_id then can compare queries all_hits_taxonomy = set() for hitGI in hitdic: for iter in c.execute("SELECT tc_id FROM blast WHERE GI='" + hitGI + "'"): one_hit_taxonomy = [int(iter[0])] while 0 not in one_hit_taxonomy: for iter in c.execute( "SELECT tc.parent_id FROM taxon_concepts tc, ranks r WHERE tc_id = '" + str(iter[0]) + "' AND tc.rank_id = r.rank_id"): sp_tc_id = iter[0] one_hit_taxonomy.append(sp_tc_id) all_hits_taxonomy.add(sp_tc_id) #all_hits_taxonomy[hitGI] = one_hit_taxonomy for rank in query_taxonomy: if rank in all_hits_taxonomy: hit_levels_return[rec_GI] = rank break return (hit_levels_return)