) i2 on i.taxid = i2.taxid WHERE qseqid in (%s, %s) GROUP BY qseqid, genus ORDER BY i.taxid, pident, coverage""") #GROUP BY qseqid, i.taxid results = dbconnection.cursor() results.execute(query, (novelseq1, novelseq2, novelseq1, novelseq2)) results = list(results) # get homolog data homologs = () for homolog in results: taxid = homolog[1] sgi = homolog[4] print(taxid) lineage = ncbi.get_lineage(taxid) sequence = ncbi.get_gene_seq(sgi) seqdef = ncbi.get_gene_data(sgi)[0]['GBSeq_definition'] homologs = homologs + (homolog + (seqdef, lineage, sequence),) # Build a fasta file from homolog seq homologsRec1=[] homologsRec2=[] # Add novel sequences handle = open("notes/transcriptome/BothNyAd.fa", "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() sid = 'Diaphorina citri' name = 'Diaphorina citri'
def search(i, q, l, f_l): """Search the strain name on the database and establish a species-domain look-up dictionary, first search on ENA database, if no exact match is found, search on NCBI database, if still no exact match, the species and the search result with most similar name will be added into fail_list. """ while q.qsize() > 0: ex_match = False sp = q.get() print( "[Thread:%d Queue:%d]Searching taxon information for %s on ENA database" % (i, q.qsize(), sp)) ena_hits = taxon(sp) sm_score = list() #Sleep for 0.5 second to prevent the database from rejecting access. for idx, hit in enumerate(ena_hits): sp_norm = sp.replace('+', ' ').replace('_', ' ').replace('sp. ', '') hit_norm = [ name.replace('+', ' ').replace('_', ' ').replace('sp. ', '') for name in hit['name'] ] idty_chck = any([sp_norm == name for name in hit_norm]) if idty_chck: l[sp] = ena_hits[idx] ex_match = True q.task_done() print( "[Thread:%d Queue:%d]Taxon information hasbee found for %s on ENA database" % (i, q.qsize(), sp)) break else: sm_score.append( max([SM(None, sp_norm, name).ratio() for name in hit_norm])) if not ex_match: print( "[Thread:%d Queue:%d]Exact match is not found in ENA database, searching %s on NCBI database" % (i, q.qsize(), sp)) ncbi_hits = get_lineage(sp) for idx, hit in enumerate(ncbi_hits): hit_norm = [ name.replace('+', ' ').replace('_', ' ').replace('sp. ', '') for name in hit['name'] ] idty_chck = any([sp_norm == name for name in hit_norm]) if idty_chck: l[sp] = ncbi_hits[idx] ex_match = True q.task_done() print( "[Thread:%d Queue:%d]Taxon information hasbee found for %s on NCBI database" % (i, q.qsize(), sp)) break else: sm_score.append( max([ SM(None, sp_norm, name).ratio() for name in hit_norm ])) if not ex_match: print( "[Thread:%d Queue:%d]%s can't be found in both database, added into fail list." % (i, q.qsize(), sp)) hits = ena_hits + ncbi_hits if len(hits) == 0: f_l[sp] = [] else: print(sm_score) print(len(hits)) max_idx = sm_score.index(max(sm_score)) f_l[sp] = hits[max_idx] q.task_done()
taxainfo['blast_record'] = get_BLAST(taxainfo['taxid'], qseqid) try: taxainfo['sbjctseq'] = taxainfo['blast_record'].alignments[0].hsps[0].sbjct[:50] print(taxainfo['sbjctseq']) try: taxainfo['GI'] = int(taxainfo['blast_record'].alignments[0].title.split('|')[1]) print("GI is: " + str(taxainfo['GI'])) except: print("GI not valid for " + taxa + " : " + taxainfo['blast_record'].alignments[0].title.split('|')[1]) try: taxainfo['seqdef'] = ncbi.get_gene_data(taxainfo['GI'])[0]['GBSeq_definition'] print('GBSeq_definition is ' + taxainfo['seqdef']) except: print("Unable to get GBSeq_definition") try: taxainfo['lineage'] = ncbi.get_lineage(taxainfo['taxid']) except: print("Unable to get Lineage") try: taxainfo['sequence'] = ncbi.get_gene_seq(taxainfo['GI']) except: print("unable to get sequence") except: print("No match found for " + taxa) taxarecords.append(taxainfo) handle = open('temp/' + record + '.pickle', 'w') pickle.dump(taxarecords, handle) handle.close() homologrec = [] for trecord in taxarecords: try:
record_dict[novelseq1].description = novelseq1 record_dict[novelseq2].description = novelseq2 homologs = () for ataxa in list(alltaxa)[:4]: print("lookup " + ataxa) taxid = ncbi.get_taxid(ataxa) print(ataxa + " tax id is " + str(taxid)) qseq1 = str(record_dict[novelseq2].seq) blast_record = get_BLAST(taxid, qseq1) try: sbjctseq1 = blast_record.alignments[0].hsps[0].sbjct[:50] sgi1 = blast_record.alignments[0].title.split('|')[1] print("1", sbjctseq1) seqdef1 = ncbi.get_gene_data(sgi1)[0]['GBSeq_definition'] lineage1 = ncbi.get_lineage(taxid) sequence1 = ncbi.get_gene_seq(sgi1) except: print("No match") sgi1 = 0 qseq2 = str(record_dict[novelseq1].seq) blast_record = get_BLAST(taxid, qseq2) try: sbjctseq2 = blast_record.alignments[0].hsps[0].sbjct[:50] sgi2 = blast_record.alignments[0].title.split('|')[1] print("2", sbjctseq2) seqdef2 = ncbi.get_gene_data(sgi2)[0]['GBSeq_definition'] lineage2 = ncbi.get_lineage(taxid) sequence2 = ncbi.get_gene_seq(sgi2)