def _filter_homologues(get_all_homologues, seqs, min_identity, keep_ids=None, nucleotide=False): print 'Filtering out close homologues. This will take a wile:' command = 'blastn' if nucleotide else 'blastp' dbname = '' try: with user_message('Formatting blast DB', '\n'): dbname = BlastCLI.format_tmp_db(seqs, nucleotide) if not dbname: print 'Unable to make temporary BLAST database.' return None with ProgressCounter('Searching for homologues using local blastp...', len(seqs)) as prg: homologues = get_all_homologues(seqs, min_identity, dbname, command, prg) except Exception as e: print '%s\n' % str(e) return None finally: if dbname: shutil.rmtree(os.path.dirname(dbname), ignore_errors=True) if not homologues: return seqs with user_message('Removing all homologs from each group except the first one...'): remove = set() if keep_ids: keep_ids = set(keep_ids) for seq in seqs: if seq.id in remove: continue h = homologues.pop(seq.id, set()) if h: if keep_ids: nhoms = len(h) h -= keep_ids if nhoms != len(h) and seq.id not in keep_ids: h.add(seq.id) remove.update(h) return [seq for seq in seqs if seq.id not in remove]
def hmmsearch_genes(self, hmms, genome, table='Standard', decorate=False, **kwargs): #get _genes genes = get_indexes_of_all_genes(genome) if not genes: return None for gene_id, gi in enumerate(genes): genome.features[gi].qualifiers['feature_id'] = gi genome.features[gi].qualifiers['gene_id'] = gene_id #translate _genes with user_message('Translating _genes/CDS of %s' % genome.description, '\n'): translator = Translator(self._abort_event) translation = translator.translate_features(genome, genes, table) if not translation: return None if isinstance(hmms, str): hmms = [hmms] results = dict() for hmm in hmms: with user_message('Performing hmm search.'): hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs) if not hmm_results: return None with user_message('Parsing search results...'): #get hit_ids of hmm matches hits = dict() for result in hmm_results: for hit in result.iterhits(): hits[hit.id] = hit #get indexes of features where hmm hit hit_features = dict() for t in translation: if t.id in hits: fid = t.features[0].qualifiers.get('feature_id') if fid is None: continue hit_features[fid] = hits[t.id], t if hit_features: results.update(hit_features) #decorate genome if decorate: with user_message('Adding results as annotations...'): hmm_name = os.path.basename(hmm) for f in hit_features: feature = genome.features[f] for hsp in hit_features[f][0]: if feature.strand == 1: hmm_location = FeatureLocation( feature.location.start + hsp.hit_start * 3, feature.location.start + hsp.hit_end * 3, feature.strand) else: hmm_location = FeatureLocation( feature.location.end - hsp.hit_end * 3, feature.location.end - hsp.hit_start * 3, feature.strand) hmm_feature = self.hsp2feature( hmm_name, 'HMM_annotations', hmm_location, hsp) genome.features.append(hmm_feature) return results if results else None
def hmmsearch_genome(self, hmm, genome, table='Standard', decorate=False, **kwargs): #get genes genes = get_indexes_of_genes(genome) if not genes: return None for gene_id, gi in enumerate(genes): genome.features[gi].qualifiers['feature_id'] = gi genome.features[gi].qualifiers['gene_id'] = gene_id #translate genes with user_message('Translating genes/CDS of %s' % genome.description, '\n'): translator = Translator(self._abort_event) translation = translator.translate(genome, genes, table) if not translation: return None with user_message('Performing hmm search.'): results = self.hmmsearch_recs(hmm, translation) if not results: return None with user_message('Parsing search results...'): #get hit_ids of hmm matches hits = dict() for result in results: for hit in result.iterhits(): hits[hit.id] = hit #get indexes of features where hmm hit hit_features = dict() for t in translation: if t.id in hits: fid = t.features[0].qualifiers.get('feature_id') if fid is None: continue hit_features[fid] = hits[t.id], t #decorate genome if decorate: with user_message('Adding results as annotations...'): hmm_name = os.path.basename(hmm) for f in hit_features: feature = genome.features[f] for hsp in hit_features[f][0]: if feature.strand == 1: hmm_location = FeatureLocation(feature.location.start+hsp.hit_start*3, feature.location.start+hsp.hit_end*3, feature.strand) else: hmm_location = FeatureLocation(feature.location.end-hsp.hit_end*3, feature.location.end-hsp.hit_start*3, feature.strand) hmm_feature = SeqFeature(hmm_location, type='misc_feature') hmm_feature.qualifiers['hmm_model'] = hmm_name hmm_feature.qualifiers['bitscore'] = hsp.bitscore hmm_feature.qualifiers['psi_evalue'] = hsp.psi_evalue hmm_feature.qualifiers['evalue_cond'] = hsp.evalue_cond hmm_feature.qualifiers['acc_average'] = hsp.acc_avg hmm_feature.qualifiers['bias'] = hsp.bias genome.features.append(hmm_feature) print 'Done.\n' return hit_features
def _main(self): alifile = '/home/allis/Documents/INMI/Fervidicoccales-signature/arb-silva.de_2016-04-06_id331139.fasta' group_names = ['Fervidicoccales', 'Acidilobales', 'Desulfurococcales', 'Thermoproteales:Sulfolobales', 'Other'] predefined_positions = [34, 501, 544, 1244, 1293] ref_name = 'Escherichia' reference = None groups = ListDB() with user_message('Loadding initial alignment...', '\n'): ali = AlignmentUtils.load_first(alifile) if not ali: return 1 with user_message('Sorting alignment into subgroups...', '\n'): for rec in ali: if ref_name in rec.description: reference = rec continue found = False for g in group_names: for k in g.split(':'): if k in rec.description: groups[g] = rec found = True break if not found: groups['Other'] = rec groups = dict((n, AlignmentExt(groups[n])) for n in groups) ali_len = ali.get_alignment_length() predefined_positions = [self._col_index(i, reference) for i in predefined_positions] print ('\nReference sequence:\n>%s\n%s' % (reference.description, str(reference.seq).replace('.', '').replace('-', ''))) print '\nAlignment: %d seqs, %d columns' % (len(ali), ali_len) print print_table([(g, '%d sequences' % len(groups[g])) for g in group_names]) print main_group = group_names[0] main_ali = groups[main_group] others = group_names[1:] for ci in xrange(ali_len): main_letter = self.LetterStats(main_ali[:,ci]) predef = ci in predefined_positions if predef or main_letter.freq_no_gaps >= 0.95 and main_letter.freq > 0.5: other_letters = [self.LetterStats(groups[g][:,ci]) for g in others] if predef or any(l.letter != main_letter.letter for l in other_letters): print ('------------------ E.coli position: %d ---------------------' % (self._ref_index(ci, reference)+1)) print print_table([(main_group, str(main_letter))]+ [(g, str(l)) for g, l in zip(others, other_letters)]) print print 'Done'
def blastn_annotate(self, tag_sequences, subject_record, min_identity, evalue=0.001, **kwargs): results = self.s2s_blast_batch(tag_sequences, [subject_record], evalue=evalue, command='blastn', **kwargs) if results is None: return False with user_message('Adding results as annotations...'): annotated = False for i, tag in enumerate(tag_sequences): if not results[i]: continue record = results[i][0] if not record: continue tag_name = pretty_rec_name(tag) if tag_name != tag.id: tag_name += ' (%s)' % tag.id for hit in record: for ali in hit.alignments: for hsp in ali.hsps: if hsp.identities / float(hsp.align_length) < min_identity: continue strand = 1 if hsp.sbjct_start < hsp.sbjct_end else -1 if strand == 1: location = FeatureLocation(hsp.sbjct_start-1, hsp.sbjct_end, strand) else: location = FeatureLocation(hsp.sbjct_end-1, hsp.sbjct_start, strand) feature = self.hsp2feature(tag_name,'blastn_annotations', location, hsp) self.add_program(feature, 'blastn') subject_record.features.append(feature) annotated = True return annotated
def hmmsearch_genome(self, hmms, genome, table='Standard', decorate=False, **kwargs): #translate _genes with user_message('Translating whole genome in 6 reading frames', '\n'): translator = Translator(self._abort_event) translation = translator.translate_six_frames(genome, table) if not translation: return None if isinstance(hmms, str): hmms = [hmms] results = [] for hmm in hmms: with user_message('Performing hmm search.'): hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs) if not any(len(r) for r in hmm_results): continue results += hmm_results #decorate genome if decorate: translation = dict((t.id, t) for t in translation) with user_message('Adding results as annotations...'): hmm_name = os.path.basename(hmm) glen = len(genome) for frame in hmm_results: for hit in frame: frec = translation[hit.id] start = frec.annotations['start'] strand = frec.annotations['strand'] for hsp in hit: if strand == 1: hmm_location = FeatureLocation( start + hsp.hit_start * 3, start + hsp.hit_end * 3, strand) else: hmm_location = FeatureLocation( glen - start - hsp.hit_end * 3, glen - start - hsp.hit_start * 3, strand) hmm_feature = self.hsp2feature( hmm_name, 'HMM_annotations', hmm_location, hsp) genome.features.append(hmm_feature) return results if results else None
def replace_node_labels(cls, treefile, labels, schema=None, outfile=None): '''Reads a tree from file and replaces node labels according to provided mapping. The modified tree is returned as DendroPy.Tree object or is written to the provided output file. @param labels: dict, replacement table @param outfile: the name of the file to write the modified tree ''' with user_message('Loading tree file...', '\n'): tree = cls.load(treefile, schema) if not tree: print 'No tree loaded.' return None with user_message('Processing tree...', '\n'): for leaf in tree.leaf_node_iter(): label = leaf.taxon.label.replace(' ', '_') if label in labels: leaf.taxon.label = labels[label] if outfile: tree.write(path=outfile, schema=schema) return tree
def safe_load(cls, files): with user_message("Loading sequences..."): try: view = cls() view.load(files) except Exception as e: print str(e) return None if len(view) == 0: print 'No sequences were loaded from:\n%s' % files view.close() return None return view
def blastp_annotate(self, tag_sequences, subject_record, min_identity, evalue=0.001, table=11, **kwargs): # translate subject in six frames with user_message('Translating whole genome in 6 reading frames', '\n'): translator = Translator(self._abort_event) translation = translator.translate_six_frames(subject_record, table) if not translation: return False results = self.s2s_blast_batch(tag_sequences, translation, evalue=evalue, command='blastp', **kwargs) if results is None: return False with user_message('Adding results as annotations...'): annotated = False subj_len = len(subject_record) for i, tag in enumerate(tag_sequences): if not results[i]: continue tag_name = pretty_rec_name(tag) if tag_name != tag.id: tag_name += ' (%s)' % tag.id for frame, record in enumerate(results[i]): if not record: continue frec = translation[frame] start = frec.annotations['start'] strand = frec.annotations['strand'] for hit in record: for ali in hit.alignments: for hsp in ali.hsps: if hsp.identities / float(hsp.align_length) < min_identity: continue if strand == 1: location = FeatureLocation(start+(hsp.sbjct_start-1)*3, start+hsp.sbjct_end*3, strand) else: location = FeatureLocation(subj_len-start-hsp.sbjct_end*3, subj_len-start-hsp.sbjct_start*3, strand) feature = self.hsp2feature(tag_name, 'blastp_annotations', location, hsp) self.add_program(feature, 'blastp') subject_record.features.append(feature) annotated = True return annotated
def process_sequences(seqs, _depth): if _depth == 0: return with user_message('RingBlast: processing %d sequences of the %d ring.' % (len(seqs), depth-_depth+1), '\n'): next_ring = blast_filter_fetch(seqs) if not next_ring: return to_check = [] next_to_process = [] for n in next_ring: next_seqs = from_shelf(n) if not next_seqs: continue for ns in next_seqs: sid = self.base_sid(ns) if sid in extended_set: #FIXME: need to merge sequences properly, instead of replacing if len(extended_set[sid]) < len(ns): extended_set[sid] = ns else: to_check.append(ns) if not to_check or not check_sequences(to_check, next_to_process): return if next_to_process: process_sequences(next_to_process, _depth-1)
def annotate_tree(cls, treefile, organisms, outfile=None, schema=None, **kwargs): ''' Annotate input tree with taxonomy information using edge labels and colors. @param treefile : a file containing the tree to be annotated @param organisms: organisms database @param outfile: optional : basename for output file; Note: the last extension will be stripped @param schema: data format of the treefile Accepted kwargs: @param beautify leafs: bool (True) : replaces IDs in leafs' labels with organism names @param mark_leafs: list(str) : mark nodes with the specified labels in bold @param collapse_taxa : list(str) : collapses all subtrees belonging to given taxa @param collapse_last : bool (False) : changes display method of genus subtrees in Dendroscope to trapezium nodes @param collapse_hard: bool (False) : removes collapsed subtrees, leaving a single node @param collapse_min_nodes : int (3) : only collapse subtrees with number of leafs greater or equal than this @param min_support : float (0: disabled) : nodes with support less that this will be removed from the tree, children being relinked to parents @param reroot_at : string ('') : reroot the tree at specified leaf; special value 'midpoint' reroots the at midpoint; special value 'unroot' unroots the tree @param lineage_colors : dict : a dictionary of colors as (r, g, b) tuples with lowercase taxons as keys; special value 'auto' causes to automatically assign colors @param top_lineage: a Lineage object to be subtracted from lineages of organisms on the tree; if not provided, it is computed automatically ''' with user_message('Processing tree file...', '\n'): tree = cls.load(treefile, schema) if not tree: print 'No tree loaded.' return False #need to get the root before beautifying new_root = None root_name = kwargs.pop('reroot_at', '') min_support = kwargs.pop('min_support', False) beautify_leafs = kwargs.pop('beautify_leafs', False) mark_leafs = kwargs.pop('mark_leafs', set()) for leaf in tree.leaf_node_iter(): label = leaf.taxon.label.replace(' ', '_') if label in mark_leafs: cls._add_format(leaf, " x=0.0 y=0.0 ft='Ubuntu-BOLDITALIC-14' ll=7;") if not new_root and label == root_name: new_root = leaf org = organisms.get(label) if not org: continue leaf.edge.lineage = org.lineage if beautify_leafs: leaf.taxon.label = '%s' % (org.description or org.id) if org.id != leaf.taxon.label: leaf.taxon.label += ' (%s)' % org.id leaf.taxon.label = leaf.taxon.label.replace('_', ' ') leaf.label = leaf.taxon.label if min_support: for node in tree.postorder_internal_node_iter(exclude_seed_node=True): try: support = float(node.label) except ValueError: pass if support < min_support and node.edge: node.edge.collapse(adjust_collapsed_head_children_edge_lengths=True) #reroot the tree before traversing if root_name == 'unroot': with user_message('Unrooting tree...'): tree.deroot() if root_name == 'midpoint': with user_message('Rerooting tree at midpoint...'): tree.reroot_at_midpoint(update_bipartitions=True) elif new_root: with user_message('Rerooting tree at %s...' % root_name): tree.to_outgroup_position(new_root, update_bipartitions=True) else: print 'Node for rerooting not found: %s' % root_name #annotate the tree with user_message('Adding taxonomy information to the tree...', '\n'): top_lineage = kwargs.pop('top_lineage', None) if not isinstance(top_lineage, Lineage): top_lineage = organisms.common_lineage colors = kwargs.pop('lineage_colors', None) if colors == 'auto': pass#TODO cls._set_node_taxonomy(tree.seed_node, top_lineage, None, kwargs.pop('collapse_taxa', []), kwargs.pop('collapse_last', False), kwargs.pop('collapse_min_nodes', 3), kwargs.pop('collapse_hard', False), colors) with user_message('Saving resulting tree...'): if not outfile: outfile = cls.strip_ext(treefile)+'.out' xtreefile = outfile+'.nexml' tree.write(path=outfile+'.tre', schema='newick') tree.write(path=xtreefile, schema='nexml') with open(outfile+'.dot', 'w') as out: tree.write_as_dot(out, edge_formatter=lambda e: e.label or '') with user_message('Tuning nexml file for Dendroscope...'): cls._postprocess_nexml(xtreefile) return True
def _main(self): min_prod = 400 silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva.fasta' alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta' add_filename = FilenameParser.strip_ext( alifile) + '.with_additions.fasta' outgroups = [ 'Thermococcus_chitonophagus', 'SMTZ1-55', 'contig72135_1581_sunspring_meta' ] add = ['KF836721.1.1270', 'EU635905.1.1323'] exclude = [ ] #['Thermococcus_chitonophagus', 'SMTZ1-55', 'BA1-16S', 'contig72135_1581_sunspring_meta'] #load alignment if os.path.isfile(add_filename): alifile = add_filename add_filename = '' with user_message('Loadding initial alignment...', '\n'): orig_ali = AlignmentUtils.load_first(alifile) if not orig_ali: return 1 #load homologs if add_filename: with user_message('Loadding additional sequences...', '\n'): add_seqs = [] db = SeqView() if db.load(silva_db): for sid in add: seq = db.get(sid) if seq: add_seqs.append(seq) else: print '%s not found in %s' % (sid, silva_db) #realign data if needed if add_seqs: with user_message('Realigning data...', '\n'): add_filename = FilenameParser.strip_ext( alifile) + '.with_additions.fasta' AlignmentUtils.align( list(orig_ali) + add_seqs, add_filename) orig_ali = AlignmentUtils.load_first(add_filename) if not orig_ali: return 2 #process the alignment ali = orig_ali.remove(*exclude).trim() for out in outgroups: if not ali.index(out): print '%s not found in the alignment' % out return 3 ali.sort(key=lambda r: 'zzzzzzzz' if r.id in outgroups else r.id) AlignmentUtils.save( ali, '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.trimmed.fasta' ) args = dict(plen=(20, 40), max_mismatches=8, min_match_mismatches=1, first_match_mismatches=1, first_may_match=1, AT_first=True, outgroup=len(outgroups)) fprimers = PrimerFinder.find_discriminating_primers(ali, **args) rprimers = PrimerFinder.find_discriminating_primers(ali, reverse=True, **args) pairs = PrimerFinder.compile_pairs(fprimers, rprimers, min_prod, 'SSBa') if not pairs: print '\nNo suitable primer pairs found' return 3 PrimerFinder.print_pairs(pairs) orig_ali = PrimerFinder.add_pairs_to_alignment(pairs, orig_ali) AlignmentUtils.save( orig_ali, '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.with_primers.aln.fasta' ) print 'Done'
abort_event.set(); sleep(0.1) clean_tmp_files() #end def if __name__ == '__main__': from multiprocessing import Event from BioUtils.Tools.Output import user_message from BioUtils.SeqUtils import load_files, load_dir _pid = os.getpid() #setup signal handler signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGQUIT, sig_handler) abort_event = Event() with user_message('Loading genomes...', '\n'): genomes_dir = u'/home/allis/Documents/INMI/Aerobic-CODH/genomes/' genome_names = ['Thermococcus_barophilus_Ch5-complete.gb', 'Thermococcus_onnurineus_NA1-complete-genome.gb', 'Thermococcus_sp._ES1.gb', 'Thermococcus-DS1-preliminary.gb'] genomes = load_dir(abort_event, genomes_dir, 'gb', r'.*\.gb') if not genomes: sys.exit(1) # load_files(abort_event, [os.path.join(genomes_dir, f) for f in genome_names], 'gb') hmm = u'/home/allis/Documents/INMI/Aerobic-CODH/COX-EC/COX-EC_1.2.99.2_CoxL.hmm' hmmer = Hmmer(abort_event) for g in genomes: results = hmmer.hmmsearch_genome(hmm, g, table=11, decorate=True)
def g2g_blastp(self, reference, subjects, table='Standard', evalue=0.001, max_rlen=0, features_of_interest=None): ''' Perform blastp of each coding sequence of the reference against each subject, which is first translated gene-by-gene. Parameters @param reference: SeqRecord object of the reference genome @param subjects: a list of SeqRecord objects of subject genomes @param table: translation table number (see NCBI site for description) @param evalue: filter out blastp results with E-value grater than this @param max_rlen: filter out blastp results which are shorter than this fraction of target gene length @param features_of_interest: list of dictionaries of the form {qualifier_name : qualifier_value} to mark features denoting known clusters that should be analyzed one against the other @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) where CDS is a gene/CDS feature from the reference.features list and blast_resultN is a list of results for the N-th subject, containing following information: (hit_feature, align_length, percent_identity, evalue) where hit_feature is a SeqFeature object of the gene/CDS of the subject where top blast hit is located, align_length is the length of the hit, percent_identity is the ratio of number of identities and align_length [0; 1] and evalue is the E-value of the top hit. ''' if not reference or not subjects: print 'No reference or subject sequences provided' return None #get list of features to query with user_message('Searching for gene/CDS features in provided sequences...'): all_records = [reference]+subjects num_records = len(all_records) features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), range(num_records), all_records) if self.aborted(): print '\nAborted' return None if not features or not features[0]: print ('\nReference sequence does not contain annotated genes:\n%s %s' % (reference.id, reference.description)) return None if len([f for f in features if f]) < 2: print '\nSubject sequences do not contain annotated genes' return None #add gene ids for ri, genes in enumerate(features): if not genes: continue r = all_records[ri] for gene_id, gi in enumerate(genes): r.features[gi].qualifiers['feature_id'] = gi r.features[gi].qualifiers['gene_id'] = gene_id #get features of interest if requested fois = None if features_of_interest: with user_message('Searching for features of interest...'): fois = [] for foi in features_of_interest: foi = self._get_fois(all_records, foi) if foi and foi[0]: fois.append(foi) if self.aborted(): print '\nAborted' return None #translate features to proteins with Progress('Translating genes found in the reference and subjects...', num_records) as prg: translator = Translator(self._abort_event) translations = [None]*num_records foi_translations = [[None]*num_records for _f in fois] for i, (f, rec) in enumerate(zip(features, all_records)): if not f: prg.step(i) continue translation = translator.translate(rec, f, table) if not translation: return None if i > 0: translations[i] = cat_records(translation) if fois: for ifoi, foi in enumerate(fois): foi_loc = [0, 0] for foi_var in foi[i]: if not foi_var: continue for gid in foi_var: l = translations[i].features[gid].location foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1 foi_loc[1] = max(int(l.end), foi_loc[1]) if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc else: translations[i] = translation if fois: for ifoi, foi in enumerate(fois): foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]] prg.step(i) #blast features against subjects with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'): stranslations = translations[1:] blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not blast_results: print '\nBlast have not returned any results.' return None if fois: #redo blast for fois and replace the results with user_message('Rerunning blast for FOIs...', '\n'): for ifoi, foi in enumerate(foi_translations): sfoi_locs = foi[1:] for i, foi_var in enumerate(foi[0]): foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not foi_blast: continue for gi, gid in enumerate(fois[ifoi][0][i]): if foi_blast[gi]: blast_results[gid] = foi_blast[gi] #process blast results pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations)))) with ProgressCounter('Searching for genes in subjects that overlap with top blast hits...', len(pairs)) as prg: work = self.Work() work.start_work(self._find_features_by_hsps, pairs, None, stranslations, blast_results) @MultiprocessingBase.results_assembler def assembler(index, result, blast_results, pairs, prg): qs = pairs[index] blast_results[qs[0]][qs[1]] = result prg.count() work.assemble(assembler, blast_results, pairs, prg) if not work.wait(): return None return zip((reference.features[f] for f in features[0]), blast_results)
def ring_blast(self, query, db='nr', evalue=0.001, blast_filter=None, depth=1, command='blastn', **kwargs): '''Perform a blast search with the given query to obtain the core set of hits. Make another search with each hit as a query. If results of the second search contain new hits, check if these are reciprocal by yet another search with them and checking that results contain hits from the core set and if they are, add the to the final set. ''' if isinstance(query, SeqRecord): query = [query] def blast_filter_fetch(seqs): @MultiprocessingBase.data_mapper @shelf_result def worker(s): r = self.blast_seq(s, db, evalue, command) if r and blast_filter: blast_filter(r) if r: return self.fetch_results(r, db, what='alignment') return None results = [] total = len(seqs) prg = ProgressCounter('Performing blast search for %d sequences:' % total, total) @MultiprocessingBase.results_assembler def assembler(i, res): if res: results.append(res) prg.count() with prg: if not self.parallelize2(1, worker, assembler, seqs): return None return results with user_message('RingBlast: building a core set of sequences.', '\n'): core_seqs = blast_filter_fetch(query) if not core_seqs: return None core_seqs = self.unique_seqs(chain.from_iterable(from_shelf(r) for r in core_seqs)) extended_set = dict((self.base_sid(s), s) for s in core_seqs) if depth <= 0: return core_seqs core_db = self.format_tmp_db(core_seqs, command.endswith('n')) def check_sequences(seqs, next_to_process): total = len(seqs) prg = ProgressCounter('RingBlast: checking %d new sequences:' % total, total) @MultiprocessingBase.data_mapper def worker(seq): res = self.blast_seq(seq, core_db, 100, command) if res and blast_filter: blast_filter(res) return bool(res), seq @MultiprocessingBase.results_assembler def assembler(i, res): prg.count() if not res[0]: return seq = res[1] extended_set[self.base_sid(seq)] = seq next_to_process.append(seq) with prg: return self.parallelize2(1, worker, assembler, seqs) def process_sequences(seqs, _depth): if _depth == 0: return with user_message('RingBlast: processing %d sequences of the %d ring.' % (len(seqs), depth-_depth+1), '\n'): next_ring = blast_filter_fetch(seqs) if not next_ring: return to_check = [] next_to_process = [] for n in next_ring: next_seqs = from_shelf(n) if not next_seqs: continue for ns in next_seqs: sid = self.base_sid(ns) if sid in extended_set: #FIXME: need to merge sequences properly, instead of replacing if len(extended_set[sid]) < len(ns): extended_set[sid] = ns else: to_check.append(ns) if not to_check or not check_sequences(to_check, next_to_process): return if next_to_process: process_sequences(next_to_process, _depth-1) process_sequences(core_seqs, depth) return extended_set.values()
def _main(self): query = simple_rec('AAACTGGGGCTAATACCCGATGGGTGAGGAGGCCTGGAATGGTTCTTCACCGAAAAGACGTTGAGACCATGCTTTTCAACGTTGCCTAAGGATGGGGCCGCGTCCGATCAGGTTGTTGGTGGGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGGGAGCGGAAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGTCGCGAAAACTCCGCAATGCGCGAAAGCGTGACGGGGCTACCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAATGAGCCTGGGGAATAAGGAGAGGGCAAGCCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTAGGGATGATTATTGGGCTTAAAGCGTCCGTAGCCAGCCCGGCAAGTCTCCCGTTAAATCCAGCGACCTAATCGTTGGGCTGCGGAAGATACTGTTGGGCTAGGGGGCGGGAGAGGCCGACGGTATTCCCGGGGTAGGGGTGAAATCCTATAATCCTGGGAGGACCACCAGTGGCGAAGGCTGTCGGCTAGAACGCGCTCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACTGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCTCAGTGCCGCAGGGAAGCCATTAAGCCCGCCGCCTGGGAAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGCGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGGCGACAGCAGGATGAGGGCCAGATTGAAGGTCTTGCTTGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGCAACGATCGAGACCCGCACCCTTAGTTGCAACCCCTGCGGAACCCGCAGGGGGCACACTACGGGAACTGCCGCCGATAAGGCGGAGGAAGGAGCGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAGAGACAATGGGTTCCAACCTTGAAAGAGGGAGGTAATCCCTAAACCCTGCCTCAGTTGGGATCGAGGGCTGCAACCCGCCCTCGTGAACATGGAATGCCTAGTAATCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTGGGTGAGGCGTGGTCTGTTGGCCGCGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT', 'BA2-16S') suns_db = '/home/allis/Documents/INMI/SunS-metagenome/BlastDB-big/sunspring_meta' silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva' additions = [simple_rec('AAACTGGGGCTAATCCCCCATAGGCCTGGGGTACTGGAAGGTCCCCAGGCCGAAAGGG------GACCGTA-----AGGTCCCGCCCGAGGATGGGCCGGCGGCCGATTAGGTAGTTGGTGGGGTAACGGCCCACCAAG--CCGAAGATCGGTACGGGCC-GTGAGAGCGGGAGCCCGGAGATGGACA---CTGAGACACGGGTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCGCAATGCGGGAAACCGCGACGGGGGGACCCCCAGTGCCGTGCCTCTGGC-----ACGGCTTTTCCGGAGTG-TAAAAAGCTCCGGGAATAAGGGCTGGGCAAGGCCGGTGGC-AGCCGCCGCGGTAATACCGGCGGCCCGAGTGGTGGCCACTATTATTGGGCCTAAAGCGGCCGTAGCCGGGCCCGTAAGTCCCTGGCG-AAATCCCACGGCTCAACCGTGGGGCTCGCTGGGGATACTGCGG-GCCTTGGGACCGGGAGAGGCCGGGGGTACC-CCCGGGGTAGGGGTGAAATCCTATAATCCCGGGGGGACCGCCAGT-GGCGAAGGCGCCC--GGCTGGAACGGGTCCGACGGTGAGGGCCGAAGGCC-AGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCTGGCTGTAAAGGATGCGGGCTAGGTGTCGGGCGAG-CTTCGAGCTCGC-CCGGTGCCGTAGGGAAGCCGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACTACAAGGGGTGGAGCGTGCGGTTTAATTGGATTCAACGCCGGGAACCTCACCGGGGGCGACGGCAGGATGAA-GGCCAGGCTGAAGGTCTTGCCGGACGCGCCGAGAGGAG-----------------------------------GTGCATGGCCGCCGTCAGCTCGTACCGTGAGGCGTCCA-CTTAAGTGTGGTAACGAGCGAGACCCGC--GCCCCCAGTTGCCAGTCCCTCCCGCTGGGA---GGGAGGC-ACTCTGGGGGG-ACTGCCGGCGAT-AAGCCGGAGGAAGGGGCGGGCGACGGTAGGTCAGTATG-CCCCGAAACCC-CCGGGCT-ACACGCGCGCTACAATGGGCGGGACAATGGGA-CCCGACCCCGAAAGGGGAAGGGAATCCCCTAAACCCGCCCTCAGTTCGGATCGCGGGCTG-CAACTCGCCCGCGTGAAGC-TGGAAT-CCCTAGTACCCGCGCGTCATCATCGCGCGGCGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCACTCCACCCGAG-CGGGGCCC-GGGTGAGGCCCGATCTCCTTCGGGAGGTCGGGTCGAGCCTGGGCTC-CGTGAGGGGGG-AGAAGTCGTAACAAGGTAGCC------------------------------'.replace('-', ''), 'Thermococcus_chitonophagus'), simple_rec('AAACTGGGATTAATACCCACTAAATGATAATACCTGGAATGGCTTATCATTGAAAGAC-TCTGGAAACATGCTTC-CAGCGTCGCCCAAGG-------------------------------------------------------------------------------GGAGCCCGGAGATGGAAA---CTGAGACAAGGTTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCACAATGCGCGAAAGCGTGATGGGGTTATCCCGAGTGCCGTCCGATGAGG-----ATGGCTTTTCCTCGGTG-TAAGGATCCGAGGGAATAAAGGGGGGGCAAGACTGGTGTC-AGCCGCCGCGGTAATACCAGCTCCCTGAGTGGTAAGGACGATTATTTGGCCTAAAGCGTCCGTAGCCGGCTTATCAAGTCTCTTGTT-AAACCCAGTGATTCAATCATTGACCT-GCAAGAGATACTGTTA-TGCTAGAGGACGGGAGAGGTCGACGG---------GGGTAGGGGTGAAATCCTATAATCCTTGGAGGACCACCAGT-GGCGAAGGCGGTC--GACTAGAACGTGCCTGACGGTGAGGGACGAAAGCT-GGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTAG-CTACGAGCTACT-CCAGTGCCGCAGAGAAGTTGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACCACAAGGGGTGAAGGCTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGCTGACAGCAGAGTGAA-GGCCAGACTGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCT-GTTAAGTCAGGCAACGAACGAGACCCCC--ACTGTTAGTTGCCAGCGAATTCCAACGGAAT--GTCGGGC-ACACTAACAGG-ACTGCCACCGAT-AAGGTGGAGGAAGGAGGGGGCAACGGCAGGTCAGTATG-CCCC--------------------------------------------------------------------------------------------------------------GAACTCGCCCTCATGAACA-TGGAAT-CCCTAGTAACCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCAAG-TCGGGTCT-AGATGAGGCGCAGTCTTCT-----TGGCTACGTCGAATCTGGGTTC-GGTGAGGGGGG-AGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT'.replace('-', ''), 'SMTZ1-55'), simple_rec('ACTCCGGTTGATCCTGCCGGACCCCACTGCTATCGGGGTAGGACTTAACCATGCGAGTTGTGCGTCCCCAAGCCATGGTGGGGGCGCGGCATACGGCTCAGTAACACGTGGCTAACCTAGCCTTTGGACGGGGACAACCCCGGGAAACTGGGGCTAATCCCCGATGGGTGGGAAGGCCTGGAATGGTTTCCCACCGAAAGGGCGTCTGAACCATGCTTCAGGCGTTGCCGAAGGATGGGGCCGCGGCCGATCAGGTTGTTGGTGAGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGAGAGCGGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGGTGCGAAAACTCCGCGATGCGCGAAAGCGTGACGGGGCTATCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAGGGAGCCGGGGGAATAAGGAGAGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTGGGGACAATTATTGGGCTTAAAGCGTCCGTAGCCGGCCCATCAAGTCTCTTGTTAAATCCAGCGATCCAATCGCTGGACTGCGGGAGATACTGCTGGGCTAGGGGGCGGGAGAAGCCGATGGTATTCTCGGGGTAGGGGTGAAATCCTATAATCCCGGGAGGACCACCAGTGGCGTAGGCGGTCGGCTAGAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCCGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCCCAGTGCCGCATGGAAGCAATTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAAAGGAACAGCGTTTTGTTGTTCCTCTGGATACCTTACCGGGGGCGACAGCAGGATGAAGGCCAGATTGAAGGTCTTGCTGGACGAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGTAACGATCGAGACCCACACCCCCAGTTGCTACCTCTTCGGAGGGCACTCTAGGGGTACTGCCGCCGATAAGGCGGAGGAAGGAGTGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAAGGACAATGGGTTCTGACCCCGAGAGGGGAAGGTAATCCCGAAACCCTGCCTCAGTTGGGATCGAGGGCTGAAACCCGCCCTCGTGAACATGGAATCCCTAGTAATCGCGGGTCACCAGCCCGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTAGGTGAGGCGTGGTCCTTGTGGCTGTGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTC', 'BA1-16S') ] #prepare filter filt = BlastFilter(lambda a: a.hsps[0].align_length > 1100) filt.AND = BlastFilter(lambda a: all(hsp.score > 500 for hsp in a.hsps)) filt.AND.AND = BlastFilter(lambda a: all(hsp.identities/float(hsp.align_length) > 0.8 for hsp in a.hsps)) #make ring-blast blast = BlastCLI(self.abort_event) orig_seqs = blast.ring_blast(query, suns_db, 100, filt, 3) if not orig_seqs: print 'No blast results.' return 1 nseqs = len(orig_seqs) print 'RingBlast to:\n%s\nreturned %d sequences.\n' % (suns_db, nseqs) #save an initial alignment self.fix_ids(orig_seqs) alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta' with user_message('Aligning retrieved sequences...', '\n'): if not AlignmentUtils.align(orig_seqs+[query]+additions, outfile=alifile): return 3 #search for additional homologs add_seqs = blast.ring_blast(orig_seqs, silva_db, 100, filt, 0) if add_seqs: self.fix_ids(add_seqs) print 'RingBlast to:\n%s\nreturned %d additional sequences.\n' % (silva_db, len(add_seqs)) #build an alignment seqs = orig_seqs+add_seqs+[query]+additions alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.fasta' with user_message('Aligning retrieved sequences...', '\n'): if not AlignmentUtils.align(seqs, outfile=alifile): return 3 #build a tree treefile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.tre' if not PhyloUtils.build_fast_tree(alifile, treefile): return 4 #annotate the tree with open('/home/allis/Documents/INMI/16S/SSBaF4-SSBaR4-1_243072232-iPCR-report.txt') as inp: # SSBaF4-SSBaR4_65397396-iPCR-report.txt sids = set() len_re = re.compile(r'(\s|^)(\d+)(\sbp|\\s*:)?', re.MULTILINE) entry = False cur_sid = None cur_len = -1 for l in inp: if l == '========= histograms and electrophorograms of PCR products of each hit =========': break if l.startswith('---'): entry = False if cur_sid and cur_len > 0 and abs(cur_len-920) < 60: sids.add(cur_sid) cur_sid = None cur_len = -1 continue if entry or '#' in l: entry = True plen = len_re.search(l) if plen: cur_len = int(plen.group(2)) sid = BlastID.extract(l)[0] if sid: cur_sid = sid organisms = Organisms.from_records(seqs) if PhyloUtils.annotate_tree(treefile, organisms, reroot_at='Thermococcus_chitonophagus', # beautify_leafs=True, # collapse_taxa=['miscellaneous crenarchaeotic group', 'thaumarchaeota'], # collapse_last=True, # collapse_hard=True, mark_leafs=sids, # [r.id for r in orig_seqs+[query]+additions], lineage_colors={'miscellaneous crenarchaeotic group':(0, 0, 255), 'thaumarchaeta':(255,0,0)}, top_lineage=Lineage('archaea')): return 0 return 2
from multiprocessing import Event from BioUtils.Tools.Output import user_message from BioUtils.SeqUtils import load_files _pid = os.getpid() #setup signal handler signal.signal(signal.SIGINT, sig_handler) signal.signal(signal.SIGTERM, sig_handler) signal.signal(signal.SIGQUIT, sig_handler) if True: # from DegenPrimer import MultiprocessingBase # MultiprocessingBase.cpu_count = 1 abort_event = Event() lb = BlastCLI(abort_event) with user_message('Loading genomes...', '\n'): genomes_dir = u'/home/allis/Dropbox/Science/Микра/Thermococcus/sequence/GenBank/Thermococcus' genome_names = [ 'Thermococcus_barophilus_Ch5-complete.gb', 'Thermococcus_onnurineus_NA1-complete-genome.gb', 'Thermococcus_sp._ES1.gb', 'Thermococcus-DS1-preliminary.gb' ] genomes = load_files( abort_event, [os.path.join(genomes_dir, f) for f in genome_names], 'gb') ref = genomes[0] subj = genomes[1:] @shelf_result def g2g2shelf():
def _main(self): min_prod = 400 silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva.fasta' alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta' add_filename = FilenameParser.strip_ext(alifile)+'.with_additions.fasta' outgroups = ['Thermococcus_chitonophagus', 'SMTZ1-55', 'contig72135_1581_sunspring_meta'] add = ['KF836721.1.1270','EU635905.1.1323'] exclude = []#['Thermococcus_chitonophagus', 'SMTZ1-55', 'BA1-16S', 'contig72135_1581_sunspring_meta'] #load alignment if os.path.isfile(add_filename): alifile = add_filename add_filename = '' with user_message('Loadding initial alignment...', '\n'): orig_ali = AlignmentUtils.load_first(alifile) if not orig_ali: return 1 #load homologs if add_filename: with user_message('Loadding additional sequences...', '\n'): add_seqs = [] db = SeqView() if db.load(silva_db): for sid in add: seq = db.get(sid) if seq: add_seqs.append(seq) else: print '%s not found in %s' % (sid, silva_db) #realign data if needed if add_seqs: with user_message('Realigning data...', '\n'): add_filename = FilenameParser.strip_ext(alifile)+'.with_additions.fasta' AlignmentUtils.align(list(orig_ali)+add_seqs, add_filename) orig_ali = AlignmentUtils.load_first(add_filename) if not orig_ali: return 2 #process the alignment ali = orig_ali.remove(*exclude).trim() for out in outgroups: if not ali.index(out): print '%s not found in the alignment' % out return 3 ali.sort(key=lambda r: 'zzzzzzzz' if r.id in outgroups else r.id) ali_len = ali.get_alignment_length() AlignmentUtils.save(ali, '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.trimmed.fasta') args = dict(plen = (20,40), max_mismatches = 8, min_match_mismatches = 1, first_match_mismatches = 1, first_may_match = 1, AT_first=True, outgroup=len(outgroups)) fprimers = self._find_primers(ali, **args) rprimers = self._find_primers(ali.reverse_complement(), **args) pairs = [] for i, (fs, fp) in enumerate(fprimers): start = fs fprimer = Primer.from_sequences(fp[:-1], 1, 'SSBaF%d' % fs) for _j, (rs, rp) in enumerate(rprimers): end = ali_len-rs if end-start <= min_prod: continue pairs.append((fprimer, Primer.from_sequences(rp[:-1], 1, 'SSBaR%d' % (ali_len-rs+1)))) if not pairs: print '\nNo suitable primer pairs found' return 3 added = set() for i, (fp, rp) in enumerate(pairs): print '\npair %d' % (i+1) print '%s: %s' % (fp.id, fp) print '%s: %s' % (rp.id, rp) if fp.id not in added: orig_ali.append(fp.master_sequence+'-'*(orig_ali.get_alignment_length()-len(fp))) added.add(fp.id) if rp.id not in added: orig_ali.append(copy_attrs(rp.master_sequence, rp.master_sequence.reverse_complement())+ '-'*(orig_ali.get_alignment_length()-len(rp))) added.add(rp.id) print orig_ali = AlignmentUtils.align(orig_ali) AlignmentUtils.save(orig_ali, '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.with_primers.aln.fasta') print 'Done'
def _main(self): query = simple_rec( 'AAACTGGGGCTAATACCCGATGGGTGAGGAGGCCTGGAATGGTTCTTCACCGAAAAGACGTTGAGACCATGCTTTTCAACGTTGCCTAAGGATGGGGCCGCGTCCGATCAGGTTGTTGGTGGGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGGGAGCGGAAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGTCGCGAAAACTCCGCAATGCGCGAAAGCGTGACGGGGCTACCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAATGAGCCTGGGGAATAAGGAGAGGGCAAGCCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTAGGGATGATTATTGGGCTTAAAGCGTCCGTAGCCAGCCCGGCAAGTCTCCCGTTAAATCCAGCGACCTAATCGTTGGGCTGCGGAAGATACTGTTGGGCTAGGGGGCGGGAGAGGCCGACGGTATTCCCGGGGTAGGGGTGAAATCCTATAATCCTGGGAGGACCACCAGTGGCGAAGGCTGTCGGCTAGAACGCGCTCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACTGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCTCAGTGCCGCAGGGAAGCCATTAAGCCCGCCGCCTGGGAAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGCGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGGCGACAGCAGGATGAGGGCCAGATTGAAGGTCTTGCTTGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGCAACGATCGAGACCCGCACCCTTAGTTGCAACCCCTGCGGAACCCGCAGGGGGCACACTACGGGAACTGCCGCCGATAAGGCGGAGGAAGGAGCGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAGAGACAATGGGTTCCAACCTTGAAAGAGGGAGGTAATCCCTAAACCCTGCCTCAGTTGGGATCGAGGGCTGCAACCCGCCCTCGTGAACATGGAATGCCTAGTAATCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTGGGTGAGGCGTGGTCTGTTGGCCGCGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT', 'BA2-16S') suns_db = '/home/allis/Documents/INMI/SunS-metagenome/BlastDB-big/sunspring_meta' silva_db = '/home/allis/Documents/INMI/SILVA-DB/SILVA_123_SSURef_Nr99_tax_silva' additions = [ simple_rec( 'AAACTGGGGCTAATCCCCCATAGGCCTGGGGTACTGGAAGGTCCCCAGGCCGAAAGGG------GACCGTA-----AGGTCCCGCCCGAGGATGGGCCGGCGGCCGATTAGGTAGTTGGTGGGGTAACGGCCCACCAAG--CCGAAGATCGGTACGGGCC-GTGAGAGCGGGAGCCCGGAGATGGACA---CTGAGACACGGGTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCGCAATGCGGGAAACCGCGACGGGGGGACCCCCAGTGCCGTGCCTCTGGC-----ACGGCTTTTCCGGAGTG-TAAAAAGCTCCGGGAATAAGGGCTGGGCAAGGCCGGTGGC-AGCCGCCGCGGTAATACCGGCGGCCCGAGTGGTGGCCACTATTATTGGGCCTAAAGCGGCCGTAGCCGGGCCCGTAAGTCCCTGGCG-AAATCCCACGGCTCAACCGTGGGGCTCGCTGGGGATACTGCGG-GCCTTGGGACCGGGAGAGGCCGGGGGTACC-CCCGGGGTAGGGGTGAAATCCTATAATCCCGGGGGGACCGCCAGT-GGCGAAGGCGCCC--GGCTGGAACGGGTCCGACGGTGAGGGCCGAAGGCC-AGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCTGGCTGTAAAGGATGCGGGCTAGGTGTCGGGCGAG-CTTCGAGCTCGC-CCGGTGCCGTAGGGAAGCCGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACTACAAGGGGTGGAGCGTGCGGTTTAATTGGATTCAACGCCGGGAACCTCACCGGGGGCGACGGCAGGATGAA-GGCCAGGCTGAAGGTCTTGCCGGACGCGCCGAGAGGAG-----------------------------------GTGCATGGCCGCCGTCAGCTCGTACCGTGAGGCGTCCA-CTTAAGTGTGGTAACGAGCGAGACCCGC--GCCCCCAGTTGCCAGTCCCTCCCGCTGGGA---GGGAGGC-ACTCTGGGGGG-ACTGCCGGCGAT-AAGCCGGAGGAAGGGGCGGGCGACGGTAGGTCAGTATG-CCCCGAAACCC-CCGGGCT-ACACGCGCGCTACAATGGGCGGGACAATGGGA-CCCGACCCCGAAAGGGGAAGGGAATCCCCTAAACCCGCCCTCAGTTCGGATCGCGGGCTG-CAACTCGCCCGCGTGAAGC-TGGAAT-CCCTAGTACCCGCGCGTCATCATCGCGCGGCGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCACTCCACCCGAG-CGGGGCCC-GGGTGAGGCCCGATCTCCTTCGGGAGGTCGGGTCGAGCCTGGGCTC-CGTGAGGGGGG-AGAAGTCGTAACAAGGTAGCC------------------------------' .replace('-', ''), 'Thermococcus_chitonophagus'), simple_rec( 'AAACTGGGATTAATACCCACTAAATGATAATACCTGGAATGGCTTATCATTGAAAGAC-TCTGGAAACATGCTTC-CAGCGTCGCCCAAGG-------------------------------------------------------------------------------GGAGCCCGGAGATGGAAA---CTGAGACAAGGTTCCAGGCCCTACGGGGCGCAGCAGGCGCGAAACC-TCCACAATGCGCGAAAGCGTGATGGGGTTATCCCGAGTGCCGTCCGATGAGG-----ATGGCTTTTCCTCGGTG-TAAGGATCCGAGGGAATAAAGGGGGGGCAAGACTGGTGTC-AGCCGCCGCGGTAATACCAGCTCCCTGAGTGGTAAGGACGATTATTTGGCCTAAAGCGTCCGTAGCCGGCTTATCAAGTCTCTTGTT-AAACCCAGTGATTCAATCATTGACCT-GCAAGAGATACTGTTA-TGCTAGAGGACGGGAGAGGTCGACGG---------GGGTAGGGGTGAAATCCTATAATCCTTGGAGGACCACCAGT-GGCGAAGGCGGTC--GACTAGAACGTGCCTGACGGTGAGGGACGAAAGCT-GGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCTGTAAACGATGCGGGCTAGGTGTTGGGGTAG-CTACGAGCTACT-CCAGTGCCGCAGAGAAGTTGTTAAGCCCGCCGCC-TGGGGAGTACGGCCGCAAGGCT-GAAACTTAAAGGAATT-GGCGGGGGAGC-ACCACAAGGGGTGAAGGCTGCGGTTTAATTGGAGTCAACGCCGGGAACCTTACCGGGGCTGACAGCAGAGTGAA-GGCCAGACTGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGAAGATCTTGCCAGACAAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCT-GTTAAGTCAGGCAACGAACGAGACCCCC--ACTGTTAGTTGCCAGCGAATTCCAACGGAAT--GTCGGGC-ACACTAACAGG-ACTGCCACCGAT-AAGGTGGAGGAAGGAGGGGGCAACGGCAGGTCAGTATG-CCCC--------------------------------------------------------------------------------------------------------------GAACTCGCCCTCATGAACA-TGGAAT-CCCTAGTAACCGCGTGTCATCATCGCGCGGTGAATACGTCCCCGCTCCTTGCACACACCGCCCGTCGCTCCATCCAAG-TCGGGTCT-AGATGAGGCGCAGTCTTCT-----TGGCTACGTCGAATCTGGGTTC-GGTGAGGGGGG-AGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTCCT' .replace('-', ''), 'SMTZ1-55'), simple_rec( 'ACTCCGGTTGATCCTGCCGGACCCCACTGCTATCGGGGTAGGACTTAACCATGCGAGTTGTGCGTCCCCAAGCCATGGTGGGGGCGCGGCATACGGCTCAGTAACACGTGGCTAACCTAGCCTTTGGACGGGGACAACCCCGGGAAACTGGGGCTAATCCCCGATGGGTGGGAAGGCCTGGAATGGTTTCCCACCGAAAGGGCGTCTGAACCATGCTTCAGGCGTTGCCGAAGGATGGGGCCGCGGCCGATCAGGTTGTTGGTGAGGTAACGGCTCACCAAGCCTATAACCGGTACGGGCCGTGAGAGCGGGAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGGTGCGAAAACTCCGCGATGCGCGAAAGCGTGACGGGGCTATCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCGGTGTAGGGAGCCGGGGGAATAAGGAGAGGGCAAGTCTGGTGTCAGCCGCCGCGGTAATACCAGCTCTCCGAGTGGTGGGGACAATTATTGGGCTTAAAGCGTCCGTAGCCGGCCCATCAAGTCTCTTGTTAAATCCAGCGATCCAATCGCTGGACTGCGGGAGATACTGCTGGGCTAGGGGGCGGGAGAAGCCGATGGTATTCTCGGGGTAGGGGTGAAATCCTATAATCCCGGGAGGACCACCAGTGGCGTAGGCGGTCGGCTAGAACGCGCCCGACGGTGAGGGACGAAAGCTGGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCCGTAAACGATGCGGGCTAGGTGTTGGGGTGGCTACGAGCCACCCCAGTGCCGCATGGAAGCAATTAAGCCCGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGGAAAGGAACAGCGTTTTGTTGTTCCTCTGGATACCTTACCGGGGGCGACAGCAGGATGAAGGCCAGATTGAAGGTCTTGCTGGACGAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAGGTAACGATCGAGACCCACACCCCCAGTTGCTACCTCTTCGGAGGGCACTCTAGGGGTACTGCCGCCGATAAGGCGGAGGAAGGAGTGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGGGCCACACGCGAGCTGCAATGGCAAGGACAATGGGTTCTGACCCCGAGAGGGGAAGGTAATCCCGAAACCCTGCCTCAGTTGGGATCGAGGGCTGAAACCCGCCCTCGTGAACATGGAATCCCTAGTAATCGCGGGTCACCAGCCCGCGGTGAATACGTCCCTGCTCCTTGCACACACCGCCCGTCGCTCCATCCGAGTGGGGTTTAGGTGAGGCGTGGTCCTTGTGGCTGTGTCGAATCTAGGCTTCGCGAGGAGGGAGAAGTCGTAACAAGGTGGCCGTAGGGGAACCTGCGGCCGGATCACCTC', 'BA1-16S'), simple_rec( 'CTGGTGGAAATATAGAAGAGGCCAAATCCGGGGTTCAGGCCGCCCGGGGTAATTACCCGTTGTCGGAGTGGGGGGGGGACGCTATTGGGGCTTAAGCCATCGTTAGCCCGTTTGACCAGGTCTCTTGTTAAATCAGGCGGATTTATTGGTCGATTGCAGGAGATTATGTTCGTCTTAGGGGCCGGAGGAGTCAACAGTATTCCCGGGGTAGGAGTGAATGCCTATATTCCCGGAGGTACCACCAGTGGGGACGCCGTTGGTATAGAACGCGCCGGCCGGTGATGGAATGAAAGTGAGGGAACCGACCCGAATTAGATACCGGGGTATTGCTACCGTTAACCGATGCAGCTTAGGTGTTCGGGTGGTTACTAGCCATTCGAGTGCGCCAGGGAAGCTGTCAGGCTTACCGCTTGGGAAGTGCGGCTGCAGGGCCAAAACTTAAGGAAATCGCCGGGGAAGCACCCCAGGGGGTGAAGCTTGCGCTTTAATGGAATTCACCGCGGTAATTCTCACCGGGGGAGCCACCAGGAGGAAAGCCAGATTAAAGTTCTTGTTGGCGGAGTGGAGAGGAGGTGCATGCCGTTCGCCAGTTCTTCCCGGGAGGTTCTTGTTAGTTCAGCCACCGATGAGGACCGCCATCCCCTGTTGTTATTGGCCTTGCGCCAGGCACACTGGGGAGACCGCCGCCGATAAGGCGGAGGAAGGAGCGGGCCACGGCAGGTCAGTATGCCCCGAATCCCCCGTCCACACGCGAGGGGCAATG', '155a'), simple_rec( 'CAAGTCCTATAACCGGTACGGGCCGTGGGAGCGGTAGCCCGGAGATGGGCACTGAGACAAGGGCCCAGGCCCTACGGGGCGCAGCAGTCGCGAAACCTCCGCAATACGCGAAAGCGTGACGGGGTCATCCCGAGTGCCGTCCGCTGAGGATGGCTTTTCCCCAGTGTAGACAGCTGGGGGAATAAGGAGAGGGCAAGTCGGGTGTCAGCCGCCGCGGTAATACCCGCTCTCCGAGTGGTGGGGACGCTTATTGGGCCTAAAGCATCCGTAGCCGGCTGGACAAGTCCCCTGTTAAATCCAGCGATTTAATCGTTGGACTGCGGGGGATACTGTCCGGCTAGGGGGCGGGAGAGGCCGACGGTATTTCCGGGGTAGGGGTGAAATCCTATAATCCCGGGAGGACCACCAGTGGCGAAGGCTGTCGGCTAGAACGCGCCCGACGGTGAGGGATGAAAGCTGGGGGAGCGAACCGGATTAGATACCCGGGTAGTCCCAGCCGTAAACGATGCAGGCTAGGTGTTCGGGTGGCTACGTGCCACTCGAGTGCCGCAGGGAAGCTGTTAAGCCTGCCGCCTGGGGAGTACGGCCGCAAGGCTGAAACTTAAAGGAATTGGCGGGGGAGCACCACAAGGGGTGAAGCTTGCGGTTTAATTGGAGTCAACGCCGGAAATCTCACCGGGGGAGACAGCAGGATGAAAGCCAGATTAAAGGTCTTGCTAGACGAGCTGAGAGGAGGTGCATGGCCGTCGCCAGTTCGTGCCGTGAGGTGTCCTGTTAAGTCAAGGCAACGATCGAGACTCGCATCCTCTGTTGCTACTACCCTTGCGCCAAGGCACACTGGGGGAGACCGCCGCTCGATAAGGCGGAAGGAAGGAGCGGCCCACGGCAGTCAGTATGCCCCGAATTCCCTCGGCCACACGCAAGCTGCAATG', '156a'), simple_rec( 'GGGGATCGGGGCATACTGACCTGCCGTGGCCCGCTCCTTCCTCCGCCTTATCGGCGGCGGTCTCCCCAGTGTGCCTGGCGCAAGGGCAGTAACAACAGGGGATGGGGGTCTCGATCGGTGGCTGGCTTAACAGGAAACCTCACGGGACGAACTGGCGAACGGCATGGACCTTCTCTCAACTTGGCTAAGAAGAACTTTAATCTGGCTTTCATTCTGGTGGCTTCCCCGGTGAGAATTCCGGCGGTGACTCCCAATAAAACGCAAGCTTCACCCCTTGGGGTGGTTCCCCGGCCATTTCTTTAAGGTTCAAGCTTTGCGGCGGTATTCCCAAGCGGCAAGGTTAACAGCTTCCCTGCCGCACTCGAGTGGCACGTAACCACCCGAACAACTAACCTGCATCCGTTACCGGTTGGACTAACCCGGTATCTAATCCGGGTCGCTCCCCCAGCCTTCATTCCTTCACCGTCCGGCGCGGTTCTAAGCGACCGGCTTTCGCACTTGTGGTTCCTCCCGGGGATTATAAGAATTCACCCCTACCCCGGAAATTACGGTCCGGCTCCTCCGGCCCCTAACCCGACACGTAATCCCCCGCCAGTTCAACCGATTAAATCCGCTTGAATTTAACAAGGGGGACCTTGTCCAGCCGGCCTACGGATGCTTTAAGGCCCAATAAGCCGTCCCCACCACTCCGAGAGCGGGTAATAACCGCGGCCGGCCTGACAACCGACCTGGCCTCTCCTAAATCCCCCAGCTGTTCACACTTGGGAAAGGGCATTCCTCAGCGAACGGCACTTCGGGATGAACCCGTCACGCTTTCGCGTAATTGCGGGAAGGTTTCGCGAACTGCTGCGCCCCGTAAAGGCCTGGGTCCTTGTGTCTCAAATTGCCCCATCTCCGGGCTATACGCTCTCCACGGGCCCGTACC', '157a') ] #prepare filter filt = BlastFilter(lambda a, r: a.hsps[0].align_length > 1100) filt.AND = BlastFilter( lambda a, r: all(hsp.score > 500 for hsp in a.hsps)) filt.AND.AND = BlastFilter(lambda a, r: all( hsp.identities / float(hsp.align_length) > 0.8 for hsp in a.hsps)) #make ring-blast blast = BlastCLI(self.abort_event) orig_seqs = blast.ring_blast(query, suns_db, 100, filt, 3) if not orig_seqs: print 'No blast results.' return 1 nseqs = len(orig_seqs) print 'RingBlast to:\n%s\nreturned %d sequences.\n' % (suns_db, nseqs) #save an initial alignment self.fix_ids(orig_seqs) alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.aln.fasta' with user_message('Aligning retrieved sequences...', '\n'): if not AlignmentUtils.align(orig_seqs + [query] + additions, outfile=alifile): return 3 #search for additional homologs add_seqs = blast.ring_blast(orig_seqs, silva_db, 100, filt, 0) if add_seqs: self.fix_ids(add_seqs) print 'RingBlast to:\n%s\nreturned %d additional sequences.\n' % ( silva_db, len(add_seqs)) #build an alignment seqs = orig_seqs + add_seqs + [query] + additions alifile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.fasta' with user_message('Aligning retrieved sequences...', '\n'): if not AlignmentUtils.align(seqs, outfile=alifile): return 3 #build a tree treefile = '/home/allis/Documents/INMI/SunS-metagenome/Bathy/BA2_SunS_16S.big.aln.tre' if not PhyloUtils.build_fast_tree(alifile, treefile): return 4 #annotate the tree if False: with open( '/home/allis/Documents/INMI/16S/SSBaF4-SSBaR4-1_243072232-iPCR-report.txt' ) as inp: # SSBaF4-SSBaR4_65397396-iPCR-report.txt sids = set() len_re = re.compile(r'(\s|^)(\d+)(\sbp|\\s*:)?', re.MULTILINE) entry = False cur_sid = None cur_len = -1 for l in inp: if l == '========= histograms and electrophorograms of PCR products of each hit =========': break if l.startswith('---'): entry = False if cur_sid and cur_len > 0 and abs(cur_len - 920) < 60: sids.add(cur_sid) cur_sid = None cur_len = -1 continue if entry or '#' in l: entry = True plen = len_re.search(l) if plen: cur_len = int(plen.group(2)) sid = BlastID.extract(l)[0] if sid: cur_sid = sid organisms = Organisms.from_records(seqs) if PhyloUtils.annotate_tree( treefile, organisms, reroot_at='Thermococcus_chitonophagus', # beautify_leafs=True, # collapse_taxa=['miscellaneous crenarchaeotic group', 'thaumarchaeota'], # collapse_last=True, # collapse_hard=True, # mark_leafs=sids, mark_leafs=[r.id for r in orig_seqs + [query] + additions], lineage_colors={ 'miscellaneous crenarchaeotic group': (0, 0, 255), 'thaumarchaeta': (255, 0, 0) }, top_lineage=Lineage('archaea')): return 0 return 2
def ring_blast(self, query, db='nr', evalue=0.001, blast_filter=None, depth=1, command='blastn', **kwargs): '''Perform a blast search with the given query to obtain the core set of hits. Make another search with each hit as a query. If results of the second search contain new hits, check if these are reciprocal by yet another search with them and checking that results contain hits from the core set and if they are, add the to the final set. ''' if isinstance(query, SeqRecord): query = [query] def blast_filter_fetch(seqs): @MultiprocessingBase.data_mapper @shelf_result def worker(s): r = self.blast_seq(s, db, evalue, command, **kwargs) if r and blast_filter: blast_filter(r) if r: return self.fetch_results(r, db, what='alignment') return None results = [] total = len(seqs) prg = ProgressCounter('Performing blast search for %d sequences:' % total, total) @MultiprocessingBase.results_assembler def assembler(i, res): if res: results.append(res) prg.count() with prg: if not self.parallelize2(1, worker, assembler, seqs): return None return results with user_message('RingBlast: building a core set of sequences.', '\n'): core_seqs = blast_filter_fetch(query) if not core_seqs: return None core_seqs = self.unique_seqs(chain.from_iterable(from_shelf(r) for r in core_seqs)) extended_set = dict((self.base_sid(s), s) for s in core_seqs) if depth <= 0: return core_seqs core_db = self.format_tmp_db(core_seqs, command.endswith('n')) def check_sequences(seqs, next_to_process): total = len(seqs) prg = ProgressCounter('RingBlast: checking %d new sequences:' % total, total) @MultiprocessingBase.data_mapper def worker(seq): res = self.blast_seq(seq, core_db, 100, command) if res and blast_filter: blast_filter(res) return bool(res), seq @MultiprocessingBase.results_assembler def assembler(i, res): prg.count() if not res[0]: return seq = res[1] extended_set[self.base_sid(seq)] = seq next_to_process.append(seq) with prg: return self.parallelize2(1, worker, assembler, seqs) def process_sequences(seqs, _depth): if _depth == 0: return with user_message('RingBlast: processing %d sequences of the %d ring.' % (len(seqs), depth-_depth+1), '\n'): next_ring = blast_filter_fetch(seqs) if not next_ring: return to_check = [] next_to_process = [] for n in next_ring: next_seqs = from_shelf(n) if not next_seqs: continue for ns in next_seqs: sid = self.base_sid(ns) if sid in extended_set: #FIXME: need to merge sequences properly, instead of replacing if len(extended_set[sid]) < len(ns): extended_set[sid] = ns else: to_check.append(ns) if not to_check or not check_sequences(to_check, next_to_process): return if next_to_process: process_sequences(next_to_process, _depth-1) process_sequences(core_seqs, depth) return extended_set.values()
def g2g_blastp(self, reference, subjects, table='Standard', evalue=0.001, max_rlen=0, features_of_interest=None): ''' Perform blastp of each coding sequence of the reference against each subject, which is first translated gene-by-gene. Parameters @param reference: SeqRecord object of the reference genome @param subjects: a list of SeqRecord objects of subject genomes @param table: translation table number (see NCBI site for description) @param evalue: filter out blastp results with E-value grater than this @param max_rlen: filter out blastp results which are shorter than this fraction of target gene length @param features_of_interest: list of dictionaries of the form {qualifier_name : qualifier_value} to mark features denoting known clusters that should be analyzed one against the other @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) where CDS is a gene/CDS feature from the reference.features list and blast_resultN is a list of results for the N-th subject, containing following information: (hit_feature, align_length, percent_identity, evalue) where hit_feature is a SeqFeature object of the gene/CDS of the subject where top blast hit is located, align_length is the length of the hit, percent_identity is the ratio of number of identities and align_length [0; 1] and evalue is the E-value of the top hit. ''' if not reference or not subjects: print 'No reference or subject sequences provided' return None #get list of features to query with user_message('Searching for gene/CDS features in provided sequences...'): all_records = [reference]+subjects num_records = len(all_records) features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), range(num_records), all_records) if self.aborted(): print '\nAborted' return None if not features or not features[0]: print ('\nReference sequence does not contain annotated _genes:\n%s %s' % (reference.id, reference.description)) return None if len([f for f in features if f]) < 2: print '\nSubject sequences do not contain annotated _genes' return None #add gene ids for ri, genes in enumerate(features): if not genes: continue r = all_records[ri] for gene_id, gi in enumerate(genes): r.features[gi].qualifiers['feature_id'] = gi r.features[gi].qualifiers['gene_id'] = gene_id #get features of interest if requested fois = None if features_of_interest: with user_message('Searching for features of interest...'): fois = [] for foi in features_of_interest: foi = self._get_fois(all_records, foi) if foi and foi[0]: fois.append(foi) if self.aborted(): print '\nAborted' return None #translate features to proteins with Progress('Translating _genes found in the reference and subjects...', num_records) as prg: translator = Translator(self._abort_event) translations = [None]*num_records foi_translations = [[None]*num_records for _f in fois] for i, (f, rec) in enumerate(zip(features, all_records)): if not f: prg.step(i) continue translation = translator.translate_features(rec, f, table) if not translation: return None if i > 0: translations[i] = cat_records(translation) if fois: for ifoi, foi in enumerate(fois): foi_loc = [0, 0] for foi_var in foi[i]: if not foi_var: continue for gid in foi_var: l = translations[i].features[gid].location foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1 foi_loc[1] = max(int(l.end), foi_loc[1]) if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc else: translations[i] = translation if fois: for ifoi, foi in enumerate(fois): foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]] prg.step(i) #blast features against subjects with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'): stranslations = translations[1:] blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not blast_results: print '\nBlast have not returned any results.' return None if fois: #redo blast for fois and replace the results with user_message('Rerunning blast for FOIs...', '\n'): for ifoi, foi in enumerate(foi_translations): sfoi_locs = foi[1:] for i, foi_var in enumerate(foi[0]): foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not foi_blast: continue for gi, gid in enumerate(fois[ifoi][0][i]): if foi_blast[gi]: blast_results[gid] = foi_blast[gi] #process blast results pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations)))) with ProgressCounter('Searching for _genes in subjects that overlap with top blast hits...', len(pairs)) as prg: work = self.Work() work.start_work(self._find_features_by_hsps, pairs, None, stranslations, blast_results) @MultiprocessingBase.results_assembler def assembler(index, result, blast_results, pairs, prg): qs = pairs[index] blast_results[qs[0]][qs[1]] = result prg.count() work.assemble(assembler, blast_results, pairs, prg) if not work.wait(): return None return zip((reference.features[f] for f in features[0]), blast_results)