def _blast_feature(self, f, c1, c2): trans = Translator(self._abort_event) cds = trans.translate(f.extract(c1), 11) sixframes = trans.translate_six_frames_single(c2, 11) if not sixframes: return [(None, None, None)] results = [] for frame in sixframes: res = BlastCLI.s2s_blast(cds, frame, self.evalue, command='blastp', task='blastp') if res: results.extend(res) hsps = BlastCLI.all_hsps(results) if not hsps: return [(None, None, None)] f1 = [] f2 = [] col = [] fname = self._feature_name(f, default='CDS') cds_len = len(cds) min_len = len(cds) * self.min_length for hsp in hsps: if hsp.align_length < min_len: continue if hsp.identities / float(hsp.align_length) < self.min_identity: continue color_t = (float(hsp.identities) / hsp.align_length) print '%s %s: %5.1f%% (%5.1f%%)' % (c1.description, fname, color_t * 100, float(hsp.identities) / cds_len * 100) col.append(colors.linearlyInterpolatedColor(colors.Color(0, 0, 1, 0.2), colors.Color(0, 1, 0, 0.2), 0.2, 1, color_t)) qstart = (hsp.query_start - 1) * 3 qend = qstart + hsp.align_length * 3 sstart = (hsp.sbjct_start - 1) * 3 send = sstart + hsp.align_length * 3 f1.append( SeqFeature(FeatureLocation(f.location.start + qstart, f.location.start + qend, strand=hsp.strand[0]))) f2.append(SeqFeature(FeatureLocation(sstart, send, strand=hsp.strand[1]))) return zip(f1, f2, col)
def hmmsearch_genes(self, hmms, genome, table='Standard', decorate=False, **kwargs): #get _genes genes = get_indexes_of_all_genes(genome) if not genes: return None for gene_id, gi in enumerate(genes): genome.features[gi].qualifiers['feature_id'] = gi genome.features[gi].qualifiers['gene_id'] = gene_id #translate _genes with user_message('Translating _genes/CDS of %s' % genome.description, '\n'): translator = Translator(self._abort_event) translation = translator.translate_features(genome, genes, table) if not translation: return None if isinstance(hmms, str): hmms = [hmms] results = dict() for hmm in hmms: with user_message('Performing hmm search.'): hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs) if not hmm_results: return None with user_message('Parsing search results...'): #get hit_ids of hmm matches hits = dict() for result in hmm_results: for hit in result.iterhits(): hits[hit.id] = hit #get indexes of features where hmm hit hit_features = dict() for t in translation: if t.id in hits: fid = t.features[0].qualifiers.get('feature_id') if fid is None: continue hit_features[fid] = hits[t.id], t if hit_features: results.update(hit_features) #decorate genome if decorate: with user_message('Adding results as annotations...'): hmm_name = os.path.basename(hmm) for f in hit_features: feature = genome.features[f] for hsp in hit_features[f][0]: if feature.strand == 1: hmm_location = FeatureLocation( feature.location.start + hsp.hit_start * 3, feature.location.start + hsp.hit_end * 3, feature.strand) else: hmm_location = FeatureLocation( feature.location.end - hsp.hit_end * 3, feature.location.end - hsp.hit_start * 3, feature.strand) hmm_feature = self.hsp2feature( hmm_name, 'HMM_annotations', hmm_location, hsp) genome.features.append(hmm_feature) return results if results else None
def hmmsearch_genome(self, hmm, genome, table='Standard', decorate=False, **kwargs): #get genes genes = get_indexes_of_genes(genome) if not genes: return None for gene_id, gi in enumerate(genes): genome.features[gi].qualifiers['feature_id'] = gi genome.features[gi].qualifiers['gene_id'] = gene_id #translate genes with user_message('Translating genes/CDS of %s' % genome.description, '\n'): translator = Translator(self._abort_event) translation = translator.translate(genome, genes, table) if not translation: return None with user_message('Performing hmm search.'): results = self.hmmsearch_recs(hmm, translation) if not results: return None with user_message('Parsing search results...'): #get hit_ids of hmm matches hits = dict() for result in results: for hit in result.iterhits(): hits[hit.id] = hit #get indexes of features where hmm hit hit_features = dict() for t in translation: if t.id in hits: fid = t.features[0].qualifiers.get('feature_id') if fid is None: continue hit_features[fid] = hits[t.id], t #decorate genome if decorate: with user_message('Adding results as annotations...'): hmm_name = os.path.basename(hmm) for f in hit_features: feature = genome.features[f] for hsp in hit_features[f][0]: if feature.strand == 1: hmm_location = FeatureLocation(feature.location.start+hsp.hit_start*3, feature.location.start+hsp.hit_end*3, feature.strand) else: hmm_location = FeatureLocation(feature.location.end-hsp.hit_end*3, feature.location.end-hsp.hit_start*3, feature.strand) hmm_feature = SeqFeature(hmm_location, type='misc_feature') hmm_feature.qualifiers['hmm_model'] = hmm_name hmm_feature.qualifiers['bitscore'] = hsp.bitscore hmm_feature.qualifiers['psi_evalue'] = hsp.psi_evalue hmm_feature.qualifiers['evalue_cond'] = hsp.evalue_cond hmm_feature.qualifiers['acc_average'] = hsp.acc_avg hmm_feature.qualifiers['bias'] = hsp.bias genome.features.append(hmm_feature) print 'Done.\n' return hit_features
def _blast_feature(self, f, c1, c2, features1, features2, evalue, max_rlen): trans = Translator(self._abort_event) cds = trans.translate(f.extract(c1), 11) sixframes = trans.translate_six_frames_single(c2, 11) if not sixframes: return [(None, None, None)] results = [] for frame in sixframes: res = BlastCLI.s2s_blast(cds, frame, evalue, command='blastp', task='blastp') if res: results.extend(res) hsps = BlastCLI.all_hsps(results, max_rlen) if not hsps: return [(None, None, None)] f1 = [] f2 = [] col = [] c1_name = pretty_rec_name(c1) if 'locus_tag' in f.qualifiers: fname = f.qualifiers['locus_tag'][0] else: fname = 'CDS' cds_len = len(cds) for hsp in hsps: color_t = (float(hsp.identities) / hsp.align_length) print '%s %s: %5.1f%% (%5.1f%%)' % (c1_name, fname, color_t * 100, float(hsp.identities) / cds_len * 100) col.append( colors.linearlyInterpolatedColor(colors.Color(0, 0, 1, 0.2), colors.Color(0, 1, 0, 0.2), 0.2, 1, color_t)) qstart = (hsp.query_start - 1) * 3 qend = qstart + hsp.align_length * 3 sstart = (hsp.sbjct_start - 1) * 3 send = sstart + hsp.align_length * 3 f1.append( SeqFeature( FeatureLocation(f.location.start + qstart, f.location.start + qend, strand=hsp.strand[0]))) f2.append( SeqFeature(FeatureLocation(sstart, send, strand=hsp.strand[1]))) return zip(f1, f2, col)
def hmmsearch_genome(self, hmms, genome, table='Standard', decorate=False, **kwargs): #translate _genes with user_message('Translating whole genome in 6 reading frames', '\n'): translator = Translator(self._abort_event) translation = translator.translate_six_frames(genome, table) if not translation: return None if isinstance(hmms, str): hmms = [hmms] results = [] for hmm in hmms: with user_message('Performing hmm search.'): hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs) if not any(len(r) for r in hmm_results): continue results += hmm_results #decorate genome if decorate: translation = dict((t.id, t) for t in translation) with user_message('Adding results as annotations...'): hmm_name = os.path.basename(hmm) glen = len(genome) for frame in hmm_results: for hit in frame: frec = translation[hit.id] start = frec.annotations['start'] strand = frec.annotations['strand'] for hsp in hit: if strand == 1: hmm_location = FeatureLocation( start + hsp.hit_start * 3, start + hsp.hit_end * 3, strand) else: hmm_location = FeatureLocation( glen - start - hsp.hit_end * 3, glen - start - hsp.hit_start * 3, strand) hmm_feature = self.hsp2feature( hmm_name, 'HMM_annotations', hmm_location, hsp) genome.features.append(hmm_feature) return results if results else None
def blastp_annotate(self, tag_sequences, subject_record, min_identity, evalue=0.001, table=11, **kwargs): # translate subject in six frames with user_message('Translating whole genome in 6 reading frames', '\n'): translator = Translator(self._abort_event) translation = translator.translate_six_frames(subject_record, table) if not translation: return False results = self.s2s_blast_batch(tag_sequences, translation, evalue=evalue, command='blastp', **kwargs) if results is None: return False with user_message('Adding results as annotations...'): annotated = False subj_len = len(subject_record) for i, tag in enumerate(tag_sequences): if not results[i]: continue tag_name = pretty_rec_name(tag) if tag_name != tag.id: tag_name += ' (%s)' % tag.id for frame, record in enumerate(results[i]): if not record: continue frec = translation[frame] start = frec.annotations['start'] strand = frec.annotations['strand'] for hit in record: for ali in hit.alignments: for hsp in ali.hsps: if hsp.identities / float(hsp.align_length) < min_identity: continue if strand == 1: location = FeatureLocation(start+(hsp.sbjct_start-1)*3, start+hsp.sbjct_end*3, strand) else: location = FeatureLocation(subj_len-start-hsp.sbjct_end*3, subj_len-start-hsp.sbjct_start*3, strand) feature = self.hsp2feature(tag_name, 'blastp_annotations', location, hsp) self.add_program(feature, 'blastp') subject_record.features.append(feature) annotated = True return annotated
def g2g_blastp(self, reference, subjects, table='Standard', evalue=0.001, max_rlen=0, features_of_interest=None): ''' Perform blastp of each coding sequence of the reference against each subject, which is first translated gene-by-gene. Parameters @param reference: SeqRecord object of the reference genome @param subjects: a list of SeqRecord objects of subject genomes @param table: translation table number (see NCBI site for description) @param evalue: filter out blastp results with E-value grater than this @param max_rlen: filter out blastp results which are shorter than this fraction of target gene length @param features_of_interest: list of dictionaries of the form {qualifier_name : qualifier_value} to mark features denoting known clusters that should be analyzed one against the other @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) where CDS is a gene/CDS feature from the reference.features list and blast_resultN is a list of results for the N-th subject, containing following information: (hit_feature, align_length, percent_identity, evalue) where hit_feature is a SeqFeature object of the gene/CDS of the subject where top blast hit is located, align_length is the length of the hit, percent_identity is the ratio of number of identities and align_length [0; 1] and evalue is the E-value of the top hit. ''' if not reference or not subjects: print 'No reference or subject sequences provided' return None #get list of features to query with user_message('Searching for gene/CDS features in provided sequences...'): all_records = [reference]+subjects num_records = len(all_records) features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), range(num_records), all_records) if self.aborted(): print '\nAborted' return None if not features or not features[0]: print ('\nReference sequence does not contain annotated genes:\n%s %s' % (reference.id, reference.description)) return None if len([f for f in features if f]) < 2: print '\nSubject sequences do not contain annotated genes' return None #add gene ids for ri, genes in enumerate(features): if not genes: continue r = all_records[ri] for gene_id, gi in enumerate(genes): r.features[gi].qualifiers['feature_id'] = gi r.features[gi].qualifiers['gene_id'] = gene_id #get features of interest if requested fois = None if features_of_interest: with user_message('Searching for features of interest...'): fois = [] for foi in features_of_interest: foi = self._get_fois(all_records, foi) if foi and foi[0]: fois.append(foi) if self.aborted(): print '\nAborted' return None #translate features to proteins with Progress('Translating genes found in the reference and subjects...', num_records) as prg: translator = Translator(self._abort_event) translations = [None]*num_records foi_translations = [[None]*num_records for _f in fois] for i, (f, rec) in enumerate(zip(features, all_records)): if not f: prg.step(i) continue translation = translator.translate(rec, f, table) if not translation: return None if i > 0: translations[i] = cat_records(translation) if fois: for ifoi, foi in enumerate(fois): foi_loc = [0, 0] for foi_var in foi[i]: if not foi_var: continue for gid in foi_var: l = translations[i].features[gid].location foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1 foi_loc[1] = max(int(l.end), foi_loc[1]) if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc else: translations[i] = translation if fois: for ifoi, foi in enumerate(fois): foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]] prg.step(i) #blast features against subjects with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'): stranslations = translations[1:] blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not blast_results: print '\nBlast have not returned any results.' return None if fois: #redo blast for fois and replace the results with user_message('Rerunning blast for FOIs...', '\n'): for ifoi, foi in enumerate(foi_translations): sfoi_locs = foi[1:] for i, foi_var in enumerate(foi[0]): foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not foi_blast: continue for gi, gid in enumerate(fois[ifoi][0][i]): if foi_blast[gi]: blast_results[gid] = foi_blast[gi] #process blast results pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations)))) with ProgressCounter('Searching for genes in subjects that overlap with top blast hits...', len(pairs)) as prg: work = self.Work() work.start_work(self._find_features_by_hsps, pairs, None, stranslations, blast_results) @MultiprocessingBase.results_assembler def assembler(index, result, blast_results, pairs, prg): qs = pairs[index] blast_results[qs[0]][qs[1]] = result prg.count() work.assemble(assembler, blast_results, pairs, prg) if not work.wait(): return None return zip((reference.features[f] for f in features[0]), blast_results)
def g2g_blastp(self, reference, subjects, table='Standard', evalue=0.001, max_rlen=0, features_of_interest=None): ''' Perform blastp of each coding sequence of the reference against each subject, which is first translated gene-by-gene. Parameters @param reference: SeqRecord object of the reference genome @param subjects: a list of SeqRecord objects of subject genomes @param table: translation table number (see NCBI site for description) @param evalue: filter out blastp results with E-value grater than this @param max_rlen: filter out blastp results which are shorter than this fraction of target gene length @param features_of_interest: list of dictionaries of the form {qualifier_name : qualifier_value} to mark features denoting known clusters that should be analyzed one against the other @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) where CDS is a gene/CDS feature from the reference.features list and blast_resultN is a list of results for the N-th subject, containing following information: (hit_feature, align_length, percent_identity, evalue) where hit_feature is a SeqFeature object of the gene/CDS of the subject where top blast hit is located, align_length is the length of the hit, percent_identity is the ratio of number of identities and align_length [0; 1] and evalue is the E-value of the top hit. ''' if not reference or not subjects: print 'No reference or subject sequences provided' return None #get list of features to query with user_message('Searching for gene/CDS features in provided sequences...'): all_records = [reference]+subjects num_records = len(all_records) features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), range(num_records), all_records) if self.aborted(): print '\nAborted' return None if not features or not features[0]: print ('\nReference sequence does not contain annotated _genes:\n%s %s' % (reference.id, reference.description)) return None if len([f for f in features if f]) < 2: print '\nSubject sequences do not contain annotated _genes' return None #add gene ids for ri, genes in enumerate(features): if not genes: continue r = all_records[ri] for gene_id, gi in enumerate(genes): r.features[gi].qualifiers['feature_id'] = gi r.features[gi].qualifiers['gene_id'] = gene_id #get features of interest if requested fois = None if features_of_interest: with user_message('Searching for features of interest...'): fois = [] for foi in features_of_interest: foi = self._get_fois(all_records, foi) if foi and foi[0]: fois.append(foi) if self.aborted(): print '\nAborted' return None #translate features to proteins with Progress('Translating _genes found in the reference and subjects...', num_records) as prg: translator = Translator(self._abort_event) translations = [None]*num_records foi_translations = [[None]*num_records for _f in fois] for i, (f, rec) in enumerate(zip(features, all_records)): if not f: prg.step(i) continue translation = translator.translate_features(rec, f, table) if not translation: return None if i > 0: translations[i] = cat_records(translation) if fois: for ifoi, foi in enumerate(fois): foi_loc = [0, 0] for foi_var in foi[i]: if not foi_var: continue for gid in foi_var: l = translations[i].features[gid].location foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1 foi_loc[1] = max(int(l.end), foi_loc[1]) if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc else: translations[i] = translation if fois: for ifoi, foi in enumerate(fois): foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]] prg.step(i) #blast features against subjects with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'): stranslations = translations[1:] blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not blast_results: print '\nBlast have not returned any results.' return None if fois: #redo blast for fois and replace the results with user_message('Rerunning blast for FOIs...', '\n'): for ifoi, foi in enumerate(foi_translations): sfoi_locs = foi[1:] for i, foi_var in enumerate(foi[0]): foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not foi_blast: continue for gi, gid in enumerate(fois[ifoi][0][i]): if foi_blast[gi]: blast_results[gid] = foi_blast[gi] #process blast results pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations)))) with ProgressCounter('Searching for _genes in subjects that overlap with top blast hits...', len(pairs)) as prg: work = self.Work() work.start_work(self._find_features_by_hsps, pairs, None, stranslations, blast_results) @MultiprocessingBase.results_assembler def assembler(index, result, blast_results, pairs, prg): qs = pairs[index] blast_results[qs[0]][qs[1]] = result prg.count() work.assemble(assembler, blast_results, pairs, prg) if not work.wait(): return None return zip((reference.features[f] for f in features[0]), blast_results)