Beispiel #1
0
 def _blast_feature(self, f, c1, c2):
     trans = Translator(self._abort_event)
     cds = trans.translate(f.extract(c1), 11)
     sixframes = trans.translate_six_frames_single(c2, 11)
     if not sixframes: return [(None, None, None)]
     results = []
     for frame in sixframes:
         res = BlastCLI.s2s_blast(cds, frame, self.evalue, command='blastp', task='blastp')
         if res: results.extend(res)
     hsps = BlastCLI.all_hsps(results)
     if not hsps: return [(None, None, None)]
     f1 = []
     f2 = []
     col = []
     fname = self._feature_name(f, default='CDS')
     cds_len = len(cds)
     min_len = len(cds) * self.min_length
     for hsp in hsps:
         if hsp.align_length < min_len: continue
         if hsp.identities / float(hsp.align_length) < self.min_identity: continue
         color_t = (float(hsp.identities) / hsp.align_length)
         print '%s %s: %5.1f%% (%5.1f%%)' % (c1.description, fname, color_t * 100, float(hsp.identities) / cds_len * 100)
         col.append(colors.linearlyInterpolatedColor(colors.Color(0, 0, 1, 0.2), colors.Color(0, 1, 0, 0.2),
                                                     0.2, 1, color_t))
         qstart = (hsp.query_start - 1) * 3
         qend = qstart + hsp.align_length * 3
         sstart = (hsp.sbjct_start - 1) * 3
         send = sstart + hsp.align_length * 3
         f1.append(
             SeqFeature(FeatureLocation(f.location.start + qstart, f.location.start + qend, strand=hsp.strand[0])))
         f2.append(SeqFeature(FeatureLocation(sstart, send, strand=hsp.strand[1])))
     return zip(f1, f2, col)
Beispiel #2
0
 def hmmsearch_genes(self,
                     hmms,
                     genome,
                     table='Standard',
                     decorate=False,
                     **kwargs):
     #get _genes
     genes = get_indexes_of_all_genes(genome)
     if not genes: return None
     for gene_id, gi in enumerate(genes):
         genome.features[gi].qualifiers['feature_id'] = gi
         genome.features[gi].qualifiers['gene_id'] = gene_id
     #translate _genes
     with user_message('Translating _genes/CDS of %s' % genome.description,
                       '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate_features(genome, genes, table)
     if not translation: return None
     if isinstance(hmms, str): hmms = [hmms]
     results = dict()
     for hmm in hmms:
         with user_message('Performing hmm search.'):
             hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs)
         if not hmm_results: return None
         with user_message('Parsing search results...'):
             #get hit_ids of hmm matches
             hits = dict()
             for result in hmm_results:
                 for hit in result.iterhits():
                     hits[hit.id] = hit
             #get indexes of features where hmm hit
             hit_features = dict()
             for t in translation:
                 if t.id in hits:
                     fid = t.features[0].qualifiers.get('feature_id')
                     if fid is None: continue
                     hit_features[fid] = hits[t.id], t
             if hit_features: results.update(hit_features)
         #decorate genome
         if decorate:
             with user_message('Adding results as annotations...'):
                 hmm_name = os.path.basename(hmm)
                 for f in hit_features:
                     feature = genome.features[f]
                     for hsp in hit_features[f][0]:
                         if feature.strand == 1:
                             hmm_location = FeatureLocation(
                                 feature.location.start + hsp.hit_start * 3,
                                 feature.location.start + hsp.hit_end * 3,
                                 feature.strand)
                         else:
                             hmm_location = FeatureLocation(
                                 feature.location.end - hsp.hit_end * 3,
                                 feature.location.end - hsp.hit_start * 3,
                                 feature.strand)
                         hmm_feature = self.hsp2feature(
                             hmm_name, 'HMM_annotations', hmm_location, hsp)
                         genome.features.append(hmm_feature)
     return results if results else None
Beispiel #3
0
 def hmmsearch_genome(self, hmm, genome, table='Standard', decorate=False, **kwargs):
     #get genes
     genes = get_indexes_of_genes(genome)
     if not genes: return None
     for gene_id, gi in enumerate(genes):
         genome.features[gi].qualifiers['feature_id'] = gi
         genome.features[gi].qualifiers['gene_id'] = gene_id
     #translate genes
     with user_message('Translating genes/CDS of %s' % genome.description, '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate(genome, genes, table)
     if not translation: return None
     with user_message('Performing hmm search.'):
         results = self.hmmsearch_recs(hmm, translation)
     if not results: return None
     with user_message('Parsing search results...'):
         #get hit_ids of hmm matches
         hits = dict()
         for result in results:
             for hit in result.iterhits():
                 hits[hit.id] = hit
         #get indexes of features where hmm hit
         hit_features = dict()
         for t in translation:
             if t.id in hits:
                 fid = t.features[0].qualifiers.get('feature_id')
                 if fid is None: continue
                 hit_features[fid] = hits[t.id], t
     #decorate genome
     if decorate:
         with user_message('Adding results as annotations...'):
             hmm_name = os.path.basename(hmm)
             for f in hit_features:
                 feature = genome.features[f] 
                 for hsp in hit_features[f][0]:
                     if feature.strand == 1:
                         hmm_location = FeatureLocation(feature.location.start+hsp.hit_start*3, 
                                                        feature.location.start+hsp.hit_end*3, 
                                                        feature.strand)
                     else:
                         hmm_location = FeatureLocation(feature.location.end-hsp.hit_end*3, 
                                                        feature.location.end-hsp.hit_start*3, 
                                                        feature.strand)
                     hmm_feature = SeqFeature(hmm_location, type='misc_feature')
                     hmm_feature.qualifiers['hmm_model'] = hmm_name
                     hmm_feature.qualifiers['bitscore'] = hsp.bitscore
                     hmm_feature.qualifiers['psi_evalue'] = hsp.psi_evalue
                     hmm_feature.qualifiers['evalue_cond'] = hsp.evalue_cond
                     hmm_feature.qualifiers['acc_average'] = hsp.acc_avg
                     hmm_feature.qualifiers['bias'] = hsp.bias
                     genome.features.append(hmm_feature)
     print 'Done.\n'
     return hit_features 
 def _blast_feature(self, f, c1, c2, features1, features2, evalue,
                    max_rlen):
     trans = Translator(self._abort_event)
     cds = trans.translate(f.extract(c1), 11)
     sixframes = trans.translate_six_frames_single(c2, 11)
     if not sixframes: return [(None, None, None)]
     results = []
     for frame in sixframes:
         res = BlastCLI.s2s_blast(cds,
                                  frame,
                                  evalue,
                                  command='blastp',
                                  task='blastp')
         if res: results.extend(res)
     hsps = BlastCLI.all_hsps(results, max_rlen)
     if not hsps: return [(None, None, None)]
     f1 = []
     f2 = []
     col = []
     c1_name = pretty_rec_name(c1)
     if 'locus_tag' in f.qualifiers:
         fname = f.qualifiers['locus_tag'][0]
     else:
         fname = 'CDS'
     cds_len = len(cds)
     for hsp in hsps:
         color_t = (float(hsp.identities) / hsp.align_length)
         print '%s %s: %5.1f%% (%5.1f%%)' % (c1_name, fname, color_t * 100,
                                             float(hsp.identities) /
                                             cds_len * 100)
         col.append(
             colors.linearlyInterpolatedColor(colors.Color(0, 0, 1, 0.2),
                                              colors.Color(0, 1, 0, 0.2),
                                              0.2, 1, color_t))
         qstart = (hsp.query_start - 1) * 3
         qend = qstart + hsp.align_length * 3
         sstart = (hsp.sbjct_start - 1) * 3
         send = sstart + hsp.align_length * 3
         f1.append(
             SeqFeature(
                 FeatureLocation(f.location.start + qstart,
                                 f.location.start + qend,
                                 strand=hsp.strand[0])))
         f2.append(
             SeqFeature(FeatureLocation(sstart, send,
                                        strand=hsp.strand[1])))
     return zip(f1, f2, col)
Beispiel #5
0
 def hmmsearch_genome(self,
                      hmms,
                      genome,
                      table='Standard',
                      decorate=False,
                      **kwargs):
     #translate _genes
     with user_message('Translating whole genome in 6 reading frames',
                       '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate_six_frames(genome, table)
     if not translation: return None
     if isinstance(hmms, str): hmms = [hmms]
     results = []
     for hmm in hmms:
         with user_message('Performing hmm search.'):
             hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs)
         if not any(len(r) for r in hmm_results): continue
         results += hmm_results
         #decorate genome
         if decorate:
             translation = dict((t.id, t) for t in translation)
             with user_message('Adding results as annotations...'):
                 hmm_name = os.path.basename(hmm)
                 glen = len(genome)
                 for frame in hmm_results:
                     for hit in frame:
                         frec = translation[hit.id]
                         start = frec.annotations['start']
                         strand = frec.annotations['strand']
                         for hsp in hit:
                             if strand == 1:
                                 hmm_location = FeatureLocation(
                                     start + hsp.hit_start * 3,
                                     start + hsp.hit_end * 3, strand)
                             else:
                                 hmm_location = FeatureLocation(
                                     glen - start - hsp.hit_end * 3,
                                     glen - start - hsp.hit_start * 3,
                                     strand)
                             hmm_feature = self.hsp2feature(
                                 hmm_name, 'HMM_annotations', hmm_location,
                                 hsp)
                             genome.features.append(hmm_feature)
     return results if results else None
Beispiel #6
0
 def blastp_annotate(self, tag_sequences, subject_record, min_identity, evalue=0.001, table=11, **kwargs):
     # translate subject in six frames
     with user_message('Translating whole genome in 6 reading frames', '\n'):
         translator = Translator(self._abort_event)
         translation = translator.translate_six_frames(subject_record, table)
     if not translation: return False
     results = self.s2s_blast_batch(tag_sequences, translation, evalue=evalue, command='blastp', **kwargs)
     if results is None: return False
     with user_message('Adding results as annotations...'):
         annotated = False
         subj_len = len(subject_record)
         for i, tag in enumerate(tag_sequences):
             if not results[i]: continue
             tag_name = pretty_rec_name(tag)
             if tag_name != tag.id:
                 tag_name += ' (%s)' % tag.id
             for frame, record in enumerate(results[i]):
                 if not record: continue
                 frec = translation[frame]
                 start = frec.annotations['start']
                 strand = frec.annotations['strand']
                 for hit in record:
                     for ali in hit.alignments:
                         for hsp in ali.hsps:
                             if hsp.identities / float(hsp.align_length) < min_identity: continue
                             if strand == 1:
                                 location = FeatureLocation(start+(hsp.sbjct_start-1)*3,
                                                            start+hsp.sbjct_end*3,
                                                            strand)
                             else:
                                 location = FeatureLocation(subj_len-start-hsp.sbjct_end*3,
                                                            subj_len-start-hsp.sbjct_start*3,
                                                            strand)
                             feature = self.hsp2feature(tag_name, 'blastp_annotations', location, hsp)
                             self.add_program(feature, 'blastp')
                             subject_record.features.append(feature)
                             annotated = True
     return annotated
Beispiel #7
0
 def g2g_blastp(self, reference, subjects, table='Standard', 
                evalue=0.001, max_rlen=0, features_of_interest=None):
     '''
     Perform blastp of each coding sequence of the reference against each 
     subject, which is first translated gene-by-gene.
     Parameters
     @param reference: SeqRecord object of the reference genome
     @param subjects: a list of SeqRecord objects of subject genomes
     @param table: translation table number (see NCBI site for description)
     @param evalue: filter out blastp results with E-value grater than this
     @param max_rlen: filter out blastp results which are shorter than this 
     fraction of target gene length
     @param features_of_interest: list of dictionaries of the form 
     {qualifier_name : qualifier_value}
     to mark features denoting known clusters that should be analyzed one 
     against the other
     @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) 
     where CDS is a gene/CDS feature from the reference.features list 
     and blast_resultN is a list of results for the N-th  
     subject, containing following information:
     (hit_feature, align_length, percent_identity, evalue)
     where hit_feature is a SeqFeature object of the gene/CDS of the subject
     where top blast hit is located, align_length is the length of the hit,
     percent_identity is the ratio of number of identities and align_length [0; 1]
     and evalue is the E-value of the top hit.
     '''
     if not reference or not subjects:
         print 'No reference or subject sequences provided' 
         return None
     #get list of features to query
     with user_message('Searching for gene/CDS features in provided sequences...'):
         all_records = [reference]+subjects
         num_records = len(all_records)
         features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), 
                                          range(num_records), 
                                          all_records)
         if self.aborted():
             print '\nAborted'
             return None
         if not features or not features[0]:
             print ('\nReference sequence does not contain annotated genes:\n%s %s' 
                    % (reference.id, reference.description))
             return None
         if len([f for f in features if f]) < 2:
             print '\nSubject sequences do not contain annotated genes'
             return None
         #add gene ids
         for ri, genes in enumerate(features):
             if not genes: continue
             r = all_records[ri]
             for gene_id, gi in enumerate(genes):
                 r.features[gi].qualifiers['feature_id'] = gi
                 r.features[gi].qualifiers['gene_id'] = gene_id
     #get features of interest if requested
     fois = None
     if features_of_interest:
         with user_message('Searching for features of interest...'):
             fois = []
             for foi in features_of_interest:
                 foi = self._get_fois(all_records, foi)
                 if foi and foi[0]: fois.append(foi)
                 if self.aborted():
                     print '\nAborted'
                     return None
     #translate features to proteins
     with Progress('Translating genes found in the reference and subjects...', num_records) as prg:
         translator = Translator(self._abort_event)
         translations = [None]*num_records
         foi_translations = [[None]*num_records for _f in fois]
         for i, (f, rec) in enumerate(zip(features, all_records)):
             if not f:
                 prg.step(i) 
                 continue
             translation = translator.translate(rec, f, table)
             if not translation: return None 
             if i > 0: 
                 translations[i] = cat_records(translation)
                 if fois:
                     for ifoi, foi in enumerate(fois):
                         foi_loc = [0, 0]
                         for foi_var in foi[i]: 
                             if not foi_var: continue
                             for gid in foi_var:
                                 l = translations[i].features[gid].location
                                 foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1
                                 foi_loc[1] = max(int(l.end), foi_loc[1])
                         if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc 
             else: 
                 translations[i] = translation
                 if fois: 
                     for ifoi, foi in enumerate(fois):
                         foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]]
             prg.step(i)
     #blast features against subjects
     with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'):
         stranslations = translations[1:]
         blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, 
                                               command='blastp', task='blastp')
         if self.aborted():
             print '\nAborted'
             return None
         if not blast_results:
             print '\nBlast have not returned any results.' 
             return None
     if fois: #redo blast for fois and replace the results
         with user_message('Rerunning blast for FOIs...', '\n'):
             for ifoi, foi in enumerate(foi_translations):
                 sfoi_locs = foi[1:]
                 for i, foi_var in enumerate(foi[0]):
                     foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, 
                                                       command='blastp', task='blastp')
                     if self.aborted():
                         print '\nAborted'
                         return None
                     if not foi_blast: continue
                     for gi, gid in enumerate(fois[ifoi][0][i]):
                         if foi_blast[gi]:
                             blast_results[gid] = foi_blast[gi]
     #process blast results
     pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations))))
     with ProgressCounter('Searching for genes in subjects that overlap with top blast hits...', len(pairs)) as prg:
         work = self.Work()
         work.start_work(self._find_features_by_hsps, pairs,
                         None, stranslations, blast_results)
         @MultiprocessingBase.results_assembler
         def assembler(index, result, blast_results, pairs, prg):
             qs = pairs[index]
             blast_results[qs[0]][qs[1]] = result
             prg.count()
         work.assemble(assembler, blast_results, pairs, prg)
         if not work.wait(): return None
     return zip((reference.features[f] for f in features[0]), blast_results)
Beispiel #8
0
 def g2g_blastp(self, reference, subjects, table='Standard', 
                evalue=0.001, max_rlen=0, features_of_interest=None):
     '''
     Perform blastp of each coding sequence of the reference against each 
     subject, which is first translated gene-by-gene.
     Parameters
     @param reference: SeqRecord object of the reference genome
     @param subjects: a list of SeqRecord objects of subject genomes
     @param table: translation table number (see NCBI site for description)
     @param evalue: filter out blastp results with E-value grater than this
     @param max_rlen: filter out blastp results which are shorter than this 
     fraction of target gene length
     @param features_of_interest: list of dictionaries of the form 
     {qualifier_name : qualifier_value}
     to mark features denoting known clusters that should be analyzed one 
     against the other
     @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) 
     where CDS is a gene/CDS feature from the reference.features list 
     and blast_resultN is a list of results for the N-th  
     subject, containing following information:
     (hit_feature, align_length, percent_identity, evalue)
     where hit_feature is a SeqFeature object of the gene/CDS of the subject
     where top blast hit is located, align_length is the length of the hit,
     percent_identity is the ratio of number of identities and align_length [0; 1]
     and evalue is the E-value of the top hit.
     '''
     if not reference or not subjects:
         print 'No reference or subject sequences provided' 
         return None
     #get list of features to query
     with user_message('Searching for gene/CDS features in provided sequences...'):
         all_records = [reference]+subjects
         num_records = len(all_records)
         features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), 
                                          range(num_records), 
                                          all_records)
         if self.aborted():
             print '\nAborted'
             return None
         if not features or not features[0]:
             print ('\nReference sequence does not contain annotated _genes:\n%s %s'
                    % (reference.id, reference.description))
             return None
         if len([f for f in features if f]) < 2:
             print '\nSubject sequences do not contain annotated _genes'
             return None
         #add gene ids
         for ri, genes in enumerate(features):
             if not genes: continue
             r = all_records[ri]
             for gene_id, gi in enumerate(genes):
                 r.features[gi].qualifiers['feature_id'] = gi
                 r.features[gi].qualifiers['gene_id'] = gene_id
     #get features of interest if requested
     fois = None
     if features_of_interest:
         with user_message('Searching for features of interest...'):
             fois = []
             for foi in features_of_interest:
                 foi = self._get_fois(all_records, foi)
                 if foi and foi[0]: fois.append(foi)
                 if self.aborted():
                     print '\nAborted'
                     return None
     #translate features to proteins
     with Progress('Translating _genes found in the reference and subjects...', num_records) as prg:
         translator = Translator(self._abort_event)
         translations = [None]*num_records
         foi_translations = [[None]*num_records for _f in fois]
         for i, (f, rec) in enumerate(zip(features, all_records)):
             if not f:
                 prg.step(i) 
                 continue
             translation = translator.translate_features(rec, f, table)
             if not translation: return None 
             if i > 0: 
                 translations[i] = cat_records(translation)
                 if fois:
                     for ifoi, foi in enumerate(fois):
                         foi_loc = [0, 0]
                         for foi_var in foi[i]: 
                             if not foi_var: continue
                             for gid in foi_var:
                                 l = translations[i].features[gid].location
                                 foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1
                                 foi_loc[1] = max(int(l.end), foi_loc[1])
                         if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc 
             else: 
                 translations[i] = translation
                 if fois: 
                     for ifoi, foi in enumerate(fois):
                         foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]]
             prg.step(i)
     #blast features against subjects
     with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'):
         stranslations = translations[1:]
         blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, 
                                               command='blastp', task='blastp')
         if self.aborted():
             print '\nAborted'
             return None
         if not blast_results:
             print '\nBlast have not returned any results.' 
             return None
     if fois: #redo blast for fois and replace the results
         with user_message('Rerunning blast for FOIs...', '\n'):
             for ifoi, foi in enumerate(foi_translations):
                 sfoi_locs = foi[1:]
                 for i, foi_var in enumerate(foi[0]):
                     foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, 
                                                       command='blastp', task='blastp')
                     if self.aborted():
                         print '\nAborted'
                         return None
                     if not foi_blast: continue
                     for gi, gid in enumerate(fois[ifoi][0][i]):
                         if foi_blast[gi]:
                             blast_results[gid] = foi_blast[gi]
     #process blast results
     pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations))))
     with ProgressCounter('Searching for _genes in subjects that overlap with top blast hits...', len(pairs)) as prg:
         work = self.Work()
         work.start_work(self._find_features_by_hsps, pairs,
                         None, stranslations, blast_results)
         @MultiprocessingBase.results_assembler
         def assembler(index, result, blast_results, pairs, prg):
             qs = pairs[index]
             blast_results[qs[0]][qs[1]] = result
             prg.count()
         work.assemble(assembler, blast_results, pairs, prg)
         if not work.wait(): return None
     return zip((reference.features[f] for f in features[0]), blast_results)