def hmmsearch_genes(self, hmms, genome, table='Standard', decorate=False, **kwargs): #get _genes genes = get_indexes_of_all_genes(genome) if not genes: return None for gene_id, gi in enumerate(genes): genome.features[gi].qualifiers['feature_id'] = gi genome.features[gi].qualifiers['gene_id'] = gene_id #translate _genes with user_message('Translating _genes/CDS of %s' % genome.description, '\n'): translator = Translator(self._abort_event) translation = translator.translate_features(genome, genes, table) if not translation: return None if isinstance(hmms, str): hmms = [hmms] results = dict() for hmm in hmms: with user_message('Performing hmm search.'): hmm_results = self.hmmsearch_recs(hmm, translation, **kwargs) if not hmm_results: return None with user_message('Parsing search results...'): #get hit_ids of hmm matches hits = dict() for result in hmm_results: for hit in result.iterhits(): hits[hit.id] = hit #get indexes of features where hmm hit hit_features = dict() for t in translation: if t.id in hits: fid = t.features[0].qualifiers.get('feature_id') if fid is None: continue hit_features[fid] = hits[t.id], t if hit_features: results.update(hit_features) #decorate genome if decorate: with user_message('Adding results as annotations...'): hmm_name = os.path.basename(hmm) for f in hit_features: feature = genome.features[f] for hsp in hit_features[f][0]: if feature.strand == 1: hmm_location = FeatureLocation( feature.location.start + hsp.hit_start * 3, feature.location.start + hsp.hit_end * 3, feature.strand) else: hmm_location = FeatureLocation( feature.location.end - hsp.hit_end * 3, feature.location.end - hsp.hit_start * 3, feature.strand) hmm_feature = self.hsp2feature( hmm_name, 'HMM_annotations', hmm_location, hsp) genome.features.append(hmm_feature) return results if results else None
def g2g_blastp(self, reference, subjects, table='Standard', evalue=0.001, max_rlen=0, features_of_interest=None): ''' Perform blastp of each coding sequence of the reference against each subject, which is first translated gene-by-gene. Parameters @param reference: SeqRecord object of the reference genome @param subjects: a list of SeqRecord objects of subject genomes @param table: translation table number (see NCBI site for description) @param evalue: filter out blastp results with E-value grater than this @param max_rlen: filter out blastp results which are shorter than this fraction of target gene length @param features_of_interest: list of dictionaries of the form {qualifier_name : qualifier_value} to mark features denoting known clusters that should be analyzed one against the other @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) where CDS is a gene/CDS feature from the reference.features list and blast_resultN is a list of results for the N-th subject, containing following information: (hit_feature, align_length, percent_identity, evalue) where hit_feature is a SeqFeature object of the gene/CDS of the subject where top blast hit is located, align_length is the length of the hit, percent_identity is the ratio of number of identities and align_length [0; 1] and evalue is the E-value of the top hit. ''' if not reference or not subjects: print 'No reference or subject sequences provided' return None #get list of features to query with user_message('Searching for gene/CDS features in provided sequences...'): all_records = [reference]+subjects num_records = len(all_records) features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), range(num_records), all_records) if self.aborted(): print '\nAborted' return None if not features or not features[0]: print ('\nReference sequence does not contain annotated _genes:\n%s %s' % (reference.id, reference.description)) return None if len([f for f in features if f]) < 2: print '\nSubject sequences do not contain annotated _genes' return None #add gene ids for ri, genes in enumerate(features): if not genes: continue r = all_records[ri] for gene_id, gi in enumerate(genes): r.features[gi].qualifiers['feature_id'] = gi r.features[gi].qualifiers['gene_id'] = gene_id #get features of interest if requested fois = None if features_of_interest: with user_message('Searching for features of interest...'): fois = [] for foi in features_of_interest: foi = self._get_fois(all_records, foi) if foi and foi[0]: fois.append(foi) if self.aborted(): print '\nAborted' return None #translate features to proteins with Progress('Translating _genes found in the reference and subjects...', num_records) as prg: translator = Translator(self._abort_event) translations = [None]*num_records foi_translations = [[None]*num_records for _f in fois] for i, (f, rec) in enumerate(zip(features, all_records)): if not f: prg.step(i) continue translation = translator.translate_features(rec, f, table) if not translation: return None if i > 0: translations[i] = cat_records(translation) if fois: for ifoi, foi in enumerate(fois): foi_loc = [0, 0] for foi_var in foi[i]: if not foi_var: continue for gid in foi_var: l = translations[i].features[gid].location foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1 foi_loc[1] = max(int(l.end), foi_loc[1]) if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc else: translations[i] = translation if fois: for ifoi, foi in enumerate(fois): foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]] prg.step(i) #blast features against subjects with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'): stranslations = translations[1:] blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not blast_results: print '\nBlast have not returned any results.' return None if fois: #redo blast for fois and replace the results with user_message('Rerunning blast for FOIs...', '\n'): for ifoi, foi in enumerate(foi_translations): sfoi_locs = foi[1:] for i, foi_var in enumerate(foi[0]): foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, command='blastp', task='blastp') if self.aborted(): print '\nAborted' return None if not foi_blast: continue for gi, gid in enumerate(fois[ifoi][0][i]): if foi_blast[gi]: blast_results[gid] = foi_blast[gi] #process blast results pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations)))) with ProgressCounter('Searching for _genes in subjects that overlap with top blast hits...', len(pairs)) as prg: work = self.Work() work.start_work(self._find_features_by_hsps, pairs, None, stranslations, blast_results) @MultiprocessingBase.results_assembler def assembler(index, result, blast_results, pairs, prg): qs = pairs[index] blast_results[qs[0]][qs[1]] = result prg.count() work.assemble(assembler, blast_results, pairs, prg) if not work.wait(): return None return zip((reference.features[f] for f in features[0]), blast_results)