Beispiel #1
0
 def find_clusters(self, files, output_dir=''):
     if not self.clusters: return False
     # load genomes
     genomes = SeqView.safe_load(files)
     if not genomes: return False
     # create dest dir if needed
     if output_dir and not os.path.isdir(output_dir):
         os.mkdir(output_dir)
     # process files
     results = {}
     glen = len(genomes)
     with ProgressCounter('Searching for %d clusters in %d sequence%s:' %
                                  (len(self.clusters), glen, 's' if glen>1 else ''), glen) as prg:
         work = self.Work()
         work.start_work(self._process_genome, genomes.keys(), None, genomes, output_dir)
         work.assemble(ordered_results_assembler, results, prg)
         if not work.wait(): return False
     # check results
     found = False
     for gi in results:
         res, gname = results[gi]
         if not res: continue
         found = True
         print('%s:\n\t%s\n' % (gname, '\n\t'.join(res)))
     if not found:
         print 'No putative clusters were found.'
     return True
Beispiel #2
0
 def _s2s_blast_batch(self, queries, subjects, subject_locs=None, evalue=0.001, command='blastn', **kwargs):
     queries_len = len(queries)
     subjects_len = len(subjects)
     results = [[None for _s in subjects] for _q in queries]
     pairs = list(itertools.product(xrange(queries_len), xrange(subjects_len)))
     ignore_none_locs = kwargs.pop('ignore_none_locs', False)
     shuffle(pairs)
     @MultiprocessingBase.data_mapper
     @shelf_result
     def worker(qs, queries, subjects, subject_locs):
         query = queries[qs[0]]
         subject = subjects[qs[1]]
         if query is None or subject is None: return None
         if subject_locs:
             if subject_locs[qs[1]]: 
                 loc = tuple(subject_locs[qs[1]])
                 kwargs['subject_loc'] = '%d-%d' % loc
             elif ignore_none_locs: return None
             elif 'subject_loc' in kwargs: 
                 del kwargs['subject_loc']
         elif 'subject_loc' in kwargs: del kwargs['subject_loc']
         return BlastCLI.s2s_blast(query, subject, evalue, command, **kwargs)
     @MultiprocessingBase.results_assembler
     def assembler(index, hsps, results, pairs, prg):
         qs = pairs[index]
         results[qs[0]][qs[1]] = hsps
         prg.count()
     with ProgressCounter('Performing multiple %s searches:'%command, len(pairs)) as prg:
         work = self.Work()
         work.start_work(worker, pairs, None, queries, subjects, subject_locs)
         work.assemble(assembler, results, pairs, prg)
         if not work.wait(): return None
     return results
Beispiel #3
0
 def _filter_homologues(get_all_homologues, seqs, min_identity, keep_ids=None, nucleotide=False):
     print 'Filtering out close homologues. This will take a wile:'
     command = 'blastn' if nucleotide else 'blastp'
     dbname = ''
     try:
         with user_message('Formatting blast DB', '\n'):
             dbname = BlastCLI.format_tmp_db(seqs, nucleotide)
             if not dbname:
                 print 'Unable to make temporary BLAST database.'
                 return None
         with ProgressCounter('Searching for homologues using local blastp...', len(seqs)) as prg:
             homologues = get_all_homologues(seqs, min_identity, dbname, command, prg)
     except Exception as e:
         print '%s\n' % str(e)
         return None
     finally:
         if dbname:
             shutil.rmtree(os.path.dirname(dbname), ignore_errors=True)
     if not homologues: return seqs
     with user_message('Removing all homologs from each group except the first one...'):
         remove = set()
         if keep_ids: keep_ids = set(keep_ids)
         for seq in seqs:
             if seq.id in remove: continue
             h = homologues.pop(seq.id, set())
             if h:
                 if keep_ids:
                     nhoms = len(h)
                     h -= keep_ids
                     if nhoms != len(h) and seq.id not in keep_ids:
                         h.add(seq.id)
                 remove.update(h)
         return [seq for seq in seqs if seq.id not in remove]
Beispiel #4
0
 def fetch_queries(self, email, queries, from_dbs=None, **kwargs):
     if not email:
         raise ValueError('You should always provide a valid e-mail '
                          'to NCBI when performing an Entrez query.')
     # search the ID of each blast result, then fetch corresponding part of the sequence
     batch = ListDB()
     single = ListDB()
     fetched = []
     entrez = BatchEntrez(self._abort_event, email)
     with ProgressCounter('', 0, replace=False) as prg:
         for q in queries:
             if self.aborted(): return []
             if q.region: single[q.db] = q
             else: batch[q.db] = q
         for db in single:
             if not from_dbs or db in from_dbs:
                 for q in single[db]:
                     if self.aborted(): return []
                     fetched += self._fetch_query(q, entrez)
                     prg.count()
         for db in batch:
             if self.aborted(): return []
             if not from_dbs or db in from_dbs:
                 fetched += entrez.get_records_for_terms(
                     [q.term for q in batch[db]], db)
                 prg.count()
     return fetched
Beispiel #5
0
 def s2s_blast_batch(self, queries, subjects, subject_locs=None, evalue=0.001, command='blastn', **kwargs):
     results = self._s2s_blast_batch(queries, subjects, subject_locs, evalue, command, **kwargs)
     if not results: return None
     with ProgressCounter('Parsing results...', len(results)) as prg:
         for qi in xrange(len(results)):
             for si in xrange(len(results[qi])):
                 results_name = results[qi][si]
                 if not results_name: continue
                 with roDict(results_name) as db:
                     results[qi][si] = db['result']
             prg.count()
     return results
Beispiel #6
0
 def check_sequences(seqs, next_to_process):
     total = len(seqs)
     prg = ProgressCounter('RingBlast: checking %d new sequences:' % total, total)
     @MultiprocessingBase.data_mapper
     def worker(seq):
         res = self.blast_seq(seq, core_db, 100, command)
         if res and blast_filter: blast_filter(res)
         return bool(res), seq
     @MultiprocessingBase.results_assembler
     def assembler(i, res):
         prg.count()
         if not res[0]: return 
         seq = res[1]
         extended_set[self.base_sid(seq)] = seq
         next_to_process.append(seq)
     with prg: return self.parallelize2(1, worker, assembler, seqs)
Beispiel #7
0
 def extract_clusters(self):
     self.clusters = []
     genomes = SeqView.safe_load(self.files)
     if not genomes: return False
     glen = len(genomes)
     self.clusters = [None]*glen
     if self.order: self.order = [oid for oid in self.order if oid in genomes.keys()]
     with ProgressCounter('Extracting clusters from provided genomes:', glen) as prg:
         work = self.Work()
         work.start_work(self._process_genome, self.order or genomes.keys(), None, genomes)
         work.assemble(ordered_shelved_results_assembler, self.clusters, prg)
         if not work.wait(): return False
     self.clusters = list(chain.from_iterable(c for c in self.clusters if c))
     #generate gene colors if needed
     if not self.no_color and not self.colors:
         self._generate_gene_colors()
     return bool(self.clusters)
Beispiel #8
0
 def blast_filter_fetch(seqs):
     @MultiprocessingBase.data_mapper
     @shelf_result
     def worker(s):
         r = self.blast_seq(s, db, evalue, command, **kwargs)
         if r and blast_filter: blast_filter(r)
         if r: return self.fetch_results(r, db, what='alignment')
         return None
     results = []
     total = len(seqs)
     prg = ProgressCounter('Performing blast search for %d sequences:' % total, total)
     @MultiprocessingBase.results_assembler
     def assembler(i, res):
         if res: results.append(res)
         prg.count()
     with prg:
         if not self.parallelize2(1, worker, assembler, seqs): return None
         return results
Beispiel #9
0
 def g2g_blastp(self, reference, subjects, table='Standard', 
                evalue=0.001, max_rlen=0, features_of_interest=None):
     '''
     Perform blastp of each coding sequence of the reference against each 
     subject, which is first translated gene-by-gene.
     Parameters
     @param reference: SeqRecord object of the reference genome
     @param subjects: a list of SeqRecord objects of subject genomes
     @param table: translation table number (see NCBI site for description)
     @param evalue: filter out blastp results with E-value grater than this
     @param max_rlen: filter out blastp results which are shorter than this 
     fraction of target gene length
     @param features_of_interest: list of dictionaries of the form 
     {qualifier_name : qualifier_value}
     to mark features denoting known clusters that should be analyzed one 
     against the other
     @return: list of pairs (CDS, (blast_result1, blast_result2, ...)) 
     where CDS is a gene/CDS feature from the reference.features list 
     and blast_resultN is a list of results for the N-th  
     subject, containing following information:
     (hit_feature, align_length, percent_identity, evalue)
     where hit_feature is a SeqFeature object of the gene/CDS of the subject
     where top blast hit is located, align_length is the length of the hit,
     percent_identity is the ratio of number of identities and align_length [0; 1]
     and evalue is the E-value of the top hit.
     '''
     if not reference or not subjects:
         print 'No reference or subject sequences provided' 
         return None
     #get list of features to query
     with user_message('Searching for gene/CDS features in provided sequences...'):
         all_records = [reference]+subjects
         num_records = len(all_records)
         features = self.parallelize_work(1, lambda ri, records: self._get_genes(records[ri]), 
                                          range(num_records), 
                                          all_records)
         if self.aborted():
             print '\nAborted'
             return None
         if not features or not features[0]:
             print ('\nReference sequence does not contain annotated _genes:\n%s %s'
                    % (reference.id, reference.description))
             return None
         if len([f for f in features if f]) < 2:
             print '\nSubject sequences do not contain annotated _genes'
             return None
         #add gene ids
         for ri, genes in enumerate(features):
             if not genes: continue
             r = all_records[ri]
             for gene_id, gi in enumerate(genes):
                 r.features[gi].qualifiers['feature_id'] = gi
                 r.features[gi].qualifiers['gene_id'] = gene_id
     #get features of interest if requested
     fois = None
     if features_of_interest:
         with user_message('Searching for features of interest...'):
             fois = []
             for foi in features_of_interest:
                 foi = self._get_fois(all_records, foi)
                 if foi and foi[0]: fois.append(foi)
                 if self.aborted():
                     print '\nAborted'
                     return None
     #translate features to proteins
     with Progress('Translating _genes found in the reference and subjects...', num_records) as prg:
         translator = Translator(self._abort_event)
         translations = [None]*num_records
         foi_translations = [[None]*num_records for _f in fois]
         for i, (f, rec) in enumerate(zip(features, all_records)):
             if not f:
                 prg.step(i) 
                 continue
             translation = translator.translate_features(rec, f, table)
             if not translation: return None 
             if i > 0: 
                 translations[i] = cat_records(translation)
                 if fois:
                     for ifoi, foi in enumerate(fois):
                         foi_loc = [0, 0]
                         for foi_var in foi[i]: 
                             if not foi_var: continue
                             for gid in foi_var:
                                 l = translations[i].features[gid].location
                                 foi_loc[0] = min(int(l.start)+1, foi_loc[0]) if foi_loc[0] > 0 else int(l.start)+1
                                 foi_loc[1] = max(int(l.end), foi_loc[1])
                         if foi_loc[0] > 0: foi_translations[ifoi][i] = foi_loc 
             else: 
                 translations[i] = translation
                 if fois: 
                     for ifoi, foi in enumerate(fois):
                         foi_translations[ifoi][i] = [[translation[gid] for gid in foi_var] for foi_var in foi[i]]
             prg.step(i)
     #blast features against subjects
     with user_message('Performing local blast of every translated gene in the reference against every translated subject...', '\n'):
         stranslations = translations[1:]
         blast_results = self._s2s_blast_batch(translations[0], stranslations, None, evalue, 
                                               command='blastp', task='blastp')
         if self.aborted():
             print '\nAborted'
             return None
         if not blast_results:
             print '\nBlast have not returned any results.' 
             return None
     if fois: #redo blast for fois and replace the results
         with user_message('Rerunning blast for FOIs...', '\n'):
             for ifoi, foi in enumerate(foi_translations):
                 sfoi_locs = foi[1:]
                 for i, foi_var in enumerate(foi[0]):
                     foi_blast = self._s2s_blast_batch(foi_var, stranslations, sfoi_locs, evalue, 
                                                       command='blastp', task='blastp')
                     if self.aborted():
                         print '\nAborted'
                         return None
                     if not foi_blast: continue
                     for gi, gid in enumerate(fois[ifoi][0][i]):
                         if foi_blast[gi]:
                             blast_results[gid] = foi_blast[gi]
     #process blast results
     pairs = list(itertools.product(xrange(len(translations[0])), xrange(len(stranslations))))
     with ProgressCounter('Searching for _genes in subjects that overlap with top blast hits...', len(pairs)) as prg:
         work = self.Work()
         work.start_work(self._find_features_by_hsps, pairs,
                         None, stranslations, blast_results)
         @MultiprocessingBase.results_assembler
         def assembler(index, result, blast_results, pairs, prg):
             qs = pairs[index]
             blast_results[qs[0]][qs[1]] = result
             prg.count()
         work.assemble(assembler, blast_results, pairs, prg)
         if not work.wait(): return None
     return zip((reference.features[f] for f in features[0]), blast_results)