Exemple #1
0
 def findClusters(gene_sequences):
     # blast all against 1
     sequences = [e[1] for e in gene_sequences]
     randi = random.randint(0, len(sequences)-1)
     bools, _ = atools.blast(sequences, sequences[randi], minoverlap,
                             logger, wd, threads)
     # how many species had sequences in the cluster?
     cluster_sequences = [gene_sequences[i] for i, e in enumerate(bools)
                          if e]
     nspp = len(set([e[0] for e in cluster_sequences]))
     pspp = float(nspp)/tot_nspp
     # if more than 50% and 5 species ...
     if pspp > 0.5 and nspp > 5:
         # return cluster, remove those sequences from gene_sequences
         gene_sequences = [gene_sequences[i] for i, e in enumerate(bools)
                           if not e]
         return cluster_sequences, gene_sequences
     return None, gene_sequences
Exemple #2
0
 def _filter(self, sequences):
     """Filter sequences by BLASTing"""
     # choose random species for query
     randn = random.randint(0, len(sequences)-1)
     query = sequences
     subj = [sequences[randn]]
     # blast rand seq against all other seqs
     blast_bool, _ = atools.blast(query, subj, self.minoverlap, self.logger,
                                  wd=self.wd, threads=self.threads)
     # filtered are all sequences that are true
     filtered = [sequences[i] for i, e in enumerate(blast_bool) if e]
     # sequence pool are all sequences that are false
     seqpool = [sequences[i] for i, e in enumerate(blast_bool) if not e]
     # return filtered if there are more than votesize sequences in
     #  filtered
     if len(filtered) > self.votesize:
         return filtered, seqpool
     # else return empty list of filtered and the sequences
     else:
         return [], sequences