def findClusters(gene_sequences): # blast all against 1 sequences = [e[1] for e in gene_sequences] randi = random.randint(0, len(sequences)-1) bools, _ = atools.blast(sequences, sequences[randi], minoverlap, logger, wd, threads) # how many species had sequences in the cluster? cluster_sequences = [gene_sequences[i] for i, e in enumerate(bools) if e] nspp = len(set([e[0] for e in cluster_sequences])) pspp = float(nspp)/tot_nspp # if more than 50% and 5 species ... if pspp > 0.5 and nspp > 5: # return cluster, remove those sequences from gene_sequences gene_sequences = [gene_sequences[i] for i, e in enumerate(bools) if not e] return cluster_sequences, gene_sequences return None, gene_sequences
def _filter(self, sequences): """Filter sequences by BLASTing""" # choose random species for query randn = random.randint(0, len(sequences)-1) query = sequences subj = [sequences[randn]] # blast rand seq against all other seqs blast_bool, _ = atools.blast(query, subj, self.minoverlap, self.logger, wd=self.wd, threads=self.threads) # filtered are all sequences that are true filtered = [sequences[i] for i, e in enumerate(blast_bool) if e] # sequence pool are all sequences that are false seqpool = [sequences[i] for i, e in enumerate(blast_bool) if not e] # return filtered if there are more than votesize sequences in # filtered if len(filtered) > self.votesize: return filtered, seqpool # else return empty list of filtered and the sequences else: return [], sequences