def test_cdhit_clusters_from_seqs(self): """cdhit_clusters_from_seqs returns expected clusters""" exp = [['cdhit_test_seqs_0'],['cdhit_test_seqs_1'],\ ['cdhit_test_seqs_2'],['cdhit_test_seqs_3'],\ ['cdhit_test_seqs_4'],['cdhit_test_seqs_5'],\ ['cdhit_test_seqs_6','cdhit_test_seqs_8'],\ ['cdhit_test_seqs_7'],['cdhit_test_seqs_9']] obs = cdhit_clusters_from_seqs(dna_seqs, DNA) self.assertEqual(obs, exp)
def __call__(self, seq_path, result_path=None, log_path=None, id_len=0, prefix_prefilter_length=None, trie_prefilter=False): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. id_len: if set, truncates ids to n chars (you don't want this!) prefix_prefilter_length: prefilters the sequence collection so sequences whose first prefix_prefilter_length characters are identical will automatically be grouped into the same OTU [off by default, 100 is typically a good value if this filtering is desired] -- useful for large sequence collections, when cdhit doesn't scale well trie_prefilter: prefilter the sequence collection such that all sequences which are a prefix of another sequence are clustered with the other sequence. Togther with cd-hit this is a non-heuristic filter reduces run time a lot. Still a bit slower than the prefix_prefilter toggled with prefix_prefilter_length. """ moltype = DNA log_lines = [] # create the params dict to pass to cd-hit-est -- IS THERE A # BETTER WAY TO MAKE self.Params INTO THE params DICT TO PASS # TO THE APPLICATION CONTROLLERS? cd_hit_params = copy(self.Params) del cd_hit_params['Application'] del cd_hit_params['Algorithm'] cd_hit_params['-d'] = id_len #turn off id truncation cd_hit_params['-g'] = "1" if (prefix_prefilter_length != None and trie_prefilter): log_lines.append("Both prefilters selected. Deactivate \ trie_prefilter") trie_prefilter = False if prefix_prefilter_length != None: log_lines.append(\ 'Prefix-based prefiltering, prefix length: %d' \ % prefix_prefilter_length ) seqs, filter_map = self._prefilter_exact_prefixes(\ MinimalFastaParser(open(seq_path)),prefix_prefilter_length) log_lines.append(\ 'Prefix-based prefiltering, post-filter num seqs: %d' \ % len(seqs)) elif trie_prefilter: log_lines.append(\ 'Trie-based prefiltering') seqs, filter_map = self._prefilter_with_trie(seq_path) log_lines.append(\ 'Trie-based prefiltering, post-filter num seqs: %d' \ % len(seqs)) else: log_lines.append('No prefix-based prefiltering.') # Load the seq path. Right now, cdhit_clusters_from_seqs # doesn't support being passed a file path even though the # seqs do get written to a fasta file before being passed # to cd-hit-est. We may want to change that in the future # to avoid the overhead of loading large sequence collections # during this step. seqs = LoadSeqs(seq_path, moltype=moltype, aligned=False, label_to_name=lambda x: x.split()[0]) # Get the clusters by running cd-hit-est against the # sequence collection clusters = cdhit_clusters_from_seqs(\ seqs=seqs,moltype=moltype,params=cd_hit_params) if prefix_prefilter_length != None or trie_prefilter: clusters = self._map_filtered_clusters_to_full_clusters(\ clusters,filter_map) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, 'w') for i, cluster in enumerate(clusters): of.write('%s\t%s\n' % (i, '\t'.join(cluster))) of.close() result = None log_lines.append('Result path: %s' % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary result = dict(enumerate(clusters)) log_lines.append('Result path: None, returned as dict.') if log_path: # if the user provided a log file path, log the run log_file = open(log_path, 'w') log_lines = [str(self)] + log_lines log_file.write('\n'.join(log_lines)) # return the result (note this is None if the data was # written to file) return result
def __call__( self, seq_path, result_path=None, log_path=None, id_len=0, prefix_prefilter_length=None, trie_prefilter=False ): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. id_len: if set, truncates ids to n chars (you don't want this!) prefix_prefilter_length: prefilters the sequence collection so sequences whose first prefix_prefilter_length characters are identical will automatically be grouped into the same OTU [off by default, 100 is typically a good value if this filtering is desired] -- useful for large sequence collections, when cdhit doesn't scale well trie_prefilter: prefilter the sequence collection such that all sequences which are a prefix of another sequence are clustered with the other sequence. Togther with cd-hit this is a non-heuristic filter reduces run time a lot. Still a bit slower than the prefix_prefilter toggled with prefix_prefilter_length. """ moltype = DNA log_lines = [] # create the params dict to pass to cd-hit-est -- IS THERE A # BETTER WAY TO MAKE self.Params INTO THE params DICT TO PASS # TO THE APPLICATION CONTROLLERS? cd_hit_params = copy(self.Params) del cd_hit_params["Application"] del cd_hit_params["Algorithm"] cd_hit_params["-d"] = id_len # turn off id truncation cd_hit_params["-g"] = "1" if prefix_prefilter_length != None and trie_prefilter: log_lines.append( "Both prefilters selected. Deactivate \ trie_prefilter" ) trie_prefilter = False if prefix_prefilter_length != None: log_lines.append("Prefix-based prefiltering, prefix length: %d" % prefix_prefilter_length) seqs, filter_map = self._prefilter_exact_prefixes( MinimalFastaParser(open(seq_path)), prefix_prefilter_length ) log_lines.append("Prefix-based prefiltering, post-filter num seqs: %d" % len(seqs)) elif trie_prefilter: log_lines.append("Trie-based prefiltering") seqs, filter_map = self._prefilter_with_trie(seq_path) log_lines.append("Trie-based prefiltering, post-filter num seqs: %d" % len(seqs)) else: log_lines.append("No prefix-based prefiltering.") # Load the seq path. Right now, cdhit_clusters_from_seqs # doesn't support being passed a file path even though the # seqs do get written to a fasta file before being passed # to cd-hit-est. We may want to change that in the future # to avoid the overhead of loading large sequence collections # during this step. seqs = LoadSeqs(seq_path, moltype=moltype, aligned=False, label_to_name=lambda x: x.split()[0]) # Get the clusters by running cd-hit-est against the # sequence collection clusters = cdhit_clusters_from_seqs(seqs=seqs, moltype=moltype, params=cd_hit_params) if prefix_prefilter_length != None or trie_prefilter: clusters = self._map_filtered_clusters_to_full_clusters(clusters, filter_map) if result_path: # if the user provided a result_path, write the # results to file with one tab-separated line per # cluster of = open(result_path, "w") for i, cluster in enumerate(clusters): of.write("%s\t%s\n" % (i, "\t".join(cluster))) of.close() result = None log_lines.append("Result path: %s" % result_path) else: # if the user did not provide a result_path, store # the clusters in a dict of {otu_id:[seq_ids]}, where # otu_id is arbitrary result = dict(enumerate(clusters)) log_lines.append("Result path: None, returned as dict.") if log_path: # if the user provided a log file path, log the run log_file = open(log_path, "w") log_lines = [str(self)] + log_lines log_file.write("\n".join(log_lines)) # return the result (note this is None if the data was # written to file) return result