def test_get_clusters_from_fasta_filepath_extending_reference_db(self): """ Correct clusters when clustering against db and adding new clusters """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path=None, max_accepts=7, max_rejects=12, percent_ID=0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=False, enable_rev_strand_matching=True, HALT_EXEC=False, save_uc_files=False) self.ref_test_clusters2.sort() self.ref_test_failures2.sort() self.ref_test_new_seeds2.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (self.ref_test_clusters2, self.ref_test_failures2, self.ref_test_new_seeds2))
def test_get_clusters_from_fasta_filepath_rev_strand_match(self): """ Test OTUs from filepath functions with rev strand match """ # seq and its rc don't cluster when enable_rev_strand_matching = False expected_cluster_list = [['uclust_test_seqs_0'], ['uclust_test_seqs_0_rc']] expected_failure_list = [] expected_new_seed_list = [ 'uclust_test_seqs_0', 'uclust_test_seqs_0_rc' ] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list)) # seq and its rc cluster when enable_rev_strand_matching = False expected_cluster_list = [[ 'uclust_test_seqs_0', 'uclust_test_seqs_0_rc' ]] expected_failure_list = [] expected_new_seed_list = ['uclust_test_seqs_0'] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_rev_strand_match(self): """ Test OTUs from filepath functions with rev strand match """ # seq and its rc don't cluster when enable_rev_strand_matching = False expected_cluster_list = [['uclust_test_seqs_0'], ['uclust_test_seqs_0_rc']] expected_failure_list = [] expected_new_seed_list = [ 'uclust_test_seqs_0', 'uclust_test_seqs_0_rc'] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list)) # seq and its rc cluster when enable_rev_strand_matching = False expected_cluster_list = [ ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']] expected_failure_list = [] expected_new_seed_list = ['uclust_test_seqs_0'] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath(self): """ Tests for return of lists of OTUs from given fasta filepath """ clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path=None, percent_ID=0.90, save_uc_files=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath(self): """ Tests for return of lists of OTUs from given fasta filepath """ clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, \ original_fasta_path = None, percent_ID = 0.90, save_uc_files=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_optimal(self): """ Test OTUs from filepath functions with optimal """ # need to compile a small test where optimal has an affect -- # this currently is only testing that we don't get a failure with # optimal clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, percent_ID = 0.90, optimal = True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_optimal(self): """ Test OTUs from filepath functions with optimal """ # need to compile a small test where optimal has an affect -- # this currently is only testing that we don't get a failure with # optimal clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path = None, save_uc_files=False, percent_ID = 0.90, optimal = True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_suppress_sort(self): """ Test OTUs from filepath functions with suppress sort """ expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'], ['uclust_test_seqs_2'], ['uclust_test_seqs_3'], ['uclust_test_seqs_4'], ['uclust_test_seqs_5'], ['uclust_test_seqs_6', 'uclust_test_seqs_8'], ['uclust_test_seqs_7'], ['uclust_test_seqs_9']] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, percent_ID = 0.90, suppress_sort = True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_suppress_sort(self): """ Test OTUs from filepath functions with suppress sort """ expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'], ['uclust_test_seqs_2'], ['uclust_test_seqs_3'], ['uclust_test_seqs_4'], ['uclust_test_seqs_5'], ['uclust_test_seqs_6', 'uclust_test_seqs_8'], ['uclust_test_seqs_7'], ['uclust_test_seqs_9']] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path = None, percent_ID = 0.90, suppress_sort = True, save_uc_files=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_reference_db_only(self): """ Correct clusters returned when clustering against a database only """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, max_accepts=7, max_rejects=12, percent_ID=0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=True, HALT_EXEC=False) self.ref_test_clusters1.sort() self.ref_test_failures1.sort() self.ref_test_new_seeds1.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (self.ref_test_clusters1, self.ref_test_failures1, self.ref_test_new_seeds1))
def test_get_clusters_from_fasta_filepath_extending_reference_db(self): """ Correct clusters when clustering against db and adding new clusters """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path = None, max_accepts=7,max_rejects=12, percent_ID = 0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=False,enable_rev_strand_matching=True, HALT_EXEC=False, save_uc_files=False) self.ref_test_clusters2.sort() self.ref_test_failures2.sort() self.ref_test_new_seeds2.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(self.ref_test_clusters2, self.ref_test_failures2, self.ref_test_new_seeds2))
def test_get_clusters_from_fasta_filepath_reference_db_only(self): """ Correct clusters returned when clustering against a database only """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path = None, save_uc_files=False, max_accepts=7,max_rejects=12, percent_ID = 0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=True, HALT_EXEC=False) self.ref_test_clusters1.sort() self.ref_test_failures1.sort() self.ref_test_new_seeds1.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(self.ref_test_clusters1, self.ref_test_failures1, self.ref_test_new_seeds1))
def __call__( self, seq_fp, refseqs_fp, next_new_cluster_number=None, new_cluster_identifier=None, result_path=None, log_path=None, failure_path=None, HALT_EXEC=False, ): original_fasta_path = seq_fp prefilter_identical_sequences = self.Params["prefilter_identical_sequences"] if new_cluster_identifier: self.Params["new_cluster_identifier"] = new_cluster_identifier if next_new_cluster_number != None: self.Params["next_new_cluster_number"] = next_new_cluster_number self.files_to_remove = [] if self.Params["presort_by_abundance"]: # seq path will become the temporary sorted sequences # filepath, to be cleaned up after the run seq_fp = self._presort_by_abundance(seq_fp) self.files_to_remove.append(seq_fp) # Collapse idetical sequences to a new file if prefilter_identical_sequences: exact_match_id_map, seq_fp = self._apply_identical_sequences_prefilter(seq_fp) # perform the clustering cluster_map, failures, new_seeds = get_clusters_from_fasta_filepath( seq_fp, original_fasta_path, subject_fasta_filepath=refseqs_fp, percent_ID=self.Params["Similarity"], enable_rev_strand_matching=self.Params["enable_rev_strand_matching"], max_accepts=self.Params["max_accepts"], max_rejects=self.Params["max_rejects"], stepwords=self.Params["stepwords"], word_length=self.Params["word_length"], suppress_new_clusters=self.Params["suppress_new_clusters"], optimal=self.Params["optimal"], exact=self.Params["exact"], suppress_sort=self.Params["suppress_sort"], return_cluster_maps=True, stable_sort=self.Params["stable_sort"], save_uc_files=self.Params["save_uc_files"], output_dir=self.Params["output_dir"], HALT_EXEC=HALT_EXEC, ) # expand identical sequences to create full OTU map if prefilter_identical_sequences: # expand the clusters (while retaining the names of # the clusters so we know which are new OTUs and # which are reference OTUs) cluster_names = cluster_map.keys() clusters = [cluster_map[c] for c in cluster_names] clusters = self._map_filtered_clusters_to_full_clusters(clusters, exact_match_id_map) cluster_map = dict(zip(cluster_names, clusters)) # expand failures temp_failures = [] for fa in failures: temp_failures.extend(exact_match_id_map[fa]) failures = temp_failures self._rename_clusters(cluster_map, new_seeds) # clean up any temp files that were created remove_files(self.files_to_remove) log_lines = [] log_lines.append("Reference seqs:%s" % refseqs_fp) log_lines.append("Num OTUs:%d" % len(cluster_map)) log_lines.append("Num new OTUs:%d" % len(new_seeds)) log_lines.append("Num failures:%d" % len(failures)) cluster_map = cluster_map.items() result = self._prepare_results(result_path, cluster_map, log_lines) if log_path: self._write_log(log_path, log_lines) if failure_path: self._write_failures(failure_path, failures) # return the result (note this is None if the data was # written to file) return result
def __call__(self, seq_path, result_path=None, log_path=None, HALT_EXEC=False): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. """ prefilter_identical_sequences = self.Params["prefilter_identical_sequences"] original_fasta_path = seq_path self.files_to_remove = [] if self.Params["presort_by_abundance"]: # seq path will become the temporary sorted sequences # filepath, to be cleaned up after the run seq_path = self._presort_by_abundance(seq_path) self.files_to_remove.append(seq_path) # Collapse idetical sequences to a new file if prefilter_identical_sequences: exact_match_id_map, seq_path = self._apply_identical_sequences_prefilter(seq_path) # perform the clustering clusters, failures, seeds = get_clusters_from_fasta_filepath( seq_path, original_fasta_path, percent_ID=self.Params["Similarity"], optimal=self.Params["optimal"], exact=self.Params["exact"], suppress_sort=self.Params["suppress_sort"], enable_rev_strand_matching=self.Params["enable_rev_strand_matching"], max_accepts=self.Params["max_accepts"], max_rejects=self.Params["max_rejects"], stepwords=self.Params["stepwords"], word_length=self.Params["word_length"], stable_sort=self.Params["stable_sort"], save_uc_files=self.Params["save_uc_files"], output_dir=self.Params["output_dir"], HALT_EXEC=HALT_EXEC, ) # clean up any temp files that were created remove_files(self.files_to_remove) log_lines = [] log_lines.append("Num OTUs:%d" % len(clusters)) # expand identical sequences to create full OTU map if prefilter_identical_sequences: clusters = self._map_filtered_clusters_to_full_clusters(clusters, exact_match_id_map) otu_id_prefix = self.Params["new_cluster_identifier"] if otu_id_prefix == None: clusters = enumerate(clusters) else: clusters = [("%s%d" % (otu_id_prefix, i), c) for i, c in enumerate(clusters)] result = self._prepare_results(result_path, clusters, log_lines) if log_path: self._write_log(log_path, log_lines) # return the result (note this is None if the data was # written to file) return result