def test_get_clusters_from_fasta_filepath_suppress_sort(self): """ Test OTUs from filepath functions with suppress sort """ expected = [ ["uclust_test_seqs_0"], ["uclust_test_seqs_1"], ["uclust_test_seqs_2"], ["uclust_test_seqs_3"], ["uclust_test_seqs_4"], ["uclust_test_seqs_5"], ["uclust_test_seqs_6", "uclust_test_seqs_8"], ["uclust_test_seqs_7"], ["uclust_test_seqs_9"], ] clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path=None, percent_ID=0.90, suppress_sort=True, save_uc_files=False, ) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_rev_strand_match(self): """ Test OTUs from filepath functions with rev strand match """ # seq and its rc don't cluster when enable_rev_strand_matching = False expected_cluster_list = [["uclust_test_seqs_0"], ["uclust_test_seqs_0_rc"]] expected_failure_list = [] expected_new_seed_list = ["uclust_test_seqs_0", "uclust_test_seqs_0_rc"] clusters_res = get_clusters_from_fasta_filepath( self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=False, ) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list)) # seq and its rc cluster when enable_rev_strand_matching = False expected_cluster_list = [["uclust_test_seqs_0", "uclust_test_seqs_0_rc"]] expected_failure_list = [] expected_new_seed_list = ["uclust_test_seqs_0"] clusters_res = get_clusters_from_fasta_filepath( self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=True, ) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_rev_strand_match(self): """ Test OTUs from filepath functions with rev strand match """ # seq and its rc don't cluster when enable_rev_strand_matching = False expected_cluster_list = [['uclust_test_seqs_0'], ['uclust_test_seqs_0_rc']] expected_failure_list = [] expected_new_seed_list = ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc'] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path = None, save_uc_files=False, percent_ID = 0.90, enable_rev_strand_matching = False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list)) # seq and its rc cluster when enable_rev_strand_matching = False expected_cluster_list = [['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']] expected_failure_list = [] expected_new_seed_list = ['uclust_test_seqs_0'] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path = None, save_uc_files=False, percent_ID = 0.90, enable_rev_strand_matching = True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath(self): """ Tests for return of lists of OTUs from given fasta filepath """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path=None, percent_ID=0.90, save_uc_files=False ) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath(self): """ Tests for return of lists of OTUs from given fasta filepath """ clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, \ original_fasta_path = None, percent_ID = 0.90, save_uc_files=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_optimal(self): """ Test OTUs from filepath functions with optimal """ # need to compile a small test where optimal has an affect -- # this currently is only testing that we don't get a failure with # optimal clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path = None, save_uc_files=False, percent_ID = 0.90, optimal = True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_suppress_sort(self): """ Test OTUs from filepath functions with suppress sort """ expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'], ['uclust_test_seqs_2'], ['uclust_test_seqs_3'], ['uclust_test_seqs_4'], ['uclust_test_seqs_5'], ['uclust_test_seqs_6', 'uclust_test_seqs_8'], ['uclust_test_seqs_7'], ['uclust_test_seqs_9']] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path = None, percent_ID = 0.90, suppress_sort = True, save_uc_files=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_extending_reference_db(self): """ Correct clusters when clustering against db and adding new clusters """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path = None, max_accepts=7,max_rejects=12, percent_ID = 0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=False,enable_rev_strand_matching=True, HALT_EXEC=False, save_uc_files=False) self.ref_test_clusters2.sort() self.ref_test_failures2.sort() self.ref_test_new_seeds2.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(self.ref_test_clusters2, self.ref_test_failures2, self.ref_test_new_seeds2))
def test_get_clusters_from_fasta_filepath_reference_db_only(self): """ Correct clusters returned when clustering against a database only """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path = None, save_uc_files=False, max_accepts=7,max_rejects=12, percent_ID = 0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=True, HALT_EXEC=False) self.ref_test_clusters1.sort() self.ref_test_failures1.sort() self.ref_test_new_seeds1.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res,(self.ref_test_clusters1, self.ref_test_failures1, self.ref_test_new_seeds1))
def __call__(self, seq_fp, refseqs_fp, next_new_cluster_number=None, new_cluster_identifier=None, result_path=None, log_path=None, failure_path=None, HALT_EXEC=False): original_fasta_path = seq_fp if new_cluster_identifier: self.Params['new_cluster_identifier'] = new_cluster_identifier if next_new_cluster_number != None: self.Params['next_new_cluster_number'] = next_new_cluster_number if self.Params['presort_by_abundance']: # seq path will become the temporary sorted sequences # filepath, to be cleaned up after the run seq_fp = self._presort_by_abundance(seq_fp) files_to_remove = [seq_fp] else: # create a dummy list of files to clean up files_to_remove = [] # perform the clustering cluster_map, failures, new_seeds = get_clusters_from_fasta_filepath( seq_fp, original_fasta_path, subject_fasta_filepath=refseqs_fp, percent_ID=self.Params['Similarity'], enable_rev_strand_matching=self. Params['enable_rev_strand_matching'], max_accepts=self.Params['max_accepts'], max_rejects=self.Params['max_rejects'], suppress_new_clusters=self.Params['suppress_new_clusters'], optimal=self.Params['optimal'], exact=self.Params['exact'], suppress_sort=self.Params['suppress_sort'], return_cluster_maps=True, stable_sort=self.Params['stable_sort'], save_uc_files=self.Params['save_uc_files'], output_dir=self.Params['output_dir'], HALT_EXEC=HALT_EXEC) self._rename_clusters(cluster_map, new_seeds) # clean up any temp files that were created remove_files(files_to_remove) log_lines = [] log_lines.append('Reference seqs:%s' % refseqs_fp) log_lines.append('Num OTUs:%d' % len(cluster_map)) log_lines.append('Num new OTUs:%d' % len(new_seeds)) log_lines.append('Num failures:%d' % len(failures)) cluster_map = cluster_map.items() result = self._prepare_results(result_path, cluster_map, log_lines) if log_path: self._write_log(log_path, log_lines) if failure_path: self._write_failures(failure_path, failures) # return the result (note this is None if the data was # written to file) return result
def __call__(self, seq_path, result_path=None, log_path=None, HALT_EXEC=False): """Returns dict mapping {otu_id:[seq_ids]} for each otu. Parameters: seq_path: path to file of sequences result_path: path to file of results. If specified, dumps the result to the desired path instead of returning it. log_path: path to log, which includes dump of params. """ original_fasta_path = seq_path if self.Params['presort_by_abundance']: # seq path will become the temporary sorted sequences # filepath, to be cleaned up after the run seq_path = self._presort_by_abundance(seq_path) files_to_remove = [seq_path] else: # create a dummy list of files to clean up files_to_remove = [] # perform the clustering clusters, failures, seeds = get_clusters_from_fasta_filepath( seq_path, original_fasta_path, percent_ID=self.Params['Similarity'], optimal=self.Params['optimal'], exact=self.Params['exact'], suppress_sort=self.Params['suppress_sort'], enable_rev_strand_matching=self. Params['enable_rev_strand_matching'], max_accepts=self.Params['max_accepts'], max_rejects=self.Params['max_rejects'], stable_sort=self.Params['stable_sort'], save_uc_files=self.Params['save_uc_files'], output_dir=self.Params['output_dir'], HALT_EXEC=HALT_EXEC) # clean up any temp files that were created remove_files(files_to_remove) log_lines = [] log_lines.append('Num OTUs:%d' % len(clusters)) otu_id_prefix = self.Params['new_cluster_identifier'] if otu_id_prefix == None: clusters = enumerate(clusters) else: clusters = [('%s%d' % (otu_id_prefix, i), c) for i, c in enumerate(clusters)] result = self._prepare_results(result_path, clusters, log_lines) if log_path: self._write_log(log_path, log_lines) # return the result (note this is None if the data was # written to file) return result