def pick_ref_contaminants(queries, ref_db_fp, input_fasta_fp, contaminant_similarity, output_dir): # Blast against contaminant DB clusters, failures, seeds = get_clusters_from_fasta_filepath( input_fasta_fp, input_fasta_fp, percent_ID=contaminant_similarity, max_accepts=1, max_rejects=8, stepwords=8, word_length=8, optimal=False, exact=False, suppress_sort=False, output_dir=output_dir, enable_rev_strand_matching=False, subject_fasta_filepath=ref_db_fp, suppress_new_clusters=True, return_cluster_maps=True, stable_sort=False, save_uc_files=True, HALT_EXEC=False) # Pick seqs that fail the similarity to contaminants rule ref_contaminants = set(queries) - set(failures) return(ref_contaminants)
def test_get_clusters_from_fasta_filepath_rev_strand_match(self): """ Test OTUs from filepath functions with rev strand match """ # seq and its rc don't cluster when enable_rev_strand_matching = False expected_cluster_list = [['uclust_test_seqs_0'], ['uclust_test_seqs_0_rc']] expected_failure_list = [] expected_new_seed_list = [ 'uclust_test_seqs_0', 'uclust_test_seqs_0_rc'] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list)) # seq and its rc cluster when enable_rev_strand_matching = False expected_cluster_list = [ ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']] expected_failure_list = [] expected_new_seed_list = ['uclust_test_seqs_0'] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, enable_rev_strand_matching=True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath(self): """ Tests for return of lists of OTUs from given fasta filepath """ clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path=None, percent_ID=0.90, save_uc_files=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_optimal(self): """ Test OTUs from filepath functions with optimal """ # need to compile a small test where optimal has an affect -- # this currently is only testing that we don't get a failure with # optimal clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path=None, save_uc_files=False, percent_ID=0.90, optimal=True) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_suppress_sort(self): """ Test OTUs from filepath functions with suppress sort """ expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'], ['uclust_test_seqs_2'], ['uclust_test_seqs_3'], ['uclust_test_seqs_4'], ['uclust_test_seqs_5'], ['uclust_test_seqs_6', 'uclust_test_seqs_8'], ['uclust_test_seqs_7'], ['uclust_test_seqs_9']] clusters_res = \ get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath, original_fasta_path=None, percent_ID=0.90, suppress_sort=True, save_uc_files=False) expected_cluster_list.sort() expected_failure_list.sort() expected_new_seed_list.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (expected_cluster_list, expected_failure_list, expected_new_seed_list))
def test_get_clusters_from_fasta_filepath_extending_reference_db(self): """ Correct clusters when clustering against db and adding new clusters """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path=None, max_accepts=7, max_rejects=12, percent_ID=0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=False, enable_rev_strand_matching=True, HALT_EXEC=False, save_uc_files=False) self.ref_test_clusters2.sort() self.ref_test_failures2.sort() self.ref_test_new_seeds2.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (self.ref_test_clusters2, self.ref_test_failures2, self.ref_test_new_seeds2))
def test_get_clusters_from_fasta_filepath_reference_db_only(self): """ Correct clusters returned when clustering against a database only """ clusters_res = get_clusters_from_fasta_filepath( self.tmp_unsorted_fasta_filepath, original_fasta_path=None, save_uc_files=False, max_accepts=7, max_rejects=12, percent_ID=0.90, subject_fasta_filepath=self.ref_dna_seqs_fp, suppress_new_clusters=True, HALT_EXEC=False) self.ref_test_clusters1.sort() self.ref_test_failures1.sort() self.ref_test_new_seeds1.sort() clusters_res[0].sort() clusters_res[1].sort() clusters_res[2].sort() self.assertEqual(clusters_res, (self.ref_test_clusters1, self.ref_test_failures1, self.ref_test_new_seeds1))
def select_unique_rand_bcs(rand_bcs, unique_threshold): """ Attempts to select true barcodes from set of barcodes i.e. removes barcodes that might be artifacts due to sequencing errors. Uses uclust to remove barcodes that are similar thatn threshold. Parameters ---------- rand_bcs: list unique_threshold: float Returns ---------- unique_rand_bcs: set set of unique random barcodes. """ temp_dir = get_qiime_temp_dir() fasta_fd, fasta_tempfile_name = mkstemp(dir=temp_dir, prefix='tmp', suffix='.fas') rand_bcs = set(rand_bcs) with open(fasta_tempfile_name, 'w') as fasta_tempfile: for rand_bc in rand_bcs: fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc)) fasta_tempfile.close() _, _, unique_rand_bcs = get_clusters_from_fasta_filepath( fasta_tempfile_name, original_fasta_path=None, percent_ID=unique_threshold, save_uc_files=False, output_dir=temp_dir) unique_rand_bcs = set(unique_rand_bcs) remove_files([fasta_tempfile_name]) return unique_rand_bcs
def select_unique_rand_bcs(rand_bcs, unique_threshold): """ Attempts to select true barcodes from set of barcodes i.e. removes barcodes that might be artifacts due to sequencing errors. Uses uclust to remove barcodes that are similar thatn threshold. Parameters ---------- rand_bcs: list unique_threshold: float Returns ---------- unique_rand_bcs: set set of unique random barcodes. """ temp_dir = get_qiime_temp_dir() fasta_fd, fasta_tempfile_name = mkstemp( dir=temp_dir, prefix='tmp', suffix='.fas') rand_bcs = set(rand_bcs) with open(fasta_tempfile_name, 'w') as fasta_tempfile: for rand_bc in rand_bcs: fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc)) fasta_tempfile.close() _, _, unique_rand_bcs = get_clusters_from_fasta_filepath( fasta_tempfile_name, original_fasta_path=None, percent_ID=unique_threshold, save_uc_files=False, output_dir=temp_dir) unique_rand_bcs = set(unique_rand_bcs) remove_files([fasta_tempfile_name]) return unique_rand_bcs