def pick_ref_contaminants(queries, ref_db_fp, input_fasta_fp, contaminant_similarity, output_dir):
    # Blast against contaminant DB

    clusters, failures, seeds = get_clusters_from_fasta_filepath(
        input_fasta_fp,
        input_fasta_fp,
        percent_ID=contaminant_similarity,
        max_accepts=1,
        max_rejects=8, 
        stepwords=8,
        word_length=8,
        optimal=False,
        exact=False,
        suppress_sort=False,
        output_dir=output_dir,
        enable_rev_strand_matching=False,
        subject_fasta_filepath=ref_db_fp,
        suppress_new_clusters=True,
        return_cluster_maps=True,
        stable_sort=False,
        save_uc_files=True,
        HALT_EXEC=False)

    # Pick seqs that fail the similarity to contaminants rule

    ref_contaminants = set(queries) - set(failures)

    return(ref_contaminants)
    def test_get_clusters_from_fasta_filepath_rev_strand_match(self):
        """ Test OTUs from filepath functions with rev strand match
        """
        # seq and its rc don't cluster when enable_rev_strand_matching = False
        expected_cluster_list = [['uclust_test_seqs_0'],
                                 ['uclust_test_seqs_0_rc']]
        expected_failure_list = []
        expected_new_seed_list = [
            'uclust_test_seqs_0',
            'uclust_test_seqs_0_rc']
        clusters_res = \
            get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
                                             original_fasta_path=None, save_uc_files=False,
                                             percent_ID=0.90, enable_rev_strand_matching=False)

        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (expected_cluster_list,
                                        expected_failure_list,
                                        expected_new_seed_list))

        # seq and its rc cluster when enable_rev_strand_matching = False
        expected_cluster_list = [
            ['uclust_test_seqs_0', 'uclust_test_seqs_0_rc']]
        expected_failure_list = []
        expected_new_seed_list = ['uclust_test_seqs_0']
        clusters_res = \
            get_clusters_from_fasta_filepath(self.tmp_raw_dna_seqs_rc_filepath,
                                             original_fasta_path=None, save_uc_files=False,
                                             percent_ID=0.90, enable_rev_strand_matching=True)

        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (expected_cluster_list,
                                        expected_failure_list,
                                        expected_new_seed_list))
    def test_get_clusters_from_fasta_filepath(self):
        """ Tests for return of lists of OTUs from given fasta filepath """

        clusters_res = \
            get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
                                             original_fasta_path=None, percent_ID=0.90, save_uc_files=False)
        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (expected_cluster_list,
                                        expected_failure_list,
                                        expected_new_seed_list))
    def test_get_clusters_from_fasta_filepath_optimal(self):
        """ Test OTUs from filepath functions with optimal
        """
        # need to compile a small test where optimal has an affect --
        # this currently is only testing that we don't get a failure with
        # optimal
        clusters_res = \
            get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
                                             original_fasta_path=None, save_uc_files=False,
                                             percent_ID=0.90, optimal=True)
        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()

        self.assertEqual(clusters_res, (expected_cluster_list,
                                        expected_failure_list,
                                        expected_new_seed_list))
    def test_get_clusters_from_fasta_filepath_suppress_sort(self):
        """ Test OTUs from filepath functions with suppress sort
        """
        expected = [['uclust_test_seqs_0'], ['uclust_test_seqs_1'],
                    ['uclust_test_seqs_2'], ['uclust_test_seqs_3'],
                    ['uclust_test_seqs_4'], ['uclust_test_seqs_5'],
                    ['uclust_test_seqs_6', 'uclust_test_seqs_8'],
                    ['uclust_test_seqs_7'], ['uclust_test_seqs_9']]
        clusters_res = \
            get_clusters_from_fasta_filepath(self.tmp_unsorted_fasta_filepath,
                                             original_fasta_path=None,
                                             percent_ID=0.90, suppress_sort=True, save_uc_files=False)
        expected_cluster_list.sort()
        expected_failure_list.sort()
        expected_new_seed_list.sort()
        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()

        self.assertEqual(clusters_res, (expected_cluster_list,
                                        expected_failure_list,
                                        expected_new_seed_list))
    def test_get_clusters_from_fasta_filepath_extending_reference_db(self):
        """ Correct clusters when clustering against db and adding new clusters
        """
        clusters_res = get_clusters_from_fasta_filepath(
            self.tmp_unsorted_fasta_filepath,
            original_fasta_path=None,
            max_accepts=7, max_rejects=12,
            percent_ID=0.90,
            subject_fasta_filepath=self.ref_dna_seqs_fp,
            suppress_new_clusters=False, enable_rev_strand_matching=True,
            HALT_EXEC=False,
            save_uc_files=False)

        self.ref_test_clusters2.sort()
        self.ref_test_failures2.sort()
        self.ref_test_new_seeds2.sort()

        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (self.ref_test_clusters2,
                                        self.ref_test_failures2,
                                        self.ref_test_new_seeds2))
    def test_get_clusters_from_fasta_filepath_reference_db_only(self):
        """ Correct clusters returned when clustering against a database only
        """
        clusters_res = get_clusters_from_fasta_filepath(
            self.tmp_unsorted_fasta_filepath,
            original_fasta_path=None,
            save_uc_files=False,
            max_accepts=7, max_rejects=12,
            percent_ID=0.90,
            subject_fasta_filepath=self.ref_dna_seqs_fp,
            suppress_new_clusters=True,
            HALT_EXEC=False)

        self.ref_test_clusters1.sort()
        self.ref_test_failures1.sort()
        self.ref_test_new_seeds1.sort()

        clusters_res[0].sort()
        clusters_res[1].sort()
        clusters_res[2].sort()
        self.assertEqual(clusters_res, (self.ref_test_clusters1,
                                        self.ref_test_failures1,
                                        self.ref_test_new_seeds1))
def select_unique_rand_bcs(rand_bcs, unique_threshold):
    """
    Attempts to select true barcodes from set of barcodes
    i.e. removes barcodes that might be artifacts
    due to sequencing errors.
    Uses uclust to remove barcodes that are similar thatn
    threshold.
    Parameters
    ----------
    rand_bcs: list
    unique_threshold: float
    Returns
    ----------
    unique_rand_bcs: set
        set of unique random barcodes.
    """
    temp_dir = get_qiime_temp_dir()
    fasta_fd, fasta_tempfile_name = mkstemp(dir=temp_dir,
                                            prefix='tmp',
                                            suffix='.fas')
    rand_bcs = set(rand_bcs)

    with open(fasta_tempfile_name, 'w') as fasta_tempfile:
        for rand_bc in rand_bcs:
            fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc))
    fasta_tempfile.close()

    _, _, unique_rand_bcs = get_clusters_from_fasta_filepath(
        fasta_tempfile_name,
        original_fasta_path=None,
        percent_ID=unique_threshold,
        save_uc_files=False,
        output_dir=temp_dir)

    unique_rand_bcs = set(unique_rand_bcs)
    remove_files([fasta_tempfile_name])
    return unique_rand_bcs
def select_unique_rand_bcs(rand_bcs, unique_threshold):
    """
    Attempts to select true barcodes from set of barcodes
    i.e. removes barcodes that might be artifacts
    due to sequencing errors.
    Uses uclust to remove barcodes that are similar thatn
    threshold.
    Parameters
    ----------
    rand_bcs: list
    unique_threshold: float
    Returns
    ----------
    unique_rand_bcs: set
        set of unique random barcodes.
    """
    temp_dir = get_qiime_temp_dir()
    fasta_fd, fasta_tempfile_name = mkstemp(
        dir=temp_dir, prefix='tmp', suffix='.fas')
    rand_bcs = set(rand_bcs)

    with open(fasta_tempfile_name, 'w') as fasta_tempfile:
        for rand_bc in rand_bcs:
            fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc))
    fasta_tempfile.close()

    _, _, unique_rand_bcs = get_clusters_from_fasta_filepath(
        fasta_tempfile_name,
        original_fasta_path=None,
        percent_ID=unique_threshold,
        save_uc_files=False,
        output_dir=temp_dir)

    unique_rand_bcs = set(unique_rand_bcs)
    remove_files([fasta_tempfile_name])
    return unique_rand_bcs