def tearDown(self):
     remove_files(set(self.files_to_remove))
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
Example #2
0
    def test_mothur_supported_version(self):
        """mothur is in path and version is supported """
        acceptable_version = (1, 25, 0)
        self.assertTrue(
            which("mothur"),
            "mothur not found. This may or may not be a problem depending on "
            + "which components of QIIME you plan to use.",
        )
        # mothur creates a log file in cwd, so create a tmp and cd there first
        log_file = join(get_qiime_temp_dir(), "mothur.log")
        command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file
        stdout, stderr, exit_Status = qiime_system_call(command)

        # remove log file
        remove_files([log_file], error_on_missing=False)

        version_string = stdout.strip().split(" ")[1].strip("v.")
        try:
            version = tuple(map(int, version_string.split(".")))
            pass_test = version == acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(
            pass_test,
            "Unsupported mothur version. %s is required, but running %s."
            % (".".join(map(str, acceptable_version)), version_string),
        )
Example #3
0
 def tearDown(self):
     if self._files_to_remove:
         remove_files(self._files_to_remove)
     if exists(self.output_dir):
         rmtree(self.output_dir)
     if exists(self.input_dir):
         rmtree(self.input_dir)
 def test_plot_heatmap(self):
     plot_heatmap(self.otu_table,
                  self.otu_table.ids(axis='observation'),
                  self.otu_table.ids(),
                  filename=self.tmp_heatmap_fpath)
     self.assertEqual(exists(self.tmp_heatmap_fpath), True)
     remove_files(set([self.tmp_heatmap_fpath]))
 def tearDown(self):
     remove_files(set(self.files_to_remove))
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
    def test_mothur_supported_version(self):
        """mothur is in path and version is supported """
        acceptable_version = (1, 25, 0)
        self.assertTrue(
            which('mothur'),
            "mothur not found. This may or may not be a problem depending on "
            + "which components of QIIME you plan to use.")
        # mothur creates a log file in cwd, so create a tmp and cd there first
        log_file = join(get_qiime_temp_dir(), 'mothur.log')
        command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file
        stdout, stderr, exit_Status = qiime_system_call(command)

        # remove log file
        remove_files([log_file], error_on_missing=False)

        version_string = stdout.strip().split(' ')[1].strip('v.')
        try:
            version = tuple(map(int, version_string.split('.')))
            pass_test = version == acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(
            pass_test,
            "Unsupported mothur version. %s is required, but running %s." %
            ('.'.join(map(str, acceptable_version)), version_string))
 def tearDown(self):
     if self._files_to_remove:
         remove_files(self._files_to_remove)
     if exists(self.output_dir):
         rmtree(self.output_dir)
     if exists(self.input_dir):
         rmtree(self.input_dir)
    def tearDown(self):
        """Removes temporary directories and files."""
        remove_files(self.files_to_remove)

        # Remove directories last, so we don't get errors trying to remove
        # files which may be in the directories.
        for d in self.dirs_to_remove:
            if exists(d):
                rmtree(d)
    def tearDown(self):
        """Removes temporary directories and files."""
        remove_files(self.files_to_remove)

        # Remove directories last, so we don't get errors trying to remove
        # files which may be in the directories.
        for d in self.dirs_to_remove:
            if exists(d):
                rmtree(d)
Example #10
0
 def tearDown(self):
     """ """
     disable_timeout()
     remove_files(self.files_to_remove, error_on_missing=False)
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
Example #11
0
    def tearDown(self):
        """Clean up tmp files."""
        remove_files(self.files_to_remove, False)
        if self.tmpdir:
            rmtree(self.tmpdir)

        # clean up the file from init_flowgram_file
        if (hasattr(self, "tmp_filename") and exists(self.tmp_filename)):
            remove(self.tmp_filename)
Example #12
0
    def tearDown(self):
        """Clean up tmp files."""
        remove_files(self.files_to_remove, False)
        if self.tmpdir:
            rmtree(self.tmpdir)

        # clean up the file from init_flowgram_file
        if (hasattr(self, "tmp_filename") and exists(self.tmp_filename)):
            remove(self.tmp_filename)
 def tearDown(self):
     """ """
     disable_timeout()
     remove_files(self.files_to_remove, error_on_missing=False)
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
 def tearDown(self):
     """remove all the files after completing tests """
     self.mapping_fp.close()
     self.fasta_file_no_consensus.close()
     self.fasta_file_for_consensus_tie_G_C.close()
     self.fasta_file_for_consensus_unequal_length.close()
     remove_files([self.mapping_fp_name,
                   self.fasta_file_no_consensus_name,
                   self.fasta_file_for_consensus_tie_G_C_name,
                   self.fasta_file_for_consensus_unequal_length_name,
                   self.fwd_read_fh_name, self.rev_read_fh_name])
 def tearDown(self):
     """remove all the files after completing tests """
     self.mapping_fp.close()
     self.fasta_file_no_consensus.close()
     self.fasta_file_for_consensus_tie_G_C.close()
     self.fasta_file_for_consensus_unequal_length.close()
     remove_files([self.mapping_fp_name,
                   self.fasta_file_no_consensus_name,
                   self.fasta_file_for_consensus_tie_G_C_name,
                   self.fasta_file_for_consensus_unequal_length_name,
                   self.fwd_read_fh_name, self.rev_read_fh_name])
Example #16
0
def swarm_denovo_cluster(seq_path,
                         d=1,
                         threads=1,
                         HALT_EXEC=False):
    """ Function  : launch the Swarm de novo OTU picker

        Parameters: seq_path, filepath to reads
                    d, resolution
                    threads, number of threads to use

        Return    : clusters, list of lists
    """

    # Check sequence file exists
    if not exists(seq_path):
        raise ValueError("%s does not exist" % seq_path)

    # Instantiate the object
    swarm = Swarm(HALT_EXEC=HALT_EXEC)

    # Set the resolution
    if d > 0:
        swarm.Parameters['-d'].on(d)
    else:
        raise ValueError("Resolution -d must be a positive integer.")

    # Set the number of threads
    if threads > 0:
        swarm.Parameters['-t'].on(threads)
    else:
        raise ValueError("Number of threads must be a positive integer.")

    # create temporary file for Swarm OTU-map
    f, tmp_swarm_otumap = mkstemp(prefix='temp_otumap_',
                                  suffix='.swarm')
    close(f)

    swarm.Parameters['-o'].on(tmp_swarm_otumap)

    # Remove this file later, the final OTU-map
    # is output by swarm_breaker.py and returned
    # as a list of lists (clusters)
    swarm.files_to_remove.append(tmp_swarm_otumap)

    # Launch Swarm
    # set the data string to include the read filepath
    # (to be passed as final arguments in the swarm command)
    clusters = swarm(seq_path)

    remove_files(swarm.files_to_remove, error_on_missing=False)

    # Return clusters
    return clusters
Example #17
0
    def tearDown(self):
        """ """
        disable_timeout()

        # reset sys.stderr
        sys.stderr = self.saved_stderr

        remove_files(self.files_to_remove)
        # remove directories last, so we don't get errors
        # trying to remove files which may be in the directories
        for d in self.dirs_to_remove:
            if exists(d):
                rmtree(d)
Example #18
0
    def remove_intermediate_files(self):
        """Remove all intermediate files."""

        # tmp files are written in the current dir,
        # app controller always jumps into dir specified via exec_dir
        # Note: blast intermediates are not removed
        exec_dir = str(self.Parameters['--exec_dir'].Value)
        inp_file_name = str(self.Parameters['--query_NAST'].Value)

        exec_dir = exec_dir.rstrip('"')
        exec_dir = exec_dir.lstrip('"')

        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx",
                        ".CPS.CPC.wTaxons", ".cidx"]
        cs_tmp_files = [
            exec_dir +
            '/' +
            inp_file_name +
            x for x in tmp_suffixes]
        remove_files(cs_tmp_files, error_on_missing=False)

        db_param = self.Parameters['--db_NAST']
        if db_param.isOn():
            nast_db_name = str(db_param.Value)
            nast_db_name = nast_db_name.rstrip('"')
            nast_db_name = nast_db_name.lstrip('"')

            # Better do not remove this file since other ChimeraSlayer
            # instances running on the same ref set might use this file
            # Should be rather deleted in the calling function
#            remove_files([nast_db_name + ".cidx"],
#                         error_on_missing=False)

        fasta_param = self.Parameters['--db_FASTA']
        if fasta_param.isOn():
            fasta_name = str(fasta_param.Value)
            fasta_name = fasta_name.rstrip('"')
            fasta_name = fasta_name.lstrip('"')

            blast_db_files = [
                fasta_name +
                x for x in [
                    ".nsq",
                    ".nin",
                    ".nhr",
                    ".cidx"]]
            remove_files(blast_db_files, error_on_missing=False)
    def remove_intermediate_files(self):
        """Remove all intermediate files."""

        # tmp files are written in the current dir,
        # app controller always jumps into dir specified via exec_dir
        # Note: blast intermediates are not removed
        exec_dir = str(self.Parameters['--exec_dir'].Value)
        inp_file_name = str(self.Parameters['--query_NAST'].Value)

        exec_dir = exec_dir.rstrip('"')
        exec_dir = exec_dir.lstrip('"')

        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx",
                        ".CPS.CPC.wTaxons", ".cidx"]
        cs_tmp_files = [
            exec_dir +
            '/' +
            inp_file_name +
            x for x in tmp_suffixes]
        remove_files(cs_tmp_files, error_on_missing=False)

        db_param = self.Parameters['--db_NAST']
        if db_param.isOn():
            nast_db_name = str(db_param.Value)
            nast_db_name = nast_db_name.rstrip('"')
            nast_db_name = nast_db_name.lstrip('"')

            # Better do not remove this file since other ChimeraSlayer
            # instances running on the same ref set might use this file
            # Should be rather deleted in the calling function
#            remove_files([nast_db_name + ".cidx"],
#                         error_on_missing=False)

        fasta_param = self.Parameters['--db_FASTA']
        if fasta_param.isOn():
            fasta_name = str(fasta_param.Value)
            fasta_name = fasta_name.rstrip('"')
            fasta_name = fasta_name.lstrip('"')

            blast_db_files = [
                fasta_name +
                x for x in [
                    ".nsq",
                    ".nin",
                    ".nhr",
                    ".cidx"]]
            remove_files(blast_db_files, error_on_missing=False)
    def test_seq_path(self):
        """ Swarm should raise a ValueError if the sequences
            filepath does not exist
        """

        f, tmp_file = mkstemp(prefix='temp_reads_',
                              suffix='.fasta')
        close(f)
        remove_files([tmp_file])

        self.assertRaises(ValueError,
                          swarm_denovo_cluster,
                          seq_path=tmp_file,
                          d=1,
                          threads=1)
Example #21
0
    def tearDown(self):
        """Clean up tmp files."""

        # turn off the alarm
        signal.alarm(0)

        remove_files(self.files_to_remove, False)
        if self.server_socket:
            self.server_socket.close()
        # give clients time to clean up
        sleep(1)
        if exists(self.tmp_dir):
            try:
                rmdir(self.tmp_dir)
            except OSError:
                # give clients some more time, fail if still error
                sleep(5)
                rmdir(self.tmp_dir)
Example #22
0
    def tearDown(self):
        """Clean up tmp files."""

        # turn off the alarm
        signal.alarm(0)

        remove_files(self.files_to_remove, False)
        if self.server_socket:
            self.server_socket.close()
        # give clients time to clean up
        sleep(1)
        if exists(self.tmp_dir):
            try:
                rmdir(self.tmp_dir)
            except OSError:
                # give clients some more time, fail if still error
                sleep(5)
                rmdir(self.tmp_dir)
Example #23
0
    def test_remove_files(self):
        # create list of temp file paths
        test_fds = [NamedTemporaryFile(delete=False) for i in range(5)]
        test_filepaths = [element.name for element in test_fds]

        # should work just fine
        remove_files(test_filepaths)

        # check that an error is raised on trying to remove the files...
        self.assertRaises(OSError, remove_files, test_filepaths)

        # touch one of the filepaths so it exists
        extra_file = NamedTemporaryFile(delete=False).name
        test_filepaths.append(extra_file)

        # no error is raised on trying to remove the files
        # (although 5 don't exist)...
        remove_files(test_filepaths, error_on_missing=False)
        # ... and the existing file was removed
        self.assertFalse(exists(extra_file))

        # try to remove them with remove_files and verify that an IOError is
        # raises
        self.assertRaises(OSError, remove_files, test_filepaths)

        # now get no error when error_on_missing=False
        remove_files(test_filepaths, error_on_missing=False)
Example #24
0
    def test_remove_files(self):
        # create list of temp file paths
        test_fds = [NamedTemporaryFile(delete=False) for i in range(5)]
        test_filepaths = [element.name for element in test_fds]

        # should work just fine
        remove_files(test_filepaths)

        # check that an error is raised on trying to remove the files...
        self.assertRaises(OSError, remove_files, test_filepaths)

        # touch one of the filepaths so it exists
        extra_file = NamedTemporaryFile(delete=False).name
        test_filepaths.append(extra_file)

        # no error is raised on trying to remove the files
        # (although 5 don't exist)...
        remove_files(test_filepaths, error_on_missing=False)
        # ... and the existing file was removed
        self.assertFalse(exists(extra_file))

        # try to remove them with remove_files and verify that an IOError is
        # raises
        self.assertRaises(OSError, remove_files, test_filepaths)

        # now get no error when error_on_missing=False
        remove_files(test_filepaths, error_on_missing=False)
Example #25
0
    def test_compute_seqs_per_file(self):
        """compute_seqs_per_file functions as expected
        """
        fd, temp_fasta_fp = mkstemp(prefix='QiimeScriptUtilTests',
                                   suffix='.fasta')
        close(fd)
        temp_fasta = ['>seq', 'AAACCCCAAATTGG'] * 25
        open(temp_fasta_fp, 'w').write('\n'.join(temp_fasta))

        actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp, 25)
        actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp, 2)
        actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp, 10)
        actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp, 5)
        actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp, 40)

        remove_files([temp_fasta_fp])

        self.assertEqual(actual_25, 1)
        self.assertEqual(actual_2, 13)
        self.assertEqual(actual_10, 3)
        self.assertEqual(actual_5, 5)
        self.assertEqual(actual_40, 1)
Example #26
0
    def test_build_blast_db_from_fasta_path_aln(self):
        """build_blast_db_from_fasta_path works with alignment as input
        """
        blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
        self.assertEqual(blast_db,self.in_aln1_fp)
        expected_db_files = set([blast_db + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files),expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
def select_unique_rand_bcs(rand_bcs, unique_threshold):
    """
    Attempts to select true barcodes from set of barcodes
    i.e. removes barcodes that might be artifacts
    due to sequencing errors.
    Uses uclust to remove barcodes that are similar thatn
    threshold.
    Parameters
    ----------
    rand_bcs: list
    unique_threshold: float
    Returns
    ----------
    unique_rand_bcs: set
        set of unique random barcodes.
    """
    temp_dir = get_qiime_temp_dir()
    fasta_fd, fasta_tempfile_name = mkstemp(dir=temp_dir,
                                            prefix='tmp',
                                            suffix='.fas')
    rand_bcs = set(rand_bcs)

    with open(fasta_tempfile_name, 'w') as fasta_tempfile:
        for rand_bc in rand_bcs:
            fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc))
    fasta_tempfile.close()

    _, _, unique_rand_bcs = get_clusters_from_fasta_filepath(
        fasta_tempfile_name,
        original_fasta_path=None,
        percent_ID=unique_threshold,
        save_uc_files=False,
        output_dir=temp_dir)

    unique_rand_bcs = set(unique_rand_bcs)
    remove_files([fasta_tempfile_name])
    return unique_rand_bcs
Example #28
0
    def test_build_blast_db_from_fasta_file(self):
        """build_blast_db_from_fasta_file works with open files as input
        """
        blast_db, db_files = \
         build_blast_db_from_fasta_file(open(self.in_aln1_fp),output_dir='/tmp/')
        self.assertTrue(blast_db.startswith('/tmp/BLAST_temp_db'))
        self.assertTrue(blast_db.endswith('.fasta'))
        expected_db_files = set([blast_db] + [blast_db + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files),expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
def select_unique_rand_bcs(rand_bcs, unique_threshold):
    """
    Attempts to select true barcodes from set of barcodes
    i.e. removes barcodes that might be artifacts
    due to sequencing errors.
    Uses uclust to remove barcodes that are similar thatn
    threshold.
    Parameters
    ----------
    rand_bcs: list
    unique_threshold: float
    Returns
    ----------
    unique_rand_bcs: set
        set of unique random barcodes.
    """
    temp_dir = get_qiime_temp_dir()
    fasta_fd, fasta_tempfile_name = mkstemp(
        dir=temp_dir, prefix='tmp', suffix='.fas')
    rand_bcs = set(rand_bcs)

    with open(fasta_tempfile_name, 'w') as fasta_tempfile:
        for rand_bc in rand_bcs:
            fasta_tempfile.write(">{}\n{}\n".format(rand_bc, rand_bc))
    fasta_tempfile.close()

    _, _, unique_rand_bcs = get_clusters_from_fasta_filepath(
        fasta_tempfile_name,
        original_fasta_path=None,
        percent_ID=unique_threshold,
        save_uc_files=False,
        output_dir=temp_dir)

    unique_rand_bcs = set(unique_rand_bcs)
    remove_files([fasta_tempfile_name])
    return unique_rand_bcs
Example #30
0
    def test_build_blast_db_from_fasta_path(self):
        """build_blast_db_from_fasta_path convenience function works as expected
        """
        blast_db, db_files = \
         build_blast_db_from_fasta_path(self.in_seqs1_fp)
        self.assertEqual(blast_db,self.in_seqs1_fp)
        expected_db_files = set([self.in_seqs1_fp + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files),expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Example #31
0
    def test_build_blast_db_from_seqs(self):
        """build_blast_db_from_seqs convenience function works as expected
        """
        blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1,output_dir='/tmp')
        self.assertTrue(blast_db.startswith('/tmp/Blast_tmp_db'))
        self.assertTrue(blast_db.endswith('.fasta'))
        expected_db_files = set([blast_db + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files),expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
def get_chimeras_from_Nast_aligned(seqs_fp, ref_db_aligned_fp=None,
                                   ref_db_fasta_fp=None,
                                   HALT_EXEC=False, min_div_ratio=None,
                                   keep_intermediates=False):
    """remove chimeras from seqs_fp using chimeraSlayer.

    seqs_fp:  a filepath with the seqs to check in the file
    ref_db_aligned_fp: fp to (pynast) aligned reference sequences
    ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided,
    HALT_EXEC: stop execution if true
    min_div_ratio: passed to ChimeraSlayer App
    """

    files_to_remove = []
    # might come in as FilePath object with quotes
    seqs_fp = str(seqs_fp)
    seqs_fp = seqs_fp.rstrip('"')
    seqs_fp = seqs_fp.lstrip('"')

    seqs_dir, new_seqs_fp = split(seqs_fp)

    # if fp is in current dir, we fake a dir change
    if seqs_dir == "":
        seqs_dir = "./"

    # Chimera Slayer puts some temp files in current dir and some in dir of input file
    # use exe_dir to change to dir of input file, so to have all tmp files in
    # one place
    params = {'--query_NAST': new_seqs_fp,
              '--exec_dir': seqs_dir}

    if ref_db_aligned_fp is None and ref_db_fasta_fp is None:
        # use default db, whose relative position to the
        # ChimeraSlayer binary is hardcoded
        pass

    else:
        if not ref_db_fasta_fp:
            # make degapped reference file
            ref_db_fasta_fp = write_degapped_fasta_to_file(parse_fasta(
                open(ref_db_aligned_fp)))
            files_to_remove.append(ref_db_fasta_fp)
        # use user db
        params.update({'--db_NAST': abspath(ref_db_aligned_fp),
                       '--db_FASTA': abspath(ref_db_fasta_fp)})

    if min_div_ratio is not None:
        params.update({'-R': min_div_ratio})

    app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC)
    app_results = app()

#    this is a FilePath object in case of success.
#    How can we test for failure here?
    #    if not exists(app_results['CPS']):
#         raise ApplicationError, "ChimeraSlayer failed. No output file."

    chimeras = parse_CPS_file((app_results['CPS']))
    if not keep_intermediates:
        app.remove_intermediate_files()
        remove_files(files_to_remove)

    return chimeras
Example #33
0
 def tearDown(self):
     remove_files(self.files_to_remove)
     for folder in self.folders_to_remove:
         shutil.rmtree(folder)
def get_consensus_seqs_lookup(random_bc_lookup,
                              random_bc_reads,
                              random_bcs,
                              min_difference_in_bcs,
                              min_reads_per_random_bc,
                              output_dir,
                              min_difference_in_clusters,
                              max_cluster_ratio,
                              min_consensus):
    """
    Generates LEA-seq consensus sequence
    For each sample id, for each random barcode, consensus sequence is created
    according to the LEA seq algorithm.
    Parameters
    ----------
    random_bc_lookup: defaultdict
        contains sample ID -> random barcode -> list of seqs
    random_bc_reads: defaultdict
        contains sample ID -> random barcode -> number of reads
    random_bcs: list
        list of random barcodes
    min_difference_in_bcs: float
        threshold for selecting unique barcodes
    min_reads_per_random_bc:
        minimum number of reads per random bc, for it not to be discarded
    output_dir: dirpath
        output directory path
    min_difference_in_clusters: float
        percent identity threshold for cluster formation
    max_cluster_ratio: float
        cluster_ratio below which you need to find the consensus sequence
    Returns
    ----------
    consensus_seq_lookup: defaultdict
    contains sample ID -> random barcode -> consensus_seq
    """

    consensus_seq_lookup = defaultdict(lambda:
                                       defaultdict(str))
    # defaultdict that stores LEA-seq consensus sequence
    # For each sample id, for each random barcode,
    # consensus sequence is stored

    random_bc_keep = {}
    # to remove random bcs that are selected
    # during the pruning step (select_unique_rand_bcs)

    for sample_id in random_bc_lookup:
        random_bc_keep[sample_id] = select_unique_rand_bcs(
            random_bcs[sample_id],
            min_difference_in_bcs)
        # removes barcodes that might be artifacts
        # due to sequencing error
        for random_bc in random_bc_lookup[sample_id]:
            if random_bc in random_bc_keep[sample_id] and random_bc_reads[
                    sample_id][random_bc] >= min_reads_per_random_bc:
                fwd_fd, fwd_fasta_tempfile_name = mkstemp(
                    dir=output_dir, prefix='fwd', suffix='.fas')
                rev_fd, rev_fasta_tempfile_name = mkstemp(
                    dir=output_dir, prefix='rev', suffix='.fas')
                close(fwd_fd)
                close(rev_fd)

                # create fasta files for all fwd and rev seqs
                # for that sample id and random bc.
                fwd_fasta_tempfile = open(fwd_fasta_tempfile_name, 'w')
                rev_fasta_tempfile = open(rev_fasta_tempfile_name, 'w')
                max_freq = 0
                for seq_index, fwd_rev in enumerate(
                        random_bc_lookup[sample_id][random_bc]):
                    fwd_seq, rev_seq = fwd_rev
                    fwd_line = ">{}{}|{}\n{}\n".format(
                        seq_index, random_bc,
                        random_bc_lookup[sample_id][random_bc][fwd_rev],
                        fwd_seq)
                    rev_line = ">{}{}|{}\n{}\n".format(
                        seq_index, random_bc,
                        random_bc_lookup[sample_id][random_bc][fwd_rev],
                        rev_seq)
                    fwd_fasta_tempfile.write(fwd_line)
                    rev_fasta_tempfile.write(rev_line)
                    if random_bc_lookup[sample_id][
                            random_bc][fwd_rev] > max_freq:
                        max_freq = random_bc_lookup[
                            sample_id][random_bc][fwd_rev]
                        majority_seq = fwd_seq + "^" + rev_seq
                # select majority sequence for the sample_id,
                # and for that particular random_bc

                fwd_fasta_tempfile.close()
                rev_fasta_tempfile.close()
                fwd_cluster_ratio = get_cluster_ratio(
                    fwd_fasta_tempfile_name,
                    min_difference_in_clusters)
                rev_cluster_ratio = get_cluster_ratio(
                    rev_fasta_tempfile_name,
                    min_difference_in_clusters)

                # If the cluster ratio exists, and
                # if is is below the threshold(max_cluster_ratio),
                # set the consensus seq as the majority seq
                # otherwise call get_consensus function
                if fwd_cluster_ratio == 0 or rev_cluster_ratio == 0:
                    consensus_seq = "No consensus"
                elif (fwd_cluster_ratio > max_cluster_ratio
                        and rev_cluster_ratio > max_cluster_ratio):
                    consensus_seq = majority_seq
                else:
                    fwd_fasta_tempfile = open(fwd_fasta_tempfile_name, 'r')
                    rev_fasta_tempfile = open(rev_fasta_tempfile_name, 'r')
                    fwd_consensus = get_consensus(
                        fwd_fasta_tempfile,
                        min_consensus)
                    rev_consensus = get_consensus(
                        rev_fasta_tempfile,
                        min_consensus)
                    fwd_fasta_tempfile.close()
                    rev_fasta_tempfile.close()
                    consensus_seq = fwd_consensus + "^" + rev_consensus

                consensus_seq_lookup[sample_id][random_bc] = consensus_seq
                files_to_be_removed = list()
                files_to_be_removed.append(fwd_fasta_tempfile_name)
                files_to_be_removed.append(rev_fasta_tempfile_name)
                remove_files(files_to_be_removed)

    # return the entire defaultdict 'consensus_seq_lookup
    # which has consensus sequence for each sample id,
    # and for each random barcode.
    return consensus_seq_lookup
def get_cluster_ratio(fasta_seqs, min_difference_in_clusters):
    """
    Uses uclust to calculate cluster ratio
    cluster_ratio =
    num_of_seq_in_cluster_with_max_seq
    divided by
    num_of_seq_in cluster_with_second_higest_seq
    Parameters
    ----------
    fasta_seqs: list
        list of fasta sequences
    min_difference_in_clusters: float
        percent identity threshold for cluster formation
    Returns
    ----------
    cluster_ratio: float
        cluster ratio of the sequences using uclust
        cluster_ratio =
        num_of_seq_in_cluster_with_max_seq /
        num_of_seq_in cluster_with_second_higest_seq
    """
    cluster_percent_id = min_difference_in_clusters
    temp_dir = get_qiime_temp_dir()
    fd_uc, uclust_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc')
    close(fd_uc)
    fd_fas, fasta_tempfile_name = mkstemp(dir=temp_dir, suffix='.uc')
    close(fd_fas)
    with open(fasta_tempfile_name, 'w') as fasta_tempfile:
        fasta_tempfile.write(fasta_seqs)
    fasta_tempfile.close()
    count = 0
    command = "uclust --usersort --input {} --uc {} --id 0.98".format(
        fasta_tempfile_name, uclust_tempfile_name)
    # In the function, I am calling uclust a large number of times.
    # Initially I was using from bfillings.get_clusters_from_fasta_filepath
    # but due to issue (biocore/bfillingss#31), I have temporarily
    # reverted to qiime_system_call.

    count_lookup = defaultdict(int)

    qiime_system_call(command)
    uclust_tempfile = open(uclust_tempfile_name, 'r')
    for line in uclust_tempfile:
        if search(r'^C', line):
            pieces = line.split('\t')
            count_lookup[pieces[1]] += int(pieces[2])
            count += 1
    uclust_tempfile.close()
    files_to_be_removed = list()
    files_to_be_removed.append(uclust_tempfile_name)
    remove_files(files_to_be_removed)

    sorted_counts_in_clusters = sorted(
        count_lookup.iteritems(),
        key=lambda x: x[1], reverse=True)
    try:
        max_cluster_count = \
            float(str(sorted_counts_in_clusters[0][1]))
        second_cluster_count = \
            float(str(sorted_counts_in_clusters[1][1]))
        return max_cluster_count / second_cluster_count
    except IndexError:
        return 1
Example #36
0
 def tearDown(self):
     remove_files(self.files_to_remove)
Example #37
0
def pick_subsampled_open_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_assign_tax=True,
        run_align_and_tree=True,
        prefilter_percent_id=None,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        suppress_index_page=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = [
        'uclust_ref', 'usearch61_ref', 'sortmerna'
    ]
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
            ('Run summary data', log_fp, _index_headers['run_summary']))
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(
            logger,
            [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger,
                prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])
            index_links.append(
                ('Pre-filtered sequence identifiers '
                 '(failed to hit reference at %1.1f%% identity)' %
                 (float(prefilter_percent_id) * 100),
                 prefilter_failures_list_fp, _index_headers['sequences']))

            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir,
                                                 reference_otu_picking_method,
                                                 refseqs_fp, parallel, params,
                                                 logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(
            step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                     'python -c "import qiime; qiime.util.subsample_fasta' +
                     '(\'%s\', \'%s\', \'%f\')\n\n"' %
                     (abspath(step1_failures_fasta_fp),
                      abspath(step2_input_fasta_fp), percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                     new_ref_set_id, denovo_otu_picking_method,
                                     params, logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                        reference_otu_picking_method,
                                        step2_repset_fasta_fp, parallel,
                                        params, logger)

        commands.append([('Pick reference OTUs using de novo rep set',
                          step3_cmd)])

        index_links.append((
            'Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
            merged_otu_map_fp, _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                              step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step,
                          step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([
            ('Move final failures file to top-level directory',
             'mv %s %s/final_failures.txt' % (failures_fp, output_dir))
        ])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    index_links.append(
        ('Final map of OTU identifier to sequence identifers excluding '
         'OTUs with fewer than %d sequences' % min_otu_size,
         otu_no_singletons_fp, _index_headers['otu_maps']))

    logger.write(
        '# Filter singletons from the otu map using API \n' +
        'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
        '(\'%s\', \'%s\', \'%d\')"\n\n' %
        (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(('OTU representative sequences', final_repset_fp,
                        _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append((
        'New reference sequences (i.e., OTU representative sequences plus input '
        'reference sequences)', new_refseqs_fp, _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' %
                 final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write(
        '# Copy the full input refseqs file to the new refseq file\n' +
        'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 2 and step 4 to the final representative set and the new reference'
            + ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 4 to the final representative set and the new reference' +
            ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp, _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments'
            % min_otu_size, pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST' % min_otu_size,
            pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(('OTU taxonomic assignments', taxonomy_fp,
                                _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(('OTU phylogenetic tree', rep_set_tree_fp,
                            _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Example #38
0
def align_and_tree(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):

    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # Prep the pynast alignment command
    alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method)
    aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename)
    failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename)
    if exists(pynast_dir):
        rmtree(pynast_dir)

    if parallel:
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['align_seqs'].copy()
            if 'alignment_method' in d:
                del d['alignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass

        # Build the parallel pynast alignment command
        align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
            (repset_fasta_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
            (repset_fasta_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])

    # Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\
        (pynast_dir, input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
        (pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])

    # Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
        (filtered_aln_fp, tree_fp, params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    if exists(tree_fp):
        remove_files([tree_fp])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return failures_fp
Example #39
0
def get_chimeras_from_Nast_aligned(seqs_fp, ref_db_aligned_fp=None,
                                   ref_db_fasta_fp=None,
                                   HALT_EXEC=False, min_div_ratio=None,
                                   keep_intermediates=False):
    """remove chimeras from seqs_fp using chimeraSlayer.

    seqs_fp:  a filepath with the seqs to check in the file
    ref_db_aligned_fp: fp to (pynast) aligned reference sequences
    ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided,
    HALT_EXEC: stop execution if true
    min_div_ratio: passed to ChimeraSlayer App
    """

    files_to_remove = []
    # might come in as FilePath object with quotes
    seqs_fp = str(seqs_fp)
    seqs_fp = seqs_fp.rstrip('"')
    seqs_fp = seqs_fp.lstrip('"')

    seqs_dir, new_seqs_fp = split(seqs_fp)

    # if fp is in current dir, we fake a dir change
    if seqs_dir == "":
        seqs_dir = "./"

    # Chimera Slayer puts some temp files in current dir and some in dir of input file
    # use exe_dir to change to dir of input file, so to have all tmp files in
    # one place
    params = {'--query_NAST': new_seqs_fp,
              '--exec_dir': seqs_dir}

    if ref_db_aligned_fp is None and ref_db_fasta_fp is None:
        # use default db, whose relative position to the
        # ChimeraSlayer binary is hardcoded
        pass

    else:
        if not ref_db_fasta_fp:
            # make degapped reference file
            ref_db_fasta_fp = write_degapped_fasta_to_file(parse_fasta(
                open(ref_db_aligned_fp)))
            files_to_remove.append(ref_db_fasta_fp)
        # use user db
        params.update({'--db_NAST': abspath(ref_db_aligned_fp),
                       '--db_FASTA': abspath(ref_db_fasta_fp)})

    if min_div_ratio is not None:
        params.update({'-R': min_div_ratio})

    app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC)
    app_results = app()

#    this is a FilePath object in case of success.
#    How can we test for failure here?
    #    if not exists(app_results['CPS']):
#         raise ApplicationError, "ChimeraSlayer failed. No output file."

    chimeras = parse_CPS_file((app_results['CPS']))
    if not keep_intermediates:
        app.remove_intermediate_files()
        remove_files(files_to_remove)

    return chimeras
 def cleanUp(self):
     """ Remove temporary blast database files, if applicable
     """
     remove_files(self._db_files_to_remove, error_on_missing=False)
Example #41
0
 def tearDown(self):
     remove_files(self.files_to_remove, error_on_missing=False)
def pick_subsampled_open_reference_otus(input_fp,
                                        refseqs_fp,
                                        output_dir,
                                        percent_subsample,
                                        new_ref_set_id,
                                        command_handler,
                                        params,
                                        qiime_config,
                                        prefilter_refseqs_fp=None,
                                        run_assign_tax=True,
                                        run_align_and_tree=True,
                                        prefilter_percent_id=None,
                                        min_otu_size=2,
                                        step1_otu_map_fp=None,
                                        step1_failures_fasta_fp=None,
                                        parallel=False,
                                        suppress_step4=False,
                                        logger=None,
                                        suppress_md5=False,
                                        suppress_index_page=False,
                                        denovo_otu_picking_method='uclust',
                                        reference_otu_picking_method='uclust_ref',
                                        status_update_callback=print_to_stdout,
                                        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref',
                                             'sortmerna']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
                ('Run summary data',
                log_fp,
                _index_headers['run_summary']))
    else:
        close_logger_on_success = False


    if not suppress_md5:
        log_input_md5s(logger, [input_fp,
                                refseqs_fp,
                                step1_otu_map_fp,
                                step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id)
            commands.append(
                [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append(
                [('Filter prefilter failures from input', filter_fasta_cmd)])
            index_links.append(
            ('Pre-filtered sequence identifiers '
             '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100),
                        prefilter_failures_list_fp,
                        _index_headers['sequences']))


            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method,
            refseqs_fp, parallel, params, logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp,
                        step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp),
                                                    abspath(
                                                        step2_input_fasta_fp),
                                                    percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                     step2_dir,
                                     new_ref_set_id,
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(
            step1_failures_fasta_fp,
            step3_dir,
            reference_otu_picking_method,
            step2_repset_fasta_fp,
            parallel,
            params,
            logger)

        commands.append([
            ('Pick reference OTUs using de novo rep set', step3_cmd)])

        index_links.append(
            ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
             merged_otu_map_fp,
             _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                            step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' % (failures_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(
        otu_fp,
        otu_no_singletons_fp,
        min_otu_size)

    index_links.append(('Final map of OTU identifier to sequence identifers excluding '
                        'OTUs with fewer than %d sequences' % min_otu_size,
                        otu_no_singletons_fp,
                        _index_headers['otu_maps']))

    logger.write('# Filter singletons from the otu map using API \n' +
                 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
                 '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp),
                                                    abspath(
                                                        otu_no_singletons_fp),
                                                    min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(
        ('OTU representative sequences',
         final_repset_fp,
         _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append(
        ('New reference sequences (i.e., OTU representative sequences plus input '
         'reference sequences)',
         new_refseqs_fp,
         _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' % final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write('# Copy the full input refseqs file to the new refseq file\n' +
                 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 2 and step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp,
         _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
             'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
             'fail to align with PyNAST' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(
                    ('OTU taxonomic assignments',
                    taxonomy_fp,
                    _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(
            ('OTU phylogenetic tree',
             rep_set_tree_fp,
             _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []


    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                                refseqs_fp=refseqs_fp,
                                                output_dir=iteration_output_dir,
                                                percent_subsample=percent_subsample,
                                                new_ref_set_id='.'.join(
                                                    [new_ref_set_id, str(i)]),
                                                command_handler=command_handler,
                                                params=params,
                                                qiime_config=qiime_config,
                                                run_assign_tax=False,
                                                run_align_and_tree=False,
                                                prefilter_refseqs_fp=prefilter_refseqs_fp,
                                                prefilter_percent_id=prefilter_percent_id,
                                                min_otu_size=min_otu_size,
                                                step1_otu_map_fp=step1_otu_map_fp,
                                                step1_failures_fasta_fp=step1_failures_fasta_fp,
                                                parallel=parallel,
                                                suppress_step4=suppress_step4,
                                                logger=logger,
                                                suppress_md5=suppress_md5,
                                                suppress_index_page=True,
                                                denovo_otu_picking_method=denovo_otu_picking_method,
                                                reference_otu_picking_method=reference_otu_picking_method,
                                                status_update_callback=status_update_callback,
                                                minimum_failure_threshold=minimum_failure_threshold)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append(
            '%s/otu_table_mc%d.biom' %
            (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
Example #44
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_assign_tax=False,
                run_align_and_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                suppress_md5=suppress_md5,
                suppress_index_page=True,
                denovo_otu_picking_method=denovo_otu_picking_method,
                reference_otu_picking_method=reference_otu_picking_method,
                status_update_callback=status_update_callback,
                minimum_failure_threshold=minimum_failure_threshold)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp=None,
                            suppress_usearch61_intermediates=False,
                            suppress_usearch61_ref=False,
                            suppress_usearch61_denovo=False,
                            split_by_sampleid=False,
                            non_chimeras_retention="union",
                            usearch61_minh=0.28,
                            usearch61_xn=8.0,
                            usearch61_dn=1.4,
                            usearch61_mindiffs=3,
                            usearch61_mindiv=0.8,
                            usearch61_abundance_skew=2.0,
                            percent_id_usearch61=0.97,
                            minlen=64,
                            word_length=8,
                            max_accepts=1,
                            max_rejects=8,
                            verbose=False,
                            threads=1.0,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking

    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera.
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    threads: Specify number of threads used per core per CPU
    HALT_EXEC=application controller option to halt execution and print command
    """

    """
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster
    size, will directly cluster raw seqs with the small_mem clustering option.

    This means without additional parsing steps to recalculate
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""

    files_to_remove = []

    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")

    non_chimeras = []
    chimeras = []
    log_lines = {'denovo_chimeras': 0,
                 'denovo_non_chimeras': 0,
                 'ref_chimeras': 0,
                 'ref_non_chimeras': 0}

    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."

        with open(input_seqs_fp, 'U') as full_seqs:
            sep_fastas =split_sequence_file_on_sample_ids_to_files(
                full_seqs,
                'fasta',
                output_dir)

        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas

        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
                identify_chimeras_usearch61(curr_fasta, output_dir,
                                            reference_seqs_fp, suppress_usearch61_intermediates,
                                            suppress_usearch61_ref, suppress_usearch61_denovo,
                                            non_chimeras_retention, usearch61_minh, usearch61_xn,
                                            usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                            usearch61_abundance_skew, percent_id_usearch61, minlen,
                                            word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                            log_lines, verbose, threads)

            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras

    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
            identify_chimeras_usearch61(input_seqs_fp, output_dir,
                                        reference_seqs_fp, suppress_usearch61_intermediates,
                                        suppress_usearch61_ref, suppress_usearch61_denovo,
                                        non_chimeras_retention, usearch61_minh, usearch61_xn,
                                        usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                                        word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                        log_lines, verbose, threads)

    # write log, non chimeras, chimeras.
    write_usearch61_log(log_fp, input_seqs_fp, output_dir,
                        reference_seqs_fp, suppress_usearch61_intermediates,
                        suppress_usearch61_ref, suppress_usearch61_denovo,
                        split_by_sampleid, non_chimeras_retention, usearch61_minh,
                        usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                        word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)

    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()

    remove_files(files_to_remove)
Example #46
0
 def tearDown(self):
     if self._files_to_remove:
         remove_files(self._files_to_remove)
     if isdir('/tmp/truncate_fasta_qual_test/'):
         rmtree('/tmp/truncate_fasta_qual_test/')
    def tearDown(self):

        remove_files(self.files_to_remove)
        if self._dirs_to_remove:
            for i in self._dirs_to_remove:
                rmtree(i)
Example #48
0
 def tearDown(self):
     remove_files(self.files_to_remove)
     rmtree(self.working_dir)
Example #49
0
 def cleanUp(self):
     """ Remove temporary blast database files, if applicable
     """
     remove_files(self._db_files_to_remove, error_on_missing=False)
Example #50
0
 def tearDown(self):
     """cleanup temporary files and dirs
     """
     remove_files(set(self.files_to_remove), error_on_missing=False)
Example #51
0
    def tearDown(self):

        remove_files(self.files_to_remove)
        if self._dirs_to_remove:
            for i in self._dirs_to_remove:
                rmtree(i)
Example #52
0
 def tearDown(self):
     for dir in self.dirs_to_remove:
         if exists(dir):
             rmdir(dir)
     remove_files(self.files_to_remove)
Example #53
0
def remove_artifacts_seqs(seqs_fp,
                          ref_fp,
                          output_fp,
                          ref_db_fp=None,
                          negate=False,
                          threads=1):
    """Remove artifacts from FASTA file using SortMeRNA.

    Parameters
    ----------
    seqs_fp: string
        file path to FASTA input sequence file
    ref_fp: tuple
        file path(s) to FASTA database file
    output_fp: string
        file path to store output results
    ref_db_fp: string or tuple, optional
        file path(s) to indexed FASTA database
    negate: boolean, optional
        if True, discard all input sequences aligning
        to reference database
    threads: integer, optional
        number of threads to use for SortMeRNA
    """
    working_dir = join(dirname(output_fp), "working_dir")
    if not exists(working_dir):
        makedirs(working_dir)

    aligned_seq_ids = set()
    files_to_remove = []

    for i, db in enumerate(ref_fp):
        # create working directory for each
        # reference database
        db_dir_base = splitext(basename(db))[0]
        db_dir = join(working_dir, db_dir_base)
        if not exists(db_dir):
            makedirs(db_dir)

        if ref_db_fp:
            sortmerna_db = ref_db_fp[i]
        else:
            # build index
            sortmerna_db, files_to_remove = \
                build_database_sortmerna(
                    fasta_path=db,
                    max_pos=10000,
                    output_dir=db_dir)

        # run SortMeRNA
        app_result = sortmerna_map(seq_path=seqs_fp,
                                   output_dir=db_dir,
                                   refseqs_fp=db,
                                   sortmerna_db=sortmerna_db,
                                   threads=threads,
                                   best=1)

        # Print SortMeRNA errors
        stderr_fp = app_result['StdErr'].name
        if stat(stderr_fp).st_size != 0:
            with open(stderr_fp, 'U') as stderr_f:
                for line in stderr_f:
                    print line
            raise ValueError("Could not run SortMeRNA.")

        for line in app_result['BlastAlignments']:
            line = line.strip().split('\t')
            if line[1] == '*':
                continue
            else:
                aligned_seq_ids.add(line[0])

        # remove indexed database files
        remove_files(files_to_remove, error_on_missing=False)

    if negate:

        def op(x):
            return x not in aligned_seq_ids
    else:

        def op(x):
            return x in aligned_seq_ids

    # if negate = False, only output sequences
    # matching to at least one of the databases
    with open(seqs_fp, 'U') as seqs_f:
        with open(output_fp, 'w') as out_f:
            for label, seq in parse_fasta(seqs_f):
                label = label.split()[0]
                if op(label):
                    out_f.write(">%s\n%s\n" % (label, seq))
Example #54
0
 def tearDown(self):
     remove_files(self.files_to_remove)
     rmtree(self.root_dir)
Example #55
0
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp=None,
                            suppress_usearch61_intermediates=False,
                            suppress_usearch61_ref=False,
                            suppress_usearch61_denovo=False,
                            split_by_sampleid=False,
                            non_chimeras_retention="union",
                            usearch61_minh=0.28,
                            usearch61_xn=8.0,
                            usearch61_dn=1.4,
                            usearch61_mindiffs=3,
                            usearch61_mindiv=0.8,
                            usearch61_abundance_skew=2.0,
                            percent_id_usearch61=0.97,
                            minlen=64,
                            word_length=8,
                            max_accepts=1,
                            max_rejects=8,
                            verbose=False,
                            threads=1.0,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking

    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera.
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    threads: Specify number of threads used per core per CPU
    HALT_EXEC=application controller option to halt execution and print command
    """

    """
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster
    size, will directly cluster raw seqs with the small_mem clustering option.

    This means without additional parsing steps to recalculate
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""

    files_to_remove = []

    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")

    non_chimeras = []
    chimeras = []
    log_lines = {'denovo_chimeras': 0,
                 'denovo_non_chimeras': 0,
                 'ref_chimeras': 0,
                 'ref_non_chimeras': 0}

    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."

        with open(input_seqs_fp, 'U') as full_seqs:
            sep_fastas =split_sequence_file_on_sample_ids_to_files(
                full_seqs,
                'fasta',
                output_dir)

        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas

        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
                identify_chimeras_usearch61(curr_fasta, output_dir,
                                            reference_seqs_fp, suppress_usearch61_intermediates,
                                            suppress_usearch61_ref, suppress_usearch61_denovo,
                                            non_chimeras_retention, usearch61_minh, usearch61_xn,
                                            usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                            usearch61_abundance_skew, percent_id_usearch61, minlen,
                                            word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                            log_lines, verbose, threads)

            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras

    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
            identify_chimeras_usearch61(input_seqs_fp, output_dir,
                                        reference_seqs_fp, suppress_usearch61_intermediates,
                                        suppress_usearch61_ref, suppress_usearch61_denovo,
                                        non_chimeras_retention, usearch61_minh, usearch61_xn,
                                        usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                                        word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
                                        log_lines, verbose, threads)

    # write log, non chimeras, chimeras.
    write_usearch61_log(log_fp, input_seqs_fp, output_dir,
                        reference_seqs_fp, suppress_usearch61_intermediates,
                        suppress_usearch61_ref, suppress_usearch61_denovo,
                        split_by_sampleid, non_chimeras_retention, usearch61_minh,
                        usearch61_xn, usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
                        usearch61_abundance_skew, percent_id_usearch61, minlen,
                        word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)

    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()

    remove_files(files_to_remove)