Beispiel #1
0
    def test_build_blast_db_from_seqs(self):
        """build_blast_db_from_seqs convenience function works as expected
        """
        blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1,
                                                      output_dir='/tmp')
        self.assertTrue(blast_db.startswith('/tmp/Blast_tmp_db'))
        self.assertTrue(blast_db.endswith('.fasta'))
        expected_db_files = set([blast_db + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files), expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
    def test_generate_new_otus_stats(self):
        """Test generating new OTU stats on valid input data."""
        exp = [('New.CleanUp.ReferenceOTU972',
            'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG'
            'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;baz', 2, 60.0,
            4.349999999999994, {'Env2': 25.0, 'Env1': 35.0}),
            ('New.CleanUp.ReferenceOTU969',
            'ATACGTAGGTCCCGAGCGTTGTCCGGATTTACTGGGTGTAAAGGGAGCGTAGACGGCATGGCAAG'
            'TCTGAAGTGAAAACCCAGGGCTCAACCCTGGGACTGC', 'foo;bar;baz', 2, 14.0,
            12.5, {'Env2': 8.0, 'Env1': 6.0}), ('New.CleanUp.ReferenceOTU964',
            'ATACGGAGGATGCGAGCGTTATCCGGATTTATTGGGTTTAAAGGGTGCGTAGACGGCGAAGCAAG'
            'TCTGAAGTGAAAGCCCGGGGCTCAACCGCGGGACTGC', 'foo;bar;baz', 2, 5.0,
            14.769999999999996, {'Env2': 2.0, 'Env1': 3.0}),
            ('New.CleanUp.ReferenceOTU999',
            'ATACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGGGTGCGTAGGCGGATGTTTAAG'
            'TGGGATGTGAAATCCCCGGGCTTAACCTGGGGGCTGC', 'foo;bar;bazz', 1, 99.0,
            4.349999999999994, {'Env2': 0.0, 'Env1': 99.0})]

        ref_seqs_db, ref_seqs_db_files_to_remove = \
            build_blast_db_from_fasta_path(self.ref_seqs_f.name)
        obs = _generate_new_otus_stats(self.otu_table_f, self.rep_set_f,
                self.ref_seqs_f, ref_seqs_db, self.mapping_f,
                self.grouping_category, self.top_n)
        remove_files(ref_seqs_db_files_to_remove)

        self.assertFloatEqual(obs, exp)
 def tearDown(self):
     remove_files(set(self.files_to_remove))
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
 def tearDown(self):
     if self._files_to_remove:
         remove_files(self._files_to_remove)
     if exists(self.output_dir):
         rmtree(self.output_dir)
     if exists(self.input_dir):
         rmtree(self.input_dir)
Beispiel #5
0
 def test_split_fasta_diff_num_seqs_per_file_alt(self):
     """split_fasta funcs always catches all seqs
     """
     # start with 59 seqs (b/c it's prime, so should make more 
     # confusing splits)
     in_seqs = LoadSeqs(data=[('seq%s' % k,'AACCTTAA') for k in range(59)])
     infile = in_seqs.toFasta().split('\n')
     
     # test seqs_per_file from 1 to 1000
     for i in range(1,1000):
         filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
      
         actual = split_fasta(infile, i, filename_prefix)
     
         actual_seqs = []
         for fp in actual:
             actual_seqs += list(open(fp))
         # remove the files now, so if the test fails they still get 
         # cleaned up
         remove_files(actual)
         
         # building seq collections from infile and the split files result in
         # equivalent seq collections
         self.assertEqual(\
          LoadSeqs(data=infile,aligned=False),\
          LoadSeqs(data=actual_seqs,aligned=False))
Beispiel #6
0
 def tearDown(self):
     if self._files_to_remove:
         remove_files(self._files_to_remove)
     if exists(self.output_dir):
         rmtree(self.output_dir)
     if exists(self.input_dir):
         rmtree(self.input_dir)
 def tearDown(self):
     remove_files(set(self.files_to_remove))
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
Beispiel #8
0
 def test_plot_heatmap(self):
     plot_heatmap(self.otu_table,
                  self.otu_table.ObservationIds,
                  self.otu_table.SampleIds,
                  filename=self.tmp_heatmap_fpath)
     self.assertEqual(exists(self.tmp_heatmap_fpath), True)
     remove_files(set([self.tmp_heatmap_fpath]))
Beispiel #9
0
    def test_mothur_supported_version(self):
        """mothur is in path and version is supported """
        acceptable_version = (1, 25, 0)
        self.assertTrue(
            which('mothur'),
            "mothur not found. This may or may not be a problem depending on "
            + "which components of QIIME you plan to use.")
        # mothur creates a log file in cwd, so create a tmp and cd there first
        log_file = join(get_qiime_temp_dir(), 'mothur.log')
        command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file
        stdout, stderr, exit_Status = qiime_system_call(command)

        # remove log file
        remove_files([log_file], error_on_missing=False)

        version_string = stdout.strip().split(' ')[1].strip('v.')
        try:
            version = tuple(map(int, version_string.split('.')))
            pass_test = version == acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(
            pass_test,
            "Unsupported mothur version. %s is required, but running %s." %
            ('.'.join(map(str, acceptable_version)), version_string))
Beispiel #10
0
    def test_ParsInsert_supported_version(self):
        """ParsInsert is in path and version is supported """
        acceptable_version = ["1.04"]
        self.assertTrue(
            which('ParsInsert'),
            "ParsInsert not found. This may or may not be a problem depending on "
            + "which components of QIIME you plan to use.")
        command = "ParsInsert -v | grep App | awk '{print $3}'"
        proc = Popen(command,
                     shell=True,
                     universal_newlines=True,
                     stdout=PIPE,
                     stderr=STDOUT)
        stdout = proc.stdout.read()

        # remove log file generated
        remove_files(['ParsInsert.log'], error_on_missing=False)

        version_string = stdout.strip()
        try:
            pass_test = version_string in acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(
            pass_test,
            "Unsupported ParsInsert version. %s is required, but running %s." %
            ('.'.join(map(str, acceptable_version)), version_string))
Beispiel #11
0
    def test_split_fasta_diff_num_seqs_per_file_alt(self):
        """split_fasta funcs always catches all seqs
        """
        # start with 59 seqs (b/c it's prime, so should make more
        # confusing splits)
        in_seqs = LoadSeqs(data=[('seq%s' % k, 'AACCTTAA') for k in range(59)])
        infile = in_seqs.toFasta().split('\n')

        # test seqs_per_file from 1 to 1000
        for i in range(1, 1000):
            filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')

            actual = split_fasta(infile, i, filename_prefix)

            actual_seqs = []
            for fp in actual:
                actual_seqs += list(open(fp))
            # remove the files now, so if the test fails they still get
            # cleaned up
            remove_files(actual)

            # building seq collections from infile and the split files result in
            # equivalent seq collections
            self.assertEqual(\
             LoadSeqs(data=infile,aligned=False),\
             LoadSeqs(data=actual_seqs,aligned=False))
Beispiel #12
0
def check_options(parser, options):
    """Check to insure required options have been supplied"""
    if options.percent_aligned > 1.0:
        parser.error(\
            "Please check -p option: should be between 0.0(0%) and 1.0(100%)")

    if options.querydb is None:
        parser.error(\
                "Please check -i option: must specify path to a FASTA file")
    try:
        f = open(options.querydb, 'r')
        f.close()
    except IOError:
        parser.error(\
                "Please check -i option: cannot read from query FASTA filepath")
    if options.subjectdb is None:
        parser.error(\
                "Please check -d option: must specify path to a FASTA file")
    try:
        f = open(options.subjectdb, 'r')
        f.close()
    except IOError:
        parser.error(\
              "Please check -d option: cannot read from subject FASTA filepath")
    if options.outputfilename is None:
        parser.error(\
                "Please check -o option: must specify base output path")
    try:
        f = open(options.outputfilename, 'w')
        f.close()
        remove_files([FilePath(options.outputfilename)])
    except IOError:
        parser.error(\
              "Please check -o option: cannot write to output file")
Beispiel #13
0
    def test_ParsInsert_supported_version(self):
        """ParsInsert is in path and version is supported """
        acceptable_version = ["1.04"]
        self.assertTrue(
            which("ParsInsert"),
            "ParsInsert not found. This may or may not be a problem depending on "
            + "which components of QIIME you plan to use.",
        )
        command = "ParsInsert -v | grep App | awk '{print $3}'"
        proc = Popen(command, shell=True, universal_newlines=True, stdout=PIPE, stderr=STDOUT)
        stdout = proc.stdout.read()

        # remove log file generated
        remove_files(["ParsInsert.log"], error_on_missing=False)

        version_string = stdout.strip()
        try:
            pass_test = version_string in acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(
            pass_test,
            "Unsupported ParsInsert version. %s is required, but running %s."
            % (".".join(map(str, acceptable_version)), version_string),
        )
Beispiel #14
0
    def test_mothur_supported_version(self):
        """mothur is in path and version is supported """
        acceptable_version = (1, 25, 0)
        self.assertTrue(
            which("mothur"),
            "mothur not found. This may or may not be a problem depending on "
            + "which components of QIIME you plan to use.",
        )
        # mothur creates a log file in cwd, so create a tmp and cd there first
        log_file = join(get_qiime_temp_dir(), "mothur.log")
        command = "mothur \"#set.logfile(name=%s)\" | grep '^mothur v'" % log_file
        stdout, stderr, exit_Status = qiime_system_call(command)

        # remove log file
        remove_files([log_file], error_on_missing=False)

        version_string = stdout.strip().split(" ")[1].strip("v.")
        try:
            version = tuple(map(int, version_string.split(".")))
            pass_test = version == acceptable_version
        except ValueError:
            pass_test = False
            version_string = stdout
        self.assertTrue(
            pass_test,
            "Unsupported mothur version. %s is required, but running %s."
            % (".".join(map(str, acceptable_version)), version_string),
        )
Beispiel #15
0
def check_options(parser,options):
    """Check to insure required options have been supplied"""
    if options.percent_aligned > 1.0:
        parser.error(\
            "Please check -p option: should be between 0.0(0%) and 1.0(100%)")
    
    if options.querydb is None:
        parser.error(\
                "Please check -i option: must specify path to a FASTA file")  
    try:
        f=open(options.querydb,'r')
        f.close()
    except IOError:
        parser.error(\
                "Please check -i option: cannot read from query FASTA filepath")  
    if options.subjectdb is None:
        parser.error(\
                "Please check -d option: must specify path to a FASTA file")  
    try:
        f=open(options.subjectdb,'r')
        f.close()
    except IOError:
        parser.error(\
              "Please check -d option: cannot read from subject FASTA filepath")
    if options.outputfilename is None:
        parser.error(\
                "Please check -o option: must specify base output path")  
    try:
        f=open(options.outputfilename,'w')
        f.close()
        remove_files([FilePath(options.outputfilename)])
    except IOError:
        parser.error(\
              "Please check -o option: cannot write to output file")  
Beispiel #16
0
def clean_up_raw_data_files(raw_data_files, raw_data_dirs):
    for raw_data_fp_glob in raw_data_files:
        remove_files(glob(raw_data_fp_glob))

    for raw_data_dir_glob in raw_data_dirs:
        for dir_to_remove in glob(raw_data_dir_glob):
            rmtree(dir_to_remove)
Beispiel #17
0
def qiime_blast_seqs(seqs,
                     blast_constructor=Blastall,
                     blast_program='blastn',
                     blast_db=None,
                     refseqs=None,
                     refseqs_fp=None,
                     blast_mat_root=None,
                     params={},
                     WorkingDir=None,
                     seqs_per_blast_run=1000,
                     HALT_EXEC=False):
    """Blast list of sequences.

    seqs: a list (or object with list-like interace) of (seq_id, seq) 
     tuples (e.g., the output of MinimalFastaParser)
    
    """
    assert blast_db or refseqs_fp or refseqs, \
     'Must provide either a blast_db or a fasta '+\
     'filepath containing sequences to build one.'

    if refseqs_fp:
        blast_db, db_files_to_remove =\
         build_blast_db_from_fasta_path(refseqs_fp,output_dir=WorkingDir)
    elif refseqs:
        blast_db, db_files_to_remove =\
         build_blast_db_from_fasta_file(refseqs,output_dir=WorkingDir)
    else:
        db_files_to_remove = []

    params["-d"] = blast_db
    params["-p"] = blast_program

    blast_app = blast_constructor(params=params,
                                  blast_mat_root=blast_mat_root,
                                  InputHandler='_input_as_seq_id_seq_pairs',
                                  WorkingDir=WorkingDir,
                                  SuppressStderr=True,
                                  HALT_EXEC=HALT_EXEC)

    current_seqs = []
    blast_results = BlastResult([])
    for seq in seqs:
        current_seqs.append(seq)
        if len(current_seqs) % seqs_per_blast_run == 0:
            if blast_results:
                blast_results.update(\
                 BlastResult(blast_app(current_seqs)['StdOut']))
            else:
                blast_results = BlastResult(blast_app(current_seqs)['StdOut'])
            current_seqs = []

    # clean-up run: blast the remaining sequences
    blast_results.update(\
     BlastResult(blast_app(current_seqs)['StdOut']))

    remove_files(db_files_to_remove)

    return blast_results
Beispiel #18
0
    def check_write_load(self, orig_data):
        orig = RegionCollection(**orig_data)
        orig.writeToFile('test_data')
        recovered = RegionCollection(filename='test_data')
        for key in orig_data:
            self.assertEqual(getattr(recovered, key), orig_data[key])

        remove_files(['test_data'], error_on_missing=False)
Beispiel #19
0
    def tearDown(self):
        """Clean up tmp files."""
        remove_files(self.files_to_remove, False)
        if self.tmpdir:
            rmtree(self.tmpdir)

        # clean up the file from init_flowgram_file
        if (hasattr(self, "tmp_filename") and exists(self.tmp_filename)):
            remove(self.tmp_filename)
Beispiel #20
0
 def tearDown(self):
     """ """
     disable_timeout()
     remove_files(self.files_to_remove, error_on_missing=False)
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
Beispiel #21
0
    def tearDown(self):
        """Removes temporary directories and files."""
        remove_files(self.files_to_remove)

        # Remove directories last, so we don't get errors trying to remove
        # files which may be in the directories.
        for d in self.dirs_to_remove:
            if exists(d):
                rmtree(d)
Beispiel #22
0
 def tearDown(self):
    """Clean up tmp files."""
    remove_files(self.files_to_remove, False)
    if self.tmpdir:
       rmtree(self.tmpdir)
       
    #clean up the file from init_flowgram_file
    if (hasattr(self,"tmp_filename") and exists(self.tmp_filename)):
       remove(self.tmp_filename)
 def tearDown(self):
     """ """
     disable_timeout()
     remove_files(self.files_to_remove,error_on_missing=False)
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
 def tearDown(self):
     """ """
     # turn off the alarm
     signal.alarm(0)
     remove_files(self.files_to_remove)
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
Beispiel #25
0
    def tearDown(self):
        """Remove temporary files/dirs created by tests."""
        # Change back to the start dir - some workflows change directory.
        chdir(self.start_dir)

        remove_files(self.files_to_remove)
        # Remove directories last, so we don't get errors trying to remove
        # files which may be in the directories.
        for d in self.dirs_to_remove:
            if exists(d):
                rmtree(d)
Beispiel #26
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 blast_db=None,
                 refseqs_fp=None):

        self.log_lines = []

        if not blast_db:
            self.blast_db, self.db_files_to_remove = \
                build_blast_db_from_fasta_path(refseqs_fp)
            self.log_lines.append('Reference seqs fp (to build blast db): %s'%\
             refseqs_fp)
        else:
            self.blast_db = blast_db
            self.db_files_to_remove = []

        self.log_lines.append('Blast database: %s' % self.blast_db)

        clusters, failures = self._cluster_seqs(\
         MinimalFastaParser(open(seq_path)))
        self.log_lines.append('Num OTUs: %d' % len(clusters))

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, 'w')
            for cluster_id, cluster in clusters.items():
                of.write('%s\t%s\n' % (cluster_id, '\t'.join(cluster)))
            of.close()
            result = None
            self.log_lines.append('Result path: %s\n' % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            result = clusters
            self.log_lines.append('Result path: None, returned as dict.')

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            self.log_lines = [str(self)] + self.log_lines
            log_file.write('\n'.join(self.log_lines))
            failures.sort()
            log_file.write('Num failures: %d\n' % len(failures))
            log_file.write('Failures: %s\n' % '\t'.join(failures))

        remove_files(self.db_files_to_remove, error_on_missing=False)
        # return the result (note this is None if the data was
        # written to file)
        return result
Beispiel #27
0
    def test_export_table(self):
        """correctly generates table file"""
        orig_data = dict(counts=[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]],
                         ranks=[0, 1, 2, 3, 4],
                         labels=['a', 'b', 'c', 'd', 'e'])
        coll = RegionCollection(**orig_data)

        expect = coll.toTable().getRawData()
        coll.writeToFile('testdata', as_table=True)
        got = LoadTable('testdata', sep='\t')
        self.assertEqual(got.getRawData(), expect)
        remove_files(['testdata'], error_on_missing=False)
Beispiel #28
0
 def tearDown(self):
     
     disable_timeout()
     
     # reset sys.stderr
     sys.stderr = self.saved_stderr
     
     remove_files(self.files_to_remove)
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
Beispiel #29
0
 def test_tree_collection_read_write_file(self):
     """should correctly read / write a collection from a file"""
     def eval_klass(coll):
         coll.writeToFile('sample.trees')
         read = LoadTrees('sample.trees')
         self.assertTrue(type(read) == type(coll))
     
     eval_klass(LogLikelihoodScoredTreeCollection(self.scored_trees))
     
     # convert lnL into p
     eval_klass(WeightedTreeCollection([(exp(s), t) 
                                 for s,t in self.scored_trees]))
     remove_files(['sample.trees'], error_on_missing=False)
Beispiel #30
0
    def tearDown(self):
        """ """
        disable_timeout()

        # change back to the start dir - some workflows change directory
        chdir(self.start_dir)

        remove_files(self.files_to_remove)
        # remove directories last, so we don't get errors
        # trying to remove files which may be in the directories
        for d in self.dirs_to_remove:
            if exists(d):
                rmtree(d)
Beispiel #31
0
 def test_tree_collection_read_write_file(self):
     """should correctly read / write a collection from a file"""
     def eval_klass(coll):
         coll.writeToFile('sample.trees')
         read = LoadTrees('sample.trees')
         self.assertTrue(type(read) == type(coll))
     
     eval_klass(LogLikelihoodScoredTreeCollection(self.scored_trees))
     
     # convert lnL into p
     eval_klass(WeightedTreeCollection([(exp(s), t) 
                                 for s,t in self.scored_trees]))
     remove_files(['sample.trees'], error_on_missing=False)
    def tearDown(self):
        """ """
        disable_timeout()

        # change back to the start dir - some workflows change directory
        chdir(self.start_dir)

        remove_files(self.files_to_remove)
        # remove directories last, so we don't get errors
        # trying to remove files which may be in the directories
        for d in self.dirs_to_remove:
            if exists(d):
                rmtree(d)
Beispiel #33
0
 def tearDown(self):
     """ """
     disable_timeout()
     
     # reset sys.stderr
     sys.stderr = self.saved_stderr
     
     remove_files(self.files_to_remove)
     # remove directories last, so we don't get errors
     # trying to remove files which may be in the directories
     for d in self.dirs_to_remove:
         if exists(d):
             rmtree(d)
Beispiel #34
0
def remove_pyronoise_intermediates(basename):
    """Removes all intermediate pyronoise files.

    basename: the prefix for all files.
    """

    remove_files(map(lambda a: basename+a, 
                     [ ".otu", ".tau", "_cd.fa", ".fdist", ".pout", 
                       ".tree", "_cd2.fa", ".cen", ".list", ".qual", ".z",
                       "_cf.fa"]), True)
    for filename in listdir(basename):
        remove(basename+"/"+filename)
    rmdir(basename)
    def remove_intermediate_files(self):
        """Remove all intermediate files."""

        # tmp files are written in the current dir,
        # app controller always jumps into dir specified via exec_dir
        # Note: blast intermediates are not removed
        exec_dir = str(self.Parameters['--exec_dir'].Value)
        inp_file_name = str(self.Parameters['--query_NAST'].Value)

        exec_dir = exec_dir.rstrip('"')
        exec_dir = exec_dir.lstrip('"')

        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx",
                        ".CPS.CPC.wTaxons", ".cidx"]
        cs_tmp_files = [
            exec_dir +
            '/' +
            inp_file_name +
            x for x in tmp_suffixes]
        remove_files(cs_tmp_files, error_on_missing=False)

        db_param = self.Parameters['--db_NAST']
        if db_param.isOn():
            nast_db_name = str(db_param.Value)
            nast_db_name = nast_db_name.rstrip('"')
            nast_db_name = nast_db_name.lstrip('"')

            # Better do not remove this file since other ChimeraSlayer
            # instances running on the same ref set might use this file
            # Should be rather deleted in the calling function
#            remove_files([nast_db_name + ".cidx"],
#                         error_on_missing=False)

        fasta_param = self.Parameters['--db_FASTA']
        if fasta_param.isOn():
            fasta_name = str(fasta_param.Value)
            fasta_name = fasta_name.rstrip('"')
            fasta_name = fasta_name.lstrip('"')

            blast_db_files = [
                fasta_name +
                x for x in [
                    ".nsq",
                    ".nin",
                    ".nhr",
                    ".cidx"]]
            remove_files(blast_db_files, error_on_missing=False)
Beispiel #36
0
    def remove_intermediate_files(self):
        """Remove all intermediate files."""

        # tmp files are written in the current dir,
        # app controller always jumps into dir specified via exec_dir
        # Note: blast intermediates are not removed
        exec_dir = str(self.Parameters['--exec_dir'].Value)
        inp_file_name = str(self.Parameters['--query_NAST'].Value)

        exec_dir = exec_dir.rstrip('"')
        exec_dir = exec_dir.lstrip('"')

        inp_file_name = inp_file_name.rstrip('"')
        inp_file_name = inp_file_name.lstrip('"')

        tmp_suffixes = [".CPS", ".CPS.CPC", ".CPS_RENAST", ".CPS_RENAST.cidx",
                        ".CPS.CPC.wTaxons", ".cidx"]
        cs_tmp_files = [
            exec_dir +
            '/' +
            inp_file_name +
            x for x in tmp_suffixes]
        remove_files(cs_tmp_files, error_on_missing=False)

        db_param = self.Parameters['--db_NAST']
        if db_param.isOn():
            nast_db_name = str(db_param.Value)
            nast_db_name = nast_db_name.rstrip('"')
            nast_db_name = nast_db_name.lstrip('"')

            # Better do not remove this file since other ChimeraSlayer
            # instances running on the same ref set might use this file
            # Should be rather deleted in the calling function
#            remove_files([nast_db_name + ".cidx"],
#                         error_on_missing=False)

        fasta_param = self.Parameters['--db_FASTA']
        if fasta_param.isOn():
            fasta_name = str(fasta_param.Value)
            fasta_name = fasta_name.rstrip('"')
            fasta_name = fasta_name.lstrip('"')

            blast_db_files = [
                fasta_name +
                x for x in [
                    ".nsq",
                    ".nin",
                    ".nhr",
                    ".cidx"]]
            remove_files(blast_db_files, error_on_missing=False)
Beispiel #37
0
    def __call__(self, seq_path, result_path=None, log_path=None, blast_db=None, refseqs_fp=None):

        self.log_lines = []

        if not blast_db:
            self.blast_db, self.db_files_to_remove = build_blast_db_from_fasta_path(refseqs_fp)
            self.log_lines.append("Reference seqs fp (to build blast db): %s" % refseqs_fp)
        else:
            self.blast_db = blast_db
            self.db_files_to_remove = []

        self.log_lines.append("Blast database: %s" % self.blast_db)

        clusters, failures = self._cluster_seqs(MinimalFastaParser(open(seq_path)))
        self.log_lines.append("Num OTUs: %d" % len(clusters))

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, "w")
            for cluster_id, cluster in clusters.items():
                of.write("%s\t%s\n" % (cluster_id, "\t".join(cluster)))
            of.close()
            result = None
            self.log_lines.append("Result path: %s\n" % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            result = clusters
            self.log_lines.append("Result path: None, returned as dict.")

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, "w")
            self.log_lines = [str(self)] + self.log_lines
            log_file.write("\n".join(self.log_lines))
            failures.sort()
            log_file.write("Num failures: %d\n" % len(failures))
            log_file.write("Failures: %s\n" % "\t".join(failures))

        remove_files(self.db_files_to_remove, error_on_missing=False)
        # return the result (note this is None if the data was
        # written to file)
        return result
Beispiel #38
0
    def tearDown(self):
        """Clean up tmp files."""

        # turn off the alarm
        signal.alarm(0)

        remove_files(self.files_to_remove, False)
        if self.server_socket:
            self.server_socket.close()
        # give clients time to clean up
        sleep(1)
        if exists(self.tmp_dir):
            try:
                rmdir(self.tmp_dir)
            except OSError:
                # give clients some more time, fail if still error
                sleep(5)
                rmdir(self.tmp_dir)
   def tearDown(self):
      """Clean up tmp files."""

      # turn off the alarm
      signal.alarm(0)

      remove_files(self.files_to_remove, False)
      if self.server_socket:
         self.server_socket.close()
      #give clients time to clean up
      sleep(1)
      if exists(self.tmp_dir):
         try:
            rmdir(self.tmp_dir)
         except OSError:
            #give clients some more time, fail if still error
            sleep(5)
            rmdir(self.tmp_dir)
Beispiel #40
0
 def test_split_fasta_equal_num_seqs_per_file(self):
     """split_fasta funcs as expected when equal num seqs go to each file
     """
     filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
     infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
      '>seq3','CCTT--AA']
      
     actual = split_fasta(infile, 1, filename_prefix)
     actual_seqs = []
     for fp in actual:
         actual_seqs += list(open(fp))
     remove_files(actual)
     
     expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(3)]
     
     self.assertEqual(actual,expected)
     self.assertEqual(\
      LoadSeqs(data=infile,aligned=False),\
      LoadSeqs(data=actual_seqs,aligned=False))
Beispiel #41
0
    def test_split_fasta_equal_num_seqs_per_file(self):
        """split_fasta funcs as expected when equal num seqs go to each file
        """
        filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
        infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
         '>seq3','CCTT--AA']

        actual = split_fasta(infile, 1, filename_prefix)
        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(3)]

        self.assertEqual(actual, expected)
        self.assertEqual(\
         LoadSeqs(data=infile,aligned=False),\
         LoadSeqs(data=actual_seqs,aligned=False))
Beispiel #42
0
    def test_build_blast_db_from_fasta_path_aln(self):
        """build_blast_db_from_fasta_path works with alignment as input
        """
        blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
        self.assertEqual(blast_db, self.in_aln1_fp)
        expected_db_files = set([blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]])
        self.assertEqual(set(db_files), expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db, e_value=0.0)), 1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Beispiel #43
0
    def test_compute_seqs_per_file(self):
        """compute_seqs_per_file functions as expected
        """
        temp_fasta_fp = get_tmp_filename(prefix="QiimeScriptUtilTests", suffix=".fasta")
        temp_fasta = [">seq", "AAACCCCAAATTGG"] * 25
        open(temp_fasta_fp, "w").write("\n".join(temp_fasta))

        actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp, 25)
        actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp, 2)
        actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp, 10)
        actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp, 5)
        actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp, 40)

        remove_files([temp_fasta_fp])

        self.assertEqual(actual_25, 1)
        self.assertEqual(actual_2, 13)
        self.assertEqual(actual_10, 3)
        self.assertEqual(actual_5, 5)
        self.assertEqual(actual_40, 1)
Beispiel #44
0
 def test_compute_seqs_per_file(self):
     """compute_seqs_per_file functions as expected
     """
     temp_fasta_fp = get_tmp_filename(\
      prefix='QiimeScriptUtilTests',suffix='.fasta')
     temp_fasta = ['>seq','AAACCCCAAATTGG'] * 25
     open(temp_fasta_fp,'w').write('\n'.join(temp_fasta))
     
     actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp,25)
     actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp,2)
     actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp,10)
     actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp,5)
     actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp,40)
     
     remove_files([temp_fasta_fp])
     
     self.assertEqual(actual_25,1)
     self.assertEqual(actual_2,13)
     self.assertEqual(actual_10,3)
     self.assertEqual(actual_5,5)
     self.assertEqual(actual_40,1)
Beispiel #45
0
    def test_build_blast_db_from_fasta_path(self):
        """build_blast_db_from_fasta_path convenience function works as expected
        """
        blast_db, db_files = build_blast_db_from_fasta_path(self.in_seqs1_fp)
        self.assertEqual(blast_db, self.in_seqs1_fp)
        expected_db_files = set([self.in_seqs1_fp + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]])
        self.assertEqual(set(db_files), expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db)), 1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Beispiel #46
0
    def test_compute_seqs_per_file(self):
        """compute_seqs_per_file functions as expected
        """
        temp_fasta_fp = get_tmp_filename(\
         prefix='QiimeScriptUtilTests',suffix='.fasta')
        temp_fasta = ['>seq', 'AAACCCCAAATTGG'] * 25
        open(temp_fasta_fp, 'w').write('\n'.join(temp_fasta))

        actual_25 = self.pw._compute_seqs_per_file(temp_fasta_fp, 25)
        actual_2 = self.pw._compute_seqs_per_file(temp_fasta_fp, 2)
        actual_10 = self.pw._compute_seqs_per_file(temp_fasta_fp, 10)
        actual_5 = self.pw._compute_seqs_per_file(temp_fasta_fp, 5)
        actual_40 = self.pw._compute_seqs_per_file(temp_fasta_fp, 40)

        remove_files([temp_fasta_fp])

        self.assertEqual(actual_25, 1)
        self.assertEqual(actual_2, 13)
        self.assertEqual(actual_10, 3)
        self.assertEqual(actual_5, 5)
        self.assertEqual(actual_40, 1)
Beispiel #47
0
    def test_build_blast_db_from_fasta_path_aln(self):
        """build_blast_db_from_fasta_path works with alignment as input
        """
        blast_db, db_files = build_blast_db_from_fasta_path(self.in_aln1_fp)
        self.assertEqual(blast_db, self.in_aln1_fp)
        expected_db_files = set([blast_db + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files), expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Beispiel #48
0
    def test_build_blast_db_from_seqs(self):
        """build_blast_db_from_seqs convenience function works as expected
        """
        blast_db, db_files = build_blast_db_from_seqs(self.in_seqs1, output_dir="/tmp")
        self.assertTrue(blast_db.startswith("/tmp/Blast_tmp_db"))
        self.assertTrue(blast_db.endswith(".fasta"))
        expected_db_files = set([blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]])
        self.assertEqual(set(db_files), expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db)), 1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Beispiel #49
0
    def test_split_fasta_diff_num_seqs_per_file(self):
        """split_fasta funcs as expected when diff num seqs go to each file
        """
        filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
        infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
         '>seq3','CCTT--AA']

        actual = split_fasta(infile, 2, filename_prefix)

        actual_seqs = []
        for fp in actual:
            actual_seqs += list(open(fp))
        remove_files(actual)

        expected = ['%s.%d.fasta' % (filename_prefix, i) for i in range(2)]
        # list of file paths is as expected
        self.assertEqual(actual, expected)
        # building seq collections from infile and the split files result in
        # equivalent seq collections
        self.assertEqual(\
         LoadSeqs(data=infile,aligned=False),\
         LoadSeqs(data=actual_seqs,aligned=False))
Beispiel #50
0
 def test_split_fasta_diff_num_seqs_per_file(self):
     """split_fasta funcs as expected when diff num seqs go to each file
     """
     filename_prefix = get_random_job_prefix(fixed_prefix='/tmp/')
     infile = ['>seq1','AACCTTAA','>seq2','TTAACC','AATTAA',\
      '>seq3','CCTT--AA']
      
     actual = split_fasta(infile, 2, filename_prefix)
     
     actual_seqs = []
     for fp in actual:
         actual_seqs += list(open(fp))
     remove_files(actual)
     
     expected = ['%s.%d.fasta' % (filename_prefix,i) for i in range(2)]
     # list of file paths is as expected
     self.assertEqual(actual,expected)
     # building seq collections from infile and the split files result in
     # equivalent seq collections
     self.assertEqual(\
      LoadSeqs(data=infile,aligned=False),\
      LoadSeqs(data=actual_seqs,aligned=False))
Beispiel #51
0
    def test_build_blast_db_from_fasta_file(self):
        """build_blast_db_from_fasta_file works with open files as input
        """
        blast_db, db_files = build_blast_db_from_fasta_file(open(self.in_aln1_fp), output_dir="/tmp/")
        self.assertTrue(blast_db.startswith("/tmp/BLAST_temp_db"))
        self.assertTrue(blast_db.endswith(".fasta"))
        expected_db_files = set(
            [blast_db] + [blast_db + ext for ext in [".nhr", ".nin", ".nsq", ".nsd", ".nsi", ".log"]]
        )
        self.assertEqual(set(db_files), expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(len(blastn(self.test_seq, blast_db=blast_db, e_value=0.0)), 1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Beispiel #52
0
    def test_build_blast_db_from_fasta_file(self):
        """build_blast_db_from_fasta_file works with open files as input
        """
        blast_db, db_files = \
         build_blast_db_from_fasta_file(open(self.in_aln1_fp),output_dir='/tmp/')
        self.assertTrue(blast_db.startswith('/tmp/BLAST_temp_db'))
        self.assertTrue(blast_db.endswith('.fasta'))
        expected_db_files = set([blast_db] + [blast_db + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files), expected_db_files)
        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db,e_value=0.0)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Beispiel #53
0
    def test_build_blast_db_from_fasta_path(self):
        """build_blast_db_from_fasta_path convenience function works as expected
        """
        blast_db, db_files = \
         build_blast_db_from_fasta_path(self.in_seqs1_fp)
        self.assertEqual(blast_db, self.in_seqs1_fp)
        expected_db_files = set([self.in_seqs1_fp + ext\
         for ext in ['.nhr','.nin','.nsq','.nsd','.nsi','.log']])
        self.assertEqual(set(db_files), expected_db_files)

        # result returned when blasting against new db
        self.assertEqual(\
            len(blastn(self.test_seq,blast_db=blast_db)),1)

        # Make sure all db_files exist
        for fp in db_files:
            self.assertTrue(exists(fp))

        # Remove all db_files exist
        remove_files(db_files)

        # Make sure nothing weird happened in the remove
        for fp in db_files:
            self.assertFalse(exists(fp))
Beispiel #54
0
 def tearDown(self):
     for dir in  self.dirs_to_remove:
         if exists(dir):
             rmdir(dir)
     remove_files(self.files_to_remove)
Beispiel #55
0
 def tearDown(self):
     remove_files(self.files_to_remove)
Beispiel #56
0
def get_clusters_from_fasta_filepath(fasta_filepath,
                                     original_fasta_path,
                                     percent_ID=0.97,
                                     max_accepts=1,
                                     max_rejects=8,
                                     stepwords=8,
                                     word_length=8,
                                     optimal=False,
                                     exact=False,
                                     suppress_sort=False,
                                     output_dir=None,
                                     enable_rev_strand_matching=False,
                                     subject_fasta_filepath=None,
                                     suppress_new_clusters=False,
                                     return_cluster_maps=False,
                                     stable_sort=False,
                                     save_uc_files=True,
                                     HALT_EXEC=False):
    """ Main convenience wrapper for using uclust to generate cluster files
    
    A source fasta file is required for the fasta_filepath.  This will be 
    sorted to be in order of longest to shortest length sequences.  Following
    this, the sorted fasta file is used to generate a cluster file in the
    uclust (.uc) format.  Next the .uc file is converted to cd-hit format
    (.clstr).  Finally this file is parsed and returned as a list of lists, 
    where each sublist a cluster of sequences.  If an output_dir is
    specified, the intermediate files will be preserved, otherwise all
    files created are temporary and will be deleted at the end of this 
    function
    
    The percent_ID parameter specifies the percent identity for a clusters,
    i.e., if 99% were the parameter, all sequences that were 99% identical
    would be grouped as a cluster.
    """

    # Create readable intermediate filenames if they are to be kept

    fasta_output_filepath = None
    uc_output_filepath = None
    cd_hit_filepath = None

    if output_dir and not output_dir.endswith('/'):
        output_dir += '/'

    if save_uc_files:
        uc_save_filepath = get_output_filepaths(output_dir,
                                                original_fasta_path)
    else:
        uc_save_filepath = None

    sorted_fasta_filepath = ""
    uc_filepath = ""
    clstr_filepath = ""

    # Error check in case any app controller fails
    files_to_remove = []
    try:
        if not suppress_sort:
            # Sort fasta input file from largest to smallest sequence
            sort_fasta = uclust_fasta_sort_from_filepath(fasta_filepath, \
            output_filepath=fasta_output_filepath)

            # Get sorted fasta name from application wrapper
            sorted_fasta_filepath = sort_fasta['Output'].name
            files_to_remove.append(sorted_fasta_filepath)

        else:
            sort_fasta = None
            sorted_fasta_filepath = fasta_filepath

        # Generate uclust cluster file (.uc format)
        uclust_cluster = uclust_cluster_from_sorted_fasta_filepath(
            sorted_fasta_filepath,
            uc_save_filepath,
            percent_ID=percent_ID,
            max_accepts=max_accepts,
            max_rejects=max_rejects,
            stepwords=stepwords,
            word_length=word_length,
            optimal=optimal,
            exact=exact,
            suppress_sort=suppress_sort,
            enable_rev_strand_matching=enable_rev_strand_matching,
            subject_fasta_filepath=subject_fasta_filepath,
            suppress_new_clusters=suppress_new_clusters,
            stable_sort=stable_sort,
            HALT_EXEC=HALT_EXEC)
        # Get cluster file name from application wrapper
        remove_files(files_to_remove)
    except ApplicationError:
        remove_files(files_to_remove)
        raise ApplicationError, (
            'Error running uclust. Possible causes are '
            'unsupported version (current supported version is v1.2.22) is installed or '
            'improperly formatted input file was provided')
    except ApplicationNotFoundError:
        remove_files(files_to_remove)
        raise ApplicationNotFoundError('uclust not found, is it properly '+\
         'installed?')

    # Get list of lists for each cluster
    clusters, failures, seeds = \
     clusters_from_uc_file(uclust_cluster['ClusterFile'])

    # Remove temp files unless user specifies output filepath
    if not save_uc_files:
        uclust_cluster.cleanUp()

    if return_cluster_maps:
        return clusters, failures, seeds
    else:
        return clusters.values(), failures, seeds
def pick_subsampled_open_reference_otus(input_fp, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust','usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
     "Unknown de novo OTU picking method: %s. Known methods are: %s"\
     % (denovo_otu_picking_method,
        ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
     "Unknown reference OTU picking method: %s. Known methods are: %s"\
     % (reference_otu_picking_method,
        ','.join(allowed_reference_otu_picking_methods))
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[input_fp,
                               refseqs_fp,
                               step1_otu_map_fp,
                               step1_failures_fasta_fp])
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in 
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])
            
            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input', filter_fasta_cmd)])
            
            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            
        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)
        
        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])
        
        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []
    
    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set',step1_pick_rep_set_cmd)])
    
    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)
    
    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])
    
    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
     step1_failures_fasta_fp,
     step3_dir,
     reference_otu_picking_method,
     step2_repset_fasta_fp,
     parallel,
     params,
     logger)
    
    commands.append([
     ('Pick reference OTUs using de novo rep set',step3_cmd)])
    
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir
    
    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures', 
                          step3_filter_fasta_cmd)])
        
        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id,'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)])
        
    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])    
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                      'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))])
    
    command_handler(commands,
        status_update_callback,
        logger=logger,
        close_logger_on_success=False)
    commands = []
    
    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size)
    
    ## make the final representative seqs file and a new refseqs file that 
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be. 
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be 
    ## reads from this run so we don't hit issues building a tree using 
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp,'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp,new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp,'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write 
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    new_refseqs_f.close()
    final_repset_f.close()
    
    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table",make_otu_table_cmd)])
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
            
    if close_logger_on_success:
        logger.close()