Ejemplo n.º 1
0
def pick_subsampled_open_reference_otus(input_fp,
                                        refseqs_fp,
                                        output_dir,
                                        percent_subsample,
                                        new_ref_set_id,
                                        command_handler,
                                        params,
                                        qiime_config,
                                        prefilter_refseqs_fp=None,
                                        run_assign_tax=True,
                                        run_align_and_tree=True,
                                        prefilter_percent_id=None,
                                        min_otu_size=2,
                                        step1_otu_map_fp=None,
                                        step1_failures_fasta_fp=None,
                                        parallel=False,
                                        suppress_step4=False,
                                        logger=None,
                                        suppress_md5=False,
                                        suppress_index_page=False,
                                        denovo_otu_picking_method='uclust',
                                        reference_otu_picking_method='uclust_ref',
                                        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
                ('Run summary data',
                log_fp,
                _index_headers['run_summary']))
    else:
        close_logger_on_success = False


    if not suppress_md5:
        log_input_md5s(logger, [input_fp,
                                refseqs_fp,
                                step1_otu_map_fp,
                                step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id)
            commands.append(
                [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append(
                [('Filter prefilter failures from input', filter_fasta_cmd)])
            index_links.append(
            ('Pre-filtered sequence identifiers '
             '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100),
                        prefilter_failures_list_fp,
                        _index_headers['sequences']))


            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method,
            refseqs_fp, parallel, params, logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # Subsample the failures fasta file to retain (roughly) the
    # percent_subsample
    step2_input_fasta_fp = \
        '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)

    logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp),
                                                    abspath(
                                                        step2_input_fasta_fp),
                                                    percent_subsample))

    # Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    # Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
    commands.append(
        [('Pick representative set for subsampled failures', step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
        step1_failures_fasta_fp,
        step3_dir,
        reference_otu_picking_method,
        step2_repset_fasta_fp,
        parallel,
        params,
        logger)

    commands.append([
        ('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    index_links.append(
        ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
         merged_otu_map_fp,
         _index_headers['otu_maps']))


    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (step1_failures_fasta_fp,
             step3_failures_list_fp, step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step4_rep_set_cmd)])

    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' % (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(
        otu_fp,
        otu_no_singletons_fp,
        min_otu_size)

    index_links.append(('Final map of OTU identifier to sequence identifers excluding '
                        'OTUs with fewer than %d sequences' % min_otu_size,
                        otu_no_singletons_fp,
                        _index_headers['otu_maps']))

    logger.write('# Filter singletons from the otu map using API \n' +
                 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
                 '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp),
                                                    abspath(
                                                        otu_no_singletons_fp),
                                                    min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(
        ('OTU representative sequences',
         final_repset_fp,
         _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append(
        ('New reference sequences (i.e., OTU representative sequences plus input '
         'reference sequences)',
         new_refseqs_fp,
         _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' % final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write('# Copy the full input refseqs file to the new refseq file\n' +
                 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()
    logger.write('# Write non-singleton otus representative sequences from ' +
                 'step 2 and step 4 to the final representative set and the new reference' +
                 ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp,
         _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
             'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
             'fail to align with PyNAST' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(
            ('OTU phylogenetic tree',
             rep_set_tree_fp,
             _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            with biom_open(align_and_tree_input_otu_table) as biom_file:
                table = Table.from_hdf5(biom_file)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []


    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Ejemplo n.º 2
0
def pick_subsampled_open_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_assign_tax=True,
        run_align_and_tree=True,
        prefilter_percent_id=None,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        suppress_index_page=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = [
        'uclust_ref', 'usearch61_ref', 'sortmerna'
    ]
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
            ('Run summary data', log_fp, _index_headers['run_summary']))
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(
            logger,
            [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger,
                prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])
            index_links.append(
                ('Pre-filtered sequence identifiers '
                 '(failed to hit reference at %1.1f%% identity)' %
                 (float(prefilter_percent_id) * 100),
                 prefilter_failures_list_fp, _index_headers['sequences']))

            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir,
                                                 reference_otu_picking_method,
                                                 refseqs_fp, parallel, params,
                                                 logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # Subsample the failures fasta file to retain (roughly) the
    # percent_subsample
    step2_input_fasta_fp = \
        '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                    percent_subsample)

    logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' %
                 (abspath(step1_failures_fasta_fp),
                  abspath(step2_input_fasta_fp), percent_subsample))

    # Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                 new_ref_set_id, denovo_otu_picking_method,
                                 params, logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    # Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',
                      step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

    # remove the indexed reference database from the dictionary of
    # parameters as it must be forced to build a new database
    # using the step2_repset_fasta_fp
    if reference_otu_picking_method == 'sortmerna':
        if 'sortmerna_db' in params['pick_otus']:
            del params['pick_otus']['sortmerna_db']

    step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                    reference_otu_picking_method,
                                    step2_repset_fasta_fp, parallel, params,
                                    logger)

    commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    index_links.append((
        'Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
        merged_otu_map_fp, _index_headers['otu_maps']))

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (step1_failures_fasta_fp,
             step3_failures_list_fp, step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])

    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' %
                          (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    index_links.append(
        ('Final map of OTU identifier to sequence identifers excluding '
         'OTUs with fewer than %d sequences' % min_otu_size,
         otu_no_singletons_fp, _index_headers['otu_maps']))

    logger.write(
        '# Filter singletons from the otu map using API \n' +
        'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
        '(\'%s\', \'%s\', \'%d\')"\n\n' %
        (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(('OTU representative sequences', final_repset_fp,
                        _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append((
        'New reference sequences (i.e., OTU representative sequences plus input '
        'reference sequences)', new_refseqs_fp, _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' %
                 final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write(
        '# Copy the full input refseqs file to the new refseq file\n' +
        'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()
    logger.write(
        '# Write non-singleton otus representative sequences from ' +
        'step 2 and step 4 to the final representative set and the new reference'
        + ' set (%s and %s respectively)\n\n' %
        (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp, _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments'
            % min_otu_size, pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST' % min_otu_size,
            pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(('OTU phylogenetic tree', rep_set_tree_fp,
                            _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            with biom_open(align_and_tree_input_otu_table) as biom_file:
                table = Table.from_hdf5(biom_file)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Ejemplo n.º 3
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                                refseqs_fp=refseqs_fp,
                                                output_dir=iteration_output_dir,
                                                percent_subsample=percent_subsample,
                                                new_ref_set_id='.'.join(
                                                    [new_ref_set_id, str(i)]),
                                                command_handler=command_handler,
                                                params=params,
                                                qiime_config=qiime_config,
                                                run_assign_tax=False,
                                                run_align_and_tree=False,
                                                prefilter_refseqs_fp=prefilter_refseqs_fp,
                                                prefilter_percent_id=prefilter_percent_id,
                                                min_otu_size=min_otu_size,
                                                step1_otu_map_fp=step1_otu_map_fp,
                                                step1_failures_fasta_fp=step1_failures_fasta_fp,
                                                parallel=parallel,
                                                suppress_step4=suppress_step4,
                                                logger=logger,
                                                suppress_md5=suppress_md5,
                                                suppress_index_page=True,
                                                denovo_otu_picking_method=denovo_otu_picking_method,
                                                reference_otu_picking_method=reference_otu_picking_method,
                                                status_update_callback=status_update_callback)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append(
            '%s/otu_table_mc%d.biom' %
            (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            with biom_open(align_and_tree_input_otu_table) as biom_file:
                table = Table.from_hdf5(biom_file)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
Ejemplo n.º 4
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_assign_tax=False,
                run_align_and_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                suppress_md5=suppress_md5,
                suppress_index_page=True,
                denovo_otu_picking_method=denovo_otu_picking_method,
                reference_otu_picking_method=reference_otu_picking_method,
                status_update_callback=status_update_callback)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            with biom_open(align_and_tree_input_otu_table) as biom_file:
                table = Table.from_hdf5(biom_file)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()