def main():
    option_parser, opts, args = \
        parse_command_line_parameters(suppress_verbose=True, **script_info)
        
    input_dir = opts.input_dir
    parameter_fp = opts.parameter_fp
    read1_indicator = opts.read1_indicator
    read2_indicator = opts.read2_indicator
    match_barcodes = opts.match_barcodes
    barcode_indicator = opts.barcode_indicator
    leading_text = opts.leading_text
    trailing_text = opts.trailing_text
    include_input_dir_path = opts.include_input_dir_path
    output_dir = abspath(opts.output_dir)
    remove_filepath_in_name = opts.remove_filepath_in_name
    print_only = opts.print_only
    
    if remove_filepath_in_name and not include_input_dir_path:
        option_parser.error("If --remove_filepath_in_name is enabled, "
            "--include_input_dir_path must also be enabled.")

    if opts.parameter_fp:
        with open(opts.parameter_fp, 'U') as parameter_f:
            params_dict = parse_qiime_parameters(parameter_f)
        params_str = get_params_str(params_dict['join_paired_ends'])
    else:
        params_dict = {}
        params_str = ""

    create_dir(output_dir)
    
    all_files = []
    extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']
    
    for root, dir, fps in walk(input_dir):
        for fp in fps:
            for extension in extensions:
                if fp.endswith(extension):
                    all_files += [abspath(join(root, fp))]
        
    pairs, bc_pairs = get_pairs(all_files, read1_indicator, read2_indicator,
        match_barcodes, barcode_indicator)

    commands = create_commands_jpe(pairs, output_dir,
        params_str, leading_text, trailing_text, include_input_dir_path,
        remove_filepath_in_name, match_barcodes, bc_pairs)
        
    qiime_config = load_qiime_config()
    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params_dict,
                            qiime_config=qiime_config)
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback=no_status_updates,
                    logger=logger,
                    close_logger_on_success=True)
def main():
    option_parser, opts, args = \
        parse_command_line_parameters(suppress_verbose=True, **script_info)
        
    input_dir = opts.input_dir
    paired_data = opts.paired_data
    parameter_fp = opts.parameter_fp
    read1_indicator = opts.read1_indicator
    read2_indicator = opts.read2_indicator
    leading_text = opts.leading_text
    trailing_text = opts.trailing_text
    include_input_dir_path = opts.include_input_dir_path
    output_dir = abspath(opts.output_dir)
    remove_filepath_in_name = opts.remove_filepath_in_name
    print_only = opts.print_only
    
    if remove_filepath_in_name and not include_input_dir_path:
        option_parser.error("If --remove_filepath_in_name is enabled, "
            "--include_input_dir_path must also be enabled.")
            
    if opts.parameter_fp:
        with open(opts.parameter_fp, 'U') as parameter_f:
            params_dict = parse_qiime_parameters(parameter_f)
        params_str = get_params_str(params_dict['extract_barcodes'])
    else:
        params_dict = {}
        params_str = ""
    
    create_dir(output_dir)
                
    all_files = []
    extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']
    
    for root, dir, fps in walk(input_dir):
        for fp in fps:
            for extension in extensions:
                if fp.endswith(extension):
                    all_files += [abspath(join(root, fp))]

    if paired_data:
        all_files, bc_pairs = get_pairs(all_files, read1_indicator,
                                        read2_indicator)

    commands = create_commands_eb(all_files, paired_data, output_dir,
        params_str, leading_text, trailing_text, include_input_dir_path,
        remove_filepath_in_name)
        
    qiime_config = load_qiime_config()
    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params_dict,
                            qiime_config=qiime_config)
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback = no_status_updates,
                    logger=logger,
                    close_logger_on_success=True)
Example #3
0
def pick_subsampled_open_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_assign_tax=True,
        run_align_and_tree=True,
        prefilter_percent_id=None,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        suppress_index_page=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = [
        'uclust_ref', 'usearch61_ref', 'sortmerna'
    ]
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
            ('Run summary data', log_fp, _index_headers['run_summary']))
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(
            logger,
            [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger,
                prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])
            index_links.append(
                ('Pre-filtered sequence identifiers '
                 '(failed to hit reference at %1.1f%% identity)' %
                 (float(prefilter_percent_id) * 100),
                 prefilter_failures_list_fp, _index_headers['sequences']))

            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir,
                                                 reference_otu_picking_method,
                                                 refseqs_fp, parallel, params,
                                                 logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(
            step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                     'python -c "import qiime; qiime.util.subsample_fasta' +
                     '(\'%s\', \'%s\', \'%f\')\n\n"' %
                     (abspath(step1_failures_fasta_fp),
                      abspath(step2_input_fasta_fp), percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                     new_ref_set_id, denovo_otu_picking_method,
                                     params, logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                        reference_otu_picking_method,
                                        step2_repset_fasta_fp, parallel,
                                        params, logger)

        commands.append([('Pick reference OTUs using de novo rep set',
                          step3_cmd)])

        index_links.append((
            'Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
            merged_otu_map_fp, _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                              step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step,
                          step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([
            ('Move final failures file to top-level directory',
             'mv %s %s/final_failures.txt' % (failures_fp, output_dir))
        ])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    index_links.append(
        ('Final map of OTU identifier to sequence identifers excluding '
         'OTUs with fewer than %d sequences' % min_otu_size,
         otu_no_singletons_fp, _index_headers['otu_maps']))

    logger.write(
        '# Filter singletons from the otu map using API \n' +
        'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
        '(\'%s\', \'%s\', \'%d\')"\n\n' %
        (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(('OTU representative sequences', final_repset_fp,
                        _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append((
        'New reference sequences (i.e., OTU representative sequences plus input '
        'reference sequences)', new_refseqs_fp, _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' %
                 final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write(
        '# Copy the full input refseqs file to the new refseq file\n' +
        'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 2 and step 4 to the final representative set and the new reference'
            + ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write(
            '# Write non-singleton otus representative sequences from ' +
            'step 4 to the final representative set and the new reference' +
            ' set (%s and %s respectively)\n\n' %
            (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp, _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments'
            % min_otu_size, pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp,
            _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append((
            'OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST' % min_otu_size,
            pynast_failure_filtered_otu_table_fp,
            _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(('OTU taxonomic assignments', taxonomy_fp,
                                _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(('OTU phylogenetic tree', rep_set_tree_fp,
                            _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Example #4
0
def align_and_tree(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):

    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # Prep the pynast alignment command
    alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method)
    aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename)
    failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename)
    if exists(pynast_dir):
        rmtree(pynast_dir)

    if parallel:
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['align_seqs'].copy()
            if 'alignment_method' in d:
                del d['alignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass

        # Build the parallel pynast alignment command
        align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
            (repset_fasta_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
            (repset_fasta_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])

    # Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\
        (pynast_dir, input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
        (pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])

    # Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
        (filtered_aln_fp, tree_fp, params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    if exists(tree_fp):
        remove_files([tree_fp])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return failures_fp
Example #5
0
def run_ampliconnoise(
    mapping_fp,
    output_dir,
    command_handler,
    params,
    qiime_config,
    logger=None,
    status_update_callback=print_to_stdout,
    chimera_alpha=-3.8228,
    chimera_beta=0.6200,
    sff_txt_fp=None,
    numnodes=2,
    suppress_perseus=True,
    output_filepath=None,
    platform="flx",
    seqnoise_resolution=None,
    truncate_len=None,
):
    """ Run the ampliconnoise pipeline

        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, "U"))
    create_dir(output_dir)

    if seqnoise_resolution is None:
        if platform == "flx":
            seqnoise_resolution = "30.0"
        elif platform == "titanium":
            seqnoise_resolution = "25.0"
        else:
            raise RuntimeError("seqnoise_resolution not set, and no" + " default for platform " + platform)

    if truncate_len is None:
        if platform == "flx":
            truncate_len = "220"
        elif platform == "titanium":
            truncate_len = "400"
        else:
            raise RuntimeError("truncate_len not set, and no" + " default for platform " + platform)

    # these are filenames minus extension, and are sample IDs
    sample_names = []
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index("SampleID")])
        bc_seqs.append(map_data[i][headers.index("BarcodeSequence")])
        primer = map_data[i][headers.index("LinkerPrimerSequence")]
        for char, bases in DNASequence.iupac_degeneracies().iteritems():
            primer = primer.replace(char, "[" + "".join(bases) + "]")
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError("Error: only one primer per mapping file supported.")
    one_primer = primer_seqs[0]

    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, "map.csv"), "w")
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + "," + bc_seqs[i] + "\n")
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = "_" + truncate_len
    if suppress_perseus:
        fasta_result_names = [sample_name + post_pyro_tail + "_seqnoise_cd.fa" for sample_name in sample_names]
    else:
        fasta_result_names = [sample_name + "_Good.fa" for sample_name in sample_names]

    cmd = "cd " + output_dir  # see also os.chdir above
    commands.append([("change to output dir", cmd)])

    cmd = "echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt"
    commands.append([("confirm pyro lookup filepath environment variable", cmd)])

    cmd = (
        "SplitKeys.pl "
        + one_primer
        + " map.csv < "
        + os.path.join(called_dir, sff_txt_fp)
        + " > splitkeys_log.txt 2> unassigned.fna"
    )
    commands.append([("split sff.txt via barcodes (keys)", cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == "flx":
            cmd = "Clean360.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw"
            commands.append([("clean flows " + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == "titanium":
            cmd = "CleanMinMax.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw"
            commands.append([("clean flows " + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " PyroDist -in "
            + sample_name
            + ".dat -out "
            + sample_name
            + " > "
            + sample_name
            + ".pdout"
        )
        commands.append([("pyrodist " + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name + " > " + sample_name + ".fcout"
        commands.append([("fcluster pyrodist " + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
        # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " PyroNoise -din "
            + sample_name
            + ".dat -out "
            + sample_name
            + "_pyronoise "
            + "-lin "
            + sample_name
            + ".list -s 60.0 -c 0.01 > "
            + sample_name
            + "_pyronoise.pnout"
        )
        commands.append([("pyronoise " + sample_name, cmd)])

        cmd = (
            "Parse.pl "
            + bc_seqs[i]
            + one_primer
            + " "
            + truncate_len
            + " < "
            + sample_name
            + "_pyronoise_cd.fa"
            + " > "
            + sample_name
            + "_"
            + truncate_len
            + ".fa"
        )
        commands.append([("truncate " + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " SeqDist -in "
            + sample_name
            + post_pyro_tail
            + ".fa > "
            + sample_name
            + post_pyro_tail
            + ".seqdist"
        )
        commands.append([("seqdist " + sample_name, cmd)])

        cmd = (
            "FCluster -in "
            + sample_name
            + post_pyro_tail
            + ".seqdist -out "
            + sample_name
            + post_pyro_tail
            + "fcl > "
            + sample_name
            + post_pyro_tail
            + ".fcout"
        )
        commands.append([("fcluster seqdist " + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
        # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
        # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
        # PC.354_pyronoise_cd.snout

        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " SeqNoise -in "
            + sample_name
            + post_pyro_tail
            + ".fa -din "
            + sample_name
            + post_pyro_tail
            + ".seqdist -out "
            + sample_name
            + post_pyro_tail
            + "_seqnoise -lin "
            + sample_name
            + post_pyro_tail
            + "fcl.list -min "
            + sample_name
            + "_pyronoise"
            + ".mapping -s "
            + seqnoise_resolution
            + " -c 0.08 > "
            + sample_name
            + post_pyro_tail
            + ".snout"
        )
        commands.append([("seqnoise " + sample_name, cmd)])

        if not suppress_perseus:

            cmd = "Perseus -sin " + sample_name + post_pyro_tail + "_seqnoise_cd.fa > " + sample_name + ".per"
            commands.append([("Perseus " + sample_name, cmd)])

            cmd = (
                "Class.pl "
                + sample_name
                + ".per "
                + str(chimera_alpha)
                + " "
                + str(chimera_beta)
                + " > "
                + sample_name
                + ".class"
            )
            commands.append([("Class.pl " + sample_name, cmd)])

            cmd = (
                "FilterGoodClass.pl "
                + sample_name
                + post_pyro_tail
                + "_seqnoise_cd.fa "
                + sample_name
                + ".class 0.5 > "
                + sample_name
                + "_Chi.fa 2> "
                + sample_name
                + "_Good.fa"
            )
            commands.append([("FilterGoodClass " + sample_name, cmd)])

        cmd = "unweight_fasta.py -i %s -o %s -l %s" % (fasta_result_names[i], sample_name + "_unw.fna", sample_name)
        commands.append([("unweight fasta " + sample_name, cmd)])

    cmd = (
        "cat " + " ".join([sample_name + "_unw.fna" for sample_name in sample_names]) + " > " + output_filepath
    )  # this should be an abs filepath
    commands.append([("cat into one fasta file", cmd)])

    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_nested_reference_otus(input_fasta_fp,
                              input_tree_fp,
                              output_dir,
                              run_id,
                              similarity_thresholds,
                              command_handler,
                              status_update_callback=print_to_stdout):


    # Prepare some variables for the later steps
    create_dir(output_dir)
    otu_dir = join(output_dir,'otus')
    create_dir(otu_dir)
    rep_set_dir = join(output_dir,'rep_set')
    create_dir(rep_set_dir)
    # currently not doing anything with taxonomies and trees
    # tax_dir = join(output_dir,'taxonomies')
    # create_dir(tax_dir)
    if input_tree_fp:
        tree_dir = join(output_dir,'trees')
        create_dir(tree_dir)
    commands = []
    files_to_remove = []
    
    logger = WorkflowLogger(generate_log_fp(output_dir))
    similarity_thresholds.sort()
    similarity_thresholds.reverse()
    
    current_inseqs_fp = input_fasta_fp
    current_tree_fp = input_tree_fp
    previous_otu_map = None
    for similarity_threshold in similarity_thresholds:
        current_inseqs_basename = splitext(split(current_inseqs_fp)[1])[0]
        
        # pick otus command
        otu_fp = '%s/%d_otu_map.txt' % (otu_dir,similarity_threshold)
        clusters_fp = '%s/%d_clusters.uc' % (otu_dir,similarity_threshold)
        temp_otu_fp = '%s/%s_otus.txt' % (otu_dir, current_inseqs_basename)
        temp_log_fp = '%s/%s_otus.log' % (otu_dir, current_inseqs_basename)
        temp_clusters_fp = '%s/%s_clusters.uc' % (otu_dir, current_inseqs_basename)
        pick_otus_cmd = \
         'pick_otus.py -m uclust -DBz -i %s -s %1.2f -o %s' % (
           current_inseqs_fp,
           similarity_threshold/100,
           otu_dir)
        
        commands.append([('Pick OTUs (%d)' % similarity_threshold,
                          pick_otus_cmd)])
        commands.append([('Rename OTU file (%d)' % similarity_threshold,
                          'mv %s %s' % (temp_otu_fp,otu_fp))])
        commands.append([('Rename uc file (%d)' % similarity_threshold,
                          'mv %s %s' % (temp_clusters_fp,clusters_fp))])
        files_to_remove.append(temp_log_fp)
        
        # rep set picking
        temp_rep_set_fp = get_tmp_filename(prefix='NestedReference',
                                           suffix='.fasta')
        pick_rep_set_cmd = \
         'pick_rep_set.py -m first -i %s -o %s -f %s' % (
          otu_fp, 
          temp_rep_set_fp,
          current_inseqs_fp)
        commands.append([('Pick Rep Set (%d)' % similarity_threshold,
                           pick_rep_set_cmd)])
        command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
        commands = []
        
        # rename representative sequences
        rep_set_fp = '%s/%d_otus_%s.fasta' % (
          rep_set_dir,
          similarity_threshold,
          run_id)
        logger.write('Renaming OTU representative sequences so OTU ids are reference sequence ids.')
        rep_set_f = open(rep_set_fp,'w')
        for e in rename_rep_seqs(open(temp_rep_set_fp,'U')):
            rep_set_f.write('>%s\n%s\n' % e)
        rep_set_f.close()
        files_to_remove.append(temp_rep_set_fp)
        
        # filter the tree, if provided
        if current_tree_fp != None:
            tree_fp = '%s/%d_otus_%s.tre' % (
              tree_dir,
              similarity_threshold,
              run_id)
            tree_cmd = 'filter_tree.py -i %s -f %s -o %s' %\
               (current_tree_fp,rep_set_fp,tree_fp)
            commands.append([('Filter tree (%d)' % similarity_threshold,tree_cmd)])
            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
            # prep for the next iteration
            current_tree_fp = tree_fp
        
        
        # prep for the next iteration
        remove_files(files_to_remove)
        commands = []
        files_to_remove = []
        current_inseqs_fp = rep_set_fp
        
    logger.close()
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_otu_category_significance=False,
    status_update_callback=print_to_stdout):
    """
    """
    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    
    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,_index_headers['run_summary']))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)

    # run print_biom_table_summary.py on input BIOM table
    try:
        params_str = get_params_str(params['print_biom_table_summary'])
    except KeyError:
        params_str = ''
    biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
    print_biom_table_summary_cmd = \
     "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % \
     (biom_fp, biom_table_stats_output_fp,params_str)
    index_links.append(('BIOM table statistics',
                        biom_table_stats_output_fp,
                        _index_headers['run_summary']))
    commands.append([('Generate BIOM table summary',
                      print_biom_table_summary_cmd)])
    
    # filter samples with fewer observations than the requested sampling_depth. 
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered 
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
     (biom_fp,filtered_biom_fp,sampling_depth)
    commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth,
                      filter_samples_cmd)])
    biom_fp = filtered_biom_fp
    
    # run initial commands and reset the command list
    command_handler(commands, 
                    status_update_callback, 
                    logger,
                    close_logger_on_success=False)
    commands = []
    
    if not suppress_beta_diversity:
        bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
        even_dm_fps = run_beta_diversity_through_plots(
         otu_table_fp=biom_fp, 
         mapping_fp=mapping_fp,
         output_dir=bdiv_even_output_dir,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         sampling_depth=sampling_depth,
         # force suppression of distance histograms - boxplots work better
         # in this context, and are created below.
         histogram_categories=[],
         tree_fp=tree_fp,
         parallel=parallel,
         logger=logger,
         suppress_md5=True,
         status_update_callback=status_update_callback)
    
        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
                try:
                    params_str = get_params_str(params['make_distance_boxplots'])
                except KeyError:
                    params_str = ''
                boxplots_cmd = \
                 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
                 (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
                commands.append([('Boxplots (%s)' % category,
                                  boxplots_cmd)])
                index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                    '%s/%s_Distances.pdf' % \
                                     (boxplots_output_dir,category),
                                    _index_headers['beta_diversity_even'] % sampling_depth))
                index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                    '%s/%s_Stats.txt' % \
                                     (boxplots_output_dir,category),
                                    _index_headers['beta_diversity_even'] % sampling_depth))
            
            index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
                                '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
                                '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
                                '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
                                '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
                                 (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Distance matrix (%s)' % bdiv_metric,
                                '%s/%s_dm.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                                '%s/%s_pc.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
    
    if not suppress_alpha_diversity:
        ## Alpha rarefaction workflow
        arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
        run_alpha_rarefaction(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=arare_full_output_dir,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         tree_fp=tree_fp,
         num_steps=arare_num_steps,
         parallel=parallel,
         logger=logger,
         min_rare_depth=arare_min_rare_depth,
         max_rare_depth=sampling_depth,
         suppress_md5=True,
         status_update_callback=status_update_callback)
    
        index_links.append(('Alpha rarefaction plots',
                            '%s/alpha_rarefaction_plots/rarefaction_plots.html'\
                              % arare_full_output_dir,
                            _index_headers['alpha_diversity']))
                        
        collated_alpha_diversity_fps = \
         glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
        try:
            params_str = get_params_str(params['compare_alpha_diversity'])
        except KeyError:
            params_str = ''
        for category in categories:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                alpha_comparison_output_fp = '%s/%s_%s.txt' % \
                 (arare_full_output_dir,category,alpha_metric)
                compare_alpha_cmd = \
                 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\
                 (collated_alpha_diversity_fp, mapping_fp, category, 
                  alpha_comparison_output_fp, params_str)
                commands.append([('Compare alpha diversity (%s, %s)' %\
                                   (category,alpha_metric),
                                  compare_alpha_cmd)])
                index_links.append(
                 ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
                  alpha_comparison_output_fp,
                  _index_headers['alpha_diversity']))
    
    if not suppress_taxa_summary:
        taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
        run_summarize_taxa_through_plots(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=taxa_plots_output_dir,
         mapping_cat=None, 
         sort=True,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         logger=logger,
         suppress_md5=True,
         status_update_callback=status_update_callback)
    

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        for category in categories:
            taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category)
            run_summarize_taxa_through_plots(
             otu_table_fp=biom_fp,
             mapping_fp=mapping_fp,
             output_dir=taxa_plots_output_dir,
             mapping_cat=category, 
             sort=True,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             logger=logger,
             suppress_md5=True,
             status_update_callback=status_update_callback)

            index_links.append(('Taxa summary bar plots',
                                '%s/taxa_summary_plots/bar_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
            index_links.append(('Taxa summary area plots',
                                '%s/taxa_summary_plots/area_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
    
    if not suppress_otu_category_significance:
        # OTU category significance
        for category in categories:
            category_signifance_fp = \
             '%s/category_significance_%s.txt' % (output_dir, category)
            try:
                params_str = get_params_str(params['otu_category_significance'])
            except KeyError:
                params_str = ''
            # Build the OTU cateogry significance command
            category_significance_cmd = \
             'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
             (biom_fp, mapping_fp, category, 
              category_signifance_fp, params_str)
            commands.append([('OTU category significance (%s)' % category, 
                              category_significance_cmd)])
                          
            index_links.append(('Category significance (%s)' % category,
                        category_signifance_fp,
                        _index_headers['otu_category_sig']))
    
    commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)])
    index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth,
                        '%s.gz' % filtered_biom_fp,
                        _index_headers['run_summary']))
    
    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links,index_fp)
def assign_tax(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):
                   
    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    ## Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'rdp'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp' or
                     assignment_method == 'blast' or
                     assignment_method == 'uclust'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            if 'assignment_method' in d:
                del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (assignment_method, repset_fasta_fp, assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
         (assign_taxonomy_dir,repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp
Example #9
0
def run_pick_de_novo_otus(input_fp, 
                               output_dir, 
                               command_handler,
                               params, 
                               qiime_config,
                               parallel=False,
                               logger=None,
                               suppress_md5=False,
                               status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Pick OTUs;
          2) Pick a representative set;
          3) Align the representative set; 
          4) Assign taxonomy;
          5) Filter the alignment prior to tree building - remove positions
             which are all gaps, and specified as 0 in the lanemask
          6) Build a phylogenetic tree;
          7) Build an OTU table.
    
    """
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    cluster_failures = False
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[input_fp])
    
    # Prep the OTU picking command
    try:
        otu_picking_method = params['pick_otus']['otu_picking_method']
    except KeyError:
        otu_picking_method = 'uclust'
    pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
    otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename)
    if parallel and (otu_picking_method == 'blast' or 
                     otu_picking_method == 'uclust_ref'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --otu_picking_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            del d['otu_picking_method']
        except KeyError:
            pass
        
        if otu_picking_method == 'uclust_ref':
            try:
                suppress_new_clusters = d['suppress_new_clusters']
                del d['suppress_new_clusters']
                cluster_failures = False
            except KeyError:
                cluster_failures = True
                failure_otu_picking_method = 'uclust'
        
        params_str += ' %s' % get_params_str(d)
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/%s -i %s -o %s -T %s' % (python_exe_fp, 
                                                        script_dir, 
                                                        otu_picking_script,
                                                        input_fp,
                                                        pick_otu_dir,
                                                        params_str)
    else:
        try:
            params_str = get_params_str(params['pick_otus'])
        except KeyError:
            params_str = ''
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, input_fp, pick_otu_dir, params_str)

    commands.append([('Pick OTUs', pick_otus_cmd)])
    
    if cluster_failures:
        reference_otu_fp = otu_fp
        clustered_failures_dir = '%s/failure_otus/' % pick_otu_dir
        
        try:
            d = params['pick_otus'].copy()
            del d['otu_picking_method']
        except KeyError:
            pass

        if 'uclust_otu_id_prefix' not in d:
            d['uclust_otu_id_prefix'] = 'DeNovoOTU'        
        params_str = ' %s' % get_params_str(d)

        failures_list_fp = '%s/%s_failures.txt' % \
         (pick_otu_dir,input_basename)
        failures_fasta_fp = '%s/%s_failures.fasta' % \
         (pick_otu_dir,input_basename)
        
        filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,failures_list_fp,failures_fasta_fp)
        
        commands.append([('Generate failures fasta file',
                          filter_fasta_cmd)])
        
        # Prep the OTU picking command for
        failure_otu_fp = '%s/%s_failures_otus.txt' % (clustered_failures_dir,input_basename)
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -m %s %s' %\
         (python_exe_fp, script_dir, failures_fasta_fp, clustered_failures_dir, 
          failure_otu_picking_method, params_str)

        commands.append([('Pick de novo OTUs for new clusters', pick_otus_cmd)])
        
        merged_otu_map_fp = '%s/merged_otu_map.txt' % clustered_failures_dir
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
         (reference_otu_fp,failure_otu_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        otu_fp = merged_otu_map_fp

    # Prep the representative set picking command
    rep_set_dir = '%s/rep_set/' % output_dir
    create_dir(rep_set_dir)
    rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir,input_basename)
    rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir,input_basename)
    
    try:
        params_str = get_params_str(params['pick_rep_set'])
    except KeyError:
        params_str = ''
    # Build the representative set picking command
    pick_rep_set_cmd = '%s %s/pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_fp, input_fp, rep_set_log_fp,\
      rep_set_fp, params_str)
    commands.append([('Pick representative set', pick_rep_set_cmd)])
    
    # Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'uclust'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp' or
                     assignment_method == 'blast' or
                     assignment_method == 'uclust'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the taxonomy assignment parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            if 'assignment_method' in d:
                del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         '%s %s/parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (python_exe_fp, script_dir, assignment_method, rep_set_fp,\
          assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = '%s %s/assign_taxonomy.py -o %s -i %s %s' %\
         (python_exe_fp, script_dir, assign_taxonomy_dir,\
          rep_set_fp, params_str)
    
    commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
    
    # Prep the OTU table building command
    otu_table_fp = '%s/otu_table.biom' % output_dir
    try:
        params_str = get_params_str(params['make_otu_table'])
    except KeyError:
        params_str = ''
    # Build the OTU table building command
    make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_fp, taxonomy_fp, otu_table_fp, params_str)
    
    commands.append([('Make OTU table', make_otu_table_cmd)])
    
    if cluster_failures:
        reference_otu_table_fp = '%s/reference_only_otu_table.biom' % output_dir
        # Build the OTU table building command
        make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\
         (python_exe_fp, script_dir, reference_otu_fp, taxonomy_fp, 
          reference_otu_table_fp, params_str)
    
        commands.append([('Make reference-only OTU table', make_otu_table_cmd)])
    
    # Prep the pynast alignment command
    try:
        alignment_method = params['align_seqs']['alignment_method']
    except KeyError:
        alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method)
    aln_fp = '%s/%s_rep_set_aligned.fasta' % (pynast_dir,input_basename)
    if parallel and alignment_method == 'pynast':
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the alignment parameters        
        # Want to find a cleaner strategy for this: the parallel script
        # is method-specific, so doesn't take a --alignment_method
        # option. This works for now though.
        try:
            d = params['align_seqs'].copy()
        except KeyError:
            d = {}
        try:
            del d['alignment_method']
        except KeyError:
            pass
        params_str += ' %s' % get_params_str(d)
        
        # Build the parallel pynast alignment command
        align_seqs_cmd = '%s %s/parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
         (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = '%s %s/align_seqs.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])
    
    # Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_rep_set_aligned_pfiltered.fasta' %\
     (pynast_dir,input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = '%s %s/filter_alignment.py -o %s -i %s %s' %\
     (python_exe_fp, script_dir, pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])
    
    # Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = '%s %s/make_phylogeny.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, filtered_aln_fp, tree_fp,\
     params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    
    return abspath(tree_fp), abspath(otu_table_fp)
Example #10
0
def create_personal_results(output_dir,
                            mapping_fp,
                            coord_fp,
                            collated_dir,
                            otu_table_fp,
                            prefs_fp,
                            personal_id_column,
                            personal_ids=None,
                            column_title='Self',
                            individual_titles=None,
                            category_to_split='BodySite',
                            time_series_category='WeeksSinceStart',
                            rarefaction_depth=10000,
                            alpha=0.05,
                            rep_set_fp=None,
                            body_site_rarefied_otu_table_dir=None,
                            retain_raw_data=False,
                            suppress_alpha_rarefaction=False,
                            suppress_beta_diversity=False,
                            suppress_taxa_summary_plots=False,
                            suppress_alpha_diversity_boxplots=False,
                            suppress_otu_category_significance=False,
                            command_handler=call_commands_serially,
                            status_update_callback=no_status_updates):
    # Create our output directory and copy over the resources the personalized
    # pages need (e.g. javascript, images, etc.).
    create_dir(output_dir)

    support_files_dir = join(output_dir, 'support_files')
    if not exists(support_files_dir):
        copytree(join(get_project_dir(), 'my_microbes', 'support_files'),
                 support_files_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U'))
    try:
        personal_id_index = header.index(personal_id_column)
    except ValueError:
        raise ValueError("Personal ID field '%s' is not a mapping file column "
                         "header." % personal_id_column)
    try:
        bodysite_index = header.index(category_to_split)
    except ValueError:
        raise ValueError("Category to split field '%s' is not a mapping file "
            "column header." % category_to_split)

    header = header[:-1] + [column_title] + [header[-1]]

    # column that differentiates between body-sites within a single individual
    # used for the creation of the vectors in make_3d_plots.py, this data is
    # created by concatenating the two columns when writing the mapping file
    site_id_category = '%s&&%s' % (personal_id_column, category_to_split)
    header.insert(len(header)-1, site_id_category)

    all_personal_ids = get_personal_ids(mapping_data, personal_id_index)
    if personal_ids == None: 
        personal_ids = all_personal_ids
    else:
        for pid in personal_ids:
            if pid not in all_personal_ids:
                raise ValueError("'%s' is not a personal ID in the mapping "
                                 "file column '%s'." %
                                 (pid, personal_id_column))

    if time_series_category not in header:
        raise ValueError("Time series field '%s' is not a mapping file column "
                         "header." % time_series_category)

    otu_table_title = splitext(basename(otu_table_fp))

    output_directories = []
    raw_data_files = []
    raw_data_dirs = []

    # Rarefy the OTU table and split by body site here (instead of on a
    # per-individual basis) as we can use the same rarefied and split tables
    # for each individual.
    if not suppress_otu_category_significance:
        rarefied_otu_table_fp = join(output_dir,
                add_filename_suffix(otu_table_fp,
                                    '_even%d' % rarefaction_depth))

        if body_site_rarefied_otu_table_dir is None:
            commands = []
            cmd_title = 'Rarefying OTU table'
            cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp,
                    rarefied_otu_table_fp, rarefaction_depth)
            commands.append([(cmd_title, cmd)])
            raw_data_files.append(rarefied_otu_table_fp)

            per_body_site_dir = join(output_dir, 'per_body_site_otu_tables')

            cmd_title = 'Splitting rarefied OTU table by body site'
            cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (
                    rarefied_otu_table_fp, mapping_fp, category_to_split,
                    per_body_site_dir)
            commands.append([(cmd_title, cmd)])
            raw_data_dirs.append(per_body_site_dir)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)
        else:
            per_body_site_dir = body_site_rarefied_otu_table_dir

    for person_of_interest in personal_ids:
        # Files to clean up on a per-individual basis.
        personal_raw_data_files = []
        personal_raw_data_dirs = []

        create_dir(join(output_dir, person_of_interest))

        personal_mapping_file_fp = join(output_dir, person_of_interest,
                                        'mapping_file.txt')
        html_fp = join(output_dir, person_of_interest, 'index.html')

        personal_mapping_data = create_personal_mapping_file(mapping_data,
                person_of_interest, personal_id_index, bodysite_index,
                individual_titles)

        personal_mapping_f = open(personal_mapping_file_fp, 'w')
        personal_mapping_f.write(
                format_mapping_file(header, personal_mapping_data, comments))
        personal_mapping_f.close()
        personal_raw_data_files.append(personal_mapping_file_fp)

        column_title_index = header.index(column_title)
        column_title_values = set([e[column_title_index]
                                   for e in personal_mapping_data])
        cat_index = header.index(category_to_split)
        cat_values = set([e[cat_index] for e in personal_mapping_data])

        # Generate alpha diversity boxplots, split by body site, one per
        # metric. We run this one first because it completes relatively
        # quickly and it does not call any QIIME scripts.
        alpha_diversity_boxplots_html = ''
        if not suppress_alpha_diversity_boxplots:
            adiv_boxplots_dir = join(output_dir, person_of_interest,
                                     'adiv_boxplots')
            create_dir(adiv_boxplots_dir)
            output_directories.append(adiv_boxplots_dir)

            logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" %
                         person_of_interest)

            plot_filenames = _generate_alpha_diversity_boxplots(
                    collated_dir, personal_mapping_file_fp,
                    category_to_split, column_title, rarefaction_depth,
                    adiv_boxplots_dir)

            # Create relative paths for use with the index page.
            rel_boxplot_dir = basename(normpath(adiv_boxplots_dir))
            plot_fps = [join(rel_boxplot_dir, plot_filename)
                        for plot_filename in plot_filenames]

            alpha_diversity_boxplots_html = \
                    create_alpha_diversity_boxplots_html(plot_fps)

        ## Alpha rarefaction steps
        if not suppress_alpha_rarefaction:
            rarefaction_dir = join(output_dir, person_of_interest,
                                   'alpha_rarefaction')
            output_directories.append(rarefaction_dir)

            commands = []
            cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest
            cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % (
                    collated_dir, personal_mapping_file_fp, prefs_fp,
                    rarefaction_dir)
            commands.append([(cmd_title, cmd)])

            personal_raw_data_dirs.append(join(rarefaction_dir,
                                               'average_plots'))
            personal_raw_data_dirs.append(join(rarefaction_dir,
                                               'average_tables'))

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Beta diversity steps
        if not suppress_beta_diversity:
            pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity')
            pcoa_time_series_dir = join(output_dir, person_of_interest, 
                                         'beta_diversity_time_series')
            output_directories.append(pcoa_dir)
            output_directories.append(pcoa_time_series_dir)

            commands = []
            cmd_title = 'Creating beta diversity time series plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % (
                personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\
                '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category,
                site_id_category, time_series_category)
            commands.append([(cmd_title, cmd)])
            
            cmd_title = 'Creating beta diversity plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py  -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp,
                                                                 prefs_fp, coord_fp, 
                                                                 pcoa_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Time series taxa summary plots steps
        taxa_summary_plots_html = ''
        if not suppress_taxa_summary_plots:
            area_plots_dir = join(output_dir, person_of_interest,
                                  'time_series')
            create_dir(area_plots_dir)
            output_directories.append(area_plots_dir)

            files_to_remove, dirs_to_remove = _generate_taxa_summary_plots(
                    otu_table_fp, personal_mapping_file_fp, person_of_interest,
                    column_title, column_title_values, category_to_split,
                    cat_values, time_series_category, area_plots_dir,
                    command_handler, status_update_callback, logger)

            personal_raw_data_files.extend(files_to_remove)
            personal_raw_data_dirs.extend(dirs_to_remove)

            taxa_summary_plots_html = create_taxa_summary_plots_html(
                    output_dir, person_of_interest, cat_values)

        # Generate OTU category significance tables (per body site).
        otu_cat_sig_output_fps = []
        otu_category_significance_html = ''
        if not suppress_otu_category_significance:
            otu_cat_sig_dir = join(output_dir, person_of_interest,
                                   'otu_category_significance')
            create_dir(otu_cat_sig_dir)
            output_directories.append(otu_cat_sig_dir)

            # For each body-site rarefied OTU table, run
            # otu_category_significance.py using self versus other category.
            # Keep track of each output file that is created because we need to
            # parse these later on.
            commands = []
            valid_body_sites = []
            for cat_value in cat_values:
                body_site_otu_table_fp = join(per_body_site_dir,
                        add_filename_suffix(rarefied_otu_table_fp,
                                            '_%s' % cat_value))

                if exists(body_site_otu_table_fp):
                    # Make sure we have at least one sample for Self, otherwise
                    # otu_category_significance.py crashes with a division by
                    # zero error.
                    body_site_otu_table_f = open(body_site_otu_table_fp, 'U')
                    personal_mapping_file_f = open(personal_mapping_file_fp,
                                                   'U')
                    personal_sample_count = _count_per_individual_samples(
                            body_site_otu_table_f, personal_mapping_file_f,
                            personal_id_column, person_of_interest)
                    body_site_otu_table_f.close()
                    personal_mapping_file_f.close()

                    if personal_sample_count < 1:
                        continue
                    else:
                        valid_body_sites.append(cat_value)

                    otu_cat_output_fp = join(otu_cat_sig_dir,
                                             'otu_cat_sig_%s.txt' % cat_value)

                    cmd_title = ('Testing for significant differences in '
                                 'OTU abundances in "%s" body site (%s)' % (
                                 cat_value, person_of_interest))
                    cmd = ('otu_category_significance.py -i %s -m %s -c %s '
                           '-o %s' % (body_site_otu_table_fp,
                                      personal_mapping_file_fp,
                                      column_title,
                                      otu_cat_output_fp))
                    commands.append([(cmd_title, cmd)])

                    personal_raw_data_files.append(otu_cat_output_fp)
                    otu_cat_sig_output_fps.append(otu_cat_output_fp)

            # Hack to allow print-only mode.
            if command_handler is not print_commands and not valid_body_sites:
                raise ValueError("None of the body sites for personal ID '%s' "
                                 "could be processed because there were no "
                                 "matching samples in the rarefied OTU table."
                                 % person_of_interest)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

            # Reformat otu category significance tables.
            otu_cat_sig_html_filenames = \
                    create_otu_category_significance_html_tables(
                            otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, 
                            individual_titles, rep_set_fp=rep_set_fp)

            # Create relative paths for use with the index page.
            rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir))
            otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename)
                    for html_filename in otu_cat_sig_html_filenames]

            otu_category_significance_html = \
                    create_otu_category_significance_html(otu_cat_sig_html_fps)

        # Create the index.html file for the current individual.
        create_index_html(person_of_interest, html_fp,
                taxa_summary_plots_html=taxa_summary_plots_html,
                alpha_diversity_boxplots_html=alpha_diversity_boxplots_html,
                otu_category_significance_html=otu_category_significance_html)

        # Clean up the unnecessary raw data files and directories for the
        # current individual. glob will only grab paths that exist.
        if not retain_raw_data:
            clean_up_raw_data_files(personal_raw_data_files,
                                    personal_raw_data_dirs)


    # Clean up any remaining raw data files that weren't created on a
    # per-individual basis.
    if not retain_raw_data:
        clean_up_raw_data_files(raw_data_files, raw_data_dirs)

    logger.close()

    return output_directories
Example #11
0
def run_beta_diversity_through_plots(otu_table_fp, 
                                     mapping_fp,
                                     output_dir,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     color_by_interesting_fields_only=True,
                                     sampling_depth=None,
                                     histogram_categories=None,
                                     tree_fp=None,
                                     parallel=False,
                                     logger=None,
                                     suppress_3d_plots=False,
                                     suppress_2d_plots=False,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
         1) Compute a beta diversity distance matrix;
         2) Peform a principal coordinates analysis on the result of
          Step 1;
         3) Generate a 3D prefs file for optimized coloring of continuous
          variables;
         4) Generate a 3D plot for all mapping fields with colors
          optimized for continuous data;
         5) Generate a 3D plot for all mapping fields with colors
          optimized for discrete data.
    
    """  
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
    
    mapping_data, mapping_header, mapping_comments =\
     parse_mapping_file(open(mapping_fp,'U'))
    if histogram_categories:
        invalid_categories = set(histogram_categories) - set(mapping_header)
        if invalid_categories:
            raise ValueError,\
             "Invalid histogram categories - these must exactly match "+\
             "mapping file column headers: %s" % (' '.join(invalid_categories))
    # Get the interesting mapping fields to color by -- if none are
    # interesting, take all of them. Interesting is defined as those
    # which have greater than one value and fewer values than the number 
    # of samples
    if color_by_interesting_fields_only:
        mapping_fields =\
          get_interesting_mapping_fields(mapping_data, mapping_header) or\
          mapping_header
    else:
        mapping_fields = mapping_header
    mapping_fields = ','.join(mapping_fields)
    
    if sampling_depth:
        # Sample the OTU table at even depth
        even_sampled_otu_table_fp = '%s/%s_even%d%s' %\
         (output_dir, otu_table_basename, 
          sampling_depth, otu_table_ext)
        single_rarefaction_cmd = \
         '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\
         (python_exe_fp, script_dir, otu_table_fp,
          even_sampled_otu_table_fp, sampling_depth)
        commands.append([
         ('Sample OTU table at %d seqs/sample' % sampling_depth,
          single_rarefaction_cmd)])
        otu_table_fp = even_sampled_otu_table_fp
        otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp)
        otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac']
    
    # Prep the 3d prefs file generator command
    prefs_fp = '%s/prefs.txt' % output_dir
    try:
        params_str = get_params_str(params['make_prefs_file'])
    except KeyError:
        params_str = ''
    if not 'mapping_headers_to_use' in params['make_prefs_file']:
        params_str = '%s --mapping_headers_to_use %s' \
         % (params_str,mapping_fields)
    # Build the 3d prefs file generator command
    prefs_cmd = \
     '%s %s/make_prefs_file.py -m %s -o %s %s' %\
     (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str)
    commands.append([('Build prefs file', prefs_cmd)])
    
    dm_fps = []
    for beta_diversity_metric in beta_diversity_metrics:
        
        # Prep the beta-diversity command
        try:
            bdiv_params_copy = params['beta_diversity'].copy()
        except KeyError:
            bdiv_params_copy = {}
        try:
            del bdiv_params_copy['metrics']
        except KeyError:
            pass
        
        params_str = get_params_str(bdiv_params_copy)
            
        if tree_fp:
            params_str = '%s -t %s ' % (params_str,tree_fp)
            
        # Build the beta-diversity command
        if parallel:
            # Grab the parallel-specific parameters
            try:
                params_str += get_params_str(params['parallel'])
            except KeyError:
                pass
            beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
        else:
            beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\
             (python_exe_fp, script_dir, otu_table_fp, 
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
        
        
        orig_beta_div_fp = '%s/%s_%s.txt' % \
         (output_dir, beta_diversity_metric, otu_table_basename)
        beta_div_fp = '%s/%s_dm.txt' % \
         (output_dir, beta_diversity_metric)
        commands.append([('Rename distance matrix (%s)' % beta_diversity_metric,
                         'mv %s %s' % (orig_beta_div_fp, beta_div_fp))])
        dm_fps.append((beta_diversity_metric, beta_div_fp))
        
        # Prep the principal coordinates command
        pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the principal coordinates command
        pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str)
        commands.append(\
         [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)])
        
        # Generate 3d plots
        if not suppress_3d_plots:
            # Prep the continuous-coloring 3d plots command
            continuous_3d_dir = '%s/%s_3d_continuous/' %\
             (output_dir, beta_diversity_metric)
            create_dir(continuous_3d_dir)
            try:
                params_str = get_params_str(params['make_3d_plots'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            continuous_3d_command = \
             '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir,
               mapping_fp, params_str)
    
            # Prep the discrete-coloring 3d plots command
            discrete_3d_dir = '%s/%s_3d_discrete/' %\
             (output_dir, beta_diversity_metric)
            create_dir(discrete_3d_dir)
            try:
                params_str = get_params_str(params['make_3d_plots'])
            except KeyError:
                params_str = ''
            # Build the discrete-coloring 3d plots command
            discrete_3d_command = \
             '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir,
               mapping_fp, params_str)
       
            commands.append([\
              ('Make 3D plots (continuous coloring, %s)' %\
                beta_diversity_metric,continuous_3d_command),\
              ('Make 3D plots (discrete coloring, %s)' %\
                beta_diversity_metric,discrete_3d_command,)])
    
        # Generate 3d plots
        if not suppress_2d_plots:
            # Prep the continuous-coloring 3d plots command
            continuous_2d_dir = '%s/%s_2d_continuous/' %\
             (output_dir, beta_diversity_metric)
            create_dir(continuous_2d_dir)
            try:
                params_str = get_params_str(params['make_2d_plots'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            continuous_2d_command = \
             '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir,
               mapping_fp, params_str)
               
            # Prep the discrete-coloring 3d plots command
            discrete_2d_dir = '%s/%s_2d_discrete/' %\
             (output_dir, beta_diversity_metric)
            create_dir(discrete_2d_dir)
            try:
                params_str = get_params_str(params['make_2d_plots'])
            except KeyError:
                params_str = ''
            # Build the discrete-coloring 2d plots command
            discrete_2d_command = \
             '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir,
               mapping_fp, params_str)
       
            commands.append([\
              ('Make 2D plots (continuous coloring, %s)' %\
                beta_diversity_metric,continuous_2d_command),\
              ('Make 2D plots (discrete coloring, %s)' %\
                beta_diversity_metric,discrete_2d_command,)])
                
        if histogram_categories:
            # Prep the discrete-coloring 3d plots command
            histograms_dir = '%s/%s_histograms/' %\
             (output_dir, beta_diversity_metric)
            create_dir(histograms_dir)
            try:
                params_str = get_params_str(params['make_distance_histograms'])
            except KeyError:
                params_str = ''
            # Build the make_distance_histograms command
            distance_histograms_command = \
             '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\
              (python_exe_fp, script_dir, beta_div_fp, 
               histograms_dir, mapping_fp, 
               ','.join(histogram_categories), params_str)
       
            commands.append([\
              ('Make Distance Histograms (%s)' %\
                beta_diversity_metric,distance_histograms_command)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    
    return dm_fps
Example #12
0
def run_pick_closed_reference_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        taxonomy_fp,
        command_handler,
        params,
        qiime_config,
        assign_taxonomy=False,
        parallel=False,
        logger=None,
        suppress_md5=False,
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          1) Pick OTUs;
          2) If assignment_taxonomy is True, choose representative sequence
             for OTUs and assign taxonomy using a classifier.
          3) Build an OTU table with optional predefined taxonomy
             (if assign_taxonomy=False) or taxonomic assignments from step 2
             (if assign_taxonomy=True).

    """

    # confirm that a valid otu picking method was supplied before doing
    # any work
    reference_otu_picking_methods = ['blast', 'uclust_ref', 'usearch61_ref',
                                     'usearch_ref', 'sortmerna']

    try:
        otu_picking_method = params['pick_otus']['otu_picking_method']
    except KeyError:
        otu_picking_method = 'uclust_ref'
    assert otu_picking_method in reference_otu_picking_methods,\
        "Invalid OTU picking method supplied: %s. Valid choices are: %s"\
        % (otu_picking_method, ' '.join(reference_otu_picking_methods))

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp])

    # Prep the OTU picking command
    pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
    otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename)
    if parallel and (otu_picking_method == 'blast' or
                     otu_picking_method == 'uclust_ref' or
                     otu_picking_method == 'usearch61_ref' or
                     otu_picking_method == 'sortmerna'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            if 'otu_picking_method' in d:
                del d['otu_picking_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\
            (otu_picking_script,
             input_fp,
             pick_otu_dir,
             refseqs_fp,
             params_str)
    else:
        try:
            params_str = get_params_str(params['pick_otus'])
        except KeyError:
            params_str = ''
        # Since this is reference-based OTU picking we always want to
        # suppress new clusters -- force it here.
        params_str += ' --suppress_new_clusters'
        logger.write(
            "Forcing --suppress_new_clusters as this is "
            "closed-reference OTU picking.\n\n")
        # Build the OTU picking command
        pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\
            (input_fp,
             pick_otu_dir,
             refseqs_fp,
             otu_picking_method,
             params_str)

    commands.append([('Pick OTUs', pick_otus_cmd)])

    # Assign taxonomy using a taxonomy classifier, if request by the user.
    # (Alternatively predefined taxonomic assignments will be used, if provided.)
    if assign_taxonomy:
        # Prep the representative set picking command
        rep_set_dir = '%s/rep_set/' % output_dir
        create_dir(rep_set_dir)
        rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename)
        rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename)

        try:
            params_str = get_params_str(params['pick_rep_set'])
        except KeyError:
            params_str = ''
        # Build the representative set picking command
        pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\
            (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str)
        commands.append([('Pick representative set', pick_rep_set_cmd)])

        # Prep the taxonomy assignment command
        try:
            assignment_method = params['assign_taxonomy']['assignment_method']
        except KeyError:
            assignment_method = 'uclust'
        assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
            (output_dir, assignment_method)
        taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \
            (assign_taxonomy_dir, input_basename)
        if parallel and (assignment_method == 'rdp' or
                         assignment_method == 'blast' or
                         assignment_method == 'uclust'):
            # Grab the parallel-specific parameters
            try:
                params_str = get_params_str(params['parallel'])
            except KeyError:
                params_str = ''

            # Grab the taxonomy assignment parameters
            try:
                # Want to find a cleaner strategy for this: the parallel script
                # is method-specific, so doesn't take a --assignment_method
                # option. This works for now though.
                d = params['assign_taxonomy'].copy()
                if 'assignment_method' in d:
                    del d['assignment_method']
                params_str += ' %s' % get_params_str(d)
            except KeyError:
                pass

            # Build the parallel taxonomy assignment command
            assign_taxonomy_cmd = \
                'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
                (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str)
        else:
            try:
                params_str = get_params_str(params['assign_taxonomy'])
            except KeyError:
                params_str = ''
            # Build the taxonomy assignment command
            assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
                (assign_taxonomy_dir, rep_set_fp, params_str)

        commands.append([('Assign taxonomy', assign_taxonomy_cmd)])

    # Prep the OTU table building command
    otu_table_fp = '%s/otu_table.biom' % output_dir
    try:
        params_str = get_params_str(params['make_otu_table'])
    except KeyError:
        params_str = ''
    # If assign_taxonomy is True, this will be the path to the taxonomic
    # assignment results. If assign_taxonomy is False this will be either
    # the precomputed taxonomic assignments that the user passed in,
    # or None.
    if taxonomy_fp:
        taxonomy_str = '-t %s' % taxonomy_fp
    else:
        taxonomy_str = ''
    # Build the OTU table building command
    make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\
        (otu_fp, taxonomy_str, otu_table_fp, params_str)

    commands.append([('Make OTU table', make_otu_table_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
qiime_config = load_qiime_config()

script_info = {}
script_info['brief_description'] = ""
script_info['script_description'] = ""
script_info['script_usage'] = []
script_info['script_usage'].append(("Run a subset of the interface tests in verbose mode","Run interface tests for the count_seqs.py and make_otu_table.py scripts. This illustrates how to run from the qiime_test_dir directory.","%prog -t count_seqs,make_otu_table -v"))
script_info['script_usage'].append(("Run all of the interface tests","Run all script interface tests.  This illustrates how to run from the qiime_test_dir directory.","%prog"))
script_info['output_description']= ""
script_info['required_options'] = []

log_fp_prefix = 'script_test_log'
log_fp_suffix = 'txt'
default_log_fp = generate_log_fp(get_qiime_temp_dir(),
                    basefile_name=log_fp_prefix,
                    suffix=log_fp_suffix,
                    timestamp_pattern='%Y%m%d%H%M%S')
default_log_fp_help_str = join(get_qiime_temp_dir(),
                               '%s_TIMESTAMP.%s' % (log_fp_prefix,log_fp_suffix))

script_info['optional_options'] = [\
 make_option('-t','--tests',
             help='comma-separated list of the tests to run [default: all]'),
 make_option('-w','--working_dir',default=get_qiime_temp_dir(),
             help='directory where the tests should be run [default: %default]',
             type='existing_dirpath'),
 make_option('-l','--failure_log_fp',type="new_filepath",default=default_log_fp,
             help='log file to store record of failures [default: %s]' % default_log_fp_help_str)
]
script_info['version'] = __version__
Example #14
0
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_group_significance=False,
    status_update_callback=print_to_stdout,
):
    """
    """
    if categories is not None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U"))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError(
                    "Category '%s' is not a column header "
                    "in your mapping file. "
                    "Categories are case and white space sensitive. Valid "
                    "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames))
                )
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError(
                    "Category '%s' contains only one value. "
                    "Categories analyzed here require at least two values." % c
                )

    else:
        categories = []
    comma_separated_categories = ",".join(categories)
    # prep some variables
    if params is None:
        params = parse_qiime_parameters([])

    create_dir(output_dir)
    index_fp = "%s/index.html" % output_dir
    index_links = []
    commands = []

    # begin logging
    old_log_fps = glob(join(output_dir, "log_20*txt"))
    log_fp = generate_log_fp(output_dir)
    index_links.append(("Master run log", log_fp, _index_headers["run_summary"]))
    for old_log_fp in old_log_fps:
        index_links.append(("Previous run log", old_log_fp, _index_headers["run_summary"]))
    logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config)
    input_fps = [biom_fp, mapping_fp]
    if tree_fp is not None:
        input_fps.append(tree_fp)
    log_input_md5s(logger, input_fps)

    # run 'biom summarize-table' on input BIOM table
    try:
        params_str = get_params_str(params["biom-summarize-table"])
    except KeyError:
        params_str = ""
    biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir
    if not exists(biom_table_stats_output_fp):
        biom_table_summary_cmd = "biom summarize-table -i %s -o %s --suppress-md5 %s" % (
            biom_fp,
            biom_table_stats_output_fp,
            params_str,
        )
        commands.append([("Generate BIOM table summary", biom_table_summary_cmd)])
    else:
        logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp)
    index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"]))

    # filter samples with fewer observations than the requested sampling_depth.
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    if not exists(filtered_biom_fp):
        filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % (
            biom_fp,
            filtered_biom_fp,
            sampling_depth,
        )
        commands.append(
            [
                (
                    "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth,
                    filter_samples_cmd,
                )
            ]
        )
    else:
        logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp)
    biom_fp = filtered_biom_fp

    # rarify the BIOM table to sampling_depth
    rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth)
    if not exists(rarefied_biom_fp):
        single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" % (biom_fp, rarefied_biom_fp, sampling_depth)
        commands.append([("Rarify the OTU table to %d sequences/sample" % sampling_depth, single_rarefaction_cmd)])
    else:
        logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp)

    # run initial commands and reset the command list
    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
        commands = []

    if not suppress_beta_diversity:
        bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth)
        # Need to check for the existence of any distance matrices, since the user
        # can select which will be generated.
        existing_dm_fps = glob("%s/*_dm.txt" % bdiv_even_output_dir)
        if len(existing_dm_fps) == 0:
            even_dm_fps = run_beta_diversity_through_plots(
                otu_table_fp=rarefied_biom_fp,
                mapping_fp=mapping_fp,
                output_dir=bdiv_even_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                # Note: we pass sampling depth=None here as
                # we rarify the BIOM table above and pass that
                # in here.
                sampling_depth=None,
                tree_fp=tree_fp,
                parallel=parallel,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback,
            )
        else:
            logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ", ".join(existing_dm_fps))
            even_dm_fps = [(split(fp)[1].strip("_dm.txt"), fp) for fp in existing_dm_fps]

        # Get make_distance_boxplots parameters
        try:
            params_str = get_params_str(params["make_distance_boxplots"])
        except KeyError:
            params_str = ""

        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric)
                plot_output_fp = "%s/%s_Distances.pdf" % (boxplots_output_dir, category)
                stats_output_fp = "%s/%s_Stats.txt" % (boxplots_output_dir, category)
                if not exists(plot_output_fp):
                    boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % (
                        dm_fp,
                        category,
                        boxplots_output_dir,
                        mapping_fp,
                        params_str,
                    )
                    commands.append([("Boxplots (%s)" % category, boxplots_cmd)])
                else:
                    logger.write(
                        "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp)
                    )
                index_links.append(
                    (
                        "Distance boxplots (%s)" % bdiv_metric,
                        plot_output_fp,
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )
                index_links.append(
                    (
                        "Distance boxplots statistics (%s)" % bdiv_metric,
                        stats_output_fp,
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )

            index_links.append(
                (
                    "PCoA plot (%s)" % bdiv_metric,
                    "%s/%s_emperor_pcoa_plot/index.html" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Distance matrix (%s)" % bdiv_metric,
                    "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Principal coordinate matrix (%s)" % bdiv_metric,
                    "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )

    if not suppress_alpha_diversity:
        # Alpha rarefaction workflow
        arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth)
        rarefaction_plots_output_fp = "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir
        if not exists(rarefaction_plots_output_fp):
            run_alpha_rarefaction(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=arare_full_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                tree_fp=tree_fp,
                num_steps=arare_num_steps,
                parallel=parallel,
                logger=logger,
                min_rare_depth=arare_min_rare_depth,
                max_rare_depth=sampling_depth,
                suppress_md5=True,
                status_update_callback=status_update_callback,
                retain_intermediate_files=False,
            )
        else:
            logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp)

        index_links.append(("Alpha rarefaction plots", rarefaction_plots_output_fp, _index_headers["alpha_diversity"]))

        collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir)
        try:
            params_str = get_params_str(params["compare_alpha_diversity"])
        except KeyError:
            params_str = ""

        if len(categories) > 0:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                compare_alpha_output_dir = "%s/compare_%s" % (arare_full_output_dir, alpha_metric)
                if not exists(compare_alpha_output_dir):
                    compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % (
                        collated_alpha_diversity_fp,
                        mapping_fp,
                        comma_separated_categories,
                        compare_alpha_output_dir,
                        params_str,
                    )
                    commands.append([("Compare alpha diversity (%s)" % alpha_metric, compare_alpha_cmd)])
                    for category in categories:
                        alpha_comparison_stat_fp = "%s/%s_stats.txt" % (compare_alpha_output_dir, category)
                        alpha_comparison_boxplot_fp = "%s/%s_boxplots.pdf" % (compare_alpha_output_dir, category)
                        index_links.append(
                            (
                                "Alpha diversity statistics (%s, %s)" % (category, alpha_metric),
                                alpha_comparison_stat_fp,
                                _index_headers["alpha_diversity"],
                            )
                        )
                        index_links.append(
                            (
                                "Alpha diversity boxplots (%s, %s)" % (category, alpha_metric),
                                alpha_comparison_boxplot_fp,
                                _index_headers["alpha_diversity"],
                            )
                        )
                else:
                    logger.write(
                        "Skipping compare_alpha_diversity.py"
                        " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir)
                    )
        else:
            logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n")

    if not suppress_taxa_summary:
        taxa_plots_output_dir = "%s/taxa_plots/" % output_dir
        # need to check for existence of any html files, since the user can
        # select only certain ones to be generated
        existing_taxa_plot_html_fps = glob(join(taxa_plots_output_dir, "taxa_summary_plots", "*.html"))
        if len(existing_taxa_plot_html_fps) == 0:
            run_summarize_taxa_through_plots(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=taxa_plots_output_dir,
                mapping_cat=None,
                sort=True,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback,
            )
        else:
            logger.write(
                "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n"
                % ", ".join(existing_taxa_plot_html_fps)
            )

        index_links.append(
            (
                "Taxa summary bar plots",
                "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        index_links.append(
            (
                "Taxa summary area plots",
                "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        for category in categories:
            taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category)
            # need to check for existence of any html files, since the user can
            # select only certain ones to be generated
            existing_taxa_plot_html_fps = glob("%s/taxa_summary_plots/*.html" % taxa_plots_output_dir)
            if len(existing_taxa_plot_html_fps) == 0:
                run_summarize_taxa_through_plots(
                    otu_table_fp=biom_fp,
                    mapping_fp=mapping_fp,
                    output_dir=taxa_plots_output_dir,
                    mapping_cat=category,
                    sort=True,
                    command_handler=command_handler,
                    params=params,
                    qiime_config=qiime_config,
                    logger=logger,
                    suppress_md5=True,
                    status_update_callback=status_update_callback,
                )
            else:
                logger.write(
                    "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n"
                    % (category, ", ".join(existing_taxa_plot_html_fps))
                )

            index_links.append(
                (
                    "Taxa summary bar plots",
                    "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )
            index_links.append(
                (
                    "Taxa summary area plots",
                    "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )

    if not suppress_group_significance:
        params_str = get_params_str(params["group_significance"])
        # group significance tests, aka category significance
        for category in categories:
            group_signifance_fp = "%s/group_significance_%s.txt" % (output_dir, category)
            if not exists(group_signifance_fp):
                # Build the OTU cateogry significance command
                group_significance_cmd = "group_significance.py -i %s -m %s -c %s -o %s %s" % (
                    rarefied_biom_fp,
                    mapping_fp,
                    category,
                    group_signifance_fp,
                    params_str,
                )
                commands.append([("Group significance (%s)" % category, group_significance_cmd)])
            else:
                logger.write(
                    "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp)
                )

            index_links.append(
                ("Category significance (%s)" % category, group_signifance_fp, _index_headers["group_significance"])
            )

    filtered_biom_gzip_fp = "%s.gz" % filtered_biom_fp
    if not exists(filtered_biom_gzip_fp):
        commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)])
    else:
        logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp)
    index_links.append(
        (
            "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth,
            filtered_biom_gzip_fp,
            _index_headers["run_summary"],
        )
    )

    rarified_biom_gzip_fp = "%s.gz" % rarefied_biom_fp
    if not exists(rarified_biom_gzip_fp):
        commands.append([("Compress the rarified BIOM table", "gzip %s" % rarefied_biom_fp)])
    else:
        logger.write("Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp)
    index_links.append(
        (
            "Rarified BIOM table (sampling depth: %d)" % sampling_depth,
            rarified_biom_gzip_fp,
            _index_headers["run_summary"],
        )
    )

    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger)
    else:
        logger.close()

    generate_index_page(index_links, index_fp)
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(suppress_verbose=True, **script_info)

    input_dir = opts.input_dir
    demultiplexing_method = opts.demultiplexing_method
    parameter_fp = opts.parameter_fp
    read_indicator = opts.read_indicator
    barcode_indicator = opts.barcode_indicator
    mapping_indicator = opts.mapping_indicator
    mapping_extensions = opts.mapping_extensions.split(',')
    sampleid_indicator = opts.sampleid_indicator
    leading_text = opts.leading_text
    trailing_text = opts.trailing_text
    include_input_dir_path = opts.include_input_dir_path
    output_dir = abspath(opts.output_dir)
    remove_filepath_in_name = opts.remove_filepath_in_name
    print_only = opts.print_only

    if remove_filepath_in_name and not include_input_dir_path:
        option_parser.error("If --remove_filepath_in_name enabled, "
            "--include_input_dir_path must be enabled.")

    if opts.parameter_fp:
        with open(opts.parameter_fp, 'U') as parameter_f:
            params_dict = parse_qiime_parameters(parameter_f)
        params_str = get_params_str(params_dict['split_libraries_fastq'])
    else:
        params_dict = {}
        params_str = ""

    create_dir(output_dir)

    all_fastq = []
    all_mapping = []

    extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']

    for root, dir, fps in walk(input_dir):
        for fp in fps:
            for extension in extensions:
                if fp.endswith(extension):
                    all_fastq += [abspath(join(root, fp))]

    if demultiplexing_method == 'mapping_barcode_files':
        for root, dir, fps in walk(input_dir):
            for fp in fps:
                for mapping_extension in mapping_extensions:
                    if fp.endswith(mapping_extension):
                        all_mapping += [abspath(join(root, fp))]

        all_files = get_matching_files(all_fastq, all_mapping,
            read_indicator, barcode_indicator, mapping_indicator)
    else:
        all_files = all_fastq

    commands = create_commands_slf(all_files, demultiplexing_method, output_dir,
        params_str, leading_text, trailing_text, include_input_dir_path,
        remove_filepath_in_name, sampleid_indicator)

    qiime_config = load_qiime_config()
    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params_dict,
                            qiime_config=qiime_config)
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback=no_status_updates,
                    logger=logger,
                    close_logger_on_success=True)
Example #16
0
def run_pick_closed_reference_otus(input_fp,
                                   refseqs_fp,
                                   output_dir,
                                   taxonomy_fp,
                                   command_handler,
                                   params,
                                   qiime_config,
                                   assign_taxonomy=False,
                                   parallel=False,
                                   logger=None,
                                   suppress_md5=False,
                                   status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          1) Pick OTUs;
          2) If assignment_taxonomy is True, choose representative sequence
             for OTUs and assign taxonomy using a classifier.
          3) Build an OTU table with optional predefined taxonomy
             (if assign_taxonomy=False) or taxonomic assignments from step 2
             (if assign_taxonomy=True).

    """

    # confirm that a valid otu picking method was supplied before doing
    # any work
    reference_otu_picking_methods = [
        'blast', 'uclust_ref', 'usearch61_ref', 'usearch_ref', 'sortmerna'
    ]

    try:
        otu_picking_method = params['pick_otus']['otu_picking_method']
    except KeyError:
        otu_picking_method = 'uclust_ref'
    assert otu_picking_method in reference_otu_picking_methods,\
        "Invalid OTU picking method supplied: %s. Valid choices are: %s"\
        % (otu_picking_method, ' '.join(reference_otu_picking_methods))

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp])

    # Prep the OTU picking command
    pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
    otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename)
    if parallel and (otu_picking_method == 'blast' or otu_picking_method
                     == 'uclust_ref' or otu_picking_method == 'usearch61_ref'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            if 'otu_picking_method' in d:
                del d['otu_picking_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\
            (otu_picking_script,
             input_fp,
             pick_otu_dir,
             refseqs_fp,
             params_str)
    else:
        try:
            params_str = get_params_str(params['pick_otus'])
        except KeyError:
            params_str = ''
        # Since this is reference-based OTU picking we always want to
        # suppress new clusters -- force it here.
        params_str += ' --suppress_new_clusters'
        logger.write("Forcing --suppress_new_clusters as this is "
                     "closed-reference OTU picking.\n\n")
        # Build the OTU picking command
        pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\
            (input_fp,
             pick_otu_dir,
             refseqs_fp,
             otu_picking_method,
             params_str)

    commands.append([('Pick OTUs', pick_otus_cmd)])

    # Assign taxonomy using a taxonomy classifier, if request by the user.
    # (Alternatively predefined taxonomic assignments will be used, if provided.)
    if assign_taxonomy:
        # Prep the representative set picking command
        rep_set_dir = '%s/rep_set/' % output_dir
        create_dir(rep_set_dir)
        rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename)
        rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename)

        try:
            params_str = get_params_str(params['pick_rep_set'])
        except KeyError:
            params_str = ''
        # Build the representative set picking command
        pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\
            (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str)
        commands.append([('Pick representative set', pick_rep_set_cmd)])

        # Prep the taxonomy assignment command
        try:
            assignment_method = params['assign_taxonomy']['assignment_method']
        except KeyError:
            assignment_method = 'uclust'
        assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
            (output_dir, assignment_method)
        taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \
            (assign_taxonomy_dir, input_basename)
        if parallel and (assignment_method == 'rdp' or assignment_method
                         == 'blast' or assignment_method == 'uclust'):
            # Grab the parallel-specific parameters
            try:
                params_str = get_params_str(params['parallel'])
            except KeyError:
                params_str = ''

            # Grab the taxonomy assignment parameters
            try:
                # Want to find a cleaner strategy for this: the parallel script
                # is method-specific, so doesn't take a --assignment_method
                # option. This works for now though.
                d = params['assign_taxonomy'].copy()
                if 'assignment_method' in d:
                    del d['assignment_method']
                params_str += ' %s' % get_params_str(d)
            except KeyError:
                pass

            # Build the parallel taxonomy assignment command
            assign_taxonomy_cmd = \
                'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
                (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str)
        else:
            try:
                params_str = get_params_str(params['assign_taxonomy'])
            except KeyError:
                params_str = ''
            # Build the taxonomy assignment command
            assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
                (assign_taxonomy_dir, rep_set_fp, params_str)

        commands.append([('Assign taxonomy', assign_taxonomy_cmd)])

    # Prep the OTU table building command
    otu_table_fp = '%s/otu_table.biom' % output_dir
    try:
        params_str = get_params_str(params['make_otu_table'])
    except KeyError:
        params_str = ''
    # If assign_taxonomy is True, this will be the path to the taxonomic
    # assignment results. If assign_taxonomy is False this will be either
    # the precomputed taxonomic assignments that the user passed in,
    # or None.
    if taxonomy_fp:
        taxonomy_str = '-t %s' % taxonomy_fp
    else:
        taxonomy_str = ''
    # Build the OTU table building command
    make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\
        (otu_fp, taxonomy_str, otu_table_fp, params_str)

    commands.append([('Make OTU table', make_otu_table_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses(biom_fp,
                                mapping_fp,
                                sampling_depth,
                                output_dir,
                                qiime_config,
                                command_handler=call_commands_serially,
                                tree_fp=None,
                                params=None,
                                categories=None,
                                arare_min_rare_depth=10,
                                arare_num_steps=10,
                                parallel=False,
                                suppress_taxa_summary=False,
                                suppress_beta_diversity=False,
                                suppress_alpha_diversity=False,
                                suppress_group_significance=False,
                                status_update_callback=print_to_stdout):
    """
    """
    if categories is not None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
            parse_mapping_file_to_dict(open(mapping_fp, 'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError(
                    "Category '%s' is not a column header "
                    "in your mapping file. "
                    "Categories are case and white space sensitive. Valid "
                    "choices are: (%s)" %
                    (c, ', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError(
                    "Category '%s' contains only one value. "
                    "Categories analyzed here require at least two values." %
                    c)

    else:
        categories = []
    comma_separated_categories = ','.join(categories)
    # prep some variables
    if params is None:
        params = parse_qiime_parameters([])

    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []

    # begin logging
    old_log_fps = glob(join(output_dir, 'log_20*txt'))
    log_fp = generate_log_fp(output_dir)
    index_links.append(
        ('Master run log', log_fp, _index_headers['run_summary']))
    for old_log_fp in old_log_fps:
        index_links.append(
            ('Previous run log', old_log_fp, _index_headers['run_summary']))
    logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config)
    input_fps = [biom_fp, mapping_fp]
    if tree_fp is not None:
        input_fps.append(tree_fp)
    log_input_md5s(logger, input_fps)

    # run 'biom summarize-table' on input BIOM table
    try:
        params_str = get_params_str(params['biom-summarize-table'])
    except KeyError:
        params_str = ''
    biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
    if not exists(biom_table_stats_output_fp):
        biom_table_summary_cmd = \
            "biom summarize-table -i %s -o %s %s" % \
            (biom_fp, biom_table_stats_output_fp, params_str)
        commands.append([('Generate BIOM table summary',
                          biom_table_summary_cmd)])
    else:
        logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" %
                     biom_table_stats_output_fp)
    index_links.append(('BIOM table statistics', biom_table_stats_output_fp,
                        _index_headers['run_summary']))

    # filter samples with fewer observations than the requested sampling_depth.
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    if not exists(filtered_biom_fp):
        filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
            (biom_fp, filtered_biom_fp, sampling_depth)
        commands.append([(
            'Filter low sequence count samples from table (minimum sequence count: %d)'
            % sampling_depth, filter_samples_cmd)])
    else:
        logger.write(
            "Skipping filter_samples_from_otu_table.py as %s exists.\n\n" %
            filtered_biom_fp)
    biom_fp = filtered_biom_fp

    # rarify the BIOM table to sampling_depth
    rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth)
    if not exists(rarefied_biom_fp):
        single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" %\
            (biom_fp, rarefied_biom_fp, sampling_depth)
        commands.append([
            ('Rarify the OTU table to %d sequences/sample' % sampling_depth,
             single_rarefaction_cmd)
        ])
    else:
        logger.write("Skipping single_rarefaction.py as %s exists.\n\n" %
                     rarefied_biom_fp)

    # run initial commands and reset the command list
    if len(commands) > 0:
        command_handler(commands,
                        status_update_callback,
                        logger,
                        close_logger_on_success=False)
        commands = []

    if not suppress_beta_diversity:
        bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir, sampling_depth)
        # Need to check for the existence of any distance matrices, since the user
        # can select which will be generated.
        existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir)
        if len(existing_dm_fps) == 0:
            even_dm_fps = run_beta_diversity_through_plots(
                otu_table_fp=rarefied_biom_fp,
                mapping_fp=mapping_fp,
                output_dir=bdiv_even_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                # Note: we pass sampling depth=None here as
                # we rarify the BIOM table above and pass that
                # in here.
                sampling_depth=None,
                tree_fp=tree_fp,
                parallel=parallel,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback)
        else:
            logger.write(
                "Skipping beta_diversity_through_plots.py as %s exist(s).\n\n"
                % ', '.join(existing_dm_fps))
            even_dm_fps = [(split(fp)[1].strip('_dm.txt'), fp)
                           for fp in existing_dm_fps]

        # Get make_distance_boxplots parameters
        try:
            params_str = get_params_str(params['make_distance_boxplots'])
        except KeyError:
            params_str = ''

        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = '%s/%s_boxplots/' % (
                    bdiv_even_output_dir, bdiv_metric)
                plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,
                                                          category)
                stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,
                                                       category)
                if not exists(plot_output_fp):
                    boxplots_cmd = \
                        'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
                        (dm_fp, category, boxplots_output_dir,
                         mapping_fp, params_str)
                    commands.append([('Boxplots (%s)' % category, boxplots_cmd)
                                     ])
                else:
                    logger.write(
                        "Skipping make_distance_boxplots.py for %s as %s exists.\n\n"
                        % (category, plot_output_fp))
                index_links.append(
                    ('Distance boxplots (%s)' % bdiv_metric, plot_output_fp,
                     _index_headers['beta_diversity_even'] % sampling_depth))
                index_links.append(
                    ('Distance boxplots statistics (%s)' % bdiv_metric,
                     stats_output_fp,
                     _index_headers['beta_diversity_even'] % sampling_depth))

            index_links.append(
                ('PCoA plot (%s)' % bdiv_metric,
                 '%s/%s_emperor_pcoa_plot/index.html' %
                 (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(
                ('Distance matrix (%s)' % bdiv_metric,
                 '%s/%s_dm.txt' % (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(
                ('Principal coordinate matrix (%s)' % bdiv_metric,
                 '%s/%s_pc.txt' % (bdiv_even_output_dir, bdiv_metric),
                 _index_headers['beta_diversity_even'] % sampling_depth))

    if not suppress_alpha_diversity:
        # Alpha rarefaction workflow
        arare_full_output_dir = '%s/arare_max%d/' % (output_dir,
                                                     sampling_depth)
        rarefaction_plots_output_fp = \
            '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir
        if not exists(rarefaction_plots_output_fp):
            run_alpha_rarefaction(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=arare_full_output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                tree_fp=tree_fp,
                num_steps=arare_num_steps,
                parallel=parallel,
                logger=logger,
                min_rare_depth=arare_min_rare_depth,
                max_rare_depth=sampling_depth,
                suppress_md5=True,
                status_update_callback=status_update_callback,
                retain_intermediate_files=False)
        else:
            logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" %
                         rarefaction_plots_output_fp)

        index_links.append(
            ('Alpha rarefaction plots', rarefaction_plots_output_fp,
             _index_headers['alpha_diversity']))

        collated_alpha_diversity_fps = \
            glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
        try:
            params_str = get_params_str(params['compare_alpha_diversity'])
        except KeyError:
            params_str = ''

        if len(categories) > 0:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(
                    split(collated_alpha_diversity_fp)[1])[0]
                compare_alpha_output_dir = '%s/compare_%s' % \
                    (arare_full_output_dir, alpha_metric)
                if not exists(compare_alpha_output_dir):
                    compare_alpha_cmd = \
                        'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\
                        (collated_alpha_diversity_fp,
                         mapping_fp,
                         comma_separated_categories,
                         compare_alpha_output_dir,
                         params_str)
                    commands.append([
                        ('Compare alpha diversity (%s)' % alpha_metric,
                         compare_alpha_cmd)
                    ])
                    for category in categories:
                        alpha_comparison_stat_fp = '%s/%s_stats.txt' % \
                            (compare_alpha_output_dir, category)
                        alpha_comparison_boxplot_fp = '%s/%s_boxplots.pdf' % \
                            (compare_alpha_output_dir, category)
                        index_links.append(
                            ('Alpha diversity statistics (%s, %s)' %
                             (category, alpha_metric),
                             alpha_comparison_stat_fp,
                             _index_headers['alpha_diversity']))
                        index_links.append(
                            ('Alpha diversity boxplots (%s, %s)' %
                             (category, alpha_metric),
                             alpha_comparison_boxplot_fp,
                             _index_headers['alpha_diversity']))
                else:
                    logger.write("Skipping compare_alpha_diversity.py"
                                 " for %s as %s exists.\n\n" %
                                 (alpha_metric, compare_alpha_output_dir))
        else:
            logger.write("Skipping compare_alpha_diversity.py as"
                         " no categories were provided.\n\n")

    if not suppress_taxa_summary:
        taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
        # need to check for existence of any html files, since the user can
        # select only certain ones to be generated
        existing_taxa_plot_html_fps = glob(
            join(taxa_plots_output_dir, 'taxa_summary_plots', '*.html'))
        if len(existing_taxa_plot_html_fps) == 0:
            run_summarize_taxa_through_plots(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=taxa_plots_output_dir,
                mapping_cat=None,
                sort=True,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback)
        else:
            logger.write(
                "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n"
                % ', '.join(existing_taxa_plot_html_fps))

        index_links.append(
            ('Taxa summary bar plots',
             '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir,
             _index_headers['taxa_summary']))
        index_links.append(
            ('Taxa summary area plots',
             '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir,
             _index_headers['taxa_summary']))
        for category in categories:
            taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,
                                                           category)
            # need to check for existence of any html files, since the user can
            # select only certain ones to be generated
            existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' %
                                               taxa_plots_output_dir)
            if len(existing_taxa_plot_html_fps) == 0:
                run_summarize_taxa_through_plots(
                    otu_table_fp=biom_fp,
                    mapping_fp=mapping_fp,
                    output_dir=taxa_plots_output_dir,
                    mapping_cat=category,
                    sort=True,
                    command_handler=command_handler,
                    params=params,
                    qiime_config=qiime_config,
                    logger=logger,
                    suppress_md5=True,
                    status_update_callback=status_update_callback)
            else:
                logger.write(
                    "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n"
                    % (category, ', '.join(existing_taxa_plot_html_fps)))

            index_links.append(
                ('Taxa summary bar plots',
                 '%s/taxa_summary_plots/bar_charts.html' %
                 taxa_plots_output_dir,
                 _index_headers['taxa_summary_categorical'] % category))
            index_links.append(
                ('Taxa summary area plots',
                 '%s/taxa_summary_plots/area_charts.html' %
                 taxa_plots_output_dir,
                 _index_headers['taxa_summary_categorical'] % category))

    if not suppress_group_significance:
        params_str = get_params_str(params['group_significance'])
        # group significance tests, aka category significance
        for category in categories:
            group_signifance_fp = \
                '%s/group_significance_%s.txt' % (output_dir, category)
            if not exists(group_signifance_fp):
                # Build the OTU cateogry significance command
                group_significance_cmd = \
                    'group_significance.py -i %s -m %s -c %s -o %s %s' %\
                    (rarefied_biom_fp, mapping_fp, category,
                     group_signifance_fp, params_str)
                commands.append([('Group significance (%s)' % category,
                                  group_significance_cmd)])
            else:
                logger.write(
                    "Skipping group_significance.py for %s as %s exists.\n\n" %
                    (category, group_signifance_fp))

            index_links.append(
                ('Category significance (%s)' % category, group_signifance_fp,
                 _index_headers['group_significance']))

    filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp
    if not exists(filtered_biom_gzip_fp):
        commands.append([('Compress the filtered BIOM table',
                          'gzip %s' % filtered_biom_fp)])
    else:
        logger.write(
            "Skipping compressing of filtered BIOM table as %s exists.\n\n" %
            filtered_biom_gzip_fp)
    index_links.append(
        ('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth,
         filtered_biom_gzip_fp, _index_headers['run_summary']))

    rarified_biom_gzip_fp = '%s.gz' % rarefied_biom_fp
    if not exists(rarified_biom_gzip_fp):
        commands.append([('Compress the rarified BIOM table',
                          'gzip %s' % rarefied_biom_fp)])
    else:
        logger.write(
            "Skipping compressing of rarified BIOM table as %s exists.\n\n" %
            rarified_biom_gzip_fp)
    index_links.append(
        ('Rarified BIOM table (sampling depth: %d)' % sampling_depth,
         rarified_biom_gzip_fp, _index_headers['run_summary']))

    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger)
    else:
        logger.close()

    generate_index_page(index_links, index_fp)
Example #18
0
def run_beta_diversity_through_plots(otu_table_fp,
                                     mapping_fp,
                                     output_dir,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     color_by_interesting_fields_only=True,
                                     sampling_depth=None,
                                     histogram_categories=None,
                                     tree_fp=None,
                                     parallel=False,
                                     logger=None,
                                     suppress_3d_plots=False,
                                     suppress_2d_plots=False,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
         1) Compute a beta diversity distance matrix;
         2) Peform a principal coordinates analysis on the result of
          Step 1;
         3) Generate a 3D prefs file for optimized coloring of continuous
          variables;
         4) Generate a 3D plot for all mapping fields with colors
          optimized for continuous data;
         5) Generate a 3D plot for all mapping fields with colors
          optimized for discrete data.
    
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    mapping_data, mapping_header, mapping_comments =\
     parse_mapping_file(open(mapping_fp,'U'))
    if histogram_categories:
        invalid_categories = set(histogram_categories) - set(mapping_header)
        if invalid_categories:
            raise ValueError,\
             "Invalid histogram categories - these must exactly match "+\
             "mapping file column headers: %s" % (' '.join(invalid_categories))
    # Get the interesting mapping fields to color by -- if none are
    # interesting, take all of them. Interesting is defined as those
    # which have greater than one value and fewer values than the number
    # of samples
    if color_by_interesting_fields_only:
        mapping_fields =\
          get_interesting_mapping_fields(mapping_data, mapping_header) or\
          mapping_header
    else:
        mapping_fields = mapping_header
    mapping_fields = ','.join(mapping_fields)

    if sampling_depth:
        # Sample the OTU table at even depth
        even_sampled_otu_table_fp = '%s/%s_even%d%s' %\
         (output_dir, otu_table_basename,
          sampling_depth, otu_table_ext)
        single_rarefaction_cmd = \
         '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\
         (python_exe_fp, script_dir, otu_table_fp,
          even_sampled_otu_table_fp, sampling_depth)
        commands.append([
            ('Sample OTU table at %d seqs/sample' % sampling_depth,
             single_rarefaction_cmd)
        ])
        otu_table_fp = even_sampled_otu_table_fp
        otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp)
        otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac']

    # Prep the 3d prefs file generator command
    prefs_fp = '%s/prefs.txt' % output_dir
    try:
        params_str = get_params_str(params['make_prefs_file'])
    except KeyError:
        params_str = ''
    if not 'mapping_headers_to_use' in params['make_prefs_file']:
        params_str = '%s --mapping_headers_to_use %s' \
         % (params_str,mapping_fields)
    # Build the 3d prefs file generator command
    prefs_cmd = \
     '%s %s/make_prefs_file.py -m %s -o %s %s' %\
     (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str)
    commands.append([('Build prefs file', prefs_cmd)])

    dm_fps = []
    for beta_diversity_metric in beta_diversity_metrics:

        # Prep the beta-diversity command
        try:
            bdiv_params_copy = params['beta_diversity'].copy()
        except KeyError:
            bdiv_params_copy = {}
        try:
            del bdiv_params_copy['metrics']
        except KeyError:
            pass

        params_str = get_params_str(bdiv_params_copy)

        if tree_fp:
            params_str = '%s -t %s ' % (params_str, tree_fp)

        # Build the beta-diversity command
        if parallel:
            # Grab the parallel-specific parameters
            try:
                params_str += get_params_str(params['parallel'])
            except KeyError:
                pass
            beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
        else:
            beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])


        orig_beta_div_fp = '%s/%s_%s.txt' % \
         (output_dir, beta_diversity_metric, otu_table_basename)
        beta_div_fp = '%s/%s_dm.txt' % \
         (output_dir, beta_diversity_metric)
        commands.append([
            ('Rename distance matrix (%s)' % beta_diversity_metric,
             'mv %s %s' % (orig_beta_div_fp, beta_div_fp))
        ])
        dm_fps.append((beta_diversity_metric, beta_div_fp))

        # Prep the principal coordinates command
        pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the principal coordinates command
        pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str)
        commands.append(\
         [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)])

        # Generate 3d plots
        if not suppress_3d_plots:
            # Prep the continuous-coloring 3d plots command
            continuous_3d_dir = '%s/%s_3d_continuous/' %\
             (output_dir, beta_diversity_metric)
            create_dir(continuous_3d_dir)
            try:
                params_str = get_params_str(params['make_3d_plots'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            continuous_3d_command = \
             '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir,
               mapping_fp, params_str)

            # Prep the discrete-coloring 3d plots command
            discrete_3d_dir = '%s/%s_3d_discrete/' %\
             (output_dir, beta_diversity_metric)
            create_dir(discrete_3d_dir)
            try:
                params_str = get_params_str(params['make_3d_plots'])
            except KeyError:
                params_str = ''
            # Build the discrete-coloring 3d plots command
            discrete_3d_command = \
             '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir,
               mapping_fp, params_str)

            commands.append([\
              ('Make 3D plots (continuous coloring, %s)' %\
                beta_diversity_metric,continuous_3d_command),\
              ('Make 3D plots (discrete coloring, %s)' %\
                beta_diversity_metric,discrete_3d_command,)])

        # Generate 3d plots
        if not suppress_2d_plots:
            # Prep the continuous-coloring 3d plots command
            continuous_2d_dir = '%s/%s_2d_continuous/' %\
             (output_dir, beta_diversity_metric)
            create_dir(continuous_2d_dir)
            try:
                params_str = get_params_str(params['make_2d_plots'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            continuous_2d_command = \
             '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir,
               mapping_fp, params_str)

            # Prep the discrete-coloring 3d plots command
            discrete_2d_dir = '%s/%s_2d_discrete/' %\
             (output_dir, beta_diversity_metric)
            create_dir(discrete_2d_dir)
            try:
                params_str = get_params_str(params['make_2d_plots'])
            except KeyError:
                params_str = ''
            # Build the discrete-coloring 2d plots command
            discrete_2d_command = \
             '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\
              (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir,
               mapping_fp, params_str)

            commands.append([\
              ('Make 2D plots (continuous coloring, %s)' %\
                beta_diversity_metric,continuous_2d_command),\
              ('Make 2D plots (discrete coloring, %s)' %\
                beta_diversity_metric,discrete_2d_command,)])

        if histogram_categories:
            # Prep the discrete-coloring 3d plots command
            histograms_dir = '%s/%s_histograms/' %\
             (output_dir, beta_diversity_metric)
            create_dir(histograms_dir)
            try:
                params_str = get_params_str(params['make_distance_histograms'])
            except KeyError:
                params_str = ''
            # Build the make_distance_histograms command
            distance_histograms_command = \
             '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\
              (python_exe_fp, script_dir, beta_div_fp,
               histograms_dir, mapping_fp,
               ','.join(histogram_categories), params_str)

            commands.append([\
              ('Make Distance Histograms (%s)' %\
                beta_diversity_metric,distance_histograms_command)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)

    return dm_fps
def main():
    option_parser, opts, args =\
        parse_command_line_parameters(suppress_verbose=True, **script_info)

    input_dir = opts.input_dir
    demultiplexing_method = opts.demultiplexing_method
    parameter_fp = opts.parameter_fp
    read_indicator = opts.read_indicator
    barcode_indicator = opts.barcode_indicator
    mapping_indicator = opts.mapping_indicator
    mapping_extensions = opts.mapping_extensions.split(',')
    sampleid_indicator = opts.sampleid_indicator
    leading_text = opts.leading_text
    trailing_text = opts.trailing_text
    include_input_dir_path = opts.include_input_dir_path
    output_dir = abspath(opts.output_dir)
    remove_filepath_in_name = opts.remove_filepath_in_name
    print_only = opts.print_only

    if remove_filepath_in_name and not include_input_dir_path:
        option_parser.error("If --remove_filepath_in_name enabled, "
                            "--include_input_dir_path must be enabled.")

    if opts.parameter_fp:
        with open(opts.parameter_fp, 'U') as parameter_f:
            params_dict = parse_qiime_parameters(parameter_f)
        params_str = get_params_str(params_dict['split_libraries_fastq'])
    else:
        params_dict = {}
        params_str = ""

    create_dir(output_dir)

    all_fastq = []
    all_mapping = []

    extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']

    for root, dir, fps in walk(input_dir):
        for fp in fps:
            for extension in extensions:
                if fp.endswith(extension):
                    all_fastq += [abspath(join(root, fp))]

    if demultiplexing_method == 'mapping_barcode_files':
        for root, dir, fps in walk(input_dir):
            for fp in fps:
                for mapping_extension in mapping_extensions:
                    if fp.endswith(mapping_extension):
                        all_mapping += [abspath(join(root, fp))]

        all_files = get_matching_files(all_fastq, all_mapping, read_indicator,
                                       barcode_indicator, mapping_indicator)
    else:
        all_files = all_fastq

    commands = create_commands_slf(all_files, demultiplexing_method,
                                   output_dir, params_str, leading_text,
                                   trailing_text, include_input_dir_path,
                                   remove_filepath_in_name, sampleid_indicator)

    qiime_config = load_qiime_config()
    if print_only:
        command_handler = print_commands
    else:
        command_handler = call_commands_serially
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params_dict,
                            qiime_config=qiime_config)
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback=no_status_updates,
                    logger=logger,
                    close_logger_on_success=True)
Example #20
0
def run_alpha_rarefaction(otu_table_fp,
                          mapping_fp,
                          output_dir,
                          command_handler,
                          params,
                          qiime_config,
                          tree_fp=None,
                          num_steps=10,
                          parallel=False,
                          logger=None,
                          min_rare_depth=10,
                          max_rare_depth=None,
                          suppress_md5=False,
                          status_update_callback=print_to_stdout,
                          plot_stderr_and_stddev=False,
                          retain_intermediate_files=True):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          1) Generate rarefied OTU tables;
          2) Compute alpha diversity metrics for each rarefied OTU table;
          3) Collate alpha diversity results;
          4) Generate alpha rarefaction plots.

    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    if max_rare_depth is None:
        min_count, max_count, median_count, mean_count, counts_per_sample =\
            compute_counts_per_sample_stats(
                load_table(otu_table_fp))
        max_rare_depth = median_count
    step = int((max_rare_depth - min_rare_depth) / num_steps) or 1
    max_rare_depth = int(max_rare_depth)

    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions'])
    except KeyError:
        params_str = ''
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])
        # Build the rarefaction command
        rarefaction_cmd = \
            'parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\
            (otu_table_fp, min_rare_depth, max_rare_depth, step,
             rarefaction_dir, params_str)
    else:
        # Build the rarefaction command
        rarefaction_cmd = \
            'multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\
            (otu_table_fp, min_rare_depth, max_rare_depth, step,
             rarefaction_dir, params_str)
    commands.append([('Alpha rarefaction', rarefaction_cmd)])

    # Prep the alpha diversity command
    alpha_diversity_dir = '%s/alpha_div/' % output_dir
    create_dir(alpha_diversity_dir)
    try:
        params_str = get_params_str(params['alpha_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str += ' -t %s' % tree_fp
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])
        # Build the alpha diversity command
        alpha_diversity_cmd = \
            "parallel_alpha_diversity.py -T -i %s -o %s %s" %\
            (rarefaction_dir, alpha_diversity_dir, params_str)
    else:
        # Build the alpha diversity command
        alpha_diversity_cmd = \
            "alpha_diversity.py -i %s -o %s %s" %\
            (rarefaction_dir, alpha_diversity_dir, params_str)

    commands.append([('Alpha diversity on rarefied OTU tables',
                      alpha_diversity_cmd)])

    # Prep the alpha diversity collation command
    alpha_collated_dir = '%s/alpha_div_collated/' % output_dir
    create_dir(alpha_collated_dir)
    try:
        params_str = get_params_str(params['collate_alpha'])
    except KeyError:
        params_str = ''
    # Build the alpha diversity collation command
    alpha_collated_cmd = 'collate_alpha.py -i %s -o %s %s' %\
        (alpha_diversity_dir, alpha_collated_dir, params_str)
    commands.append([('Collate alpha', alpha_collated_cmd)])

    if not retain_intermediate_files:
        commands.append([
            ('Removing intermediate files',
             'rm -r %s %s' % (rarefaction_dir, alpha_diversity_dir))
        ])
    else:
        commands.append([('Skipping removal of intermediate files.', '')])

    # Prep the make rarefaction plot command(s)
    try:
        params_str = get_params_str(params['make_rarefaction_plots'])
    except KeyError:
        params_str = ''

    if 'std_type' in params[
            'make_rarefaction_plots'] or not plot_stderr_and_stddev:
        rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir
        create_dir(rarefaction_plot_dir)

        # Build the make rarefaction plot command(s)
        # for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
            'make_rarefaction_plots.py -i %s -m %s -o %s %s' %\
            (alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str)
        commands.append([('Rarefaction plot: %s' % 'All metrics',
                          make_rarefaction_plot_cmd)])
    else:
        rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir
        rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir
        create_dir(rarefaction_plot_dir_stddev)
        create_dir(rarefaction_plot_dir_stderr)

        # Build the make rarefaction plot command(s)
        # for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
            'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\
            (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev,
             params_str)
        commands.append([('Rarefaction plot: %s' % 'All metrics',
                          make_rarefaction_plot_cmd)])
        make_rarefaction_plot_cmd =\
            'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\
            (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr,
             params_str)
        commands.append([('Rarefaction plot: %s' % 'All metrics',
                          make_rarefaction_plot_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Example #21
0
def iterative_pick_subsampled_open_reference_otus(
                              input_fps, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we 
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    otu_table_fps = []
    repset_fasta_fps = []
    for i,input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir,i)
        if iteration_output_exists(iteration_output_dir,min_otu_size):
            # if the output from an iteration already exists, skip that 
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger,[input_fp,refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i,input_fp))
        else:
            pick_subsampled_open_reference_otus(input_fp=input_fp,
                                     refseqs_fp=refseqs_fp,
                                     output_dir=iteration_output_dir,
                                     percent_subsample=percent_subsample,
                                     new_ref_set_id='.'.join([new_ref_set_id,str(i)]),
                                     command_handler=command_handler,
                                     params=params,
                                     qiime_config=qiime_config,
                                     run_assign_tax=False,
                                     run_align_and_tree=False,
                                     prefilter_refseqs_fp=prefilter_refseqs_fp,
                                     prefilter_percent_id=prefilter_percent_id,
                                     min_otu_size=min_otu_size,
                                     step1_otu_map_fp=step1_otu_map_fp,
                                     step1_failures_fasta_fp=step1_failures_fasta_fp,
                                     parallel=parallel,
                                     suppress_step4=suppress_step4,
                                     logger=logger,
                                     suppress_md5=suppress_md5,
                                     denovo_otu_picking_method=denovo_otu_picking_method,
                                     reference_otu_picking_method=reference_otu_picking_method,
                                     status_update_callback=status_update_callback)
        ## perform post-iteration file shuffling whether the previous iteration's
        ## data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp
        otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size))
        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)
    
    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
         (','.join(otu_table_fps),otu_table_fp)        
        commands.append([("Merge OTU tables",merge_cmd)])
    
    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp)
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
    
    logger.close()
Example #22
0
def run_beta_diversity_through_plots(otu_table_fp,
                                     mapping_fp,
                                     output_dir,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     color_by_interesting_fields_only=True,
                                     sampling_depth=None,
                                     tree_fp=None,
                                     parallel=False,
                                     logger=None,
                                     suppress_emperor_plots=False,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots

        The steps performed by this function are:
         1) Compute a beta diversity distance matrix for each metric
         2) Peform a principal coordinates analysis on the result of step 1
         3) Generate an emperor plot for each result of step 2

    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    mapping_data, mapping_header, mapping_comments =\
        parse_mapping_file(open(mapping_fp, 'U'))

    # Get the interesting mapping fields to color by -- if none are
    # interesting, take all of them. Interesting is defined as those
    # which have greater than one value and fewer values than the number
    # of samples
    if color_by_interesting_fields_only:
        mapping_fields =\
            get_interesting_mapping_fields(mapping_data, mapping_header) or\
            mapping_header
    else:
        mapping_fields = mapping_header
    mapping_fields = ','.join(mapping_fields)

    if sampling_depth:
        # Sample the OTU table at even depth
        even_sampled_otu_table_fp = '%s/%s_even%d%s' %\
            (output_dir, otu_table_basename,
             sampling_depth, otu_table_ext)
        single_rarefaction_cmd = \
            'single_rarefaction.py -i %s -o %s -d %d' %\
            (otu_table_fp, even_sampled_otu_table_fp, sampling_depth)
        commands.append([
            ('Sample OTU table at %d seqs/sample' % sampling_depth,
             single_rarefaction_cmd)
        ])
        otu_table_fp = even_sampled_otu_table_fp
        otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp)
        otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac']

    dm_fps = []
    for beta_diversity_metric in beta_diversity_metrics:

        # Prep the beta-diversity command
        try:
            bdiv_params_copy = params['beta_diversity'].copy()
        except KeyError:
            bdiv_params_copy = {}
        try:
            del bdiv_params_copy['metrics']
        except KeyError:
            pass

        params_str = get_params_str(bdiv_params_copy)

        if tree_fp:
            params_str = '%s -t %s ' % (params_str, tree_fp)

        # Build the beta-diversity command
        if parallel:
            # Grab the parallel-specific parameters
            try:
                params_str += get_params_str(params['parallel'])
            except KeyError:
                pass
            beta_div_cmd = 'parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\
                (otu_table_fp, output_dir, beta_diversity_metric, params_str)
            commands.append([('Beta Diversity (%s)' % beta_diversity_metric,
                              beta_div_cmd)])
        else:
            beta_div_cmd = 'beta_diversity.py -i %s -o %s --metrics %s %s' %\
                (otu_table_fp, output_dir, beta_diversity_metric, params_str)
            commands.append([('Beta Diversity (%s)' % beta_diversity_metric,
                              beta_div_cmd)])

        orig_beta_div_fp = '%s/%s_%s.txt' % \
            (output_dir, beta_diversity_metric, otu_table_basename)
        beta_div_fp = '%s/%s_dm.txt' % \
            (output_dir, beta_diversity_metric)
        commands.append([
            ('Rename distance matrix (%s)' % beta_diversity_metric,
             'mv %s %s' % (orig_beta_div_fp, beta_div_fp))
        ])
        dm_fps.append((beta_diversity_metric, beta_div_fp))

        # Prep the principal coordinates command
        pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the principal coordinates command
        pc_cmd = 'principal_coordinates.py -i %s -o %s %s' %\
            (beta_div_fp, pc_fp, params_str)
        commands.append([('Principal coordinates (%s)' % beta_diversity_metric,
                          pc_cmd)])

        # Generate emperor plots
        if not suppress_emperor_plots:
            # Prep the emperor plots command
            emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir,
                                                        beta_diversity_metric)
            create_dir(emperor_dir)
            try:
                params_str = get_params_str(params['make_emperor'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            emperor_command = \
                'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp,
                                                          emperor_dir,
                                                          mapping_fp,
                                                          params_str)

            commands.append([
                ('Make emperor plots, %s)' % beta_diversity_metric,
                 emperor_command)
            ])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)

    return dm_fps
def pick_nested_reference_otus(input_fasta_fp,
                               input_tree_fp,
                               output_dir,
                               run_id,
                               similarity_thresholds,
                               command_handler,
                               status_update_callback=print_to_stdout):

    # Prepare some variables for the later steps
    create_dir(output_dir)
    otu_dir = join(output_dir, 'otus')
    create_dir(otu_dir)
    rep_set_dir = join(output_dir, 'rep_set')
    create_dir(rep_set_dir)
    # currently not doing anything with taxonomies and trees
    # tax_dir = join(output_dir,'taxonomies')
    # create_dir(tax_dir)
    if input_tree_fp:
        tree_dir = join(output_dir, 'trees')
        create_dir(tree_dir)
    commands = []
    files_to_remove = []

    logger = WorkflowLogger(generate_log_fp(output_dir))
    similarity_thresholds.sort()
    similarity_thresholds.reverse()

    current_inseqs_fp = input_fasta_fp
    current_tree_fp = input_tree_fp
    previous_otu_map = None
    for similarity_threshold in similarity_thresholds:
        current_inseqs_basename = splitext(split(current_inseqs_fp)[1])[0]

        # pick otus command
        otu_fp = '%s/%d_otu_map.txt' % (otu_dir, similarity_threshold)
        clusters_fp = '%s/%d_clusters.uc' % (otu_dir, similarity_threshold)
        temp_otu_fp = '%s/%s_otus.txt' % (otu_dir, current_inseqs_basename)
        temp_log_fp = '%s/%s_otus.log' % (otu_dir, current_inseqs_basename)
        temp_clusters_fp = '%s/%s_clusters.uc' % (otu_dir,
                                                  current_inseqs_basename)
        pick_otus_cmd = \
         'pick_otus.py -m uclust -DBz -i %s -s %1.2f -o %s' % (
           current_inseqs_fp,
           similarity_threshold/100,
           otu_dir)

        commands.append([('Pick OTUs (%d)' % similarity_threshold,
                          pick_otus_cmd)])
        commands.append([('Rename OTU file (%d)' % similarity_threshold,
                          'mv %s %s' % (temp_otu_fp, otu_fp))])
        commands.append([('Rename uc file (%d)' % similarity_threshold,
                          'mv %s %s' % (temp_clusters_fp, clusters_fp))])
        files_to_remove.append(temp_log_fp)

        # rep set picking
        temp_rep_set_fp = get_tmp_filename(prefix='NestedReference',
                                           suffix='.fasta')
        pick_rep_set_cmd = \
         'pick_rep_set.py -m first -i %s -o %s -f %s' % (
          otu_fp,
          temp_rep_set_fp,
          current_inseqs_fp)
        commands.append([('Pick Rep Set (%d)' % similarity_threshold,
                          pick_rep_set_cmd)])
        command_handler(commands,
                        status_update_callback,
                        logger,
                        close_logger_on_success=False)
        commands = []

        # rename representative sequences
        rep_set_fp = '%s/%d_otus_%s.fasta' % (rep_set_dir,
                                              similarity_threshold, run_id)
        logger.write(
            'Renaming OTU representative sequences so OTU ids are reference sequence ids.'
        )
        rep_set_f = open(rep_set_fp, 'w')
        for e in rename_rep_seqs(open(temp_rep_set_fp, 'U')):
            rep_set_f.write('>%s\n%s\n' % e)
        rep_set_f.close()
        files_to_remove.append(temp_rep_set_fp)

        # filter the tree, if provided
        if current_tree_fp != None:
            tree_fp = '%s/%d_otus_%s.tre' % (tree_dir, similarity_threshold,
                                             run_id)
            tree_cmd = 'filter_tree.py -i %s -f %s -o %s' %\
               (current_tree_fp,rep_set_fp,tree_fp)
            commands.append([('Filter tree (%d)' % similarity_threshold,
                              tree_cmd)])
            command_handler(commands,
                            status_update_callback,
                            logger,
                            close_logger_on_success=False)
            # prep for the next iteration
            current_tree_fp = tree_fp

        # prep for the next iteration
        remove_files(files_to_remove)
        commands = []
        files_to_remove = []
        current_inseqs_fp = rep_set_fp

    logger.close()
Example #24
0
def run_jackknifed_beta_diversity(otu_table_fp,
                                  tree_fp,
                                  seqs_per_sample,
                                  output_dir,
                                  command_handler,
                                  params,
                                  qiime_config,
                                  mapping_fp,
                                  parallel=False,
                                  logger=None,
                                  suppress_md5=False,
                                  status_update_callback=print_to_stdout,
                                  master_tree=None):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          1) Compute beta diversity distance matrix from otu table (and
           tree, if applicable)
          2) Build rarefied OTU tables;
          3) Build UPGMA tree from full distance matrix;
          4) Compute distance matrics for rarefied OTU tables;
          5) Build UPGMA trees from rarefied OTU table distance matrices;
          5.5) Build a consensus tree from the rarefied UPGMA trees
          6) Compare rarefied OTU table distance matrix UPGMA trees
           to tree full UPGMA tree and write support file and newick tree
           with support values as node labels.

        master_tree can be 'full' or 'consensus', default full
    """
    # Prepare some variables for the later steps
    if master_tree is None:
        master_tree = 'full'
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp])

    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac']

    # Prep the beta-diversity command
    try:
        params_str = get_params_str(params['beta_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str = '%s -t %s' % (params_str, tree_fp)
    # Build the beta-diversity command
    beta_div_cmd = 'beta_diversity.py -i %s -o %s %s' %\
        (otu_table_fp, output_dir, params_str)
    commands.append([
        ('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics),
         beta_div_cmd)
    ])

    # Prep rarefaction command
    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions_even_depth'])
    except KeyError:
        params_str = ''
    # Build the rarefaction command
    rarefaction_cmd = \
        'multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\
        (otu_table_fp, seqs_per_sample, rarefaction_dir, params_str)
    commands.append([('Rarefaction', rarefaction_cmd)])

    # Begin iterating over beta diversity distance metrics, if more than one
    # was provided
    for beta_diversity_metric in beta_diversity_metrics:
        metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric)
        distance_matrix_fp = '%s/%s_%s.txt' % \
            (output_dir, beta_diversity_metric, otu_table_basename)

        # Prep the hierarchical clustering command (for full distance matrix)
        full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir,
                                            otu_table_basename)
        try:
            params_str = get_params_str(params['upgma_cluster'])
        except KeyError:
            params_str = ''
        # Build the hierarchical clustering command (for full distance matrix)
        hierarchical_cluster_cmd = 'upgma_cluster.py -i %s -o %s %s' %\
            (distance_matrix_fp, full_tree_fp, params_str)
        commands.append([
            ('UPGMA on full distance matrix: %s' % beta_diversity_metric,
             hierarchical_cluster_cmd)
        ])

        # Prep the beta diversity command (for rarefied OTU tables)
        dm_dir = '%s/rare_dm/' % metric_output_dir
        create_dir(dm_dir)
        # the metrics parameter needs to be ignored as we need to run
        # beta_diversity one metric at a time to keep the per-metric
        # output files in separate directories
        try:
            d = params['beta_diversity'].copy()
            del d['metrics']
        except KeyError:
            params_str = {}
        params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric
        if tree_fp:
            params_str = '%s -t %s' % (params_str, tree_fp)
        if parallel:
            params_str += ' %s' % get_params_str(params['parallel'])
            # Build the parallel beta diversity command (for rarefied OTU
            # tables)
            beta_div_rarefied_cmd = \
                'parallel_beta_diversity.py -T -i %s -o %s %s' %\
                (rarefaction_dir, dm_dir, params_str)
        else:
            # Build the serial beta diversity command (for rarefied OTU tables)
            beta_div_rarefied_cmd = \
                'beta_diversity.py -i %s -o %s %s' %\
                (rarefaction_dir, dm_dir, params_str)
        commands.append([('Beta diversity on rarefied OTU tables (%s)' %
                          beta_diversity_metric, beta_div_rarefied_cmd)])

        # Prep the hierarchical clustering command (for rarefied
        # distance matrices)
        upgma_dir = '%s/rare_upgma/' % metric_output_dir
        create_dir(upgma_dir)

        try:
            params_str = get_params_str(params['upgma_cluster'])
        except KeyError:
            params_str = ''
        # Build the hierarchical clustering command (for rarefied
        # distance matrices)
        hierarchical_cluster_cmd =\
            'upgma_cluster.py -i %s -o %s %s' % (dm_dir, upgma_dir, params_str)
        commands.append([
            ('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric,
             hierarchical_cluster_cmd)
        ])

        # Build the consensus tree command
        consensus_tree_cmd =\
            'consensus_tree.py -i %s -o %s %s' %\
            (upgma_dir, metric_output_dir + "/rare_upgma_consensus.tre",
             params_str)
        commands.append([('consensus on rarefied distance matrices (%s)' %
                          beta_diversity_metric, consensus_tree_cmd)])

        # Prep the tree compare command
        tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir
        create_dir(tree_compare_dir)
        try:
            params_str = get_params_str(params['tree_compare'])
        except KeyError:
            params_str = ''

        # Build the tree compare command
        if master_tree == "full":
            master_tree_fp = full_tree_fp
        elif master_tree == "consensus":
            master_tree_fp = metric_output_dir + "/rare_upgma_consensus.tre"
        else:
            raise RuntimeError('master tree method "%s" not found' %
                               (master_tree, ))
        tree_compare_cmd = 'tree_compare.py -s %s -m %s -o %s %s' %\
            (upgma_dir, master_tree_fp, tree_compare_dir, params_str)
        commands.append([('Tree compare (%s)' % beta_diversity_metric,
                          tree_compare_cmd)])

        # Prep the PCoA command
        pcoa_dir = '%s/pcoa/' % metric_output_dir
        create_dir(pcoa_dir)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the PCoA command
        pcoa_cmd = 'principal_coordinates.py -i %s -o %s %s' %\
            (dm_dir, pcoa_dir, params_str)
        commands.append([('Principal coordinates (%s)' % beta_diversity_metric,
                          pcoa_cmd)])

        # Prep the emperor plots command
        emperor_dir = '%s/emperor_pcoa_plots/' % metric_output_dir
        create_dir(emperor_dir)
        try:
            params_str = get_params_str(params['make_emperor'])
        except KeyError:
            params_str = ''
        emperor_cmd = 'make_emperor.py -i %s -o %s -m %s %s' %\
            (pcoa_dir, emperor_dir, mapping_fp, params_str)
        commands.append([('emperor plots (%s)' % beta_diversity_metric,
                          emperor_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_otu_category_significance=False,
    status_update_callback=print_to_stdout):
    """
    """
    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    
    # begin logging
    old_log_fps = glob(join(output_dir,'log_20*txt'))
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,_index_headers['run_summary']))
    for old_log_fp in old_log_fps:
        index_links.append(('Previous run log',old_log_fp,_index_headers['run_summary']))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)

    # run 'biom summarize-table' on input BIOM table
    try:
        params_str = get_params_str(params['biom-summarize-table'])
    except KeyError:
        params_str = ''
    biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir
    if not exists(biom_table_stats_output_fp):
        biom_table_summary_cmd = \
         "biom summarize-table -i %s -o %s --suppress-md5 %s" % \
         (biom_fp, biom_table_stats_output_fp,params_str)
        commands.append([('Generate BIOM table summary',
                          biom_table_summary_cmd)])
    else:
        logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" \
                     % biom_table_stats_output_fp)
    index_links.append(('BIOM table statistics',
                        biom_table_stats_output_fp,
                        _index_headers['run_summary']))
    
    # filter samples with fewer observations than the requested sampling_depth. 
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered 
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    if not exists(filtered_biom_fp):
        filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\
         (biom_fp,filtered_biom_fp,sampling_depth)
        commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth,
                          filter_samples_cmd)])
    else:
        logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" \
                     % filtered_biom_fp)
    biom_fp = filtered_biom_fp
    
    # run initial commands and reset the command list
    if len(commands) > 0:
        command_handler(commands, 
                        status_update_callback, 
                        logger,
                        close_logger_on_success=False)
        commands = []
    
    if not suppress_beta_diversity:
        bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
        # Need to check for the existence of any distance matrices, since the user 
        # can select which will be generated.
        existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir)
        if len(existing_dm_fps) == 0:
            even_dm_fps = run_beta_diversity_through_plots(
             otu_table_fp=biom_fp, 
             mapping_fp=mapping_fp,
             output_dir=bdiv_even_output_dir,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             sampling_depth=sampling_depth,
             tree_fp=tree_fp,
             parallel=parallel,
             logger=logger,
             suppress_md5=True,
             status_update_callback=status_update_callback)
        else:
            logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" \
                         % ', '.join(existing_dm_fps))
            even_dm_fps = [(split(fp)[1].strip('_dm.txt'),fp) for fp in existing_dm_fps]
        
        # Get make_distance_boxplots parameters
        try:
            params_str = get_params_str(params['make_distance_boxplots'])
        except KeyError:
            params_str = ''
        
        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
                plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,category)
                stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,category)
                if not exists(plot_output_fp):
                    boxplots_cmd = \
                     'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
                     (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
                    commands.append([('Boxplots (%s)' % category,
                                      boxplots_cmd)])
                else:
                    logger.write("Skipping make_distance_boxplots.py for %s as %s exists.\n\n" \
                                 % (category, plot_output_fp))
                index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                    plot_output_fp,
                                    _index_headers['beta_diversity_even'] % sampling_depth))
                index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                    stats_output_fp,
                                    _index_headers['beta_diversity_even'] % sampling_depth))
            
            index_links.append(('PCoA plot (%s)' % bdiv_metric,
                                '%s/%s_emperor_pcoa_plot/index.html' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Distance matrix (%s)' % bdiv_metric,
                                '%s/%s_dm.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
            index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                                '%s/%s_pc.txt' % \
                                 (bdiv_even_output_dir,bdiv_metric),
                                _index_headers['beta_diversity_even'] % sampling_depth))
    
    if not suppress_alpha_diversity:
        ## Alpha rarefaction workflow
        arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
        rarefaction_plots_output_fp = \
         '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir
        if not exists(rarefaction_plots_output_fp):
            run_alpha_rarefaction(
             otu_table_fp=biom_fp,
             mapping_fp=mapping_fp,
             output_dir=arare_full_output_dir,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             tree_fp=tree_fp,
             num_steps=arare_num_steps,
             parallel=parallel,
             logger=logger,
             min_rare_depth=arare_min_rare_depth,
             max_rare_depth=sampling_depth,
             suppress_md5=True,
             status_update_callback=status_update_callback)
        else:
            logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" \
                         % rarefaction_plots_output_fp)
    
        index_links.append(('Alpha rarefaction plots',
                            rarefaction_plots_output_fp,
                            _index_headers['alpha_diversity']))
                        
        collated_alpha_diversity_fps = \
         glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
        try:
            params_str = get_params_str(params['compare_alpha_diversity'])
        except KeyError:
            params_str = ''
            
        for category in categories:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                alpha_comparison_output_fp = '%s/%s_%s.txt' % \
                 (arare_full_output_dir,category,alpha_metric)
                if not exists(alpha_comparison_output_fp):
                    compare_alpha_cmd = \
                     'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\
                     (collated_alpha_diversity_fp, mapping_fp, category, 
                      alpha_comparison_output_fp, params_str)
                    commands.append([('Compare alpha diversity (%s, %s)' %\
                                       (category,alpha_metric),
                                      compare_alpha_cmd)])
                else:
                    logger.write("Skipping compare_alpha_diversity.py for %s as %s exists.\n\n" \
                                 % (category, alpha_comparison_output_fp))
                index_links.append(
                 ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
                  alpha_comparison_output_fp,
                  _index_headers['alpha_diversity']))
    
    if not suppress_taxa_summary:
        taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
        # need to check for existence of any html files, since the user can 
        # select only certain ones to be generated
        existing_taxa_plot_html_fps = glob(join(output_dir,'taxa_summary_plots','*.html'))
        if len(existing_taxa_plot_html_fps) == 0:
            run_summarize_taxa_through_plots(
             otu_table_fp=biom_fp,
             mapping_fp=mapping_fp,
             output_dir=taxa_plots_output_dir,
             mapping_cat=None, 
             sort=True,
             command_handler=command_handler,
             params=params,
             qiime_config=qiime_config,
             logger=logger,
             suppress_md5=True,
             status_update_callback=status_update_callback)
        else:
            logger.write("Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" \
                         % ', '.join(existing_taxa_plot_html_fps))

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            _index_headers['taxa_summary']))
        for category in categories:
            taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category)
            # need to check for existence of any html files, since the user can 
            # select only certain ones to be generated
            existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir)
            if len(existing_taxa_plot_html_fps) == 0:
                run_summarize_taxa_through_plots(
                 otu_table_fp=biom_fp,
                 mapping_fp=mapping_fp,
                 output_dir=taxa_plots_output_dir,
                 mapping_cat=category, 
                 sort=True,
                 command_handler=command_handler,
                 params=params,
                 qiime_config=qiime_config,
                 logger=logger,
                 suppress_md5=True,
                 status_update_callback=status_update_callback)
            else:
                logger.write("Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" \
                             % (category, ', '.join(existing_taxa_plot_html_fps)))

            index_links.append(('Taxa summary bar plots',
                                '%s/taxa_summary_plots/bar_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
            index_links.append(('Taxa summary area plots',
                                '%s/taxa_summary_plots/area_charts.html'\
                                  % taxa_plots_output_dir,
                                _index_headers['taxa_summary_categorical'] % category))
    
    if not suppress_otu_category_significance:
        try:
            params_str = get_params_str(params['otu_category_significance'])
        except KeyError:
            params_str = ''
        # OTU category significance
        for category in categories:
            category_signifance_fp = \
             '%s/category_significance_%s.txt' % (output_dir, category)
            if not exists(category_signifance_fp):
                # Build the OTU cateogry significance command
                category_significance_cmd = \
                 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
                 (biom_fp, mapping_fp, category, 
                  category_signifance_fp, params_str)
                commands.append([('OTU category significance (%s)' % category, 
                                  category_significance_cmd)])
            else:
                logger.write("Skipping otu_category_significance.py for %s as %s exists.\n\n" \
                             % (category, category_signifance_fp))
            
            index_links.append(('Category significance (%s)' % category,
                        category_signifance_fp,
                        _index_headers['otu_category_sig']))
    filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp
    if not exists(filtered_biom_gzip_fp):
        commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)])
        index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth,
                            filtered_biom_gzip_fp,
                            _index_headers['run_summary']))
    else:
        logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" \
                     % filtered_biom_gzip_fp)
    if len(commands) > 0:
        command_handler(commands, status_update_callback, logger)
    else:
        logger.close()
    
    generate_index_page(index_links,index_fp)
Example #26
0
def run_summarize_taxa_through_plots(otu_table_fp,
                                     mapping_fp,
                                     output_dir,
                                     mapping_cat,
                                     sort,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     logger=None,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Run the data preparation for summarizing taxonomies and generating plots

        The steps performed by this function are:
          1) Summarize OTU by Category
          2) Summarize Taxonomy
          3) Plot Taxonomy Summary

    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)

    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [otu_table_fp, mapping_fp])

    # if mapping category not passed via command-line,
    # check if it is passed in params file
    if not mapping_cat:
        try:
            mapping_cat = params['collapse_samples']['collapse_fields']
        except:
            mapping_cat = None

    try:
        params_str = get_params_str(params['collapse_samples'])
        # Need to remove the mapping category option, since it is defined above.
        # Using this method since we don't want to change the params dict
        split_params = params_str.split('--')
        updated_params_str = []
        for i in split_params:
            if not i.startswith('collapse_fields'):
                updated_params_str.append(i)
        params_str = '--'.join(updated_params_str)
    except:
        params_str = ''

    if mapping_cat:
        base_filename = mapping_cat.replace(' ', '-').replace(',', '')
        output_biom_fp = join(output_dir, '%s_otu_table.biom' % base_filename)
        output_map_fp = join(output_dir, '%s_map.txt' % base_filename)
        # Build the collapse samples command
        collapse_samples_cmd = \
            "collapse_samples.py -m %s -b %s --output_biom_fp %s --output_mapping_fp %s --collapse_fields '%s' %s" %\
            (mapping_fp, otu_table_fp, output_biom_fp, output_map_fp, mapping_cat, params_str)

        commands.append([('Collapse samples in OTU table by categories',
                          collapse_samples_cmd)])

        otu_table_fp = output_biom_fp

    # Build the sort OTU table command
    if sort:
        # Prep the sort_otu_table command
        try:
            params_str = get_params_str(params['sort_otu_table'])
        except:
            params_str = ''

        # define output otu table
        sorted_fp = join(output_dir,
                         splitext(split(otu_table_fp)[-1])[0] + '_sorted.biom')

        if mapping_cat or params_str == '':
            # for this case we don't have a collapsed mapping file so must
            # handle separately
            sort_otu_table_cmd = \
                "sort_otu_table.py -i %s -o %s" % (otu_table_fp, sorted_fp)
        else:
            sort_otu_table_cmd = \
                "sort_otu_table.py -i %s -o %s -m %s %s" %\
                (otu_table_fp, sorted_fp, mapping_fp, params_str)

        commands.append([('Sort OTU Table', sort_otu_table_cmd)])

        # redefine otu_table_fp to use
        otu_table_fp = sorted_fp

    # Prep the summarize taxonomy command
    try:
        params_str = get_params_str(params['summarize_taxa'])
    except:
        params_str = ''

    try:
        sum_taxa_levels = params['summarize_taxa']['level']
    except:
        sum_taxa_levels = None

    # Build the summarize taxonomy command
    summarize_taxa_cmd = 'summarize_taxa.py -i %s -o %s %s' %\
        (otu_table_fp, output_dir, params_str)

    commands.append([('Summarize Taxonomy', summarize_taxa_cmd)])

    sum_taxa_fps = []

    if sum_taxa_levels:
        basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0])
        for i in sum_taxa_levels.split(','):
            sum_taxa_fps.append(basename + '_L%s.txt' % (str(i)))
    else:
        basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0])
        # this is the default levels from summarize_taxa, but cannot import
        # script to get these values
        for i in [2, 3, 4, 5, 6]:
            sum_taxa_fps.append(basename + '_L%s.txt' % (str(i)))

    # Prep the plot taxa summary plot command(s)
    taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir
    create_dir(taxa_summary_plots_dir)

    try:
        params_str = get_params_str(params['plot_taxa_summary'])
    except:
        params_str = ''
    # Build the plot taxa summary plot command(s)

    plot_taxa_summary_cmd =\
        'plot_taxa_summary.py -i %s -o %s %s' %\
        (','.join(sum_taxa_fps), taxa_summary_plots_dir, params_str)

    commands.append([('Plot Taxonomy Summary', plot_taxa_summary_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Example #27
0
def assign_tax(repset_fasta_fp,
               output_dir,
               command_handler,
               params,
               qiime_config,
               parallel=False,
               logger=None,
               status_update_callback=print_to_stdout):

    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'uclust'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
        (output_dir, assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
        (assign_taxonomy_dir, input_basename)
    if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'
                     or assignment_method == 'uclust'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            if 'assignment_method' in d:
                del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass

        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
            'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
            (assignment_method, repset_fasta_fp,
             assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
            (assign_taxonomy_dir, repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy', assign_taxonomy_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp
Example #28
0
def run_alpha_rarefaction(otu_table_fp, 
                          mapping_fp,
                          output_dir,
                          command_handler,
                          params,
                          qiime_config,
                          tree_fp=None,
                          num_steps=10,
                          parallel=False,
                          logger=None,
                          min_rare_depth=10,
                          max_rare_depth=None,
                          suppress_md5=False,
                          status_update_callback=print_to_stdout,
                          plot_stderr_and_stddev=False,
                          retain_intermediate_files=True):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Generate rarefied OTU tables;
          2) Compute alpha diversity metrics for each rarefied OTU table;
          3) Collate alpha diversity results;
          4) Generate alpha rarefaction plots.
    
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
    
    if max_rare_depth == None:
        min_count, max_count, median_count, mean_count, counts_per_sample =\
         compute_counts_per_sample_stats(parse_biom_table(open(otu_table_fp,'U')))
        max_rare_depth = median_count
    step = int((max_rare_depth - min_rare_depth) / num_steps) or 1
    max_rare_depth = int(max_rare_depth)
    
    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions'])
    except KeyError:
        params_str = ''
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])        
        # Build the rarefaction command
        rarefaction_cmd = \
         '%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\
         (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth,
          step, rarefaction_dir, params_str)
    else:
        # Build the rarefaction command
        rarefaction_cmd = \
         '%s %s/multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\
         (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth,
          step, rarefaction_dir, params_str)
    commands.append([('Alpha rarefaction', rarefaction_cmd)])
    
    # Prep the alpha diversity command
    alpha_diversity_dir = '%s/alpha_div/' % output_dir
    create_dir(alpha_diversity_dir)
    try:
        params_str = get_params_str(params['alpha_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str += ' -t %s' % tree_fp
    if parallel:
        params_str += ' %s' % get_params_str(params['parallel'])   
        # Build the alpha diversity command
        alpha_diversity_cmd = \
         "%s %s/parallel_alpha_diversity.py -T -i %s -o %s %s" %\
         (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir,
          params_str)
    else:  
        # Build the alpha diversity command
        alpha_diversity_cmd = \
         "%s %s/alpha_diversity.py -i %s -o %s %s" %\
         (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir,
          params_str)

    commands.append(\
     [('Alpha diversity on rarefied OTU tables',alpha_diversity_cmd)])
     
    # Prep the alpha diversity collation command
    alpha_collated_dir = '%s/alpha_div_collated/' % output_dir
    create_dir(alpha_collated_dir)
    try:
        params_str = get_params_str(params['collate_alpha'])
    except KeyError:
        params_str = ''
    # Build the alpha diversity collation command
    alpha_collated_cmd = '%s %s/collate_alpha.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, alpha_diversity_dir, \
      alpha_collated_dir, params_str)
    commands.append([('Collate alpha',alpha_collated_cmd)])
    
    if not retain_intermediate_files:
        commands.append([('Removing intermediate files',
                          'rm -r %s %s' % (rarefaction_dir,alpha_diversity_dir))])
    else:
        commands.append([('Skipping removal of intermediate files.','')])

    # Prep the make rarefaction plot command(s)
    try:
        params_str = get_params_str(params['make_rarefaction_plots'])
    except KeyError:
        params_str = ''
    
    if 'std_type' in params['make_rarefaction_plots'] or not plot_stderr_and_stddev:
        rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir
        create_dir(rarefaction_plot_dir)
        
        # Build the make rarefaction plot command(s)
        #for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
    else:
        rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir
        rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir
        create_dir(rarefaction_plot_dir_stddev)
        create_dir(rarefaction_plot_dir_stderr)
        
        # Build the make rarefaction plot command(s)
        # for metric in alpha_diversity_metrics:
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir_stddev, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
        make_rarefaction_plot_cmd =\
             '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\
             (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp,
              rarefaction_plot_dir_stderr, params_str)
        commands.append(\
             [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)])
   
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Example #29
0
def iterative_pick_subsampled_open_reference_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=None,
        min_otu_size=2,
        run_assign_tax=True,
        run_align_and_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        suppress_md5=False,
        denovo_otu_picking_method='uclust',
        reference_otu_picking_method='uclust_ref',
        status_update_callback=print_to_stdout,
        minimum_failure_threshold=100000):
    """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_reference_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_assign_tax=False,
                run_align_and_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                suppress_md5=suppress_md5,
                suppress_index_page=True,
                denovo_otu_picking_method=denovo_otu_picking_method,
                reference_otu_picking_method=reference_otu_picking_method,
                status_update_callback=status_update_callback,
                minimum_failure_threshold=minimum_failure_threshold)
        # perform post-iteration file shuffling whether the previous iteration's
        # data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp

        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))

        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
            (','.join(otu_table_fps), otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,
                                                                 min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(
                table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
Example #30
0
def run_beta_diversity_through_plots(otu_table_fp, 
                                     mapping_fp,
                                     output_dir,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     color_by_interesting_fields_only=True,
                                     sampling_depth=None,
                                     tree_fp=None,
                                     parallel=False,
                                     logger=None,
                                     suppress_emperor_plots=False,
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots
    
        The steps performed by this function are:
         1) Compute a beta diversity distance matrix for each metric
         2) Peform a principal coordinates analysis on the result of step 1
         3) Generate an emperor plot for each result of step 2
    
    """  
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
    
    mapping_data, mapping_header, mapping_comments =\
     parse_mapping_file(open(mapping_fp,'U'))

    # Get the interesting mapping fields to color by -- if none are
    # interesting, take all of them. Interesting is defined as those
    # which have greater than one value and fewer values than the number 
    # of samples
    if color_by_interesting_fields_only:
        mapping_fields =\
          get_interesting_mapping_fields(mapping_data, mapping_header) or\
          mapping_header
    else:
        mapping_fields = mapping_header
    mapping_fields = ','.join(mapping_fields)
    
    if sampling_depth:
        # Sample the OTU table at even depth
        even_sampled_otu_table_fp = '%s/%s_even%d%s' %\
         (output_dir, otu_table_basename, 
          sampling_depth, otu_table_ext)
        single_rarefaction_cmd = \
         '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\
         (python_exe_fp, script_dir, otu_table_fp,
          even_sampled_otu_table_fp, sampling_depth)
        commands.append([
         ('Sample OTU table at %d seqs/sample' % sampling_depth,
          single_rarefaction_cmd)])
        otu_table_fp = even_sampled_otu_table_fp
        otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp)
        otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac']

    dm_fps = []
    for beta_diversity_metric in beta_diversity_metrics:
        
        # Prep the beta-diversity command
        try:
            bdiv_params_copy = params['beta_diversity'].copy()
        except KeyError:
            bdiv_params_copy = {}
        try:
            del bdiv_params_copy['metrics']
        except KeyError:
            pass
        
        params_str = get_params_str(bdiv_params_copy)
            
        if tree_fp:
            params_str = '%s -t %s ' % (params_str,tree_fp)
            
        # Build the beta-diversity command
        if parallel:
            # Grab the parallel-specific parameters
            try:
                params_str += get_params_str(params['parallel'])
            except KeyError:
                pass
            beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\
             (python_exe_fp, script_dir, otu_table_fp,
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
        else:
            beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\
             (python_exe_fp, script_dir, otu_table_fp, 
              output_dir, beta_diversity_metric, params_str)
            commands.append(\
             [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)])
        
        
        orig_beta_div_fp = '%s/%s_%s.txt' % \
         (output_dir, beta_diversity_metric, otu_table_basename)
        beta_div_fp = '%s/%s_dm.txt' % \
         (output_dir, beta_diversity_metric)
        commands.append([('Rename distance matrix (%s)' % beta_diversity_metric,
                         'mv %s %s' % (orig_beta_div_fp, beta_div_fp))])
        dm_fps.append((beta_diversity_metric, beta_div_fp))
        
        # Prep the principal coordinates command
        pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the principal coordinates command
        pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str)
        commands.append(\
         [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)])
        
        # Generate emperor plots
        if not suppress_emperor_plots:
            # Prep the emperor plots command
            emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir, beta_diversity_metric)
            create_dir(emperor_dir)
            try:
                params_str = get_params_str(params['make_emperor'])
            except KeyError:
                params_str = ''
            # Build the continuous-coloring 3d plots command
            emperor_command = \
             'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp,
                                                       emperor_dir,
                                                       mapping_fp,
                                                       params_str)
       
            commands.append([('Make emperor plots, %s)' % beta_diversity_metric,
                              emperor_command)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    
    return dm_fps
Example #31
0
def run_ampliconnoise(mapping_fp,
                      output_dir,
                      command_handler,
                      params,
                      qiime_config,
                      logger=None,
                      status_update_callback=print_to_stdout,
                      chimera_alpha=-3.8228,
                      chimera_beta=0.6200,
                      sff_txt_fp=None,
                      numnodes=2,
                      suppress_perseus=True,
                      output_filepath=None,
                      platform='flx',
                      seqnoise_resolution=None,
                      truncate_len=None):
    """ Run the ampliconnoise pipeline
    
        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    create_dir(output_dir)

    if seqnoise_resolution == None:
        if platform == 'flx': seqnoise_resolution = '30.0'
        elif platform == 'titanium': seqnoise_resolution = '25.0'
        else:            raise RuntimeError('seqnoise_resolution not set, and no'+\
          ' default for platform '+platform)

    if truncate_len == None:
        if platform == 'flx': truncate_len = '220'
        elif platform == 'titanium': truncate_len = '400'
        else:            raise RuntimeError('truncate_len not set, and no'+\
          ' default for platform '+platform)

    sample_names = [
    ]  # these are filenames minus extension, and are sample IDs
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        # don't know why don't just take off the primer now.
        # but that's done later
        # primer += (map_data[i][headers.index('LinkerPrimerSequence')])
        # for char, bases in IUPAC_DNA_ambiguities.items():
        #     primer = primer.replace(char,'['+''.join(bases)+']')

        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in IUPAC_DNA_ambiguities.items():
            primer = primer.replace(char, '[' + ''.join(bases) + ']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()

    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, 'map.csv'), 'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_' + truncate_len
    if suppress_perseus == True:
        fasta_result_names = [
            sample_name + post_pyro_tail + '_seqnoise_cd.fa'
            for sample_name in sample_names
        ]
    else:
        fasta_result_names = [sample_name + '_Good.fa' \
          for sample_name in sample_names]

    cmd = 'cd ' + output_dir  # see also os.chdir above
    commands.append([('change to output dir', cmd)])

    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable', cmd)
                     ])


    cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\
        os.path.join(called_dir,sff_txt_fp)+\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\
          sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout"
        commands.append([('pyrodist ' + sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\
          " > "+sample_name+".fcout"
        commands.append([('fcluster pyrodist ' + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
        # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\
            sample_name+".dat -out "+\
            sample_name+"_pyronoise "+"-lin "+\
            sample_name+".list -s 60.0 -c 0.01 > "+\
            sample_name+"_pyronoise.pnout"
        commands.append([('pyronoise ' + sample_name, cmd)])

        cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\
            sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\
            truncate_len+'.fa'
        commands.append([('truncate ' + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\
            sample_name+post_pyro_tail+\
            ".fa > "+sample_name+post_pyro_tail+".seqdist"
        commands.append([('seqdist ' + sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+"fcl > "+\
            sample_name+post_pyro_tail+".fcout"
        commands.append([('fcluster seqdist ' + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
        # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
        # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
        # PC.354_pyronoise_cd.snout

        cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\
            sample_name+post_pyro_tail+\
            ".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+\
            "_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\
            sample_name+'_pyronoise'+\
            '.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\
            sample_name+post_pyro_tail+'.snout'
        commands.append([('seqnoise ' + sample_name, cmd)])

        if suppress_perseus == False:

            cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa > ' +\
                sample_name+'.per'
            commands.append([('Perseus ' + sample_name, cmd)])

            cmd = 'Class.pl '+sample_name+'.per '+\
                str(chimera_alpha) + ' '+ str(chimera_beta)+\
                ' > '+sample_name+'.class'
            commands.append([('Class.pl ' + sample_name, cmd)])

            cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa '+\
                sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\
                sample_name+'_Good.fa'
            commands.append([('FilterGoodClass ' + sample_name, cmd)])

        cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\
         (python_exe_fp, script_dir, fasta_result_names[i],
         sample_name+'_unw.fna', sample_name)
        commands.append([('unweight fasta ' + sample_name, cmd)])

    cmd = 'cat ' +\
      ' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\
      ' > ' + output_filepath # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Example #32
0
def run_jackknifed_beta_diversity(otu_table_fp,
                                  tree_fp,
                                  seqs_per_sample,
                                  output_dir,
                                  command_handler,
                                  params,
                                  qiime_config,
                                  mapping_fp,
                                  parallel=False,
                                  logger=None,
                                  suppress_md5=False,
                                  status_update_callback=print_to_stdout,
                                  master_tree=None):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Compute beta diversity distance matrix from otu table (and
           tree, if applicable)
          2) Build rarefied OTU tables;
          3) Build UPGMA tree from full distance matrix;
          4) Compute distance matrics for rarefied OTU tables;
          5) Build UPGMA trees from rarefied OTU table distance matrices;
          5.5) Build a consensus tree from the rarefied UPGMA trees
          6) Compare rarefied OTU table distance matrix UPGMA trees 
           to tree full UPGMA tree and write support file and newick tree
           with support values as node labels.
           
        master_tree can be 'full' or 'consensus', default full
    """
    # Prepare some variables for the later steps
    if master_tree == None:
        master_tree = 'full'
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp])
    
    try:
        beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    except KeyError:
        beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac']
    
    # Prep the beta-diversity command
    try:
        params_str = get_params_str(params['beta_diversity'])
    except KeyError:
        params_str = ''
    if tree_fp:
        params_str = '%s -t %s' % (params_str,tree_fp)
    # Build the beta-diversity command
    beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str)
    commands.append(\
     [('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd)])

    # Prep rarefaction command
    rarefaction_dir = '%s/rarefaction/' % output_dir
    create_dir(rarefaction_dir)
    try:
        params_str = get_params_str(params['multiple_rarefactions_even_depth'])
    except KeyError:
        params_str = ''
    # Build the rarefaction command
    rarefaction_cmd = \
     '%s %s/multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, seqs_per_sample,
      rarefaction_dir, params_str)
    commands.append([('Rarefaction', rarefaction_cmd)])

    # Begin iterating over beta diversity distance metrics, if more than one
    # was provided
    for beta_diversity_metric in beta_diversity_metrics:
        metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric)
        distance_matrix_fp = '%s/%s_%s.txt' % \
         (output_dir, beta_diversity_metric, otu_table_basename)
    
        # Prep the hierarchical clustering command (for full distance matrix)
        full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir,otu_table_basename)
        try:
            params_str = get_params_str(params['upgma_cluster'])
        except KeyError:
            params_str = ''
        # Build the hierarchical clustering command (for full distance matrix)
        hierarchical_cluster_cmd = '%s %s/upgma_cluster.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, distance_matrix_fp, full_tree_fp, params_str)
        commands.append(\
         [('UPGMA on full distance matrix: %s' % beta_diversity_metric,\
           hierarchical_cluster_cmd)])
           
        # Prep the beta diversity command (for rarefied OTU tables)
        dm_dir = '%s/rare_dm/' % metric_output_dir
        create_dir(dm_dir)
        # the metrics parameter needs to be ignored as we need to run
        # beta_diversity one metric at a time to keep the per-metric
        # output files in separate directories
        try:
            d = params['beta_diversity'].copy()
            del d['metrics']
        except KeyError:
            params_str = {}
        params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric
        if tree_fp:
            params_str = '%s -t %s' % (params_str,tree_fp)
        if parallel:
            params_str += ' %s' % get_params_str(params['parallel'])        
            # Build the parallel beta diversity command (for rarefied OTU tables)
            beta_div_rarefied_cmd = \
             '%s %s/parallel_beta_diversity.py -T -i %s -o %s %s' %\
             (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str)
        else:
            # Build the serial beta diversity command (for rarefied OTU tables)
            beta_div_rarefied_cmd = \
             '%s %s/beta_diversity.py -i %s -o %s %s' %\
             (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str)
        commands.append(\
         [('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric,
           beta_div_rarefied_cmd)])

        # Prep the hierarchical clustering command (for rarefied 
        # distance matrices)
        upgma_dir = '%s/rare_upgma/' % metric_output_dir
        create_dir(upgma_dir)

        try:
            params_str = get_params_str(params['upgma_cluster'])
        except KeyError:
            params_str = ''
        # Build the hierarchical clustering command (for rarefied 
        # distance matrices)
        hierarchical_cluster_cmd =\
         '%s %s/upgma_cluster.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, dm_dir, upgma_dir, params_str)
        commands.append(\
         [('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric,
           hierarchical_cluster_cmd)])
        

        # Build the consensus tree command
        consensus_tree_cmd =\
         '%s %s/consensus_tree.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, upgma_dir, metric_output_dir + "/rare_upgma_consensus.tre",
            params_str)
        commands.append(\
         [('consensus on rarefied distance matrices (%s)' % beta_diversity_metric,
           consensus_tree_cmd)])
           
           
        # Prep the tree compare command
        tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir
        create_dir(tree_compare_dir)
        try:
            params_str = get_params_str(params['tree_compare'])
        except KeyError:
            params_str = ''

        # Build the tree compare command
        if master_tree == "full":
            master_tree_fp = full_tree_fp
        elif master_tree == "consensus":
            master_tree_fp = metric_output_dir + "/rare_upgma_consensus.tre"
        else:
            raise RuntimeError('master tree method "%s" not found' % (master_tree,))
        tree_compare_cmd = '%s %s/tree_compare.py -s %s -m %s -o %s %s' %\
         (python_exe_fp, script_dir, upgma_dir, master_tree_fp, 
          tree_compare_dir, params_str)
        commands.append(\
         [('Tree compare (%s)' % beta_diversity_metric, tree_compare_cmd)])
           
        # Prep the PCoA command
        pcoa_dir = '%s/pcoa/' % metric_output_dir
        create_dir(pcoa_dir)
        try:
            params_str = get_params_str(params['principal_coordinates'])
        except KeyError:
            params_str = ''
        # Build the PCoA command
        pcoa_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, dm_dir, pcoa_dir, params_str)
        commands.append(\
         [('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)])
         
        # Prep the emperor plots command
        emperor_dir = '%s/emperor_pcoa_plots/' % metric_output_dir
        create_dir(emperor_dir)
        try:
            params_str = get_params_str(params['make_emperor'])
        except KeyError:
            params_str = ''
        emperor_cmd = 'make_emperor.py -i %s -o %s -m %s %s' %\
         (pcoa_dir, emperor_dir, mapping_fp, params_str)
        commands.append(\
         [('emperor plots (%s)' % beta_diversity_metric, emperor_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def pick_subsampled_open_reference_otus(input_fp,
                                        refseqs_fp,
                                        output_dir,
                                        percent_subsample,
                                        new_ref_set_id,
                                        command_handler,
                                        params,
                                        qiime_config,
                                        prefilter_refseqs_fp=None,
                                        run_assign_tax=True,
                                        run_align_and_tree=True,
                                        prefilter_percent_id=None,
                                        min_otu_size=2,
                                        step1_otu_map_fp=None,
                                        step1_failures_fasta_fp=None,
                                        parallel=False,
                                        suppress_step4=False,
                                        logger=None,
                                        suppress_md5=False,
                                        suppress_index_page=False,
                                        denovo_otu_picking_method='uclust',
                                        reference_otu_picking_method='uclust_ref',
                                        status_update_callback=print_to_stdout,
                                        minimum_failure_threshold=100000):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust']
    allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref',
                                             'sortmerna']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
        "Unknown de novo OTU picking method: %s. Known methods are: %s"\
        % (denovo_otu_picking_method,
           ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
        "Unknown reference OTU picking method: %s. Known methods are: %s"\
        % (reference_otu_picking_method,
           ','.join(allowed_reference_otu_picking_methods))

    # Prepare some variables for the later steps
    index_links = []
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        log_fp = generate_log_fp(output_dir)
        logger = WorkflowLogger(log_fp,
                                params=params,
                                qiime_config=qiime_config)

        close_logger_on_success = True
        index_links.append(
                ('Run summary data',
                log_fp,
                _index_headers['run_summary']))
    else:
        close_logger_on_success = False


    if not suppress_md5:
        log_input_md5s(logger, [input_fp,
                                refseqs_fp,
                                step1_otu_map_fp,
                                step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
                (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp, prefilter_dir, reference_otu_picking_method,
                prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id)
            commands.append(
                [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
                (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
                (input_fp, prefiltered_input_fp, prefilter_failures_list_fp)
            commands.append(
                [('Filter prefilter failures from input', filter_fasta_cmd)])
            index_links.append(
            ('Pre-filtered sequence identifiers '
             '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100),
                        prefilter_failures_list_fp,
                        _index_headers['sequences']))


            # Call the command handler on the list of commands
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?")

        # Build the OTU picking command
        step1_dir = \
            '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
            '%s/%s_otus.txt' % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method,
            refseqs_fp, parallel, params, logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
            (step1_dir, input_basename)
        step1_failures_fasta_fp = \
            '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
            (input_fp, step1_failures_list_fp, step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
        '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
        (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    # count number of sequences in step 1 failures fasta file
    with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f:
        num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f)

    # number of failures sequences is greater than the threshold,
    # continue to step 2,3 and 4
    run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold

    if run_step_2_and_3:

        # Subsample the failures fasta file to retain (roughly) the
        # percent_subsample
        step2_dir = '%s/step2_otus/' % output_dir
        create_dir(step2_dir)
        step2_input_fasta_fp = \
                               '%s/subsampled_failures.fasta' % step2_dir
        subsample_fasta(step1_failures_fasta_fp,
                        step2_input_fasta_fp,
                        percent_subsample)

        logger.write('# Subsample the failures fasta file using API \n' +
                 'python -c "import qiime; qiime.util.subsample_fasta' +
                 '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp),
                                                    abspath(
                                                        step2_input_fasta_fp),
                                                    percent_subsample))

        # Prep the OTU picking command for the subsampled failures
        step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                     step2_dir,
                                     new_ref_set_id,
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

        commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

        # Prep the rep set picking command for the subsampled failures
        step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
        step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step2_rep_set_cmd)])

        step3_dir = '%s/step3_otus/' % output_dir
        step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
        step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir

        # remove the indexed reference database from the dictionary of
        # parameters as it must be forced to build a new database
        # using the step2_repset_fasta_fp
        if reference_otu_picking_method == 'sortmerna':
            if 'sortmerna_db' in params['pick_otus']:
                del params['pick_otus']['sortmerna_db']

        step3_cmd = pick_reference_otus(
            step1_failures_fasta_fp,
            step3_dir,
            reference_otu_picking_method,
            step2_repset_fasta_fp,
            parallel,
            params,
            logger)

        commands.append([
            ('Pick reference OTUs using de novo rep set', step3_cmd)])

        index_links.append(
            ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")',
             merged_otu_map_fp,
             _index_headers['otu_maps']))

    if not suppress_step4:
        step4_dir = '%s/step4_otus/' % output_dir
        if run_step_2_and_3:
            step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
            step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
                (step1_failures_fasta_fp,
                 step3_failures_list_fp, step3_failures_fasta_fp)
            commands.append([('Create fasta file of step3 failures',
                            step3_filter_fasta_cmd)])

            failures_fp = step3_failures_fasta_fp
            failures_otus_fp = 'failures_failures_otus.txt'
            failures_step = 'step3'
        else:
            failures_fp = step1_failures_fasta_fp
            failures_otus_fp = 'failures_otus.txt'
            failures_step = 'step1'
            step3_otu_map_fp = ""

        step4_cmd = pick_denovo_otus(failures_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)

        step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp)
        commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)])

        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = 'cat %s %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp,
             step4_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
            (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp)
        commands.append(
            [('Pick representative set for subsampled failures', step4_rep_set_cmd)])
    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        if run_step_2_and_3:
            failures_fp = step3_failures_list_fp
        else:
            failures_fp = step1_failures_list_fp
            step3_otu_map_fp = ""

        cat_otu_tables_cmd = 'cat %s %s > %s' %\
            (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])

        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' % (failures_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)

    otus_to_keep = filter_otus_from_otu_map(
        otu_fp,
        otu_no_singletons_fp,
        min_otu_size)

    index_links.append(('Final map of OTU identifier to sequence identifers excluding '
                        'OTUs with fewer than %d sequences' % min_otu_size,
                        otu_no_singletons_fp,
                        _index_headers['otu_maps']))

    logger.write('# Filter singletons from the otu map using API \n' +
                 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' +
                 '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp),
                                                    abspath(
                                                        otu_no_singletons_fp),
                                                    min_otu_size))

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    index_links.append(
        ('OTU representative sequences',
         final_repset_fp,
         _index_headers['sequences']))
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    index_links.append(
        ('New reference sequences (i.e., OTU representative sequences plus input '
         'reference sequences)',
         new_refseqs_fp,
         _index_headers['sequences']))
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    logger.write('# Write non-singleton otus representative sequences ' +
                 'from step1 to the final rep set file: %s\n\n' % final_repset_fp)
    # copy the full input refseqs file to the new refseqs_fp
    copyfile(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    logger.write('# Copy the full input refseqs file to the new refseq file\n' +
                 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp))
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    if run_step_2_and_3:
        for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # steps 1-4 executed
    if run_step_2_and_3:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 2 and step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))
    # only steps 1 and 4 executed
    else:
        logger.write('# Write non-singleton otus representative sequences from ' +
                     'step 4 to the final representative set and the new reference' +
                     ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp))

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)

    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
        (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])
    index_links.append(
        ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size,
         otu_table_fp,
         _index_headers['otu_tables']))
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)

        align_and_tree_input_otu_table = otu_table_w_tax_fp
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
             'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
            'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
            '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and including OTU '
            'taxonomy assignments' % min_otu_size,
             otu_table_w_tax_fp,
             _index_headers['otu_tables']))

    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
            '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,
                                                           min_otu_size)
        index_links.append(
            ('OTU table exluding OTUs with fewer than %d sequences and sequences that '
             'fail to align with PyNAST' % min_otu_size,
             pynast_failure_filtered_otu_table_fp,
             _index_headers['otu_tables']))

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write(
                "Final output file exists (%s). Will not rebuild." %
                otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            index_links.append(
                    ('OTU taxonomic assignments',
                    taxonomy_fp,
                    _index_headers['taxa_assignments']))

            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
                (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        rep_set_tree_fp = join(output_dir, 'rep_set.tre')
        index_links.append(
            ('OTU phylogenetic tree',
             rep_set_tree_fp,
             _index_headers['trees']))
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Build OTU table without PyNAST failures
            table = load_table(align_and_tree_input_otu_table)
            filtered_otu_table = filter_otus_from_otu_table(table,
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0, inf, 0, inf, negate_ids_to_keep=True)
            write_biom_table(filtered_otu_table,
                             pynast_failure_filtered_otu_table_fp)

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []


    if close_logger_on_success:
        logger.close()

    if not suppress_index_page:
        index_fp = '%s/index.html' % output_dir
        generate_index_page(index_links, index_fp)
Example #34
0
def run_summarize_taxa_through_plots(otu_table_fp, 
                                     mapping_fp,
                                     output_dir,
                                     mapping_cat,
                                     sort,
                                     command_handler,
                                     params,
                                     qiime_config,
                                     logger=None, 
                                     suppress_md5=False,
                                     status_update_callback=print_to_stdout):
    """ Run the data preparation for summarizing taxonomies and generating plots
    
        The steps performed by this function are:
          1) Summarize OTU by Category
          2) Summarize Taxonomy
          3) Plot Taxonomy Summary
          
    """
    # Prepare some variables for the later steps
    otu_table_dir, otu_table_filename = split(otu_table_fp)
    otu_table_basename, otu_table_ext = splitext(otu_table_filename)
    create_dir(output_dir)
    
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[otu_table_fp,mapping_fp])
    
    # if mapping category not passed via command-line, 
    # check if it is passed in params file
    if not mapping_cat:
        try:
            mapping_cat=params['summarize_otu_by_cat']['mapping_category']
        except:
            mapping_cat=None
        
    try:
        params_str = get_params_str(params['summarize_otu_by_cat'])
        # Need to remove the mapping category option, since it is defined above.
        # Using this method since we don't want to change the params dict
        split_params=params_str.split('--')
        updated_params_str=[]
        for i in split_params:
            if not i.startswith('mapping_category'):
                updated_params_str.append(i)
        params_str='--'.join(updated_params_str)
    except:
        params_str = ''
    
    if mapping_cat:
        output_fp=join(output_dir,'%s_otu_table.biom' % (mapping_cat.replace(' ','-')))
        # Build the summarize otu by category command
        summarize_otu_by_cat_cmd = \
         "%s %s/summarize_otu_by_cat.py -m %s -i %s -o %s -c '%s' %s" %\
         (python_exe_fp, script_dir, mapping_fp, otu_table_fp, output_fp,
          mapping_cat, params_str)
        
        commands.append(\
         [('Summarize OTU table by Category',summarize_otu_by_cat_cmd)])
         
        otu_table_fp=output_fp
    
    # Build the sort OTU table command
    if sort:
        # Prep the sort_otu_table command
        try:
            params_str = get_params_str(params['sort_otu_table'])
        except:
            params_str = ''
            
        # define output otu table
        sorted_fp=join(output_dir,
                       splitext(split(otu_table_fp)[-1])[0]+'_sorted.biom')
        
        if mapping_cat or params_str=='':
            # for this case we don't have a collapsed mapping file so must
            # handle separately
            sort_otu_table_cmd = \
             "%s %s/sort_otu_table.py -i %s -o %s" %\
             (python_exe_fp, script_dir, otu_table_fp, sorted_fp)
        else:
            sort_otu_table_cmd = \
             "%s %s/sort_otu_table.py -i %s -o %s -m %s %s" %\
             (python_exe_fp, script_dir, otu_table_fp, sorted_fp,
              mapping_fp, params_str)
        
        commands.append([('Sort OTU Table',sort_otu_table_cmd)])

        # redefine otu_table_fp to use
        otu_table_fp=sorted_fp
    
    # Prep the summarize taxonomy command
    try:
        params_str = get_params_str(params['summarize_taxa'])
    except:
        params_str = ''
    
    try:
        sum_taxa_levels=params['summarize_taxa']['level']
    except:
        sum_taxa_levels=None
        
    # Build the summarize taxonomy command
    summarize_taxa_cmd = '%s %s/summarize_taxa.py -i %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str)
    
    commands.append([('Summarize Taxonomy',summarize_taxa_cmd)])

    sum_taxa_fps=[]
    
    if sum_taxa_levels:
        basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0])
        for i in sum_taxa_levels.split(','):
            sum_taxa_fps.append(basename+'_L%s.txt' % (str(i)))
    else:
        basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0])
        # this is the default levels from summarize_taxa, but cannot import
        # script to get these values
        for i in [2,3,4,5,6]:
            sum_taxa_fps.append(basename+'_L%s.txt' % (str(i)))

    # Prep the plot taxa summary plot command(s)
    taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir
    create_dir(taxa_summary_plots_dir)
        
    try:
        params_str = get_params_str(params['plot_taxa_summary'])
    except:
        params_str = ''
    # Build the plot taxa summary plot command(s)

    plot_taxa_summary_cmd =\
         '%s %s/plot_taxa_summary.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, ','.join(sum_taxa_fps),
          taxa_summary_plots_dir, params_str)
    
    commands.append(\
         [('Plot Taxonomy Summary',plot_taxa_summary_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    suppress_taxa_summary=False,
    suppress_beta_diversity=False,
    suppress_alpha_diversity=False,
    suppress_otu_category_significance=False,
    status_update_callback=print_to_stdout,
):
    """
    """
    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U"))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, (
                    "Category '%s' is not a column header "
                    "in your mapping file. "
                    "Categories are case and white space sensitive. Valid "
                    "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames))
                )
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, (
                    "Category '%s' contains only one value. "
                    "Categories analyzed here require at least two values." % c
                )

    else:
        categories = []

    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])

    create_dir(output_dir)
    index_fp = "%s/index.html" % output_dir
    index_links = []
    commands = []

    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(("Master run log", log_fp, _index_headers["run_summary"]))
    logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config)
    input_fps = [biom_fp, mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger, input_fps)

    # run print_biom_table_summary.py on input BIOM table
    try:
        params_str = get_params_str(params["print_biom_table_summary"])
    except KeyError:
        params_str = ""
    biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir
    print_biom_table_summary_cmd = "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % (
        biom_fp,
        biom_table_stats_output_fp,
        params_str,
    )
    index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"]))
    commands.append([("Generate BIOM table summary", print_biom_table_summary_cmd)])

    # filter samples with fewer observations than the requested sampling_depth.
    # since these get filtered for some analyses (eg beta diversity after
    # even sampling) it's useful to filter them here so they're filtered
    # from all analyses.
    filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth)
    filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % (
        biom_fp,
        filtered_biom_fp,
        sampling_depth,
    )
    commands.append(
        [
            (
                "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth,
                filter_samples_cmd,
            )
        ]
    )
    biom_fp = filtered_biom_fp

    # run initial commands and reset the command list
    command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
    commands = []

    if not suppress_beta_diversity:
        bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth)
        even_dm_fps = run_beta_diversity_through_plots(
            otu_table_fp=biom_fp,
            mapping_fp=mapping_fp,
            output_dir=bdiv_even_output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            sampling_depth=sampling_depth,
            # force suppression of distance histograms - boxplots work better
            # in this context, and are created below.
            histogram_categories=[],
            tree_fp=tree_fp,
            parallel=parallel,
            logger=logger,
            suppress_md5=True,
            status_update_callback=status_update_callback,
        )

        for bdiv_metric, dm_fp in even_dm_fps:
            for category in categories:
                boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric)
                try:
                    params_str = get_params_str(params["make_distance_boxplots"])
                except KeyError:
                    params_str = ""
                boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % (
                    dm_fp,
                    category,
                    boxplots_output_dir,
                    mapping_fp,
                    params_str,
                )
                commands.append([("Boxplots (%s)" % category, boxplots_cmd)])
                index_links.append(
                    (
                        "Distance boxplots (%s)" % bdiv_metric,
                        "%s/%s_Distances.pdf" % (boxplots_output_dir, category),
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )
                index_links.append(
                    (
                        "Distance boxplots statistics (%s)" % bdiv_metric,
                        "%s/%s_Stats.txt" % (boxplots_output_dir, category),
                        _index_headers["beta_diversity_even"] % sampling_depth,
                    )
                )

            index_links.append(
                (
                    "3D plot (%s, continuous coloring)" % bdiv_metric,
                    "%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "3D plot (%s, discrete coloring)" % bdiv_metric,
                    "%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "2D plot (%s, continuous coloring)" % bdiv_metric,
                    "%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "2D plot (%s, discrete coloring)" % bdiv_metric,
                    "%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Distance matrix (%s)" % bdiv_metric,
                    "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )
            index_links.append(
                (
                    "Principal coordinate matrix (%s)" % bdiv_metric,
                    "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric),
                    _index_headers["beta_diversity_even"] % sampling_depth,
                )
            )

    if not suppress_alpha_diversity:
        ## Alpha rarefaction workflow
        arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth)
        run_alpha_rarefaction(
            otu_table_fp=biom_fp,
            mapping_fp=mapping_fp,
            output_dir=arare_full_output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            tree_fp=tree_fp,
            num_steps=arare_num_steps,
            parallel=parallel,
            logger=logger,
            min_rare_depth=arare_min_rare_depth,
            max_rare_depth=sampling_depth,
            suppress_md5=True,
            status_update_callback=status_update_callback,
        )

        index_links.append(
            (
                "Alpha rarefaction plots",
                "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir,
                _index_headers["alpha_diversity"],
            )
        )

        collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir)
        try:
            params_str = get_params_str(params["compare_alpha_diversity"])
        except KeyError:
            params_str = ""
        for category in categories:
            for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
                alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
                alpha_comparison_output_fp = "%s/%s_%s.txt" % (arare_full_output_dir, category, alpha_metric)
                compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % (
                    collated_alpha_diversity_fp,
                    mapping_fp,
                    category,
                    alpha_comparison_output_fp,
                    params_str,
                )
                commands.append([("Compare alpha diversity (%s, %s)" % (category, alpha_metric), compare_alpha_cmd)])
                index_links.append(
                    (
                        "Alpha diversity statistics (%s, %s)" % (category, alpha_metric),
                        alpha_comparison_output_fp,
                        _index_headers["alpha_diversity"],
                    )
                )

    if not suppress_taxa_summary:
        taxa_plots_output_dir = "%s/taxa_plots/" % output_dir
        run_summarize_taxa_through_plots(
            otu_table_fp=biom_fp,
            mapping_fp=mapping_fp,
            output_dir=taxa_plots_output_dir,
            mapping_cat=None,
            sort=True,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            logger=logger,
            suppress_md5=True,
            status_update_callback=status_update_callback,
        )

        index_links.append(
            (
                "Taxa summary bar plots",
                "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        index_links.append(
            (
                "Taxa summary area plots",
                "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                _index_headers["taxa_summary"],
            )
        )
        for category in categories:
            taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category)
            run_summarize_taxa_through_plots(
                otu_table_fp=biom_fp,
                mapping_fp=mapping_fp,
                output_dir=taxa_plots_output_dir,
                mapping_cat=category,
                sort=True,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                logger=logger,
                suppress_md5=True,
                status_update_callback=status_update_callback,
            )

            index_links.append(
                (
                    "Taxa summary bar plots",
                    "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )
            index_links.append(
                (
                    "Taxa summary area plots",
                    "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir,
                    _index_headers["taxa_summary_categorical"] % category,
                )
            )

    if not suppress_otu_category_significance:
        # OTU category significance
        for category in categories:
            category_signifance_fp = "%s/category_significance_%s.txt" % (output_dir, category)
            try:
                params_str = get_params_str(params["otu_category_significance"])
            except KeyError:
                params_str = ""
            # Build the OTU cateogry significance command
            category_significance_cmd = "otu_category_significance.py -i %s -m %s -c %s -o %s %s" % (
                biom_fp,
                mapping_fp,
                category,
                category_signifance_fp,
                params_str,
            )
            commands.append([("OTU category significance (%s)" % category, category_significance_cmd)])

            index_links.append(
                ("Category significance (%s)" % category, category_signifance_fp, _index_headers["otu_category_sig"])
            )

    commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)])
    index_links.append(
        (
            "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth,
            "%s.gz" % filtered_biom_fp,
            _index_headers["run_summary"],
        )
    )

    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links, index_fp)
def pick_subsampled_open_reference_otus(
    input_fp,
    refseqs_fp,
    output_dir,
    percent_subsample,
    new_ref_set_id,
    command_handler,
    params,
    qiime_config,
    prefilter_refseqs_fp=None,
    run_assign_tax=True,
    run_align_and_tree=True,
    prefilter_percent_id=0.60,
    min_otu_size=2,
    step1_otu_map_fp=None,
    step1_failures_fasta_fp=None,
    parallel=False,
    suppress_step4=False,
    logger=None,
    suppress_md5=False,
    denovo_otu_picking_method="uclust",
    reference_otu_picking_method="uclust_ref",
    status_update_callback=print_to_stdout,
):
    """ Run the data preparation steps of Qiime

        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the
             representative set from step 4 as the reference set.

    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ["uclust", "usearch61"]
    allowed_reference_otu_picking_methods = ["uclust_ref", "usearch61_ref"]
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods, (
        "Unknown de novo OTU picking method: %s. Known methods are: %s"
        % (denovo_otu_picking_method, ",".join(allowed_denovo_otu_picking_methods))
    )

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods, (
        "Unknown reference OTU picking method: %s. Known methods are: %s"
        % (reference_otu_picking_method, ",".join(allowed_reference_otu_picking_methods))
    )

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp is None:
        prefilter_refseqs_fp = refseqs_fp

    # Step 1: Closed-reference OTU picking on the input file (if not already
    # complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = "%s/step1_otus" % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id is not None:
            prefilter_dir = "%s/prefilter_otus/" % output_dir
            prefilter_failures_list_fp = "%s/%s_failures.txt" % (prefilter_dir, input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(
                input_fp,
                prefilter_dir,
                reference_otu_picking_method,
                prefilter_refseqs_fp,
                parallel,
                params,
                logger,
                prefilter_percent_id,
            )
            commands.append([("Pick Reference OTUs (prefilter)", prefilter_pick_otu_cmd)])

            prefiltered_input_fp = "%s/prefiltered_%s%s" % (prefilter_dir, input_basename, input_ext)
            filter_fasta_cmd = "filter_fasta.py -f %s -o %s -s %s -n" % (
                input_fp,
                prefiltered_input_fp,
                prefilter_failures_list_fp,
            )
            commands.append([("Filter prefilter failures from input", filter_fasta_cmd)])

            # Call the command handler on the list of commands
            command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False)
            commands = []

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            if getsize(prefiltered_input_fp) == 0:
                raise ValueError(
                    "All sequences were discarded by the prefilter. "
                    "Are the input sequences in the same orientation "
                    "in your input file and reference file (you can "
                    "add 'pick_otus:enable_rev_strand_match True' to "
                    "your parameters file if not)? Are you using the "
                    "correct reference file?"
                )

        # Build the OTU picking command
        step1_dir = "%s/step1_otus" % output_dir
        step1_otu_map_fp = "%s/%s_otus.txt" % (step1_dir, input_basename)
        step1_pick_otu_cmd = pick_reference_otus(
            input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger
        )
        commands.append([("Pick Reference OTUs", step1_pick_otu_cmd)])

        # Build the failures fasta file
        step1_failures_list_fp = "%s/%s_failures.txt" % (step1_dir, input_basename)
        step1_failures_fasta_fp = "%s/failures.fasta" % step1_dir
        step1_filter_fasta_cmd = "filter_fasta.py -f %s -s %s -o %s" % (
            input_fp,
            step1_failures_list_fp,
            step1_failures_fasta_fp,
        )

        commands.append([("Generate full failures fasta file", step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = "%s/step1_rep_set.fna" % step1_dir
    step1_pick_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([("Pick rep set", step1_pick_rep_set_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False)
    commands = []

    # Subsample the failures fasta file to retain (roughly) the
    # percent_subsample
    step2_input_fasta_fp = "%s/subsampled_failures.fasta" % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample)

    logger.write(
        "# Subsample the failures fasta file using API \n"
        + 'python -c "import qiime; qiime.util.subsample_fasta'
        + "('%s', '%s', '%f')\n\n\""
        % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)
    )

    # Prep the OTU picking command for the subsampled failures
    step2_dir = "%s/step2_otus/" % output_dir
    step2_cmd = pick_denovo_otus(
        step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger
    )
    step2_otu_map_fp = "%s/subsampled_failures_otus.txt" % step2_dir

    commands.append([("Pick de novo OTUs for new clusters", step2_cmd)])

    # Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = "%s/step2_rep_set.fna" % step2_dir
    step2_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % (
        step2_otu_map_fp,
        step2_repset_fasta_fp,
        step2_input_fasta_fp,
    )
    commands.append([("Pick representative set for subsampled failures", step2_rep_set_cmd)])

    step3_dir = "%s/step3_otus/" % output_dir
    step3_otu_map_fp = "%s/failures_otus.txt" % step3_dir
    step3_failures_list_fp = "%s/failures_failures.txt" % step3_dir
    step3_cmd = pick_reference_otus(
        step1_failures_fasta_fp,
        step3_dir,
        reference_otu_picking_method,
        step2_repset_fasta_fp,
        parallel,
        params,
        logger,
    )

    commands.append([("Pick reference OTUs using de novo rep set", step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = "%s/final_otu_map.txt" % output_dir

    if not suppress_step4:
        step3_failures_fasta_fp = "%s/failures_failures.fasta" % step3_dir
        step3_filter_fasta_cmd = "filter_fasta.py -f %s -s %s -o %s" % (
            step1_failures_fasta_fp,
            step3_failures_list_fp,
            step3_failures_fasta_fp,
        )
        commands.append([("Create fasta file of step3 failures", step3_filter_fasta_cmd)])

        step4_dir = "%s/step4_otus/" % output_dir
        step4_cmd = pick_denovo_otus(
            step3_failures_fasta_fp,
            step4_dir,
            ".".join([new_ref_set_id, "CleanUp"]),
            denovo_otu_picking_method,
            params,
            logger,
        )
        step4_otu_map_fp = "%s/failures_failures_otus.txt" % step4_dir
        commands.append([("Pick de novo OTUs on step3 failures", step4_cmd)])
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = "cat %s %s %s > %s" % (
            step1_otu_map_fp,
            step3_otu_map_fp,
            step4_otu_map_fp,
            merged_otu_map_fp,
        )
        commands.append([("Merge OTU maps", cat_otu_tables_cmd)])
        step4_repset_fasta_fp = "%s/step4_rep_set.fna" % step4_dir
        step4_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % (
            step4_otu_map_fp,
            step4_repset_fasta_fp,
            step3_failures_fasta_fp,
        )
        commands.append([("Pick representative set for subsampled failures", step4_rep_set_cmd)])

    else:
        # Merge the otu maps, note that we are explicitly using the '>' operator
        # otherwise passing the --force flag on the script interface would
        # append the newly created maps to the map that was previously created
        cat_otu_tables_cmd = "cat %s %s > %s" % (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp)
        commands.append([("Merge OTU maps", cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append(
            [
                (
                    "Move final failures file to top-level directory",
                    "mv %s %s/final_failures.txt" % (step3_failures_list_fp, output_dir),
                )
            ]
        )

    command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = "%s/final_otu_map_mc%d.txt" % (output_dir, min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size)

    logger.write(
        "# Filter singletons from the otu map using API \n"
        + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map'
        + "('%s', '%s', '%d')\"\n\n" % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)
    )

    # make the final representative seqs file and a new refseqs file that
    # could be used in subsequent otu picking runs.
    # this is clunky. first, we need to do this without singletons to match
    # the otu map without singletons. next, there is a difference in what
    # we need the reference set to be and what we need the repseqs to be.
    # the reference set needs to be a superset of the input reference set
    # to this set. the repset needs to be only the sequences that were observed
    # in this data set, and we want reps for the step1 reference otus to be
    # reads from this run so we don't hit issues building a tree using
    # sequences of very different lengths. so...
    final_repset_fp = "%s/rep_set.fna" % output_dir
    final_repset_f = open(final_repset_fp, "w")
    new_refseqs_fp = "%s/new_refseqs.fna" % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, "U")):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write(">%s\n%s\n" % (otu_id, seq))
    logger.write(
        "# Write non-singleton otus representative sequences "
        + "from step1 to the final rep set file: %s\n\n" % final_repset_fp
    )
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, "a")
    new_refseqs_f.write("\n")
    logger.write(
        "# Copy the full input refseqs file to the new refseq file\n" + "cp %s %s\n\n" % (refseqs_fp, new_refseqs_fp)
    )
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, "U")):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write(">%s\n%s\n" % (otu_id, seq))
            final_repset_f.write(">%s\n%s\n" % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, "U")):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write(">%s\n%s\n" % (otu_id, seq))
                final_repset_f.write(">%s\n%s\n" % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()
    logger.write(
        "# Write non-singleton otus representative sequences from "
        + "step 2 and step 4 to the final representative set and the new reference"
        + " set (%s and %s respectively)\n\n" % (final_repset_fp, new_refseqs_fp)
    )

    # Prep the make_otu_table.py command
    otu_table_fp = "%s/otu_table_mc%d.biom" % (output_dir, min_otu_size)
    make_otu_table_cmd = "make_otu_table.py -i %s -o %s" % (otu_no_singletons_fp, otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])

    command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False)

    commands = []

    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = "%s/otu_table_mc%d_w_tax.biom" % (output_dir, min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = "%s/otu_table_mc%d_w_tax_no_pynast_failures.biom" % (
            output_dir,
            min_otu_size,
        )
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = "%s/otu_table_mc%d_w_tax.biom" % (output_dir, min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = "%s/otu_table_mc%d_no_pynast_failures.biom" % (output_dir, min_otu_size)

    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp], error_on_missing=False)

            taxonomy_fp = assign_tax(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback,
            )

            # Add taxa to otu table
            add_metadata_cmd = (
                "biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy"
                % (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp)
            )
            commands.append([("Add taxa to OTU table", add_metadata_cmd)])

            command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False)

            pynast_failures_fp = align_and_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback,
            )

            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                parse_biom_table(open(align_and_tree_input_otu_table, "U")),
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, "U")),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True,
            )
            otu_table_f = open(pynast_failure_filtered_otu_table_fp, "w")
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()

            command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False)
            commands = []

    if close_logger_on_success:
        logger.close()
Example #37
0
def run_pick_closed_reference_otus(
                              input_fp, 
                              refseqs_fp,
                              output_dir,
                              taxonomy_fp,
                              command_handler,
                              params,
                              qiime_config,
                              parallel=False,
                              logger=None,
                              suppress_md5=False,
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          1) Pick OTUs;
          2) Build an OTU table with optional pre-defined taxonmy.
    
    """
    
    # confirm that a valid otu picking method was supplied before doing
    # any work
    reference_otu_picking_methods = ['blast','uclust_ref','usearch61_ref']

    try:
        otu_picking_method = params['pick_otus']['otu_picking_method']
    except KeyError:
        otu_picking_method = 'uclust_ref'
    assert otu_picking_method in reference_otu_picking_methods,\
     "Invalid OTU picking method supplied: %s. Valid choices are: %s"\
     % (otu_picking_method,' '.join(reference_otu_picking_methods))
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    if not suppress_md5:
        log_input_md5s(logger,[input_fp,refseqs_fp,taxonomy_fp])

    # Prep the OTU picking command
    pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method)
    otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename)
    if parallel and (otu_picking_method == 'blast' or 
                     otu_picking_method == 'uclust_ref' or
                     otu_picking_method == 'usearch61_ref'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            if 'otu_picking_method' in d:
                del d['otu_picking_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/%s -i %s -o %s -r %s -T %s' %\
          (python_exe_fp, 
           script_dir, 
           otu_picking_script,
           input_fp,
           pick_otu_dir,
           refseqs_fp,
           params_str)
    else:
        try:
            params_str = get_params_str(params['pick_otus'])
        except KeyError:
            params_str = ''
        # Since this is reference-based OTU picking we always want to
        # suppress new clusters -- force it here.
        params_str+= ' --suppress_new_clusters'
        logger.write("Forcing --suppress_new_clusters as this is closed-reference OTU picking.\n\n")
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -r %s -m %s %s' %\
         (python_exe_fp,
          script_dir,
          input_fp,
          pick_otu_dir,
          refseqs_fp,
          otu_picking_method,
          params_str)

    commands.append([('Pick OTUs', pick_otus_cmd)])

    # Prep the OTU table building command
    otu_table_fp = '%s/otu_table.biom' % output_dir
    try:
        params_str = get_params_str(params['make_otu_table'])
    except KeyError:
        params_str = ''
    if taxonomy_fp:
        taxonomy_str = '-t %s' % taxonomy_fp
    else:
        taxonomy_str = ''
    # Build the OTU table building command
    make_otu_table_cmd = '%s %s/make_otu_table.py -i %s %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_fp, taxonomy_str, otu_table_fp, params_str)
    
    commands.append([('Make OTU table', make_otu_table_cmd)])
    

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
script_info['script_usage'] = []
script_info['script_usage'].append((
    "Run a subset of the interface tests in verbose mode",
    "Run interface tests for the count_seqs.py and make_otu_table.py scripts. This illustrates how to run from the qiime_test_dir directory.",
    "%prog -t count_seqs,make_otu_table -v"))
script_info['script_usage'].append((
    "Run all of the interface tests",
    "Run all script interface tests.  This illustrates how to run from the qiime_test_dir directory.",
    "%prog"))
script_info['output_description'] = ""
script_info['required_options'] = []

log_fp_prefix = 'script_test_log'
log_fp_suffix = 'txt'
default_log_fp = generate_log_fp(get_qiime_temp_dir(),
                                 basefile_name=log_fp_prefix,
                                 suffix=log_fp_suffix,
                                 timestamp_pattern='%Y%m%d%H%M%S')
default_log_fp_help_str = join(
    get_qiime_temp_dir(), '%s_TIMESTAMP.%s' % (log_fp_prefix, log_fp_suffix))

script_info['optional_options'] = [\
 make_option('-t','--tests',
             help='comma-separated list of the tests to run [default: all]'),
make_option('-w','--working_dir',default=get_qiime_temp_dir(),
             help='directory where the tests should be run [default: %default]',
             type='existing_dirpath'),
make_option('-l','--failure_log_fp',type="new_filepath",default=default_log_fp,
             help='log file to store record of failures [default: %s]' % default_log_fp_help_str)
                                   ]
script_info['version'] = __version__
Example #39
0
def run_ampliconnoise(mapping_fp,
    output_dir, command_handler, params, qiime_config,
    logger=None, status_update_callback=print_to_stdout,
    chimera_alpha=-3.8228,chimera_beta=0.6200, sff_txt_fp=None, numnodes=2,
    suppress_perseus=True, output_filepath=None, platform='flx',
    seqnoise_resolution=None, truncate_len=None):
    """ Run the ampliconnoise pipeline
    
        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data,headers,comments = parse_mapping_file(open(mapping_fp,'U'))
    create_dir(output_dir)

    if seqnoise_resolution == None:
        if platform=='flx': seqnoise_resolution = '30.0'
        elif platform=='titanium': seqnoise_resolution = '25.0'
        else: raise RuntimeError('seqnoise_resolution not set, and no'+\
            ' default for platform '+platform)

    if truncate_len == None:
        if platform=='flx': truncate_len = '220'
        elif platform=='titanium': truncate_len = '400'
        else: raise RuntimeError('truncate_len not set, and no'+\
            ' default for platform '+platform)

    sample_names = [] # these are filenames minus extension, and are sample IDs
    primer_seqs = [] # same order as sample_names
    bc_seqs = [] # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        # don't know why don't just take off the primer now. 
        # but that's done later
        # primer += (map_data[i][headers.index('LinkerPrimerSequence')])
        # for char, bases in IUPAC_DNA_ambiguities.items():
        #     primer = primer.replace(char,'['+''.join(bases)+']')

        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in IUPAC_DNA_ambiguities.items():
            primer = primer.replace(char,'['+''.join(bases)+']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()

    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger,[mapping_fp,sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir,'map.csv'),'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i]+','+bc_seqs[i]+'\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_'+truncate_len
    if suppress_perseus == True:
        fasta_result_names = [sample_name + post_pyro_tail+'_seqnoise_cd.fa'
          for sample_name in sample_names]
    else:
        fasta_result_names = [sample_name + '_Good.fa' \
          for sample_name in sample_names]

    cmd = 'cd '+output_dir # see also os.chdir above
    commands.append([('change to output dir', cmd)])
    
    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable',
        cmd)])


    cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\
        os.path.join(called_dir,sff_txt_fp)+\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows '+sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\
                sample_name+'.raw'
            commands.append([('clean flows '+sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\
          sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout"
        commands.append([('pyrodist '+sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\
          " > "+sample_name+".fcout"
        commands.append([('fcluster pyrodist '+sample_name, cmd)])

# e.g.:
# mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
# PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\
            sample_name+".dat -out "+\
            sample_name+"_pyronoise "+"-lin "+\
            sample_name+".list -s 60.0 -c 0.01 > "+\
            sample_name+"_pyronoise.pnout"
        commands.append([('pyronoise '+sample_name, cmd)])

        cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\
            sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\
            truncate_len+'.fa'
        commands.append([('truncate '+sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\
            sample_name+post_pyro_tail+\
            ".fa > "+sample_name+post_pyro_tail+".seqdist"
        commands.append([('seqdist '+sample_name, cmd)])

        cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+"fcl > "+\
            sample_name+post_pyro_tail+".fcout"
        commands.append([('fcluster seqdist '+sample_name, cmd)])

# e.g.:
# mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
# PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
# PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
# PC.354_pyronoise_cd.snout

        cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\
            sample_name+post_pyro_tail+\
            ".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\
            sample_name+post_pyro_tail+\
            "_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\
            sample_name+'_pyronoise'+\
            '.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\
            sample_name+post_pyro_tail+'.snout'
        commands.append([('seqnoise '+sample_name, cmd)])


        if suppress_perseus == False: 

            cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa > ' +\
                sample_name+'.per'
            commands.append([('Perseus '+sample_name, cmd)])

            cmd = 'Class.pl '+sample_name+'.per '+\
                str(chimera_alpha) + ' '+ str(chimera_beta)+\
                ' > '+sample_name+'.class'
            commands.append([('Class.pl '+sample_name, cmd)])

            cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\
                '_seqnoise_cd.fa '+\
                sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\
                sample_name+'_Good.fa'
            commands.append([('FilterGoodClass '+sample_name, cmd)])

        cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\
         (python_exe_fp, script_dir, fasta_result_names[i], 
         sample_name+'_unw.fna', sample_name)
        commands.append([('unweight fasta '+sample_name, cmd)])

    cmd = 'cat ' +\
      ' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\
      ' > ' + output_filepath # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def assign_taxonomy_multiple_times(input_dirs, output_dir, assignment_methods,
        reference_seqs_fp, id_to_taxonomy_fp,
        confidences=None, e_values=None, rtax_modes=None,
        uclust_min_consensus_fractions=None, uclust_similarities=None,
        uclust_max_accepts=None, input_fasta_filename='rep_set.fna',
        clean_otu_table_filename='otu_table_mc2_no_pynast_failures.biom',
        read_1_seqs_filename='seqs1.fna', read_2_seqs_filename='seqs2.fna',
        rtax_read_id_regexes=None, rtax_amplicon_id_regexes=None,
        rtax_header_id_regexes=None, rdp_max_memory=4000,
        command_handler=call_commands_serially,
        status_update_callback=no_status_updates, force=False):
    """ Performs sanity checks on passed arguments and directories. Builds 
        commands for each method and sends them off to be executed. """
    ## Check if output directory exists
    try:
        create_dir(output_dir, fail_on_exist=not force)
    except OSError:
        raise WorkflowError("Output directory '%s' already exists. Please "
                "choose a different directory, or force overwrite with -f."
                % output_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    # We're going to zip these with the input directories.
    num_dirs = len(input_dirs)
    if rtax_read_id_regexes is None:
        rtax_read_id_regexes = [None] * num_dirs
    if rtax_amplicon_id_regexes is None:
        rtax_amplicon_id_regexes = [None] * num_dirs
    if rtax_header_id_regexes is None:
        rtax_header_id_regexes = [None] * num_dirs

    if num_dirs != len(rtax_read_id_regexes) or \
       num_dirs != len(rtax_amplicon_id_regexes) or \
       num_dirs != len(rtax_header_id_regexes):
        raise WorkflowError("The number of RTAX regular expressions must "
                            "match the number of input directories.")

    for input_dir, rtax_read_id_regex, rtax_amplicon_id_regex, \
            rtax_header_id_regex in zip(input_dirs, rtax_read_id_regexes,
                         rtax_amplicon_id_regexes, rtax_header_id_regexes):
        ## Make sure the input dataset directory exists.
        if not isdir(input_dir):
            raise WorkflowError("The input dataset directory '%s' does not "
                                "exist." % input_dir)

        input_dir_name = split(normpath(input_dir))[1]
        output_dataset_dir = join(output_dir, input_dir_name)
        input_fasta_fp = join(input_dir, input_fasta_filename)
        clean_otu_table_fp = join(input_dir, clean_otu_table_filename)
        read_1_seqs_fp = join(input_dir, read_1_seqs_filename)
        read_2_seqs_fp = join(input_dir, read_2_seqs_filename)

        logger.write("\nCreating output subdirectory '%s' if it doesn't "
                     "already exist.\n" % output_dataset_dir)
        create_dir(output_dataset_dir)

        for method in assignment_methods:
            ## Method is RDP
            if method == 'rdp':
                ## Check for execution parameters required by RDP method
                if confidences is None:
                    raise WorkflowError("You must specify at least one "
                                        "confidence level.")
                ## Generate command for RDP
                commands = _generate_rdp_commands(output_dataset_dir,
                                                  input_fasta_fp,
                                                  reference_seqs_fp,
                                                  id_to_taxonomy_fp,
                                                  clean_otu_table_fp,
                                                  confidences,
                                                  rdp_max_memory=rdp_max_memory)
                        
            ## Method is BLAST
            elif method == 'blast':
                ## Check for execution parameters required by BLAST method
                if e_values is None:
                    raise WorkflowError("You must specify at least one "
                                        "E-value.")
                ## Generate command for BLAST
                commands = _generate_blast_commands(output_dataset_dir,
                                                    input_fasta_fp,
                                                    reference_seqs_fp,
                                                    id_to_taxonomy_fp,
                                                    clean_otu_table_fp,
                                                    e_values)
                        
            ## Method is Mothur
            elif method == 'mothur':
                ## Check for execution parameters required by Mothur method
                if confidences is None:
                    raise WorkflowError("You must specify at least one "
                                        "confidence level.")
                ## Generate command for mothur
                commands = _generate_mothur_commands(output_dataset_dir,
                                                     input_fasta_fp,
                                                     reference_seqs_fp,
                                                     id_to_taxonomy_fp,
                                                     clean_otu_table_fp,
                                                     confidences)

            ## Method is RTAX
            elif method == 'rtax':
                ## Check for execution parameters required by RTAX method
                if rtax_modes is None:
                    raise WorkflowError("You must specify at least one mode "
                                        "to run RTAX in.")
                for mode in rtax_modes:
                    if mode not in ['single', 'paired']:
                        raise WorkflowError("Invalid rtax mode '%s'. Must be "
                                            "'single' or 'paired'." % mode)

                ## Generate command for rtax
                commands = _generate_rtax_commands(output_dataset_dir,
                                                   input_fasta_fp,
                                                   reference_seqs_fp,
                                                   id_to_taxonomy_fp,
                                                   clean_otu_table_fp,
                                                   rtax_modes,
                                                   read_1_seqs_fp,
                                                   read_2_seqs_fp,
                                                   rtax_read_id_regex,
                                                   rtax_amplicon_id_regex,
                                                   rtax_header_id_regex)

            ## Method is uclust
            elif method == 'uclust':
                ## Check for execution parameters required by uclust method
                if uclust_min_consensus_fractions is None:
                    raise WorkflowError("You must specify at least one uclust "
                                        "minimum consensus fraction.")
                if uclust_similarities is None:
                    raise WorkflowError("You must specify at least one uclust "
                                        "similarity.")
                if uclust_max_accepts is None:
                    raise WorkflowError("You must specify at least one uclust "
                                        "max accepts.")
                ## Generate command for uclust
                commands = _generate_uclust_commands(output_dataset_dir,
                                                     input_fasta_fp,
                                                     reference_seqs_fp,
                                                     id_to_taxonomy_fp,
                                                     clean_otu_table_fp,
                                                     uclust_min_consensus_fractions,
                                                     uclust_similarities,
                                                     uclust_max_accepts)

            ## Unsupported method
            else:
                raise WorkflowError("Unrecognized or unsupported taxonomy "
                                    "assignment method '%s'." % method)

            # send command for current method to command handler
            for command in commands:
                start = time()

                # call_commands_serially needs a list of commands so here's a
                # length one commmand list.
                command_handler([command], status_update_callback, logger,
                                close_logger_on_success=False)
                end = time()
                logger.write('Time (s): %d\n\n' % (end - start))

    logger.close()
Example #41
0
def align_and_tree(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):
                   
    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    ## Prep the pynast alignment command
    alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method)
    aln_fp = '%s/%s_aligned.fasta' % (pynast_dir,input_basename)
    failures_fp = '%s/%s_failures.fasta' % (pynast_dir,input_basename)
    if exists(pynast_dir):
        rmtree(pynast_dir)
    
    if parallel:
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['align_seqs'].copy()
            if 'alignment_method' in d:
                del d['alignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel pynast alignment command
        align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])
    
    
    ## Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\
     (pynast_dir,input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
     (pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])
    
    
    ## Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
     (filtered_aln_fp, tree_fp,params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    if exists(tree_fp):
        remove_files([tree_fp])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return failures_fp
Example #42
0
def pick_subsampled_open_reference_otus(input_fp, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust','usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
     "Unknown de novo OTU picking method: %s. Known methods are: %s"\
     % (denovo_otu_picking_method,
        ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
     "Unknown reference OTU picking method: %s. Known methods are: %s"\
     % (reference_otu_picking_method,
        ','.join(allowed_reference_otu_picking_methods))
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[input_fp,
                               refseqs_fp,
                               step1_otu_map_fp,
                               step1_failures_fasta_fp])
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in 
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])
            
            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input', filter_fasta_cmd)])
            
            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            
        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)
        
        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])
        
        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []
    
    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set',step1_pick_rep_set_cmd)])
    
    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)
    
    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])
    
    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
     step1_failures_fasta_fp,
     step3_dir,
     reference_otu_picking_method,
     step2_repset_fasta_fp,
     parallel,
     params,
     logger)
    
    commands.append([
     ('Pick reference OTUs using de novo rep set',step3_cmd)])
    
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir
    
    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures', 
                          step3_filter_fasta_cmd)])
        
        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id,'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)])
        
    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])    
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                      'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))])
    
    command_handler(commands,
        status_update_callback,
        logger=logger,
        close_logger_on_success=False)
    commands = []
    
    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size)
    
    ## make the final representative seqs file and a new refseqs file that 
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be. 
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be 
    ## reads from this run so we don't hit issues building a tree using 
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp,'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp,new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp,'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write 
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    new_refseqs_f.close()
    final_repset_f.close()
    
    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table",make_otu_table_cmd)])
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            # Add taxa to otu table
            add_metadata_cmd = 'add_metadata.py -i %s --observation_mapping_fp %s -o %s --sc_separated taxonomy --observation_header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
            
    if close_logger_on_success:
        logger.close()
Example #43
0
def pick_subsampled_open_reference_otus(input_fp, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              run_assign_tax=True,
                              run_align_and_tree=True,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              suppress_md5=False,
                              denovo_otu_picking_method='uclust',
                              reference_otu_picking_method='uclust_ref',
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    allowed_denovo_otu_picking_methods = ['uclust','usearch61']
    allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref']
    assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\
     "Unknown de novo OTU picking method: %s. Known methods are: %s"\
     % (denovo_otu_picking_method,
        ','.join(allowed_denovo_otu_picking_methods))

    assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\
     "Unknown reference OTU picking method: %s. Known methods are: %s"\
     % (reference_otu_picking_method,
        ','.join(allowed_reference_otu_picking_methods))
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    if not suppress_md5:
        log_input_md5s(logger,[input_fp,
                               refseqs_fp,
                               step1_otu_map_fp,
                               step1_failures_fasta_fp])
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in 
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])
            
            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input', filter_fasta_cmd)])
            
            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            
        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)
        
        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])
        
        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []
    
    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set',step1_pick_rep_set_cmd)])
    
    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)
    
    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])
    
    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
     step1_failures_fasta_fp,
     step3_dir,
     reference_otu_picking_method,
     step2_repset_fasta_fp,
     parallel,
     params,
     logger)
    
    commands.append([
     ('Pick reference OTUs using de novo rep set',step3_cmd)])
    
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir
    
    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures', 
                          step3_filter_fasta_cmd)])
        
        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id,'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)])
        
    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])    
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                      'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))])
    
    command_handler(commands,
        status_update_callback,
        logger=logger,
        close_logger_on_success=False)
    commands = []
    
    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size)
    
    ## make the final representative seqs file and a new refseqs file that 
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be. 
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be 
    ## reads from this run so we don't hit issues building a tree using 
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp,'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp,new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp,'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write 
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    new_refseqs_f.close()
    final_repset_f.close()
    
    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table",make_otu_table_cmd)])
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    
    commands = []
    
    # initialize output file names - these differ based on what combination of
    # taxonomy assignment and alignment/tree building is happening.
    if run_assign_tax and run_align_and_tree:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        align_and_tree_input_otu_table = otu_table_w_tax_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
    elif run_assign_tax:
        tax_input_otu_table_fp = otu_table_fp
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
    elif run_align_and_tree:
        align_and_tree_input_otu_table = otu_table_fp
        pynast_failure_filtered_otu_table_fp = \
         '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size)
    
    if run_assign_tax:
        if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp],error_on_missing=False)
        
            taxonomy_fp = assign_tax(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\
             (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_metadata_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []

    if run_align_and_tree:
        if exists(pynast_failure_filtered_otu_table_fp) and\
           getsize(pynast_failure_filtered_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %\
                         pynast_failure_filtered_otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([pynast_failure_filtered_otu_table_fp],
                         error_on_missing=False)
        
            pynast_failures_fp = align_and_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(align_and_tree_input_otu_table,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
            
    if close_logger_on_success:
        logger.close()
Example #44
0
def generate_most_wanted_list(
        output_dir, otu_table_fps, rep_set_fp, gg_fp, nt_fp, mapping_fp,
        mapping_category, top_n, min_abundance, max_abundance, min_categories,
        num_categories_to_plot, max_gg_similarity, max_nt_similarity, e_value,
        word_size, merged_otu_table_fp, suppress_taxonomic_output,
        jobs_to_start, command_handler, status_update_callback, force):
    try:
        makedirs(output_dir)
    except OSError:
        if not force:
            raise WorkflowError(
                "Output directory '%s' already exists. Please "
                "choose a different directory, or force overwrite with -f." %
                output_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))
    commands, blast_results_fp, rep_set_cands_failures_fp, \
        master_otu_table_ms_fp = _get_most_wanted_filtering_commands(
            output_dir, otu_table_fps,
            rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category,
            min_abundance, max_abundance, min_categories, max_gg_similarity,
            e_value, word_size, merged_otu_table_fp, jobs_to_start)

    # Execute the commands, but keep the logger open because
    # we're going to write additional status updates as we process the data.
    command_handler(commands,
                    status_update_callback,
                    logger,
                    close_logger_on_success=False)
    commands = []

    # We'll sort the BLAST results by percent identity (ascending) and pick the
    # top n.
    logger.write("Reading in BLAST results, sorting by percent identity, "
                 "and picking the top %d OTUs.\n\n" % top_n)
    top_n_mw = _get_top_n_blast_results(open(blast_results_fp, 'U'), top_n,
                                        max_nt_similarity)

    # Read in our filtered down candidate seqs file and latest filtered and
    # collapsed OTU table. We'll need to compute some stats on these to include
    # in our report.
    logger.write("Reading in filtered candidate sequences and latest filtered "
                 "and collapsed OTU table.\n\n")
    mw_seqs = _get_rep_set_lookup(open(rep_set_cands_failures_fp, 'U'))
    master_otu_table_ms = parse_biom_table(open(master_otu_table_ms_fp, 'U'))

    # Write results out to tsv and HTML table.
    logger.write("Writing most wanted OTUs results to TSV and HTML "
                 "tables.\n\n")
    output_img_dir = join(output_dir, 'img')
    try:
        makedirs(output_img_dir)
    except OSError:
        # It already exists, which is okay since we already know we are in
        # 'force' mode from above.
        pass

    tsv_lines, html_table_lines, mw_fasta_lines, plot_fps, plot_data_fps = \
            _format_top_n_results_table(top_n_mw,
                mw_seqs, master_otu_table_ms, output_img_dir, mapping_category,
                suppress_taxonomic_output, num_categories_to_plot)

    mw_tsv_rel_fp = 'most_wanted_otus.txt'
    mw_tsv_fp = join(output_dir, mw_tsv_rel_fp)
    mw_tsv_f = open(mw_tsv_fp, 'w')
    mw_tsv_f.write(tsv_lines)
    mw_tsv_f.close()

    mw_fasta_rel_fp = 'most_wanted_otus.fasta'
    mw_fasta_fp = join(output_dir, mw_fasta_rel_fp)
    mw_fasta_f = open(mw_fasta_fp, 'w')
    mw_fasta_f.write(mw_fasta_lines)
    mw_fasta_f.close()

    html_dl_links = (
        '<a href="%s" target="_blank">Download table in tab-'
        'separated value (TSV) format</a><br /><a href="%s" '
        'target="_blank">Download OTU sequence data in FASTA format</a>' %
        (mw_tsv_rel_fp, mw_fasta_rel_fp))
    html_lines = '%s<div>%s<br /><br />%s<br />%s</div>%s' % (
        html_header, html_dl_links, html_table_lines, html_dl_links,
        html_footer)

    mw_html_f = open(join(output_dir, 'most_wanted_otus.html'), 'w')
    mw_html_f.write(html_lines)
    mw_html_f.close()
    logger.close()