def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(opts.taxa_summary_fps) != 2:
        option_parser.error("Exactly two taxa summary files are required. You "
                            "provided %d." % len(opts.taxa_summary_fps))

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U'))

    results = compare_taxa_summaries(
        parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')),
        parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')),
        opts.comparison_mode,
        correlation_type=opts.correlation_type,
        tail_type=opts.tail_type,
        num_permutations=opts.num_permutations,
        confidence_level=opts.confidence_level,
        perform_detailed_comparisons=opts.perform_detailed_comparisons,
        sample_id_map=sample_id_map,
        expected_sample_id=opts.expected_sample_id)

    # Write out the sorted and filled taxa summaries, basing their
    # filenames on the original input filenames. If the filenames are the same,
    # append a number to each filename.
    same_filenames = False
    if basename(opts.taxa_summary_fps[0]) == \
       basename(opts.taxa_summary_fps[1]):
        same_filenames = True

    for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps,
                                                     results[:2], range(0, 2)):
        filename_suffix = '_sorted_and_filled'
        if same_filenames:
            filename_suffix += '_%d' % file_num
        filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix)
        filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w')
        filled_ts_f.write(filled_ts_lines)
        filled_ts_f.close()

    # Write the overall comparison result.
    overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w')
    overall_comp_f.write(results[2])
    overall_comp_f.close()

    # Write the correlation vector containing the pairwise sample comparisons.
    if opts.perform_detailed_comparisons:
        corr_vec_f = open(join(opts.output_dir, 'detailed_comparisons.txt'),
                          'w')
        corr_vec_f.write(results[3])
        corr_vec_f.close()
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    if len(opts.taxa_summary_fps) != 2:
        option_parser.error("Exactly two taxa summary files are required. You "
                            "provided %d." % len(opts.taxa_summary_fps))

    # Create the output dir if it doesn't already exist.
    try:
        create_dir(opts.output_dir)
    except:
        option_parser.error("Could not create or access output directory "
                            "specified with the -o option.")

    sample_id_map = None
    if opts.sample_id_map_fp:
        sample_id_map = parse_sample_id_map(open(opts.sample_id_map_fp, 'U'))

    results = compare_taxa_summaries(
        parse_taxa_summary_table(open(opts.taxa_summary_fps[0], 'U')),
        parse_taxa_summary_table(open(opts.taxa_summary_fps[1], 'U')),
        opts.comparison_mode, correlation_type=opts.correlation_type,
        tail_type=opts.tail_type, num_permutations=opts.num_permutations,
        confidence_level=opts.confidence_level,
        perform_detailed_comparisons=opts.perform_detailed_comparisons,
        sample_id_map=sample_id_map,
        expected_sample_id=opts.expected_sample_id)

    # Write out the sorted and filled taxa summaries, basing their
    # filenames on the original input filenames. If the filenames are the same,
    # append a number to each filename.
    same_filenames = False
    if basename(opts.taxa_summary_fps[0]) == \
       basename(opts.taxa_summary_fps[1]):
        same_filenames = True

    for orig_ts_fp, filled_ts_lines, file_num in zip(opts.taxa_summary_fps,
                                                     results[:2], range(0, 2)):
        filename_suffix = '_sorted_and_filled'
        if same_filenames:
            filename_suffix += '_%d' % file_num
        filled_ts_fp = add_filename_suffix(orig_ts_fp, filename_suffix)
        filled_ts_f = open(join(opts.output_dir, filled_ts_fp), 'w')
        filled_ts_f.write(filled_ts_lines)
        filled_ts_f.close()

    # Write the overall comparison result.
    overall_comp_f = open(join(opts.output_dir, 'overall_comparison.txt'), 'w')
    overall_comp_f.write(results[2])
    overall_comp_f.close()

    # Write the correlation vector containing the pairwise sample comparisons.
    if opts.perform_detailed_comparisons:
        corr_vec_f = open(join(opts.output_dir,
                               'detailed_comparisons.txt'), 'w')
        corr_vec_f.write(results[3])
        corr_vec_f.close()
def _generate_taxa_processing_commands(assigned_taxonomy_dir, input_fasta_fp,
                                       clean_otu_table_fp, run_id):
    """ Build command strings for adding and summarizing taxa commands. These 
        are used with every method. """
    taxa_assignments_fp = join(assigned_taxonomy_dir,
            splitext(basename(input_fasta_fp))[0] + '_tax_assignments.txt')
    otu_table_w_taxa_fp = join(assigned_taxonomy_dir,
            add_filename_suffix(clean_otu_table_fp, '_w_taxa'))
    add_md_command = [('Adding metadata (%s)' % run_id,
                       'biom add-metadata -i %s -o %s '
                       '--observation-metadata-fp %s --sc-separated taxonomy '
                       '--observation-header OTUID,taxonomy' %
                       (clean_otu_table_fp, otu_table_w_taxa_fp,
                        taxa_assignments_fp))]
    summarize_taxa_command = [('Summarizing taxa (%s)' % run_id,
                               'summarize_taxa.py -i %s -o %s' %
                               (otu_table_w_taxa_fp, assigned_taxonomy_dir))]

    return add_md_command, summarize_taxa_command
Beispiel #4
0
def _generate_taxa_processing_commands(assigned_taxonomy_dir, input_fasta_fp,
                                       clean_otu_table_fp, run_id):
    """ Build command strings for adding and summarizing taxa commands. These 
        are used with every method. """
    taxa_assignments_fp = join(
        assigned_taxonomy_dir,
        splitext(basename(input_fasta_fp))[0] + '_tax_assignments.txt')
    otu_table_w_taxa_fp = join(
        assigned_taxonomy_dir,
        add_filename_suffix(clean_otu_table_fp, '_w_taxa'))
    add_taxa_command = [
        ('Adding taxa (%s)' % run_id, 'add_taxa.py -i %s -o %s -t %s' %
         (clean_otu_table_fp, otu_table_w_taxa_fp, taxa_assignments_fp))
    ]
    summarize_taxa_command = [
        ('Summarizing taxa (%s)' % run_id, 'summarize_taxa.py -i %s -o %s' %
         (otu_table_w_taxa_fp, assigned_taxonomy_dir))
    ]

    return add_taxa_command, summarize_taxa_command
Beispiel #5
0
def _build_simulated_data_commands(analysis_type, out_dir, even_otu_table_fp,
                                   map_fp, tree_fp, workflow):
    cmds = []

    data_type_dir = join(out_dir, 'simulated')
    create_dir(data_type_dir)

    num_samps = get_num_samples_in_table(even_otu_table_fp)

    for category in workflow['categories']:
        category_dir = join(data_type_dir, category[0])
        create_dir(category_dir)

        for trial_num in range(workflow['num_sim_data_trials']):
            trial_num_dir = join(category_dir, '%d' % trial_num)
            create_dir(trial_num_dir)

            for samp_size in workflow['sample_sizes']:
                samp_size_dir = join(trial_num_dir, '%d' % samp_size)
                create_dir(samp_size_dir)

                # Lots of duplicate code between these two blocks...
                # need to refactor and test.
                if samp_size <= num_samps:
                    simsam_rep_num = 1

                    subset_otu_table_fp = join(samp_size_dir, basename(even_otu_table_fp))
                    subset_map_fp = join(samp_size_dir, basename(map_fp))

                    if not has_results(samp_size_dir, required_files=[basename(subset_otu_table_fp), basename(subset_map_fp)]):
                        run_command('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, even_otu_table_fp, map_fp, category[0], samp_size, samp_size_dir))
                    assert get_num_samples_in_table(subset_otu_table_fp) == samp_size
                    assert get_num_samples_in_map(subset_map_fp) == samp_size

                    for d in workflow['dissim']:
                        dissim_dir = join(samp_size_dir, repr(d))
                        create_dir(dissim_dir)

                        simsam_map_fp = join(dissim_dir, add_filename_suffix(subset_map_fp, '_n%d_d%r' % (simsam_rep_num, d)))
                        simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(subset_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d)))

                        # Check for simulated table/map and various
                        # distance matrices / coordinates files.
                        required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files)

                        has_metric_files = True
                        for metric in workflow['metrics']:
                            required_metric_files = ['dm.txt', 'map.txt', 'pc.txt']
                            if analysis_type == 'gradient':
                                required_metric_files.append('%s_dm.txt' % category[0])

                            metric_dir = join(dissim_dir, metric[0])
                            has_metric_files = has_results(metric_dir, required_metric_files)
                            if not has_metric_files:
                                break

                        if not (has_simsam_files and has_metric_files):
                            cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (subset_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, subset_map_fp)]

                            for metric in workflow['metrics']:
                                metric_dir = join(dissim_dir, metric[0])
                                create_dir(metric_dir)

                                if analysis_type == 'gradient':
                                    cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (simsam_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0])))

                                cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (simsam_otu_table_fp, metric_dir, metric[0], tree_fp))
                                cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(simsam_otu_table_fp))[0])), join(metric_dir, 'dm.txt')))
                                cmd.append('cp %s %s' % (simsam_map_fp, join(metric_dir, 'map.txt')))
                                cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt')))
                            cmds.append(' && '.join(cmd))
                else:
                    # We need to simulate more samples than we originally have.
                    simsam_rep_num = get_simsam_rep_num(samp_size, num_samps)

                    for d in workflow['dissim']:
                        dissim_dir = join(samp_size_dir, repr(d))
                        create_dir(dissim_dir)

                        simsam_map_fp = join(dissim_dir, add_filename_suffix(map_fp, '_n%d_d%r' % (simsam_rep_num, d)))
                        simsam_otu_table_fp = join(dissim_dir, add_filename_suffix(even_otu_table_fp, '_n%d_d%r' % (simsam_rep_num, d)))

                        required_simsam_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_simsam_files = has_results(dissim_dir, required_files=required_simsam_files)

                        required_subset_files = [basename(simsam_map_fp), basename(simsam_otu_table_fp)]
                        has_subset_files = has_results(join(dissim_dir, 'subset'), required_files=required_subset_files)

                        has_metric_files = True
                        for metric in workflow['metrics']:
                            required_metric_files = ['dm.txt', 'map.txt', 'pc.txt']
                            if analysis_type == 'gradient':
                                required_metric_files.append('%s_dm.txt' % category[0])

                            metric_dir = join(dissim_dir, metric[0])
                            has_metric_files = has_results(metric_dir, required_metric_files)
                            if not has_metric_files:
                                break

                        if not (has_simsam_files and has_subset_files and has_metric_files):
                            cmd = ['simsam.py -i %s -t %s -o %s -d %r -n %d -m %s' % (even_otu_table_fp, tree_fp, dissim_dir, d, simsam_rep_num, map_fp)]

                            subset_dir = join(dissim_dir, 'subset')
                            cmd.append('choose_data_subset.py -t %s -i %s -m %s -c %s -n %d -o %s' % (analysis_type, simsam_otu_table_fp, simsam_map_fp, category[0], samp_size, subset_dir))
                            subset_otu_table_fp = join(subset_dir, basename(simsam_otu_table_fp))
                            subset_map_fp = join(subset_dir, basename(simsam_map_fp))

                            for metric in workflow['metrics']:
                                metric_dir = join(dissim_dir, metric[0])
                                create_dir(metric_dir)

                                if analysis_type == 'gradient':
                                    cmd.append('distance_matrix_from_mapping.py -i %s -c %s -o %s' % (subset_map_fp, category[0], join(metric_dir, '%s_dm.txt' % category[0])))

                                cmd.append('beta_diversity.py -i %s -o %s -m %s -t %s' % (subset_otu_table_fp, metric_dir, metric[0], tree_fp))
                                cmd.append('mv %s %s' % (join(metric_dir, '%s_%s.txt' % (metric[0], splitext(basename(subset_otu_table_fp))[0])), join(metric_dir, 'dm.txt')))
                                cmd.append('cp %s %s' % (subset_map_fp, join(metric_dir, 'map.txt')))
                                cmd.append('principal_coordinates.py -i %s -o %s' % (join(metric_dir, 'dm.txt'), join(metric_dir, 'pc.txt')))
                            cmds.append(' && '.join(cmd))
    return cmds
Beispiel #6
0
def create_personal_results(
    output_dir,
    mapping_fp,
    coord_fp,
    collated_dir,
    otu_table_fp,
    prefs_fp,
    personal_id_column,
    personal_ids=None,
    column_title="Self",
    individual_titles=None,
    category_to_split="BodySite",
    time_series_category="WeeksSinceStart",
    rarefaction_depth=10000,
    alpha=0.05,
    rep_set_fp=None,
    body_site_rarefied_otu_table_dir=None,
    retain_raw_data=False,
    suppress_alpha_rarefaction=False,
    suppress_beta_diversity=False,
    suppress_taxa_summary_plots=False,
    suppress_alpha_diversity_boxplots=False,
    suppress_otu_category_significance=False,
    command_handler=call_commands_serially,
    status_update_callback=no_status_updates,
):
    # Create our output directory and copy over the resources the personalized
    # pages need (e.g. javascript, images, etc.).
    create_dir(output_dir)

    support_files_dir = join(output_dir, "support_files")
    if not exists(support_files_dir):
        copytree(join(get_project_dir(), "my_microbes", "support_files"), support_files_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    mapping_data, header, comments = parse_mapping_file(open(mapping_fp, "U"))
    try:
        personal_id_index = header.index(personal_id_column)
    except ValueError:
        raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column)
    try:
        bodysite_index = header.index(category_to_split)
    except ValueError:
        raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split)

    header = header[:-1] + [column_title] + [header[-1]]

    # column that differentiates between body-sites within a single individual
    # used for the creation of the vectors in make_3d_plots.py, this data is
    # created by concatenating the two columns when writing the mapping file
    site_id_category = "%s&&%s" % (personal_id_column, category_to_split)
    header.insert(len(header) - 1, site_id_category)

    all_personal_ids = get_personal_ids(mapping_data, personal_id_index)
    if personal_ids == None:
        personal_ids = all_personal_ids
    else:
        for pid in personal_ids:
            if pid not in all_personal_ids:
                raise ValueError(
                    "'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column)
                )

    if time_series_category not in header:
        raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category)

    otu_table_title = splitext(basename(otu_table_fp))

    output_directories = []
    raw_data_files = []
    raw_data_dirs = []

    # Rarefy the OTU table and split by body site here (instead of on a
    # per-individual basis) as we can use the same rarefied and split tables
    # for each individual.
    if not suppress_otu_category_significance:
        rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_even%d" % rarefaction_depth))

        if body_site_rarefied_otu_table_dir is None:
            commands = []
            cmd_title = "Rarefying OTU table"
            cmd = "single_rarefaction.py -i %s -o %s -d %s" % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth)
            commands.append([(cmd_title, cmd)])
            raw_data_files.append(rarefied_otu_table_fp)

            per_body_site_dir = join(output_dir, "per_body_site_otu_tables")

            cmd_title = "Splitting rarefied OTU table by body site"
            cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % (
                rarefied_otu_table_fp,
                mapping_fp,
                category_to_split,
                per_body_site_dir,
            )
            commands.append([(cmd_title, cmd)])
            raw_data_dirs.append(per_body_site_dir)

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
        else:
            per_body_site_dir = body_site_rarefied_otu_table_dir

    for person_of_interest in personal_ids:
        # Files to clean up on a per-individual basis.
        personal_raw_data_files = []
        personal_raw_data_dirs = []

        create_dir(join(output_dir, person_of_interest))

        personal_mapping_file_fp = join(output_dir, person_of_interest, "mapping_file.txt")
        html_fp = join(output_dir, person_of_interest, "index.html")

        personal_mapping_data = create_personal_mapping_file(
            mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles
        )

        personal_mapping_f = open(personal_mapping_file_fp, "w")
        personal_mapping_f.write(format_mapping_file(header, personal_mapping_data, comments))
        personal_mapping_f.close()
        personal_raw_data_files.append(personal_mapping_file_fp)

        column_title_index = header.index(column_title)
        column_title_values = set([e[column_title_index] for e in personal_mapping_data])
        cat_index = header.index(category_to_split)
        cat_values = set([e[cat_index] for e in personal_mapping_data])

        # Generate alpha diversity boxplots, split by body site, one per
        # metric. We run this one first because it completes relatively
        # quickly and it does not call any QIIME scripts.
        alpha_diversity_boxplots_html = ""
        if not suppress_alpha_diversity_boxplots:
            adiv_boxplots_dir = join(output_dir, person_of_interest, "adiv_boxplots")
            create_dir(adiv_boxplots_dir)
            output_directories.append(adiv_boxplots_dir)

            logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest)

            plot_filenames = _generate_alpha_diversity_boxplots(
                collated_dir,
                personal_mapping_file_fp,
                category_to_split,
                column_title,
                rarefaction_depth,
                adiv_boxplots_dir,
            )

            # Create relative paths for use with the index page.
            rel_boxplot_dir = basename(normpath(adiv_boxplots_dir))
            plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames]

            alpha_diversity_boxplots_html = create_alpha_diversity_boxplots_html(plot_fps)

        ## Alpha rarefaction steps
        if not suppress_alpha_rarefaction:
            rarefaction_dir = join(output_dir, person_of_interest, "alpha_rarefaction")
            output_directories.append(rarefaction_dir)

            commands = []
            cmd_title = "Creating rarefaction plots (%s)" % person_of_interest
            cmd = "make_rarefaction_plots.py -i %s -m %s -p %s -o %s" % (
                collated_dir,
                personal_mapping_file_fp,
                prefs_fp,
                rarefaction_dir,
            )
            commands.append([(cmd_title, cmd)])

            personal_raw_data_dirs.append(join(rarefaction_dir, "average_plots"))
            personal_raw_data_dirs.append(join(rarefaction_dir, "average_tables"))

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

        ## Beta diversity steps
        if not suppress_beta_diversity:
            pcoa_dir = join(output_dir, person_of_interest, "beta_diversity")
            pcoa_time_series_dir = join(output_dir, person_of_interest, "beta_diversity_time_series")
            output_directories.append(pcoa_dir)
            output_directories.append(pcoa_time_series_dir)

            commands = []
            cmd_title = "Creating beta diversity time series plots (%s)" % person_of_interest
            cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=" % (
                personal_mapping_file_fp,
                prefs_fp,
                coord_fp,
                pcoa_time_series_dir,
            ) + "'%s' --add_vectors='%s,%s'" % (time_series_category, site_id_category, time_series_category)
            commands.append([(cmd_title, cmd)])

            cmd_title = "Creating beta diversity plots (%s)" % person_of_interest
            cmd = "make_3d_plots.py  -m %s -p %s -i %s -o %s" % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

        ## Time series taxa summary plots steps
        taxa_summary_plots_html = ""
        if not suppress_taxa_summary_plots:
            area_plots_dir = join(output_dir, person_of_interest, "time_series")
            create_dir(area_plots_dir)
            output_directories.append(area_plots_dir)

            files_to_remove, dirs_to_remove = _generate_taxa_summary_plots(
                otu_table_fp,
                personal_mapping_file_fp,
                person_of_interest,
                column_title,
                column_title_values,
                category_to_split,
                cat_values,
                time_series_category,
                area_plots_dir,
                command_handler,
                status_update_callback,
                logger,
            )

            personal_raw_data_files.extend(files_to_remove)
            personal_raw_data_dirs.extend(dirs_to_remove)

            taxa_summary_plots_html = create_taxa_summary_plots_html(output_dir, person_of_interest, cat_values)

        # Generate OTU category significance tables (per body site).
        otu_cat_sig_output_fps = []
        otu_category_significance_html = ""
        if not suppress_otu_category_significance:
            otu_cat_sig_dir = join(output_dir, person_of_interest, "otu_category_significance")
            create_dir(otu_cat_sig_dir)
            output_directories.append(otu_cat_sig_dir)

            # For each body-site rarefied OTU table, run
            # otu_category_significance.py using self versus other category.
            # Keep track of each output file that is created because we need to
            # parse these later on.
            commands = []
            valid_body_sites = []
            for cat_value in cat_values:
                body_site_otu_table_fp = join(
                    per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, "_%s" % cat_value)
                )

                if exists(body_site_otu_table_fp):
                    # Make sure we have at least one sample for Self, otherwise
                    # otu_category_significance.py crashes with a division by
                    # zero error.
                    with open(body_site_otu_table_fp, "U") as body_site_otu_table_f, open(
                        personal_mapping_file_fp, "U"
                    ) as personal_mapping_file_f:
                        personal_sample_count = _count_per_individual_samples(
                            body_site_otu_table_f, personal_mapping_file_f, personal_id_column, person_of_interest
                        )

                        if personal_sample_count < 1:
                            continue
                        else:
                            valid_body_sites.append(cat_value)

                    otu_cat_output_fp = join(otu_cat_sig_dir, "otu_cat_sig_%s.txt" % cat_value)

                    cmd_title = "Testing for significant differences in " 'OTU abundances in "%s" body site (%s)' % (
                        cat_value,
                        person_of_interest,
                    )
                    cmd = "otu_category_significance.py -i %s -m %s -c %s " "-o %s" % (
                        body_site_otu_table_fp,
                        personal_mapping_file_fp,
                        column_title,
                        otu_cat_output_fp,
                    )
                    commands.append([(cmd_title, cmd)])

                    personal_raw_data_files.append(otu_cat_output_fp)
                    otu_cat_sig_output_fps.append(otu_cat_output_fp)

            # Hack to allow print-only mode.
            if command_handler is not print_commands and not valid_body_sites:
                raise ValueError(
                    "None of the body sites for personal ID '%s' "
                    "could be processed because there were no "
                    "matching samples in the rarefied OTU table." % person_of_interest
                )

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

            # Reformat otu category significance tables.
            otu_cat_sig_html_filenames = create_otu_category_significance_html_tables(
                otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp
            )

            # Create relative paths for use with the index page.
            rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir))
            otu_cat_sig_html_fps = [
                join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames
            ]

            otu_category_significance_html = create_otu_category_significance_html(otu_cat_sig_html_fps)

        # Create the index.html file for the current individual.
        create_index_html(
            person_of_interest,
            html_fp,
            taxa_summary_plots_html=taxa_summary_plots_html,
            alpha_diversity_boxplots_html=alpha_diversity_boxplots_html,
            otu_category_significance_html=otu_category_significance_html,
        )

        # Clean up the unnecessary raw data files and directories for the
        # current individual. glob will only grab paths that exist.
        if not retain_raw_data:
            clean_up_raw_data_files(personal_raw_data_files, personal_raw_data_dirs)

    # Clean up any remaining raw data files that weren't created on a
    # per-individual basis.
    if not retain_raw_data:
        clean_up_raw_data_files(raw_data_files, raw_data_dirs)

    logger.close()

    return output_directories
Beispiel #7
0
def _generate_taxa_summary_plots(
    otu_table_fp,
    personal_map_fp,
    personal_id,
    personal_cat,
    personal_cat_values,
    body_site_cat,
    body_site_cat_values,
    time_series_cat,
    output_dir,
    command_handler,
    status_update_callback,
    logger,
):
    files_to_remove = []
    dirs_to_remove = []

    ## Split OTU table into self/other per-body-site tables
    commands = []
    cmd_title = "Splitting OTU table into self/other (%s)" % personal_id
    cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % (otu_table_fp, personal_map_fp, personal_cat, output_dir)
    commands.append([(cmd_title, cmd)])

    command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

    # Prefix to be used for taxa summary dirs. Will be
    # <taxa_summary_dir_prefix>_<self|other>_<body site>/.
    ts_dir_prefix = "taxa_summaries"

    # Create taxa summaries for self and other, per body site.
    for personal_cat_value in personal_cat_values:
        personal_cat_biom_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_%s" % personal_cat_value))
        personal_cat_map_fp = join(output_dir, "mapping_%s.txt" % personal_cat_value)
        files_to_remove.append(personal_cat_biom_fp)
        files_to_remove.append(personal_cat_map_fp)

        body_site_dir = join(output_dir, personal_cat_value)

        commands = []
        cmd_title = 'Splitting "%s" OTU table by body site (%s)' % (personal_cat_value, personal_id)
        cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % (
            personal_cat_biom_fp,
            personal_map_fp,
            body_site_cat,
            body_site_dir,
        )
        commands.append([(cmd_title, cmd)])
        dirs_to_remove.append(body_site_dir)

        command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

        commands = []
        for body_site_cat_value in body_site_cat_values:
            body_site_otu_table_fp = join(
                body_site_dir, add_filename_suffix(personal_cat_biom_fp, "_%s" % body_site_cat_value)
            )

            # We won't always get an OTU table if the mapping file
            # category contains samples that aren't in the OTU table
            # (e.g. the 'na' state for body site).
            if exists(body_site_otu_table_fp):
                ts_dir = join(output_dir, "%s_%s_%s" % (ts_dir_prefix, personal_cat_value, body_site_cat_value))
                create_dir(ts_dir)
                dirs_to_remove.append(ts_dir)

                # Summarize.
                summarized_otu_table_fp = join(ts_dir, "%s_otu_table.biom" % time_series_cat)

                cmd_title = "Summarizing OTU table by category (%s)" % personal_id
                cmd = "summarize_otu_by_cat.py -i %s -c %s -o %s " "-m %s " % (
                    personal_map_fp,
                    body_site_otu_table_fp,
                    summarized_otu_table_fp,
                    time_series_cat,
                )
                commands.append([(cmd_title, cmd)])

                # Sort.
                sorted_otu_table_fp = join(ts_dir, "%s_otu_table_sorted.biom" % time_series_cat)

                cmd_title = "Sorting OTU table (%s)" % personal_id
                cmd = "sort_otu_table.py -i %s -o %s" % (summarized_otu_table_fp, sorted_otu_table_fp)
                commands.append([(cmd_title, cmd)])

                # Summarize taxa.
                cmd_title = "Summarizing taxa (%s)" % personal_id
                cmd = "summarize_taxa.py -i %s -o %s" % (sorted_otu_table_fp, ts_dir)
                commands.append([(cmd_title, cmd)])

                create_comparative_taxa_plots_html(
                    body_site_cat_value, join(output_dir, "%s_comparative.html" % body_site_cat_value)
                )

        command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

    # Make each corresponding taxa summary compatible so that coloring matches
    # between them. We want to be able to compare self versus other at each
    # body site.
    commands = []
    valid_body_sites = []
    for body_site_cat_value in body_site_cat_values:
        personal_cat_vals = list(personal_cat_values)

        ts_dir = join(output_dir, "%s_%s_%s" % (ts_dir_prefix, personal_cat_vals[0], body_site_cat_value))

        if not exists(ts_dir):
            continue

        # Check that we have 2+ weeks (samples were previously collapsed into
        # weeks for self and other). If we don't have 2+ weeks,
        # plot_taxa_summary.py will fail, so we'll skip this body site.
        weeks_otu_table_fp = join(ts_dir, "%s_otu_table_sorted.biom" % time_series_cat)
        with open(weeks_otu_table_fp, "U") as weeks_otu_table_f:
            if _count_num_samples(weeks_otu_table_f) < 2:
                continue

        ts_fps1 = sorted(glob(join(ts_dir, "%s_otu_table_sorted_L*.txt" % time_series_cat)))

        ts_dir = join(output_dir, "%s_%s_%s" % (ts_dir_prefix, personal_cat_vals[1], body_site_cat_value))

        if not exists(ts_dir):
            continue

        weeks_otu_table_fp = join(ts_dir, "%s_otu_table_sorted.biom" % time_series_cat)

        with open(weeks_otu_table_fp, "U") as weeks_otu_table_f:
            if _count_num_samples(weeks_otu_table_f) < 2:
                continue

        ts_fps2 = sorted(glob(join(ts_dir, "%s_otu_table_sorted_L*.txt" % time_series_cat)))

        if len(ts_fps1) != len(ts_fps2):
            raise ValueError("There are not an equal number of taxa summaries " "to compare between self and other.")

        compatible_ts_dir = join(output_dir, "compatible_ts_%s" % body_site_cat_value)
        dirs_to_remove.append(compatible_ts_dir)

        compatible_ts_fps = defaultdict(list)
        for ts_fp1, ts_fp2 in zip(ts_fps1, ts_fps2):
            if basename(ts_fp1) != basename(ts_fp2):
                raise ValueError("Could not find matching taxa summaries " "between self and other to compare.")

            # Make taxa summaries compatible.
            cmd_title = "Making compatible taxa summaries (%s)" % personal_id
            cmd = "compare_taxa_summaries.py -i %s,%s -o %s -m paired -n 0" % (ts_fp1, ts_fp2, compatible_ts_dir)
            commands.append([(cmd_title, cmd)])

            compatible_ts_fps[personal_cat_vals[0]].append(
                join(compatible_ts_dir, add_filename_suffix(ts_fp1, "_sorted_and_filled_0"))
            )

            compatible_ts_fps[personal_cat_vals[1]].append(
                join(compatible_ts_dir, add_filename_suffix(ts_fp2, "_sorted_and_filled_1"))
            )

        for personal_cat_value in personal_cat_values:
            # Plot taxa summaries.
            ts_fps = ",".join(sorted(compatible_ts_fps[personal_cat_value]))

            ts_plots_dir = join(
                output_dir, "taxa_plots_%s_%s" % (personal_cat_value, body_site_cat_value), "taxa_summary_plots"
            )

            cmd_title = "Plot taxa summaries (%s)" % personal_id
            cmd = "plot_taxa_summary.py -i %s -o %s -a numeric" % (ts_fps, ts_plots_dir)
            commands.append([(cmd_title, cmd)])

        # If we've gotten this far, we'll be able to process this body site
        # (i.e. there are enough weeks).
        valid_body_sites.append(body_site_cat_value)

    # Hack to allow print-only mode.
    if command_handler is not print_commands and not valid_body_sites:
        raise ValueError(
            "None of the body sites for personal ID '%s' could "
            "be processed because there were not enough weeks "
            "to create taxa summary plots." % personal_id
        )

    command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

    return files_to_remove, dirs_to_remove
Beispiel #8
0
def create_personal_results(output_dir,
                            mapping_fp,
                            coord_fp,
                            collated_dir,
                            otu_table_fp,
                            prefs_fp,
                            personal_id_column,
                            personal_ids=None,
                            column_title='Self',
                            individual_titles=None,
                            category_to_split='BodySite',
                            time_series_category='WeeksSinceStart',
                            rarefaction_depth=10000,
                            alpha=0.05,
                            rep_set_fp=None,
                            parameter_fp=None,
                            body_site_rarefied_otu_table_dir=None,
                            retain_raw_data=False,
                            suppress_alpha_rarefaction=False,
                            suppress_beta_diversity=False,
                            suppress_taxa_summary_plots=False,
                            suppress_alpha_diversity_boxplots=False,
                            suppress_otu_category_significance=False,
                            command_handler=call_commands_serially,
                            status_update_callback=no_status_updates):
    # Create our output directory and copy over the resources the personalized
    # pages need (e.g. javascript, images, etc.).
    create_dir(output_dir)

    support_files_dir = join(output_dir, 'support_files')
    if not exists(support_files_dir):
        copytree(join(get_project_dir(), 'my_microbes', 'support_files'),
                 support_files_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U'))
    try:
        personal_id_index = header.index(personal_id_column)
    except ValueError:
        raise ValueError("Personal ID field '%s' is not a mapping file column "
                         "header." % personal_id_column)
    try:
        bodysite_index = header.index(category_to_split)
    except ValueError:
        raise ValueError("Category to split field '%s' is not a mapping file "
            "column header." % category_to_split)

    header = header[:-1] + [column_title] + [header[-1]]

    # column that differentiates between body-sites within a single individual
    # used for the creation of the vectors in make_3d_plots.py, this data is
    # created by concatenating the two columns when writing the mapping file
    site_id_category = '%s&&%s' % (personal_id_column, category_to_split)
    header.insert(len(header)-1, site_id_category)

    all_personal_ids = get_personal_ids(mapping_data, personal_id_index)
    if personal_ids == None: 
        personal_ids = all_personal_ids
    else:
        for pid in personal_ids:
            if pid not in all_personal_ids:
                raise ValueError("'%s' is not a personal ID in the mapping "
                                 "file column '%s'." %
                                 (pid, personal_id_column))

    if time_series_category not in header:
        raise ValueError("Time series field '%s' is not a mapping file column "
                         "header." % time_series_category)

    otu_table_title = splitext(basename(otu_table_fp))

    output_directories = []
    raw_data_files = []
    raw_data_dirs = []

    # Rarefy the OTU table and split by body site here (instead of on a
    # per-individual basis) as we can use the same rarefied and split tables
    # for each individual.
    if not suppress_otu_category_significance:
        rarefied_otu_table_fp = join(output_dir,
                add_filename_suffix(otu_table_fp,
                                    '_even%d' % rarefaction_depth))

        if body_site_rarefied_otu_table_dir is None:
            commands = []
            cmd_title = 'Rarefying OTU table'
            cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp,
                    rarefied_otu_table_fp, rarefaction_depth)
            commands.append([(cmd_title, cmd)])
            raw_data_files.append(rarefied_otu_table_fp)

            per_body_site_dir = join(output_dir, 'per_body_site_otu_tables')

            cmd_title = 'Splitting rarefied OTU table by body site'
            cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (
                    rarefied_otu_table_fp, mapping_fp, category_to_split,
                    per_body_site_dir)
            commands.append([(cmd_title, cmd)])
            raw_data_dirs.append(per_body_site_dir)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)
        else:
            per_body_site_dir = body_site_rarefied_otu_table_dir

    for person_of_interest in personal_ids:
        create_dir(join(output_dir, person_of_interest))

        personal_mapping_file_fp = join(output_dir, person_of_interest,
                                        'mapping_file.txt')
        html_fp = join(output_dir, person_of_interest, 'index.html')

        personal_mapping_data = create_personal_mapping_file(mapping_data,
                person_of_interest, personal_id_index, bodysite_index,
                individual_titles)

        personal_mapping_f = open(personal_mapping_file_fp, 'w')
        personal_mapping_f.write(
                format_mapping_file(header, personal_mapping_data, comments))
        personal_mapping_f.close()
        raw_data_files.append(personal_mapping_file_fp)

        column_title_index = header.index(column_title)
        column_title_values = set([e[column_title_index]
                                   for e in personal_mapping_data])
        cat_index = header.index(category_to_split)
        cat_values = set([e[cat_index] for e in personal_mapping_data])

        # Generate alpha diversity boxplots, split by body site, one per
        # metric. We run this one first because it completes relatively
        # quickly and it does not call any QIIME scripts.
        alpha_diversity_boxplots_html = ''
        if not suppress_alpha_diversity_boxplots:
            adiv_boxplots_dir = join(output_dir, person_of_interest,
                                     'adiv_boxplots')
            create_dir(adiv_boxplots_dir)
            output_directories.append(adiv_boxplots_dir)

            logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" %
                         person_of_interest)

            plot_filenames = _generate_alpha_diversity_boxplots(
                    collated_dir, personal_mapping_file_fp,
                    category_to_split, column_title, rarefaction_depth,
                    adiv_boxplots_dir)

            # Create relative paths for use with the index page.
            rel_boxplot_dir = basename(normpath(adiv_boxplots_dir))
            plot_fps = [join(rel_boxplot_dir, plot_filename)
                        for plot_filename in plot_filenames]

            alpha_diversity_boxplots_html = \
                    create_alpha_diversity_boxplots_html(plot_fps)

        ## Alpha rarefaction steps
        if not suppress_alpha_rarefaction:
            rarefaction_dir = join(output_dir, person_of_interest,
                                   'alpha_rarefaction')
            output_directories.append(rarefaction_dir)

            commands = []
            cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest
            cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % (
                    collated_dir, personal_mapping_file_fp, prefs_fp,
                    rarefaction_dir)
            commands.append([(cmd_title, cmd)])

            raw_data_dirs.append(join(rarefaction_dir, 'average_plots'))
            raw_data_dirs.append(join(rarefaction_dir, 'average_tables'))

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Beta diversity steps
        if not suppress_beta_diversity:
            pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity')
            pcoa_time_series_dir = join(output_dir, person_of_interest, 
                                         'beta_diversity_time_series')
            output_directories.append(pcoa_dir)
            output_directories.append(pcoa_time_series_dir)

            commands = []
            cmd_title = 'Creating beta diversity time series plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % (
                personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\
                '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category,
                site_id_category, time_series_category)
            commands.append([(cmd_title, cmd)])
            
            cmd_title = 'Creating beta diversity plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py  -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp,
                                                                 prefs_fp, coord_fp, 
                                                                 pcoa_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Time series taxa summary plots steps
        if not suppress_taxa_summary_plots:
            area_plots_dir = join(output_dir, person_of_interest, 'time_series')
            create_dir(area_plots_dir)
            output_directories.append(area_plots_dir)

            ## Split OTU table into self/other per-body-site tables
            commands = []
            cmd_title = 'Splitting OTU table into self/other (%s)' % \
                        person_of_interest
            cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (otu_table_fp,
                    personal_mapping_file_fp, column_title, area_plots_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

            for column_title_value in column_title_values:
                biom_fp = join(area_plots_dir,
                               add_filename_suffix(otu_table_fp,
                                                   '_%s' % column_title_value))
                column_title_map_fp = join(area_plots_dir, 'mapping_%s.txt' %
                                                           column_title_value)
                raw_data_files.append(biom_fp)
                raw_data_files.append(column_title_map_fp)

                body_site_dir = join(area_plots_dir, column_title_value)

                commands = []
                cmd_title = 'Splitting "%s" OTU table by body site (%s)' % \
                            (column_title_value, person_of_interest)
                cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (biom_fp,
                        personal_mapping_file_fp, category_to_split,
                        body_site_dir)
                commands.append([(cmd_title, cmd)])
                raw_data_dirs.append(body_site_dir)

                command_handler(commands, status_update_callback, logger,
                                close_logger_on_success=False)

                commands = []
                for cat_value in cat_values:
                    body_site_otu_table_fp = join(body_site_dir,
                            add_filename_suffix(biom_fp, '_%s' % cat_value))

                    # We won't always get an OTU table if the mapping file
                    # category contains samples that aren't in the OTU table
                    # (e.g. the 'na' state for body site).
                    if exists(body_site_otu_table_fp):
                        plots = join(area_plots_dir, 'taxa_plots_%s_%s' % (
                            column_title_value, cat_value))

                        cmd_title = 'Creating taxa summary plots (%s)' % \
                                    person_of_interest
                        cmd = ('summarize_taxa_through_plots.py -i %s '
                               '-o %s -c %s -m %s -s' %
                               (body_site_otu_table_fp, plots,
                                time_series_category,
                                personal_mapping_file_fp))
                        if parameter_fp is not None:
                            cmd += ' -p %s' % parameter_fp
                            
                        commands.append([(cmd_title, cmd)])

                        raw_data_files.append(join(plots, '*.biom'))
                        raw_data_files.append(join(plots, '*.txt'))

                        create_comparative_taxa_plots_html(cat_value, 
                                join(area_plots_dir, '%s_comparative.html' %
                                                     cat_value))

                command_handler(commands, status_update_callback, logger,
                                close_logger_on_success=False)

        # Generate OTU category significance tables (per body site).
        otu_cat_sig_output_fps = []
        otu_category_significance_html = ''
        if not suppress_otu_category_significance:
            otu_cat_sig_dir = join(output_dir, person_of_interest,
                                   'otu_category_significance')
            create_dir(otu_cat_sig_dir)
            output_directories.append(otu_cat_sig_dir)

            # For each body-site rarefied OTU table, run
            # otu_category_significance.py using self versus other category.
            # Keep track of each output file that is created because we need to
            # parse these later on.
            commands = []
            for cat_value in cat_values:
                body_site_otu_table_fp = join(per_body_site_dir,
                        add_filename_suffix(rarefied_otu_table_fp,
                                            '_%s' % cat_value))

                if exists(body_site_otu_table_fp):
                    otu_cat_output_fp = join(otu_cat_sig_dir,
                                             'otu_cat_sig_%s.txt' % cat_value)

                    cmd_title = ('Testing for significant differences in '
                                 'OTU abundances in "%s" body site (%s)' % (
                                 cat_value, person_of_interest))
                    cmd = ('otu_category_significance.py -i %s -m %s -c %s '
                           '-o %s' % (body_site_otu_table_fp,
                                      personal_mapping_file_fp,
                                      column_title,
                                      otu_cat_output_fp))
                    commands.append([(cmd_title, cmd)])
                    raw_data_files.append(otu_cat_output_fp)
                    otu_cat_sig_output_fps.append(otu_cat_output_fp)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

            # Reformat otu category significance tables.
            otu_cat_sig_html_filenames = \
                    format_otu_category_significance_tables_as_html(
                            otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, 
                            individual_titles, rep_set_fp=rep_set_fp)

            # Create relative paths for use with the index page.
            rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir))
            otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename)
                    for html_filename in otu_cat_sig_html_filenames]

            otu_category_significance_html = \
                    create_otu_category_significance_html(otu_cat_sig_html_fps)

        # Create the index.html file for the current individual.
        create_index_html(person_of_interest, html_fp,
                alpha_diversity_boxplots_html=alpha_diversity_boxplots_html,
                otu_category_significance_html=otu_category_significance_html)

    logger.close()

    # Clean up the unnecessary raw data files and directories. glob will only
    # grab paths that exist.
    if not retain_raw_data:
        for raw_data_fp_glob in raw_data_files:
            remove_files(glob(raw_data_fp_glob))

        for raw_data_dir_glob in raw_data_dirs:
            for dir_to_remove in glob(raw_data_dir_glob):
                rmtree(dir_to_remove)

    return output_directories
Beispiel #9
0
def generate_most_wanted_list(output_dir, otu_table_fp, rep_set_fp, gg_fp,
        nt_fp, mapping_fp, mapping_category, top_n, min_abundance,
        max_abundance, min_categories, max_gg_similarity, e_value,
        word_size, jobs_to_start, command_handler, status_update_callback,
        force):
    try:
        makedirs(output_dir)
    except OSError:
        if not force:
            raise WorkflowError("Output directory '%s' already exists. Please "
                    "choose a different directory, or force overwrite with -f."
                    % output_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))
    commands = []

    # First filter to keep only new (non-GG) OTUs.
    novel_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp,
                                                              '_novel'))
    commands.append([('Filtering out all GG reference OTUs',
            'filter_otus_from_otu_table.py -i %s -o %s -e %s' %
            (otu_table_fp, novel_otu_table_fp, gg_fp))])

    # Next filter to keep only abundant otus in the specified range (looking
    # only at extremely abundant OTUs has the problem of yielding too many
    # that are similar to stuff in the nt database).
    novel_abund_otu_table_fp = join(output_dir,
            add_filename_suffix(novel_otu_table_fp, '_min%d_max%d' %
            (min_abundance, max_abundance)))
    commands.append([('Filtering out all OTUs that do not fall within the '
            'specified abundance threshold',
            'filter_otus_from_otu_table.py -i %s -o %s -n %d -x %d' %
            (novel_otu_table_fp, novel_abund_otu_table_fp, min_abundance,
             max_abundance))])

    # Next, collapse by mapping_category.
    otu_table_by_samp_type_fp = join(output_dir,
            add_filename_suffix(novel_abund_otu_table_fp, '_%s' %
            mapping_category))
    commands.append([('Collapsing OTU table by %s' % mapping_category,
            'summarize_otu_by_cat.py -c %s -o %s -m %s -i %s' %
            (novel_abund_otu_table_fp, otu_table_by_samp_type_fp,
             mapping_category, mapping_fp))])

    # Filter to contain only otus in the specified minimum number of sample
    # types.
    otu_table_by_samp_type_ms_fp = join(output_dir, add_filename_suffix(
            otu_table_by_samp_type_fp, '_ms%d' % min_categories))
    commands.append([('Filtering OTU table to include only OTUs that appear '
            'in at least %d sample groups' % min_categories,
            'filter_otus_from_otu_table.py -i %s -o %s -s %d' %
            (otu_table_by_samp_type_fp, otu_table_by_samp_type_ms_fp,
             min_categories))])

    # Now that we have a filtered down OTU table of good candidate OTUs, filter
    # the corresponding representative set to include only these candidate
    # sequences.
    candidate_rep_set_fp = join(output_dir, add_filename_suffix(
            rep_set_fp, '_most_wanted_candidates'))
    commands.append([('Filtering representative set to include only the '
            'latest candidate OTUs',
            'filter_fasta.py -f %s -o %s -b %s' %
            (rep_set_fp, candidate_rep_set_fp, otu_table_by_samp_type_ms_fp))])

    # Find the otus that don't hit GG at a certain maximum similarity
    # threshold.
    uclust_output_dir = join(output_dir, 'most_wanted_candidates_%s_%s' %
            (basename(gg_fp), str(max_gg_similarity)))
    commands.append([('Running uclust to get list of sequences that don\'t '
            'hit the maximum GG similarity threshold',
            'parallel_pick_otus_uclust_ref.py -i %s -o %s -r %s -s %s -O %d' %
            (candidate_rep_set_fp, uclust_output_dir, gg_fp,
             str(max_gg_similarity), jobs_to_start))])

    # Filter the candidate sequences to only include the failures from uclust.
    cand_gg_dis_rep_set_fp = join(output_dir,
            add_filename_suffix(candidate_rep_set_fp, '_failures'))
    commands.append([('Filtering candidate sequences to only include uclust '
            'failures',
            'filter_fasta.py -f %s -s %s -o %s' %
            (candidate_rep_set_fp, join(uclust_output_dir,
             splitext(basename(candidate_rep_set_fp))[0] + '_failures.txt'),
             cand_gg_dis_rep_set_fp))])

    # BLAST the failures against nt.
    blast_output_dir = join(output_dir, 'blast_output')
    commands.append([('BLASTing candidate sequences against nt database',
            'parallel_blast.py -i %s -o %s -r %s -D -e %f -w %d -O %d' %
            (cand_gg_dis_rep_set_fp, blast_output_dir, nt_fp, e_value,
             word_size, jobs_to_start))])

    # Execute the commands we have so far, but keep the logger open because
    # we're going to write additional status updates as we process the data.
    command_handler(commands, status_update_callback, logger,
                    close_logger_on_success=False)

    # We'll sort the BLAST results by percent identity (ascending) and pick the
    # top n.
    logger.write("Reading in BLAST results, sorting by percent identity, "
                 "and picking the top %d OTUs.\n\n" % top_n)
    blast_results = open(join(blast_output_dir,
        splitext(basename(cand_gg_dis_rep_set_fp))[0] + '_blast_out.txt'), 'U')
    top_n_mw = []
    for line in blast_results:
        # Skip headers.
        line = line.strip()
        if line and not line.startswith('#'):
            line = line.split('\t')
            top_n_mw.append((line[0], line[1], float(line[2])))
    top_n_mw = sorted(top_n_mw, key=itemgetter(2))[:top_n]

    # Read in our filtered down candidate seqs file and latest filtered and
    # collapsed OTU table. We'll need to compute some stats on these to include
    # in our report.
    logger.write("Reading in candidate sequences and latest filtered and "
                 "collapsed OTU table.\n\n")
    mw_seqs = {}
    for seq_id, seq in MinimalFastaParser(open(cand_gg_dis_rep_set_fp, 'U')):
        seq_id = seq_id.strip().split()[0]
        mw_seqs[seq_id] = seq
    otu_table_by_samp_type_ms = parse_biom_table(
            open(otu_table_by_samp_type_ms_fp, 'U'))

    # Write results out to tsv and HTML table.
    logger.write("Writing most wanted OTUs results to TSV and HTML "
                 "tables.\n\n")
    mw_tsv_f = open(join(output_dir,
                    'top_%d_most_wanted_otus.txt' % top_n), 'w')
    mw_html_f = open(join(output_dir,
                    'top_%d_most_wanted_otus.html' % top_n), 'w')
    tsv_header = 'OTU ID\tSequence\tGreengenes taxonomy\t' + \
                 'NCBI nt closest match\tNCBI nt % identity'
    mw_tsv_f.write(tsv_header + '\n')

    tsv_header += '\tAbundance by %s' % mapping_category
    html_header = ''
    for col in tsv_header.split('\t'):
        html_header += '<th>%s</th>' % col
    mw_html_f.write('<table><tr>' + html_header + '</tr>')

    for otu_id, subject_id, percent_identity in top_n_mw:
        # Grab all necessary information to be included in our report.
        seq = mw_seqs[otu_id]
        tax = otu_table_by_samp_type_ms.ObservationMetadata[
            otu_table_by_samp_type_ms.getObservationIndex(otu_id)]['taxonomy']
        gb_id = subject_id.split('|')[3]
        ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % gb_id

        # Compute the abundance of each most wanted OTU in each sample
        # grouping and create a pie chart to go in the HTML table.
        samp_types = otu_table_by_samp_type_ms.SampleIds
        counts = otu_table_by_samp_type_ms.observationData(otu_id)
        if len(counts) != len(samp_types):
            raise WorkflowError("The number of observation counts does not "
                                "match the number of samples in the OTU "
                                "table.")

        # Piechart code modified from matplotlib example:
        # http://matplotlib.sourceforge.net/examples/pylab_examples/
        #   pie_demo.html
        figure(figsize=(6,6))
        ax = axes([0.1, 0.1, 0.8, 0.8])
        # Will auto-normalize the counts.
        pie(counts, labels=samp_types, autopct='%1.1f%%', shadow=True)

        output_img_dir = join(output_dir, 'img')
        try:
            makedirs(output_img_dir)
        except OSError:
            # It already exists, which is okay since we already know we are in
            # 'force' mode from above.
            pass

        # We need a relative path to the image.
        pie_chart_fp = join('img', 'abundance_by_%s_%s.png' %
                            (mapping_category, otu_id))
        savefig(join(output_dir, pie_chart_fp))

        mw_tsv_f.write('%s\t%s\t%s\t%s\t%s\n' %
                       (otu_id, seq, tax, gb_id, percent_identity))

        mw_html_f.write('<tr><td>%s</td><td>%s</td><td>%s</td>'
                '<td><a href="%s" target="_blank">%s</a></td><td>%s</td><td>'
                '<img src="%s" /></td></tr>' % (otu_id, seq, tax, ncbi_link,
                gb_id, percent_identity, pie_chart_fp))
    mw_html_f.write('</table>')
    mw_tsv_f.close()
    mw_html_f.close()
    logger.close()
Beispiel #10
0
def _get_most_wanted_filtering_commands(output_dir, otu_table_fps, rep_set_fp,
                                        gg_fp, nt_fp, mapping_fp,
                                        mapping_category, min_abundance,
                                        max_abundance, min_categories,
                                        max_gg_similarity, e_value, word_size,
                                        merged_otu_table_fp, jobs_to_start):
    commands = []
    otu_tables_to_merge = []

    if merged_otu_table_fp is None:
        for otu_table_fp in otu_table_fps:
            # First filter to keep only new (non-GG) OTUs.
            novel_otu_table_fp = join(
                output_dir, add_filename_suffix(otu_table_fp, '_novel'))
            commands.append([
                ('Filtering out all GG reference OTUs',
                 'filter_otus_from_otu_table.py -i %s -o %s -e %s' %
                 (otu_table_fp, novel_otu_table_fp, gg_fp))
            ])

            # Next filter to keep only abundant otus in the specified range
            # (looking only at extremely abundant OTUs has the problem of yielding
            # too many that are similar to stuff in the nt database).
            novel_abund_otu_table_fp = join(
                output_dir,
                add_filename_suffix(
                    novel_otu_table_fp,
                    '_min%d_max%d' % (min_abundance, max_abundance)))
            commands.append([
                ('Filtering out all OTUs that do not fall within the '
                 'specified abundance threshold',
                 'filter_otus_from_otu_table.py -i %s -o %s -n %d -x %d' %
                 (novel_otu_table_fp, novel_abund_otu_table_fp, min_abundance,
                  max_abundance))
            ])

            # Remove samples from the table that aren't in the mapping file.
            novel_abund_filtered_otu_table_fp = join(
                output_dir,
                add_filename_suffix(novel_abund_otu_table_fp,
                                    '_known_samples'))
            commands.append([
                ('Filtering out samples that are not in the mapping '
                 'file', 'filter_samples_from_otu_table.py -i %s -o %s '
                 '--sample_id_fp %s' %
                 (novel_abund_otu_table_fp, novel_abund_filtered_otu_table_fp,
                  mapping_fp))
            ])

            # Next, collapse by mapping_category.
            otu_table_by_samp_type_fp = join(
                output_dir,
                add_filename_suffix(novel_abund_filtered_otu_table_fp,
                                    '_%s' % mapping_category))
            commands.append([
                ('Collapsing OTU table by %s' % mapping_category,
                 'summarize_otu_by_cat.py -c %s -o %s -m %s -i %s' %
                 (novel_abund_filtered_otu_table_fp, otu_table_by_samp_type_fp,
                  mapping_category, mapping_fp))
            ])
            otu_tables_to_merge.append(otu_table_by_samp_type_fp)

        # Merge all collapsed OTU tables.
        master_otu_table_fp = join(
            output_dir, 'master_otu_table_novel_min%d_max%d_%s.biom' %
            (min_abundance, max_abundance, mapping_category))
        commands.append([('Merging collapsed OTU tables',
                          'merge_otu_tables.py -i %s -o %s' %
                          (','.join(otu_tables_to_merge), master_otu_table_fp))
                         ])
    else:
        master_otu_table_fp = merged_otu_table_fp

    # Filter to contain only otus in the specified minimum number of sample
    # types.
    master_otu_table_ms_fp = join(
        output_dir,
        add_filename_suffix(master_otu_table_fp, '_ms%d' % min_categories))
    commands.append([
        ('Filtering OTU table to include only OTUs that appear '
         'in at least %d sample groups' % min_categories,
         'filter_otus_from_otu_table.py -i %s -o %s -s %d' %
         (master_otu_table_fp, master_otu_table_ms_fp, min_categories))
    ])

    # Now that we have a filtered down OTU table of good candidate OTUs, filter
    # the corresponding representative set to include only these candidate
    # sequences.
    rep_set_cands_fp = join(output_dir,
                            add_filename_suffix(rep_set_fp, '_candidates'))
    commands.append([
        ('Filtering representative set to include only the '
         'latest candidate OTUs', 'filter_fasta.py -f %s -o %s -b %s' %
         (rep_set_fp, rep_set_cands_fp, master_otu_table_ms_fp))
    ])

    # Find the otus that don't hit GG at a certain maximum similarity
    # threshold.
    uclust_output_dir = join(
        output_dir, 'most_wanted_candidates_%s_%s' %
        (basename(gg_fp), str(max_gg_similarity)))
    commands.append([
        ('Running uclust to get list of sequences that don\'t '
         'hit the maximum GG similarity threshold',
         'parallel_pick_otus_uclust_ref.py -i %s -o %s -r %s -s %s -O %d' %
         (rep_set_cands_fp, uclust_output_dir, gg_fp, str(max_gg_similarity),
          jobs_to_start))
    ])

    # Filter the rep set to only include the failures from uclust.
    rep_set_cands_failures_fp = join(
        output_dir, add_filename_suffix(rep_set_cands_fp, '_failures'))
    commands.append([
        ('Filtering candidate sequences to only include uclust '
         'failures', 'filter_fasta.py -f %s -s %s -o %s' %
         (rep_set_cands_fp,
          join(uclust_output_dir,
               splitext(basename(rep_set_cands_fp))[0] + '_failures.txt'),
          rep_set_cands_failures_fp))
    ])

    # BLAST the failures against nt.
    blast_output_dir = join(output_dir, 'blast_output')
    commands.append([
        ('BLASTing filtered candidate sequences against nt '
         'database',
         'parallel_blast.py -i %s -o %s -r %s -D -e %f -w %d -O %d' %
         (rep_set_cands_failures_fp, blast_output_dir, nt_fp, e_value,
          word_size, jobs_to_start))
    ])

    blast_results_fp = join(
        blast_output_dir,
        splitext(basename(rep_set_cands_failures_fp))[0] + '_blast_out.txt')

    return commands, blast_results_fp, rep_set_cands_failures_fp, \
           master_otu_table_ms_fp