def main(): option_parser, opts, args = \ parse_command_line_parameters(suppress_verbose=True, **script_info) input_dir = opts.input_dir paired_data = opts.paired_data parameter_fp = opts.parameter_fp read1_indicator = opts.read1_indicator read2_indicator = opts.read2_indicator leading_text = opts.leading_text trailing_text = opts.trailing_text include_input_dir_path = opts.include_input_dir_path output_dir = abspath(opts.output_dir) remove_filepath_in_name = opts.remove_filepath_in_name print_only = opts.print_only if remove_filepath_in_name and not include_input_dir_path: option_parser.error("If --remove_filepath_in_name is enabled, " "--include_input_dir_path must also be enabled.") if opts.parameter_fp: with open(opts.parameter_fp, 'U') as parameter_f: params_dict = parse_qiime_parameters(parameter_f) params_str = get_params_str(params_dict['extract_barcodes']) else: params_dict = {} params_str = "" create_dir(output_dir) all_files = [] extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq'] for root, dir, fps in walk(input_dir): for fp in fps: for extension in extensions: if fp.endswith(extension): all_files += [abspath(join(root, fp))] if paired_data: all_files, bc_pairs = get_pairs(all_files, read1_indicator, read2_indicator) commands = create_commands_eb(all_files, paired_data, output_dir, params_str, leading_text, trailing_text, include_input_dir_path, remove_filepath_in_name) qiime_config = load_qiime_config() if print_only: command_handler = print_commands else: command_handler = call_commands_serially logger = WorkflowLogger(generate_log_fp(output_dir), params=params_dict, qiime_config=qiime_config) # Call the command handler on the list of commands command_handler(commands, status_update_callback = no_status_updates, logger=logger, close_logger_on_success=True)
def main(): option_parser, opts, args = \ parse_command_line_parameters(suppress_verbose=True, **script_info) input_dir = opts.input_dir parameter_fp = opts.parameter_fp read1_indicator = opts.read1_indicator read2_indicator = opts.read2_indicator match_barcodes = opts.match_barcodes barcode_indicator = opts.barcode_indicator leading_text = opts.leading_text trailing_text = opts.trailing_text include_input_dir_path = opts.include_input_dir_path output_dir = abspath(opts.output_dir) remove_filepath_in_name = opts.remove_filepath_in_name print_only = opts.print_only if remove_filepath_in_name and not include_input_dir_path: option_parser.error("If --remove_filepath_in_name is enabled, " "--include_input_dir_path must also be enabled.") if opts.parameter_fp: with open(opts.parameter_fp, 'U') as parameter_f: params_dict = parse_qiime_parameters(parameter_f) params_str = get_params_str(params_dict['join_paired_ends']) else: params_dict = {} params_str = "" create_dir(output_dir) all_files = [] extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq'] for root, dir, fps in walk(input_dir): for fp in fps: for extension in extensions: if fp.endswith(extension): all_files += [abspath(join(root, fp))] pairs, bc_pairs = get_pairs(all_files, read1_indicator, read2_indicator, match_barcodes, barcode_indicator) commands = create_commands_jpe(pairs, output_dir, params_str, leading_text, trailing_text, include_input_dir_path, remove_filepath_in_name, match_barcodes, bc_pairs) qiime_config = load_qiime_config() if print_only: command_handler = print_commands else: command_handler = call_commands_serially logger = WorkflowLogger(generate_log_fp(output_dir), params=params_dict, qiime_config=qiime_config) # Call the command handler on the list of commands command_handler(commands, status_update_callback=no_status_updates, logger=logger, close_logger_on_success=True)
def pick_denovo_otus(input_fp, output_dir, new_ref_set_id, otu_picking_method, params, logger): try: d = params["pick_otus"].copy() del d["otu_picking_method"] except KeyError: pass d["uclust_otu_id_prefix"] = "%s.ReferenceOTU" % new_ref_set_id params_str = " %s" % get_params_str(d) # Build the OTU picking command result = "pick_otus.py -i %s -o %s -m %s %s" % (input_fp, output_dir, otu_picking_method, params_str) return result
def pick_denovo_otus(input_fp, output_dir, new_ref_set_id, otu_picking_method, params, logger): try: d = params['pick_otus'].copy() del d['otu_picking_method'] except KeyError: pass d['uclust_otu_id_prefix'] = '%s.ReferenceOTU' % new_ref_set_id params_str = ' %s' % get_params_str(d) # Build the OTU picking command result = 'pick_otus.py -i %s -o %s -m %s %s' %\ (input_fp, output_dir, otu_picking_method, params_str) return result
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp,'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ("Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ("Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories= [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir,'log_20*txt')) log_fp = generate_log_fp(output_dir) index_links.append(('Master run log',log_fp,_index_headers['run_summary'])) for old_log_fp in old_log_fps: index_links.append(('Previous run log',old_log_fp,_index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp,mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger,input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params['biom-summarize-table']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = \ "biom summarize-table -i %s -o %s --suppress-md5 %s" % \ (biom_fp, biom_table_stats_output_fp,params_str) commands.append([('Generate BIOM table summary', biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" \ % biom_table_stats_output_fp) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp,filtered_biom_fp,sampling_depth) commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) else: logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" \ % filtered_biom_fp) biom_fp = filtered_biom_fp # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" \ % ', '.join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip('_dm.txt'),fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric) plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,category) stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,category) if not exists(plot_output_fp): boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd)]) else: logger.write("Skipping make_distance_boxplots.py for %s as %s exists.\n\n" \ % (category, plot_output_fp)) index_links.append(('Distance boxplots (%s)' % bdiv_metric, plot_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric, stats_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('PCoA plot (%s)' % bdiv_metric, '%s/%s_emperor_pcoa_plot/index.html' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth) rarefaction_plots_output_fp = \ '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" \ % rarefaction_plots_output_fp) index_links.append(('Alpha rarefaction plots', rarefaction_plots_output_fp, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = '%s/%s_%s.txt' % \ (arare_full_output_dir,category,alpha_metric) if not exists(alpha_comparison_output_fp): compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str) commands.append([('Compare alpha diversity (%s, %s)' %\ (category,alpha_metric), compare_alpha_cmd)]) else: logger.write("Skipping compare_alpha_diversity.py for %s as %s exists.\n\n" \ % (category, alpha_comparison_output_fp)) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric), alpha_comparison_output_fp, _index_headers['alpha_diversity'])) if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob(join(output_dir,'taxa_summary_plots','*.html')) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" \ % ', '.join(existing_taxa_plot_html_fps)) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" \ % (category, ', '.join(existing_taxa_plot_html_fps))) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_otu_category_significance: try: params_str = get_params_str(params['otu_category_significance']) except KeyError: params_str = '' # OTU category significance for category in categories: category_signifance_fp = \ '%s/category_significance_%s.txt' % (output_dir, category) if not exists(category_signifance_fp): # Build the OTU cateogry significance command category_significance_cmd = \ 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\ (biom_fp, mapping_fp, category, category_signifance_fp, params_str) commands.append([('OTU category significance (%s)' % category, category_significance_cmd)]) else: logger.write("Skipping otu_category_significance.py for %s as %s exists.\n\n" \ % (category, category_signifance_fp)) index_links.append(('Category significance (%s)' % category, category_signifance_fp, _index_headers['otu_category_sig'])) filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)]) index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, filtered_biom_gzip_fp, _index_headers['run_summary'])) else: logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" \ % filtered_biom_gzip_fp) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links,index_fp)
def pick_reference_otus( input_fp, output_dir, otu_picking_method, refseqs_fp, parallel, params, logger, similarity_override=None ): params_copy = deepcopy(params) if "pick_otus" in params_copy and "refseqs_fp" in params_copy["pick_otus"]: raise WorkflowError( "Cannot pass pick_otus:refseqs_fp in parameters file. This can only be" " passed on the command line or through the API." ) if similarity_override is not None: logger.write("Overridding similiary with %1.3f.\n" % similarity_override) if "pick_otus" in params_copy: params_copy["pick_otus"]["similarity"] = str(similarity_override) else: params_copy["pick_otus"] = {"similarity": str(similarity_override)} if parallel and otu_picking_method == "uclust_ref": # Grab the parallel-specific parameters try: params_str = get_params_str(params_copy["parallel"]) except KeyError: params_str = "" # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --otu_picking_method # option. This works for now though. if "otu_picking_method" in params_copy["pick_otus"]: del params_copy["pick_otus"]["otu_picking_method"] except KeyError: pass params_str += " %s" % get_params_str(params_copy["pick_otus"]) otu_picking_script = "parallel_pick_otus_%s.py" % otu_picking_method # Build the OTU picking command pick_otus_cmd = "%s -i %s -o %s -r %s -T %s" % ( otu_picking_script, input_fp, output_dir, refseqs_fp, params_str, ) else: try: params_str = get_params_str(params_copy["pick_otus"]) except KeyError: params_str = "" # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += " --suppress_new_clusters" logger.write("Forcing --suppress_new_clusters as this is reference-based OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = "pick_otus.py -i %s -o %s -r %s -m %s %s" % ( input_fp, output_dir, refseqs_fp, otu_picking_method, params_str, ) return pick_otus_cmd
def main(): option_parser, opts, args =\ parse_command_line_parameters(suppress_verbose=True, **script_info) input_dir = opts.input_dir demultiplexing_method = opts.demultiplexing_method parameter_fp = opts.parameter_fp read_indicator = opts.read_indicator barcode_indicator = opts.barcode_indicator mapping_indicator = opts.mapping_indicator mapping_extensions = opts.mapping_extensions.split(',') sampleid_indicator = opts.sampleid_indicator leading_text = opts.leading_text trailing_text = opts.trailing_text include_input_dir_path = opts.include_input_dir_path output_dir = abspath(opts.output_dir) remove_filepath_in_name = opts.remove_filepath_in_name print_only = opts.print_only if remove_filepath_in_name and not include_input_dir_path: option_parser.error("If --remove_filepath_in_name enabled, " "--include_input_dir_path must be enabled.") if opts.parameter_fp: with open(opts.parameter_fp, 'U') as parameter_f: params_dict = parse_qiime_parameters(parameter_f) params_str = get_params_str(params_dict['split_libraries_fastq']) else: params_dict = {} params_str = "" create_dir(output_dir) all_fastq = [] all_mapping = [] extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq'] for root, dir, fps in walk(input_dir): for fp in fps: for extension in extensions: if fp.endswith(extension): all_fastq += [abspath(join(root, fp))] if demultiplexing_method == 'mapping_barcode_files': for root, dir, fps in walk(input_dir): for fp in fps: for mapping_extension in mapping_extensions: if fp.endswith(mapping_extension): all_mapping += [abspath(join(root, fp))] all_files = get_matching_files(all_fastq, all_mapping, read_indicator, barcode_indicator, mapping_indicator) else: all_files = all_fastq commands = create_commands_slf(all_files, demultiplexing_method, output_dir, params_str, leading_text, trailing_text, include_input_dir_path, remove_filepath_in_name, sampleid_indicator) qiime_config = load_qiime_config() if print_only: command_handler = print_commands else: command_handler = call_commands_serially logger = WorkflowLogger(generate_log_fp(output_dir), params=params_dict, qiime_config=qiime_config) # Call the command handler on the list of commands command_handler(commands, status_update_callback=no_status_updates, logger=logger, close_logger_on_success=True)
def align_and_tree(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False ## Prep the pynast alignment command alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method) aln_fp = '%s/%s_aligned.fasta' % (pynast_dir,input_basename) failures_fp = '%s/%s_failures.fasta' % (pynast_dir,input_basename) if exists(pynast_dir): rmtree(pynast_dir) if parallel: # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['align_seqs'].copy() if 'alignment_method' in d: del d['alignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel pynast alignment command align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (repset_fasta_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\ (repset_fasta_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) ## Prep the alignment filtering command filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\ (pynast_dir,input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\ (pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) ## Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\ (filtered_aln_fp, tree_fp,params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) if exists(tree_fp): remove_files([tree_fp]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return failures_fp
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, histogram_categories=None, tree_fp=None, parallel=False, logger=None, suppress_3d_plots=False, suppress_2d_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute a beta diversity distance matrix; 2) Peform a principal coordinates analysis on the result of Step 1; 3) Generate a 3D prefs file for optimized coloring of continuous variables; 4) Generate a 3D plot for all mapping fields with colors optimized for continuous data; 5) Generate a 3D plot for all mapping fields with colors optimized for discrete data. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp,'U')) if histogram_categories: invalid_categories = set(histogram_categories) - set(mapping_header) if invalid_categories: raise ValueError,\ "Invalid histogram categories - these must exactly match "+\ "mapping file column headers: %s" % (' '.join(invalid_categories)) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\ (python_exe_fp, script_dir, otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd)]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac'] # Prep the 3d prefs file generator command prefs_fp = '%s/prefs.txt' % output_dir try: params_str = get_params_str(params['make_prefs_file']) except KeyError: params_str = '' if not 'mapping_headers_to_use' in params['make_prefs_file']: params_str = '%s --mapping_headers_to_use %s' \ % (params_str,mapping_fields) # Build the 3d prefs file generator command prefs_cmd = \ '%s %s/make_prefs_file.py -m %s -o %s %s' %\ (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str) commands.append([('Build prefs file', prefs_cmd)]) dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str,tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp))]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate 3d plots if not suppress_3d_plots: # Prep the continuous-coloring 3d plots command continuous_3d_dir = '%s/%s_3d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_3d_command = \ '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_3d_dir = '%s/%s_3d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 3d plots command discrete_3d_command = \ '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir, mapping_fp, params_str) commands.append([\ ('Make 3D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_3d_command),\ ('Make 3D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_3d_command,)]) # Generate 3d plots if not suppress_2d_plots: # Prep the continuous-coloring 3d plots command continuous_2d_dir = '%s/%s_2d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_2d_command = \ '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_2d_dir = '%s/%s_2d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 2d plots command discrete_2d_command = \ '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir, mapping_fp, params_str) commands.append([\ ('Make 2D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_2d_command),\ ('Make 2D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_2d_command,)]) if histogram_categories: # Prep the discrete-coloring 3d plots command histograms_dir = '%s/%s_histograms/' %\ (output_dir, beta_diversity_metric) create_dir(histograms_dir) try: params_str = get_params_str(params['make_distance_histograms']) except KeyError: params_str = '' # Build the make_distance_histograms command distance_histograms_command = \ '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\ (python_exe_fp, script_dir, beta_div_fp, histograms_dir, mapping_fp, ','.join(histogram_categories), params_str) commands.append([\ ('Make Distance Histograms (%s)' %\ beta_diversity_metric,distance_histograms_command)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def align_and_tree(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # Prep the pynast alignment command alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method) aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename) failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename) if exists(pynast_dir): rmtree(pynast_dir) if parallel: # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['align_seqs'].copy() if 'alignment_method' in d: del d['alignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel pynast alignment command align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (repset_fasta_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\ (repset_fasta_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) # Prep the alignment filtering command filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\ (pynast_dir, input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\ (pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) # Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\ (filtered_aln_fp, tree_fp, params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) if exists(tree_fp): remove_files([tree_fp]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return failures_fp
def run_pick_closed_reference_otus(input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, assign_taxonomy=False, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) If assignment_taxonomy is True, choose representative sequence for OTUs and assign taxonomy using a classifier. 3) Build an OTU table with optional predefined taxonomy (if assign_taxonomy=False) or taxonomic assignments from step 2 (if assign_taxonomy=True). """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = [ 'blast', 'uclust_ref', 'usearch61_ref', 'usearch_ref', 'sortmerna' ] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method, ' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write("Forcing --suppress_new_clusters as this is " "closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Assign taxonomy using a taxonomy classifier, if request by the user. # (Alternatively predefined taxonomic assignments will be used, if provided.) if assign_taxonomy: # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, rep_set_fp, params_str) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # If assign_taxonomy is True, this will be the path to the taxonomic # assignment results. If assign_taxonomy is False this will be either # the precomputed taxonomic assignments that the user passed in, # or None. if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\ (otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_summarize_taxa_through_plots(otu_table_fp, mapping_fp, output_dir, mapping_cat, sort, command_handler, params, qiime_config, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation for summarizing taxonomies and generating plots The steps performed by this function are: 1) Summarize OTU by Category 2) Summarize Taxonomy 3) Plot Taxonomy Summary """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp]) # if mapping category not passed via command-line, # check if it is passed in params file if not mapping_cat: try: mapping_cat = params['collapse_samples']['collapse_fields'] except: mapping_cat = None try: params_str = get_params_str(params['collapse_samples']) # Need to remove the mapping category option, since it is defined above. # Using this method since we don't want to change the params dict split_params = params_str.split('--') updated_params_str = [] for i in split_params: if not i.startswith('collapse_fields'): updated_params_str.append(i) params_str = '--'.join(updated_params_str) except: params_str = '' if mapping_cat: base_filename = mapping_cat.replace(' ', '-').replace(',', '') output_biom_fp = join(output_dir, '%s_otu_table.biom' % base_filename) output_map_fp = join(output_dir, '%s_map.txt' % base_filename) # Build the collapse samples command collapse_samples_cmd = \ "collapse_samples.py -m %s -b %s --output_biom_fp %s --output_mapping_fp %s --collapse_fields '%s' %s" %\ (mapping_fp, otu_table_fp, output_biom_fp, output_map_fp, mapping_cat, params_str) commands.append([('Collapse samples in OTU table by categories', collapse_samples_cmd)]) otu_table_fp = output_biom_fp # Build the sort OTU table command if sort: # Prep the sort_otu_table command try: params_str = get_params_str(params['sort_otu_table']) except: params_str = '' # define output otu table sorted_fp = join(output_dir, splitext(split(otu_table_fp)[-1])[0] + '_sorted.biom') if mapping_cat or params_str == '': # for this case we don't have a collapsed mapping file so must # handle separately sort_otu_table_cmd = \ "sort_otu_table.py -i %s -o %s" % (otu_table_fp, sorted_fp) else: sort_otu_table_cmd = \ "sort_otu_table.py -i %s -o %s -m %s %s" %\ (otu_table_fp, sorted_fp, mapping_fp, params_str) commands.append([('Sort OTU Table', sort_otu_table_cmd)]) # redefine otu_table_fp to use otu_table_fp = sorted_fp # Prep the summarize taxonomy command try: params_str = get_params_str(params['summarize_taxa']) except: params_str = '' try: sum_taxa_levels = params['summarize_taxa']['level'] except: sum_taxa_levels = None # Build the summarize taxonomy command summarize_taxa_cmd = 'summarize_taxa.py -i %s -o %s %s' %\ (otu_table_fp, output_dir, params_str) commands.append([('Summarize Taxonomy', summarize_taxa_cmd)]) sum_taxa_fps = [] if sum_taxa_levels: basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0]) for i in sum_taxa_levels.split(','): sum_taxa_fps.append(basename + '_L%s.txt' % (str(i))) else: basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0]) # this is the default levels from summarize_taxa, but cannot import # script to get these values for i in [2, 3, 4, 5, 6]: sum_taxa_fps.append(basename + '_L%s.txt' % (str(i))) # Prep the plot taxa summary plot command(s) taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir create_dir(taxa_summary_plots_dir) try: params_str = get_params_str(params['plot_taxa_summary']) except: params_str = '' # Build the plot taxa summary plot command(s) plot_taxa_summary_cmd =\ 'plot_taxa_summary.py -i %s -o %s %s' %\ (','.join(sum_taxa_fps), taxa_summary_plots_dir, params_str) commands.append([('Plot Taxonomy Summary', plot_taxa_summary_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_jackknifed_beta_diversity(otu_table_fp, tree_fp, seqs_per_sample, output_dir, command_handler, params, qiime_config, mapping_fp, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout, master_tree=None): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute beta diversity distance matrix from otu table (and tree, if applicable) 2) Build rarefied OTU tables; 3) Build UPGMA tree from full distance matrix; 4) Compute distance matrics for rarefied OTU tables; 5) Build UPGMA trees from rarefied OTU table distance matrices; 5.5) Build a consensus tree from the rarefied UPGMA trees 6) Compare rarefied OTU table distance matrix UPGMA trees to tree full UPGMA tree and write support file and newick tree with support values as node labels. master_tree can be 'full' or 'consensus', default full """ # Prepare some variables for the later steps if master_tree is None: master_tree = 'full' otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] # Prep the beta-diversity command try: params_str = get_params_str(params['beta_diversity']) except KeyError: params_str = '' if tree_fp: params_str = '%s -t %s' % (params_str, tree_fp) # Build the beta-diversity command beta_div_cmd = 'beta_diversity.py -i %s -o %s %s' %\ (otu_table_fp, output_dir, params_str) commands.append([ ('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd) ]) # Prep rarefaction command rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions_even_depth']) except KeyError: params_str = '' # Build the rarefaction command rarefaction_cmd = \ 'multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\ (otu_table_fp, seqs_per_sample, rarefaction_dir, params_str) commands.append([('Rarefaction', rarefaction_cmd)]) # Begin iterating over beta diversity distance metrics, if more than one # was provided for beta_diversity_metric in beta_diversity_metrics: metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric) distance_matrix_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) # Prep the hierarchical clustering command (for full distance matrix) full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir, otu_table_basename) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for full distance matrix) hierarchical_cluster_cmd = 'upgma_cluster.py -i %s -o %s %s' %\ (distance_matrix_fp, full_tree_fp, params_str) commands.append([ ('UPGMA on full distance matrix: %s' % beta_diversity_metric, hierarchical_cluster_cmd) ]) # Prep the beta diversity command (for rarefied OTU tables) dm_dir = '%s/rare_dm/' % metric_output_dir create_dir(dm_dir) # the metrics parameter needs to be ignored as we need to run # beta_diversity one metric at a time to keep the per-metric # output files in separate directories try: d = params['beta_diversity'].copy() del d['metrics'] except KeyError: params_str = {} params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric if tree_fp: params_str = '%s -t %s' % (params_str, tree_fp) if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the parallel beta diversity command (for rarefied OTU # tables) beta_div_rarefied_cmd = \ 'parallel_beta_diversity.py -T -i %s -o %s %s' %\ (rarefaction_dir, dm_dir, params_str) else: # Build the serial beta diversity command (for rarefied OTU tables) beta_div_rarefied_cmd = \ 'beta_diversity.py -i %s -o %s %s' %\ (rarefaction_dir, dm_dir, params_str) commands.append([('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric, beta_div_rarefied_cmd)]) # Prep the hierarchical clustering command (for rarefied # distance matrices) upgma_dir = '%s/rare_upgma/' % metric_output_dir create_dir(upgma_dir) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for rarefied # distance matrices) hierarchical_cluster_cmd =\ 'upgma_cluster.py -i %s -o %s %s' % (dm_dir, upgma_dir, params_str) commands.append([ ('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric, hierarchical_cluster_cmd) ]) # Build the consensus tree command consensus_tree_cmd =\ 'consensus_tree.py -i %s -o %s %s' %\ (upgma_dir, metric_output_dir + "/rare_upgma_consensus.tre", params_str) commands.append([('consensus on rarefied distance matrices (%s)' % beta_diversity_metric, consensus_tree_cmd)]) # Prep the tree compare command tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir create_dir(tree_compare_dir) try: params_str = get_params_str(params['tree_compare']) except KeyError: params_str = '' # Build the tree compare command if master_tree == "full": master_tree_fp = full_tree_fp elif master_tree == "consensus": master_tree_fp = metric_output_dir + "/rare_upgma_consensus.tre" else: raise RuntimeError('master tree method "%s" not found' % (master_tree, )) tree_compare_cmd = 'tree_compare.py -s %s -m %s -o %s %s' %\ (upgma_dir, master_tree_fp, tree_compare_dir, params_str) commands.append([('Tree compare (%s)' % beta_diversity_metric, tree_compare_cmd)]) # Prep the PCoA command pcoa_dir = '%s/pcoa/' % metric_output_dir create_dir(pcoa_dir) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the PCoA command pcoa_cmd = 'principal_coordinates.py -i %s -o %s %s' %\ (dm_dir, pcoa_dir, params_str) commands.append([('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)]) # Prep the emperor plots command emperor_dir = '%s/emperor_pcoa_plots/' % metric_output_dir create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' emperor_cmd = 'make_emperor.py -i %s -o %s -m %s %s' %\ (pcoa_dir, emperor_dir, mapping_fp, params_str) commands.append([('emperor plots (%s)' % beta_diversity_metric, emperor_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, tree_fp=None, parallel=False, logger=None, suppress_emperor_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots The steps performed by this function are: 1) Compute a beta diversity distance matrix for each metric 2) Peform a principal coordinates analysis on the result of step 1 3) Generate an emperor plot for each result of step 2 """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp, 'U')) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ 'single_rarefaction.py -i %s -o %s -d %d' %\ (otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd) ]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str, tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = 'parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append([('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = 'beta_diversity.py -i %s -o %s --metrics %s %s' %\ (otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append([('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([ ('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp)) ]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = 'principal_coordinates.py -i %s -o %s %s' %\ (beta_div_fp, pc_fp, params_str) commands.append([('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate emperor plots if not suppress_emperor_plots: # Prep the emperor plots command emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir, beta_diversity_metric) create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command emperor_command = \ 'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp, emperor_dir, mapping_fp, params_str) commands.append([ ('Make emperor plots, %s)' % beta_diversity_metric, emperor_command) ]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_alpha_rarefaction(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, tree_fp=None, num_steps=10, parallel=False, logger=None, min_rare_depth=10, max_rare_depth=None, suppress_md5=False, status_update_callback=print_to_stdout, plot_stderr_and_stddev=False, retain_intermediate_files=True): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Generate rarefied OTU tables; 2) Compute alpha diversity metrics for each rarefied OTU table; 3) Collate alpha diversity results; 4) Generate alpha rarefaction plots. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) if max_rare_depth is None: min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats( load_table(otu_table_fp)) max_rare_depth = median_count step = int((max_rare_depth - min_rare_depth) / num_steps) or 1 max_rare_depth = int(max_rare_depth) rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions']) except KeyError: params_str = '' if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the rarefaction command rarefaction_cmd = \ 'parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) else: # Build the rarefaction command rarefaction_cmd = \ 'multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) commands.append([('Alpha rarefaction', rarefaction_cmd)]) # Prep the alpha diversity command alpha_diversity_dir = '%s/alpha_div/' % output_dir create_dir(alpha_diversity_dir) try: params_str = get_params_str(params['alpha_diversity']) except KeyError: params_str = '' if tree_fp: params_str += ' -t %s' % tree_fp if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the alpha diversity command alpha_diversity_cmd = \ "parallel_alpha_diversity.py -T -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) else: # Build the alpha diversity command alpha_diversity_cmd = \ "alpha_diversity.py -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) commands.append([('Alpha diversity on rarefied OTU tables', alpha_diversity_cmd)]) # Prep the alpha diversity collation command alpha_collated_dir = '%s/alpha_div_collated/' % output_dir create_dir(alpha_collated_dir) try: params_str = get_params_str(params['collate_alpha']) except KeyError: params_str = '' # Build the alpha diversity collation command alpha_collated_cmd = 'collate_alpha.py -i %s -o %s %s' %\ (alpha_diversity_dir, alpha_collated_dir, params_str) commands.append([('Collate alpha', alpha_collated_cmd)]) if not retain_intermediate_files: commands.append([ ('Removing intermediate files', 'rm -r %s %s' % (rarefaction_dir, alpha_diversity_dir)) ]) else: commands.append([('Skipping removal of intermediate files.', '')]) # Prep the make rarefaction plot command(s) try: params_str = get_params_str(params['make_rarefaction_plots']) except KeyError: params_str = '' if 'std_type' in params[ 'make_rarefaction_plots'] or not plot_stderr_and_stddev: rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir create_dir(rarefaction_plot_dir) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) else: rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir create_dir(rarefaction_plot_dir_stddev) create_dir(rarefaction_plot_dir_stderr) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, histogram_categories=None, tree_fp=None, parallel=False, logger=None, suppress_3d_plots=False, suppress_2d_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute a beta diversity distance matrix; 2) Peform a principal coordinates analysis on the result of Step 1; 3) Generate a 3D prefs file for optimized coloring of continuous variables; 4) Generate a 3D plot for all mapping fields with colors optimized for continuous data; 5) Generate a 3D plot for all mapping fields with colors optimized for discrete data. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp,'U')) if histogram_categories: invalid_categories = set(histogram_categories) - set(mapping_header) if invalid_categories: raise ValueError,\ "Invalid histogram categories - these must exactly match "+\ "mapping file column headers: %s" % (' '.join(invalid_categories)) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\ (python_exe_fp, script_dir, otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd) ]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] # Prep the 3d prefs file generator command prefs_fp = '%s/prefs.txt' % output_dir try: params_str = get_params_str(params['make_prefs_file']) except KeyError: params_str = '' if not 'mapping_headers_to_use' in params['make_prefs_file']: params_str = '%s --mapping_headers_to_use %s' \ % (params_str,mapping_fields) # Build the 3d prefs file generator command prefs_cmd = \ '%s %s/make_prefs_file.py -m %s -o %s %s' %\ (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str) commands.append([('Build prefs file', prefs_cmd)]) dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str, tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([ ('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp)) ]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate 3d plots if not suppress_3d_plots: # Prep the continuous-coloring 3d plots command continuous_3d_dir = '%s/%s_3d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_3d_command = \ '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_3d_dir = '%s/%s_3d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 3d plots command discrete_3d_command = \ '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir, mapping_fp, params_str) commands.append([\ ('Make 3D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_3d_command),\ ('Make 3D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_3d_command,)]) # Generate 3d plots if not suppress_2d_plots: # Prep the continuous-coloring 3d plots command continuous_2d_dir = '%s/%s_2d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_2d_command = \ '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_2d_dir = '%s/%s_2d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 2d plots command discrete_2d_command = \ '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir, mapping_fp, params_str) commands.append([\ ('Make 2D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_2d_command),\ ('Make 2D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_2d_command,)]) if histogram_categories: # Prep the discrete-coloring 3d plots command histograms_dir = '%s/%s_histograms/' %\ (output_dir, beta_diversity_metric) create_dir(histograms_dir) try: params_str = get_params_str(params['make_distance_histograms']) except KeyError: params_str = '' # Build the make_distance_histograms command distance_histograms_command = \ '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\ (python_exe_fp, script_dir, beta_div_fp, histograms_dir, mapping_fp, ','.join(histogram_categories), params_str) commands.append([\ ('Make Distance Histograms (%s)' %\ beta_diversity_metric,distance_histograms_command)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_group_significance=False, status_update_callback=print_to_stdout, ): """ """ if categories is not None: # Validate categories provided by the users mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U")) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames)) ) if metadata_map.hasSingleCategoryValue(c): raise ValueError( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c ) else: categories = [] comma_separated_categories = ",".join(categories) # prep some variables if params is None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = "%s/index.html" % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir, "log_20*txt")) log_fp = generate_log_fp(output_dir) index_links.append(("Master run log", log_fp, _index_headers["run_summary"])) for old_log_fp in old_log_fps: index_links.append(("Previous run log", old_log_fp, _index_headers["run_summary"])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp is not None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params["biom-summarize-table"]) except KeyError: params_str = "" biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = "biom summarize-table -i %s -o %s --suppress-md5 %s" % ( biom_fp, biom_table_stats_output_fp, params_str, ) commands.append([("Generate BIOM table summary", biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp) index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % ( biom_fp, filtered_biom_fp, sampling_depth, ) commands.append( [ ( "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth, filter_samples_cmd, ) ] ) else: logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp) biom_fp = filtered_biom_fp # rarify the BIOM table to sampling_depth rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth) if not exists(rarefied_biom_fp): single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" % (biom_fp, rarefied_biom_fp, sampling_depth) commands.append([("Rarify the OTU table to %d sequences/sample" % sampling_depth, single_rarefaction_cmd)]) else: logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp) # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob("%s/*_dm.txt" % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=rarefied_biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, # Note: we pass sampling depth=None here as # we rarify the BIOM table above and pass that # in here. sampling_depth=None, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ", ".join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip("_dm.txt"), fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params["make_distance_boxplots"]) except KeyError: params_str = "" for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric) plot_output_fp = "%s/%s_Distances.pdf" % (boxplots_output_dir, category) stats_output_fp = "%s/%s_Stats.txt" % (boxplots_output_dir, category) if not exists(plot_output_fp): boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % ( dm_fp, category, boxplots_output_dir, mapping_fp, params_str, ) commands.append([("Boxplots (%s)" % category, boxplots_cmd)]) else: logger.write( "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp) ) index_links.append( ( "Distance boxplots (%s)" % bdiv_metric, plot_output_fp, _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance boxplots statistics (%s)" % bdiv_metric, stats_output_fp, _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "PCoA plot (%s)" % bdiv_metric, "%s/%s_emperor_pcoa_plot/index.html" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance matrix (%s)" % bdiv_metric, "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Principal coordinate matrix (%s)" % bdiv_metric, "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) if not suppress_alpha_diversity: # Alpha rarefaction workflow arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth) rarefaction_plots_output_fp = "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, retain_intermediate_files=False, ) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp) index_links.append(("Alpha rarefaction plots", rarefaction_plots_output_fp, _index_headers["alpha_diversity"])) collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir) try: params_str = get_params_str(params["compare_alpha_diversity"]) except KeyError: params_str = "" if len(categories) > 0: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] compare_alpha_output_dir = "%s/compare_%s" % (arare_full_output_dir, alpha_metric) if not exists(compare_alpha_output_dir): compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % ( collated_alpha_diversity_fp, mapping_fp, comma_separated_categories, compare_alpha_output_dir, params_str, ) commands.append([("Compare alpha diversity (%s)" % alpha_metric, compare_alpha_cmd)]) for category in categories: alpha_comparison_stat_fp = "%s/%s_stats.txt" % (compare_alpha_output_dir, category) alpha_comparison_boxplot_fp = "%s/%s_boxplots.pdf" % (compare_alpha_output_dir, category) index_links.append( ( "Alpha diversity statistics (%s, %s)" % (category, alpha_metric), alpha_comparison_stat_fp, _index_headers["alpha_diversity"], ) ) index_links.append( ( "Alpha diversity boxplots (%s, %s)" % (category, alpha_metric), alpha_comparison_boxplot_fp, _index_headers["alpha_diversity"], ) ) else: logger.write( "Skipping compare_alpha_diversity.py" " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir) ) else: logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n") if not suppress_taxa_summary: taxa_plots_output_dir = "%s/taxa_plots/" % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob(join(taxa_plots_output_dir, "taxa_summary_plots", "*.html")) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write( "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" % ", ".join(existing_taxa_plot_html_fps) ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) for category in categories: taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob("%s/taxa_summary_plots/*.html" % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write( "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" % (category, ", ".join(existing_taxa_plot_html_fps)) ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) if not suppress_group_significance: params_str = get_params_str(params["group_significance"]) # group significance tests, aka category significance for category in categories: group_signifance_fp = "%s/group_significance_%s.txt" % (output_dir, category) if not exists(group_signifance_fp): # Build the OTU cateogry significance command group_significance_cmd = "group_significance.py -i %s -m %s -c %s -o %s %s" % ( rarefied_biom_fp, mapping_fp, category, group_signifance_fp, params_str, ) commands.append([("Group significance (%s)" % category, group_significance_cmd)]) else: logger.write( "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp) ) index_links.append( ("Category significance (%s)" % category, group_signifance_fp, _index_headers["group_significance"]) ) filtered_biom_gzip_fp = "%s.gz" % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)]) else: logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp) index_links.append( ( "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth, filtered_biom_gzip_fp, _index_headers["run_summary"], ) ) rarified_biom_gzip_fp = "%s.gz" % rarefied_biom_fp if not exists(rarified_biom_gzip_fp): commands.append([("Compress the rarified BIOM table", "gzip %s" % rarefied_biom_fp)]) else: logger.write("Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp) index_links.append( ( "Rarified BIOM table (sampling depth: %d)" % sampling_depth, rarified_biom_gzip_fp, _index_headers["run_summary"], ) ) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links, index_fp)
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout, ): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U")) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames)) ) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c ) else: categories = [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = "%s/index.html" % output_dir index_links = [] commands = [] # begin logging log_fp = generate_log_fp(output_dir) index_links.append(("Master run log", log_fp, _index_headers["run_summary"])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run print_biom_table_summary.py on input BIOM table try: params_str = get_params_str(params["print_biom_table_summary"]) except KeyError: params_str = "" biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir print_biom_table_summary_cmd = "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % ( biom_fp, biom_table_stats_output_fp, params_str, ) index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"])) commands.append([("Generate BIOM table summary", print_biom_table_summary_cmd)]) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % ( biom_fp, filtered_biom_fp, sampling_depth, ) commands.append( [ ( "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth, filter_samples_cmd, ) ] ) biom_fp = filtered_biom_fp # run initial commands and reset the command list command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth) even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, # force suppression of distance histograms - boxplots work better # in this context, and are created below. histogram_categories=[], tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric) try: params_str = get_params_str(params["make_distance_boxplots"]) except KeyError: params_str = "" boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % ( dm_fp, category, boxplots_output_dir, mapping_fp, params_str, ) commands.append([("Boxplots (%s)" % category, boxplots_cmd)]) index_links.append( ( "Distance boxplots (%s)" % bdiv_metric, "%s/%s_Distances.pdf" % (boxplots_output_dir, category), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance boxplots statistics (%s)" % bdiv_metric, "%s/%s_Stats.txt" % (boxplots_output_dir, category), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "3D plot (%s, continuous coloring)" % bdiv_metric, "%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "3D plot (%s, discrete coloring)" % bdiv_metric, "%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "2D plot (%s, continuous coloring)" % bdiv_metric, "%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "2D plot (%s, discrete coloring)" % bdiv_metric, "%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html" % (bdiv_even_output_dir, bdiv_metric, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance matrix (%s)" % bdiv_metric, "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Principal coordinate matrix (%s)" % bdiv_metric, "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth) run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, ) index_links.append( ( "Alpha rarefaction plots", "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir, _index_headers["alpha_diversity"], ) ) collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir) try: params_str = get_params_str(params["compare_alpha_diversity"]) except KeyError: params_str = "" for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = "%s/%s_%s.txt" % (arare_full_output_dir, category, alpha_metric) compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % ( collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str, ) commands.append([("Compare alpha diversity (%s, %s)" % (category, alpha_metric), compare_alpha_cmd)]) index_links.append( ( "Alpha diversity statistics (%s, %s)" % (category, alpha_metric), alpha_comparison_output_fp, _index_headers["alpha_diversity"], ) ) if not suppress_taxa_summary: taxa_plots_output_dir = "%s/taxa_plots/" % output_dir run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) for category in categories: taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category) run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) if not suppress_otu_category_significance: # OTU category significance for category in categories: category_signifance_fp = "%s/category_significance_%s.txt" % (output_dir, category) try: params_str = get_params_str(params["otu_category_significance"]) except KeyError: params_str = "" # Build the OTU cateogry significance command category_significance_cmd = "otu_category_significance.py -i %s -m %s -c %s -o %s %s" % ( biom_fp, mapping_fp, category, category_signifance_fp, params_str, ) commands.append([("OTU category significance (%s)" % category, category_significance_cmd)]) index_links.append( ("Category significance (%s)" % category, category_signifance_fp, _index_headers["otu_category_sig"]) ) commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)]) index_links.append( ( "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth, "%s.gz" % filtered_biom_fp, _index_headers["run_summary"], ) ) command_handler(commands, status_update_callback, logger) generate_index_page(index_links, index_fp)
def assign_tax(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, repset_fasta_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, repset_fasta_fp, params_str) if exists(assign_taxonomy_dir): rmtree(assign_taxonomy_dir) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return taxonomy_fp
def run_pick_closed_reference_otus( input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) Build an OTU table with optional pre-defined taxonmy. """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = ['blast','uclust_ref','usearch61_ref'] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method,' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp,refseqs_fp,taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s %s/%s -i %s -o %s -r %s -T %s' %\ (python_exe_fp, script_dir, otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str+= ' --suppress_new_clusters' logger.write("Forcing --suppress_new_clusters as this is closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (python_exe_fp, script_dir, input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_reference_otus(input_fp, output_dir, otu_picking_method, refseqs_fp, parallel, params, logger, similarity_override=None): params_copy = deepcopy(params) if 'pick_otus' in params_copy and 'refseqs_fp' in params_copy['pick_otus']: raise WorkflowError( "Cannot pass pick_otus:refseqs_fp in parameters file. This can only be" " passed on the command line or through the API.") if similarity_override is not None: logger.write('Similiarity of %1.3f being used for pre-filtering.\n' % similarity_override) if 'pick_otus' in params_copy: params_copy['pick_otus']['similarity'] = str(similarity_override) else: params_copy['pick_otus'] = {'similarity': str(similarity_override)} if parallel and (otu_picking_method == 'uclust_ref' or otu_picking_method == "sortmerna"): # Grab the parallel-specific parameters try: params_str = get_params_str(params_copy['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --otu_picking_method # option. This works for now though. if 'otu_picking_method' in params_copy['pick_otus']: del params_copy['pick_otus']['otu_picking_method'] except KeyError: pass params_str += ' %s' % get_params_str(params_copy['pick_otus']) otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, output_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params_copy['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write( "Forcing --suppress_new_clusters as this is reference-based OTU picking.\n\n" ) # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, output_dir, refseqs_fp, otu_picking_method, params_str) return pick_otus_cmd
def run_pick_closed_reference_otus( input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, assign_taxonomy=False, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) If assignment_taxonomy is True, choose representative sequence for OTUs and assign taxonomy using a classifier. 3) Build an OTU table with optional predefined taxonomy (if assign_taxonomy=False) or taxonomic assignments from step 2 (if assign_taxonomy=True). """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = ['blast', 'uclust_ref', 'usearch61_ref', 'usearch_ref', 'sortmerna'] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method, ' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref' or otu_picking_method == 'sortmerna'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write( "Forcing --suppress_new_clusters as this is " "closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Assign taxonomy using a taxonomy classifier, if request by the user. # (Alternatively predefined taxonomic assignments will be used, if provided.) if assign_taxonomy: # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, rep_set_fp, params_str) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # If assign_taxonomy is True, this will be the path to the taxonomic # assignment results. If assign_taxonomy is False this will be either # the precomputed taxonomic assignments that the user passed in, # or None. if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\ (otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses(biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_group_significance=False, status_update_callback=print_to_stdout): """ """ if categories is not None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp, 'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories = [] comma_separated_categories = ','.join(categories) # prep some variables if params is None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir, 'log_20*txt')) log_fp = generate_log_fp(output_dir) index_links.append( ('Master run log', log_fp, _index_headers['run_summary'])) for old_log_fp in old_log_fps: index_links.append( ('Previous run log', old_log_fp, _index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp is not None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params['biom-summarize-table']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = \ "biom summarize-table -i %s -o %s %s" % \ (biom_fp, biom_table_stats_output_fp, params_str) commands.append([('Generate BIOM table summary', biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp, filtered_biom_fp, sampling_depth) commands.append([( 'Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) else: logger.write( "Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp) biom_fp = filtered_biom_fp # rarify the BIOM table to sampling_depth rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth) if not exists(rarefied_biom_fp): single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" %\ (biom_fp, rarefied_biom_fp, sampling_depth) commands.append([ ('Rarify the OTU table to %d sequences/sample' % sampling_depth, single_rarefaction_cmd) ]) else: logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp) # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir, sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=rarefied_biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, # Note: we pass sampling depth=None here as # we rarify the BIOM table above and pass that # in here. sampling_depth=None, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ', '.join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip('_dm.txt'), fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % ( bdiv_even_output_dir, bdiv_metric) plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir, category) stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir, category) if not exists(plot_output_fp): boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd) ]) else: logger.write( "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp)) index_links.append( ('Distance boxplots (%s)' % bdiv_metric, plot_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Distance boxplots statistics (%s)' % bdiv_metric, stats_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('PCoA plot (%s)' % bdiv_metric, '%s/%s_emperor_pcoa_plot/index.html' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: # Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir, sampling_depth) rarefaction_plots_output_fp = \ '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, retain_intermediate_files=False) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp) index_links.append( ('Alpha rarefaction plots', rarefaction_plots_output_fp, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' if len(categories) > 0: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext( split(collated_alpha_diversity_fp)[1])[0] compare_alpha_output_dir = '%s/compare_%s' % \ (arare_full_output_dir, alpha_metric) if not exists(compare_alpha_output_dir): compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, comma_separated_categories, compare_alpha_output_dir, params_str) commands.append([ ('Compare alpha diversity (%s)' % alpha_metric, compare_alpha_cmd) ]) for category in categories: alpha_comparison_stat_fp = '%s/%s_stats.txt' % \ (compare_alpha_output_dir, category) alpha_comparison_boxplot_fp = '%s/%s_boxplots.pdf' % \ (compare_alpha_output_dir, category) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category, alpha_metric), alpha_comparison_stat_fp, _index_headers['alpha_diversity'])) index_links.append( ('Alpha diversity boxplots (%s, %s)' % (category, alpha_metric), alpha_comparison_boxplot_fp, _index_headers['alpha_diversity'])) else: logger.write("Skipping compare_alpha_diversity.py" " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir)) else: logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n") if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob( join(taxa_plots_output_dir, 'taxa_summary_plots', '*.html')) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" % ', '.join(existing_taxa_plot_html_fps)) index_links.append( ('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append( ('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir, category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" % (category, ', '.join(existing_taxa_plot_html_fps))) index_links.append( ('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append( ('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_group_significance: params_str = get_params_str(params['group_significance']) # group significance tests, aka category significance for category in categories: group_signifance_fp = \ '%s/group_significance_%s.txt' % (output_dir, category) if not exists(group_signifance_fp): # Build the OTU cateogry significance command group_significance_cmd = \ 'group_significance.py -i %s -m %s -c %s -o %s %s' %\ (rarefied_biom_fp, mapping_fp, category, group_signifance_fp, params_str) commands.append([('Group significance (%s)' % category, group_significance_cmd)]) else: logger.write( "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp)) index_links.append( ('Category significance (%s)' % category, group_signifance_fp, _index_headers['group_significance'])) filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([('Compress the filtered BIOM table', 'gzip %s' % filtered_biom_fp)]) else: logger.write( "Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp) index_links.append( ('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, filtered_biom_gzip_fp, _index_headers['run_summary'])) rarified_biom_gzip_fp = '%s.gz' % rarefied_biom_fp if not exists(rarified_biom_gzip_fp): commands.append([('Compress the rarified BIOM table', 'gzip %s' % rarefied_biom_fp)]) else: logger.write( "Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp) index_links.append( ('Rarified BIOM table (sampling depth: %d)' % sampling_depth, rarified_biom_gzip_fp, _index_headers['run_summary'])) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links, index_fp)
def run_alpha_rarefaction(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, tree_fp=None, num_steps=10, parallel=False, logger=None, min_rare_depth=10, max_rare_depth=None, suppress_md5=False, status_update_callback=print_to_stdout, plot_stderr_and_stddev=False, retain_intermediate_files=True): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Generate rarefied OTU tables; 2) Compute alpha diversity metrics for each rarefied OTU table; 3) Collate alpha diversity results; 4) Generate alpha rarefaction plots. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) if max_rare_depth == None: min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats(parse_biom_table(open(otu_table_fp,'U'))) max_rare_depth = median_count step = int((max_rare_depth - min_rare_depth) / num_steps) or 1 max_rare_depth = int(max_rare_depth) rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions']) except KeyError: params_str = '' if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the rarefaction command rarefaction_cmd = \ '%s %s/parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) else: # Build the rarefaction command rarefaction_cmd = \ '%s %s/multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) commands.append([('Alpha rarefaction', rarefaction_cmd)]) # Prep the alpha diversity command alpha_diversity_dir = '%s/alpha_div/' % output_dir create_dir(alpha_diversity_dir) try: params_str = get_params_str(params['alpha_diversity']) except KeyError: params_str = '' if tree_fp: params_str += ' -t %s' % tree_fp if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the alpha diversity command alpha_diversity_cmd = \ "%s %s/parallel_alpha_diversity.py -T -i %s -o %s %s" %\ (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, params_str) else: # Build the alpha diversity command alpha_diversity_cmd = \ "%s %s/alpha_diversity.py -i %s -o %s %s" %\ (python_exe_fp, script_dir, rarefaction_dir, alpha_diversity_dir, params_str) commands.append(\ [('Alpha diversity on rarefied OTU tables',alpha_diversity_cmd)]) # Prep the alpha diversity collation command alpha_collated_dir = '%s/alpha_div_collated/' % output_dir create_dir(alpha_collated_dir) try: params_str = get_params_str(params['collate_alpha']) except KeyError: params_str = '' # Build the alpha diversity collation command alpha_collated_cmd = '%s %s/collate_alpha.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, alpha_diversity_dir, \ alpha_collated_dir, params_str) commands.append([('Collate alpha',alpha_collated_cmd)]) if not retain_intermediate_files: commands.append([('Removing intermediate files', 'rm -r %s %s' % (rarefaction_dir,alpha_diversity_dir))]) else: commands.append([('Skipping removal of intermediate files.','')]) # Prep the make rarefaction plot command(s) try: params_str = get_params_str(params['make_rarefaction_plots']) except KeyError: params_str = '' if 'std_type' in params['make_rarefaction_plots'] or not plot_stderr_and_stddev: rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir create_dir(rarefaction_plot_dir) # Build the make rarefaction plot command(s) #for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) else: rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir create_dir(rarefaction_plot_dir_stddev) create_dir(rarefaction_plot_dir_stderr) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) make_rarefaction_plot_cmd =\ '%s %s/make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\ (python_exe_fp, script_dir, alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr, params_str) commands.append(\ [('Rarefaction plot: %s' % 'All metrics',make_rarefaction_plot_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_pick_de_novo_otus(input_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) Pick a representative set; 3) Align the representative set; 4) Assign taxonomy; 5) Filter the alignment prior to tree building - remove positions which are all gaps, and specified as 0 in the lanemask 6) Build a phylogenetic tree; 7) Build an OTU table. """ # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() cluster_failures = False if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp]) # Prep the OTU picking command try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust' pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --otu_picking_method # option. This works for now though. d = params['pick_otus'].copy() del d['otu_picking_method'] except KeyError: pass if otu_picking_method == 'uclust_ref': try: suppress_new_clusters = d['suppress_new_clusters'] del d['suppress_new_clusters'] cluster_failures = False except KeyError: cluster_failures = True failure_otu_picking_method = 'uclust' params_str += ' %s' % get_params_str(d) otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s %s/%s -i %s -o %s -T %s' % (python_exe_fp, script_dir, otu_picking_script, input_fp, pick_otu_dir, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, input_fp, pick_otu_dir, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) if cluster_failures: reference_otu_fp = otu_fp clustered_failures_dir = '%s/failure_otus/' % pick_otu_dir try: d = params['pick_otus'].copy() del d['otu_picking_method'] except KeyError: pass if 'uclust_otu_id_prefix' not in d: d['uclust_otu_id_prefix'] = 'DeNovoOTU' params_str = ' %s' % get_params_str(d) failures_list_fp = '%s/%s_failures.txt' % \ (pick_otu_dir,input_basename) failures_fasta_fp = '%s/%s_failures.fasta' % \ (pick_otu_dir,input_basename) filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,failures_list_fp,failures_fasta_fp) commands.append([('Generate failures fasta file', filter_fasta_cmd)]) # Prep the OTU picking command for failure_otu_fp = '%s/%s_failures_otus.txt' % (clustered_failures_dir,input_basename) # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, failures_fasta_fp, clustered_failures_dir, failure_otu_picking_method, params_str) commands.append([('Pick de novo OTUs for new clusters', pick_otus_cmd)]) merged_otu_map_fp = '%s/merged_otu_map.txt' % clustered_failures_dir cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (reference_otu_fp,failure_otu_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) otu_fp = merged_otu_map_fp # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir,input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir,input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = '%s %s/pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, input_fp, rep_set_log_fp,\ rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir,assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir,input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ '%s %s/parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (python_exe_fp, script_dir, assignment_method, rep_set_fp,\ assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = '%s %s/assign_taxonomy.py -o %s -i %s %s' %\ (python_exe_fp, script_dir, assign_taxonomy_dir,\ rep_set_fp, params_str) commands.append([('Assign taxonomy',assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, taxonomy_fp, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) if cluster_failures: reference_otu_table_fp = '%s/reference_only_otu_table.biom' % output_dir # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\ (python_exe_fp, script_dir, reference_otu_fp, taxonomy_fp, reference_otu_table_fp, params_str) commands.append([('Make reference-only OTU table', make_otu_table_cmd)]) # Prep the pynast alignment command try: alignment_method = params['align_seqs']['alignment_method'] except KeyError: alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method) aln_fp = '%s/%s_rep_set_aligned.fasta' % (pynast_dir,input_basename) if parallel and alignment_method == 'pynast': # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the alignment parameters # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. try: d = params['align_seqs'].copy() except KeyError: d = {} try: del d['alignment_method'] except KeyError: pass params_str += ' %s' % get_params_str(d) # Build the parallel pynast alignment command align_seqs_cmd = '%s %s/parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = '%s %s/align_seqs.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) # Prep the alignment filtering command filtered_aln_fp = '%s/%s_rep_set_aligned_pfiltered.fasta' %\ (pynast_dir,input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = '%s %s/filter_alignment.py -o %s -i %s %s' %\ (python_exe_fp, script_dir, pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) # Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = '%s %s/make_phylogeny.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, filtered_aln_fp, tree_fp,\ params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return abspath(tree_fp), abspath(otu_table_fp)
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, tree_fp=None, parallel=False, logger=None, suppress_emperor_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots The steps performed by this function are: 1) Compute a beta diversity distance matrix for each metric 2) Peform a principal coordinates analysis on the result of step 1 3) Generate an emperor plot for each result of step 2 """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp,'U')) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\ (python_exe_fp, script_dir, otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd)]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac'] dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str,tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp))]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate emperor plots if not suppress_emperor_plots: # Prep the emperor plots command emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir, beta_diversity_metric) create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command emperor_command = \ 'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp, emperor_dir, mapping_fp, params_str) commands.append([('Make emperor plots, %s)' % beta_diversity_metric, emperor_command)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_jackknifed_beta_diversity(otu_table_fp, tree_fp, seqs_per_sample, output_dir, command_handler, params, qiime_config, mapping_fp, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout, master_tree=None): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute beta diversity distance matrix from otu table (and tree, if applicable) 2) Build rarefied OTU tables; 3) Build UPGMA tree from full distance matrix; 4) Compute distance matrics for rarefied OTU tables; 5) Build UPGMA trees from rarefied OTU table distance matrices; 5.5) Build a consensus tree from the rarefied UPGMA trees 6) Compare rarefied OTU table distance matrix UPGMA trees to tree full UPGMA tree and write support file and newick tree with support values as node labels. master_tree can be 'full' or 'consensus', default full """ # Prepare some variables for the later steps if master_tree == None: master_tree = 'full' otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp,tree_fp]) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac','unweighted_unifrac'] # Prep the beta-diversity command try: params_str = get_params_str(params['beta_diversity']) except KeyError: params_str = '' if tree_fp: params_str = '%s -t %s' % (params_str,tree_fp) # Build the beta-diversity command beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str) commands.append(\ [('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd)]) # Prep rarefaction command rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions_even_depth']) except KeyError: params_str = '' # Build the rarefaction command rarefaction_cmd = \ '%s %s/multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, seqs_per_sample, rarefaction_dir, params_str) commands.append([('Rarefaction', rarefaction_cmd)]) # Begin iterating over beta diversity distance metrics, if more than one # was provided for beta_diversity_metric in beta_diversity_metrics: metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric) distance_matrix_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) # Prep the hierarchical clustering command (for full distance matrix) full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir,otu_table_basename) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for full distance matrix) hierarchical_cluster_cmd = '%s %s/upgma_cluster.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, distance_matrix_fp, full_tree_fp, params_str) commands.append(\ [('UPGMA on full distance matrix: %s' % beta_diversity_metric,\ hierarchical_cluster_cmd)]) # Prep the beta diversity command (for rarefied OTU tables) dm_dir = '%s/rare_dm/' % metric_output_dir create_dir(dm_dir) # the metrics parameter needs to be ignored as we need to run # beta_diversity one metric at a time to keep the per-metric # output files in separate directories try: d = params['beta_diversity'].copy() del d['metrics'] except KeyError: params_str = {} params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric if tree_fp: params_str = '%s -t %s' % (params_str,tree_fp) if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the parallel beta diversity command (for rarefied OTU tables) beta_div_rarefied_cmd = \ '%s %s/parallel_beta_diversity.py -T -i %s -o %s %s' %\ (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str) else: # Build the serial beta diversity command (for rarefied OTU tables) beta_div_rarefied_cmd = \ '%s %s/beta_diversity.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, rarefaction_dir, dm_dir, params_str) commands.append(\ [('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric, beta_div_rarefied_cmd)]) # Prep the hierarchical clustering command (for rarefied # distance matrices) upgma_dir = '%s/rare_upgma/' % metric_output_dir create_dir(upgma_dir) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for rarefied # distance matrices) hierarchical_cluster_cmd =\ '%s %s/upgma_cluster.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, dm_dir, upgma_dir, params_str) commands.append(\ [('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric, hierarchical_cluster_cmd)]) # Build the consensus tree command consensus_tree_cmd =\ '%s %s/consensus_tree.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, upgma_dir, metric_output_dir + "/rare_upgma_consensus.tre", params_str) commands.append(\ [('consensus on rarefied distance matrices (%s)' % beta_diversity_metric, consensus_tree_cmd)]) # Prep the tree compare command tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir create_dir(tree_compare_dir) try: params_str = get_params_str(params['tree_compare']) except KeyError: params_str = '' # Build the tree compare command if master_tree == "full": master_tree_fp = full_tree_fp elif master_tree == "consensus": master_tree_fp = metric_output_dir + "/rare_upgma_consensus.tre" else: raise RuntimeError('master tree method "%s" not found' % (master_tree,)) tree_compare_cmd = '%s %s/tree_compare.py -s %s -m %s -o %s %s' %\ (python_exe_fp, script_dir, upgma_dir, master_tree_fp, tree_compare_dir, params_str) commands.append(\ [('Tree compare (%s)' % beta_diversity_metric, tree_compare_cmd)]) # Prep the PCoA command pcoa_dir = '%s/pcoa/' % metric_output_dir create_dir(pcoa_dir) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the PCoA command pcoa_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, dm_dir, pcoa_dir, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)]) # Prep the emperor plots command emperor_dir = '%s/emperor_pcoa_plots/' % metric_output_dir create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' emperor_cmd = 'make_emperor.py -i %s -o %s -m %s %s' %\ (pcoa_dir, emperor_dir, mapping_fp, params_str) commands.append(\ [('emperor plots (%s)' % beta_diversity_metric, emperor_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def assign_tax(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False ## Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'rdp' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir,assignment_method) taxonomy_fp = '%s/%s_tax_assignments.txt' % \ (assign_taxonomy_dir,input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, repset_fasta_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir,repset_fasta_fp, params_str) if exists(assign_taxonomy_dir): rmtree(assign_taxonomy_dir) commands.append([('Assign taxonomy',assign_taxonomy_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return taxonomy_fp
def run_summarize_taxa_through_plots(otu_table_fp, mapping_fp, output_dir, mapping_cat, sort, command_handler, params, qiime_config, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation for summarizing taxonomies and generating plots The steps performed by this function are: 1) Summarize OTU by Category 2) Summarize Taxonomy 3) Plot Taxonomy Summary """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[otu_table_fp,mapping_fp]) # if mapping category not passed via command-line, # check if it is passed in params file if not mapping_cat: try: mapping_cat=params['summarize_otu_by_cat']['mapping_category'] except: mapping_cat=None try: params_str = get_params_str(params['summarize_otu_by_cat']) # Need to remove the mapping category option, since it is defined above. # Using this method since we don't want to change the params dict split_params=params_str.split('--') updated_params_str=[] for i in split_params: if not i.startswith('mapping_category'): updated_params_str.append(i) params_str='--'.join(updated_params_str) except: params_str = '' if mapping_cat: output_fp=join(output_dir,'%s_otu_table.biom' % (mapping_cat.replace(' ','-'))) # Build the summarize otu by category command summarize_otu_by_cat_cmd = \ "%s %s/summarize_otu_by_cat.py -m %s -i %s -o %s -c '%s' %s" %\ (python_exe_fp, script_dir, mapping_fp, otu_table_fp, output_fp, mapping_cat, params_str) commands.append(\ [('Summarize OTU table by Category',summarize_otu_by_cat_cmd)]) otu_table_fp=output_fp # Build the sort OTU table command if sort: # Prep the sort_otu_table command try: params_str = get_params_str(params['sort_otu_table']) except: params_str = '' # define output otu table sorted_fp=join(output_dir, splitext(split(otu_table_fp)[-1])[0]+'_sorted.biom') if mapping_cat or params_str=='': # for this case we don't have a collapsed mapping file so must # handle separately sort_otu_table_cmd = \ "%s %s/sort_otu_table.py -i %s -o %s" %\ (python_exe_fp, script_dir, otu_table_fp, sorted_fp) else: sort_otu_table_cmd = \ "%s %s/sort_otu_table.py -i %s -o %s -m %s %s" %\ (python_exe_fp, script_dir, otu_table_fp, sorted_fp, mapping_fp, params_str) commands.append([('Sort OTU Table',sort_otu_table_cmd)]) # redefine otu_table_fp to use otu_table_fp=sorted_fp # Prep the summarize taxonomy command try: params_str = get_params_str(params['summarize_taxa']) except: params_str = '' try: sum_taxa_levels=params['summarize_taxa']['level'] except: sum_taxa_levels=None # Build the summarize taxonomy command summarize_taxa_cmd = '%s %s/summarize_taxa.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, params_str) commands.append([('Summarize Taxonomy',summarize_taxa_cmd)]) sum_taxa_fps=[] if sum_taxa_levels: basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0]) for i in sum_taxa_levels.split(','): sum_taxa_fps.append(basename+'_L%s.txt' % (str(i))) else: basename=join(output_dir,splitext(split(otu_table_fp)[-1])[0]) # this is the default levels from summarize_taxa, but cannot import # script to get these values for i in [2,3,4,5,6]: sum_taxa_fps.append(basename+'_L%s.txt' % (str(i))) # Prep the plot taxa summary plot command(s) taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir create_dir(taxa_summary_plots_dir) try: params_str = get_params_str(params['plot_taxa_summary']) except: params_str = '' # Build the plot taxa summary plot command(s) plot_taxa_summary_cmd =\ '%s %s/plot_taxa_summary.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, ','.join(sum_taxa_fps), taxa_summary_plots_dir, params_str) commands.append(\ [('Plot Taxonomy Summary',plot_taxa_summary_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_reference_otus(input_fp, output_dir, otu_picking_method, refseqs_fp, parallel, params, logger, similarity_override=None): params_copy = deepcopy(params) if 'pick_otus' in params_copy and 'refseqs_fp' in params_copy['pick_otus']: raise WorkflowError, \ ("Cannot pass pick_otus:refseqs_fp in parameters file. This can only be" " passed on the command line or through the API.") if similarity_override != None: logger.write('Overridding similiary with %1.3f.\n' % similarity_override) if 'pick_otus' in params_copy: params_copy['pick_otus']['similarity'] = str(similarity_override) else: params_copy['pick_otus'] = {'similarity':str(similarity_override)} if parallel and otu_picking_method == 'uclust_ref': # Grab the parallel-specific parameters try: params_str = get_params_str(params_copy['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --otu_picking_method # option. This works for now though. if 'otu_picking_method' in params_copy['pick_otus']: del params_copy['pick_otus']['otu_picking_method'] except KeyError: pass params_str += ' %s' % get_params_str(params_copy['pick_otus']) otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, output_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params_copy['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str+= ' --suppress_new_clusters' logger.write("Forcing --suppress_new_clusters as this is reference-based OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, output_dir, refseqs_fp, otu_picking_method, params_str) return pick_otus_cmd
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp,'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ("Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ("Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories= [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging log_fp = generate_log_fp(output_dir) index_links.append(('Master run log',log_fp,_index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp,mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger,input_fps) # run print_biom_table_summary.py on input BIOM table try: params_str = get_params_str(params['print_biom_table_summary']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir print_biom_table_summary_cmd = \ "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % \ (biom_fp, biom_table_stats_output_fp,params_str) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) commands.append([('Generate BIOM table summary', print_biom_table_summary_cmd)]) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp,filtered_biom_fp,sampling_depth) commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) biom_fp = filtered_biom_fp # run initial commands and reset the command list command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth) even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, # force suppression of distance histograms - boxplots work better # in this context, and are created below. histogram_categories=[], tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric) try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd)]) index_links.append(('Distance boxplots (%s)' % bdiv_metric, '%s/%s_Distances.pdf' % \ (boxplots_output_dir,category), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric, '%s/%s_Stats.txt' % \ (boxplots_output_dir,category), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth) run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Alpha rarefaction plots', '%s/alpha_rarefaction_plots/rarefaction_plots.html'\ % arare_full_output_dir, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = '%s/%s_%s.txt' % \ (arare_full_output_dir,category,alpha_metric) compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str) commands.append([('Compare alpha diversity (%s, %s)' %\ (category,alpha_metric), compare_alpha_cmd)]) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric), alpha_comparison_output_fp, _index_headers['alpha_diversity'])) if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category) run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_otu_category_significance: # OTU category significance for category in categories: category_signifance_fp = \ '%s/category_significance_%s.txt' % (output_dir, category) try: params_str = get_params_str(params['otu_category_significance']) except KeyError: params_str = '' # Build the OTU cateogry significance command category_significance_cmd = \ 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\ (biom_fp, mapping_fp, category, category_signifance_fp, params_str) commands.append([('OTU category significance (%s)' % category, category_significance_cmd)]) index_links.append(('Category significance (%s)' % category, category_signifance_fp, _index_headers['otu_category_sig'])) commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)]) index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, '%s.gz' % filtered_biom_fp, _index_headers['run_summary'])) command_handler(commands, status_update_callback, logger) generate_index_page(index_links,index_fp)