def main(): option_parser, opts, args = \ parse_command_line_parameters(suppress_verbose=True, **script_info) input_dir = opts.input_dir paired_data = opts.paired_data parameter_fp = opts.parameter_fp read1_indicator = opts.read1_indicator read2_indicator = opts.read2_indicator leading_text = opts.leading_text trailing_text = opts.trailing_text include_input_dir_path = opts.include_input_dir_path output_dir = abspath(opts.output_dir) remove_filepath_in_name = opts.remove_filepath_in_name print_only = opts.print_only if remove_filepath_in_name and not include_input_dir_path: option_parser.error("If --remove_filepath_in_name is enabled, " "--include_input_dir_path must also be enabled.") if opts.parameter_fp: with open(opts.parameter_fp, 'U') as parameter_f: params_dict = parse_qiime_parameters(parameter_f) params_str = get_params_str(params_dict['extract_barcodes']) else: params_dict = {} params_str = "" create_dir(output_dir) all_files = [] extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq'] for root, dir, fps in walk(input_dir): for fp in fps: for extension in extensions: if fp.endswith(extension): all_files += [abspath(join(root, fp))] if paired_data: all_files, bc_pairs = get_pairs(all_files, read1_indicator, read2_indicator) commands = create_commands_eb(all_files, paired_data, output_dir, params_str, leading_text, trailing_text, include_input_dir_path, remove_filepath_in_name) qiime_config = load_qiime_config() if print_only: command_handler = print_commands else: command_handler = call_commands_serially logger = WorkflowLogger(generate_log_fp(output_dir), params=params_dict, qiime_config=qiime_config) # Call the command handler on the list of commands command_handler(commands, status_update_callback = no_status_updates, logger=logger, close_logger_on_success=True)
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp,'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ("Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ("Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories= [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging log_fp = generate_log_fp(output_dir) index_links.append(('Master run log',log_fp,_index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp,mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger,input_fps) # run print_biom_table_summary.py on input BIOM table try: params_str = get_params_str(params['print_biom_table_summary']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir print_biom_table_summary_cmd = \ "print_biom_table_summary.py -i %s -o %s --suppress_md5 %s" % \ (biom_fp, biom_table_stats_output_fp,params_str) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) commands.append([('Generate BIOM table summary', print_biom_table_summary_cmd)]) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp,filtered_biom_fp,sampling_depth) commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) biom_fp = filtered_biom_fp # run initial commands and reset the command list command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth) even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, # force suppression of distance histograms - boxplots work better # in this context, and are created below. histogram_categories=[], tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric) try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd)]) index_links.append(('Distance boxplots (%s)' % bdiv_metric, '%s/%s_Distances.pdf' % \ (boxplots_output_dir,category), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric, '%s/%s_Stats.txt' % \ (boxplots_output_dir,category), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric, '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric, '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \ (bdiv_even_output_dir,bdiv_metric,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth) run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Alpha rarefaction plots', '%s/alpha_rarefaction_plots/rarefaction_plots.html'\ % arare_full_output_dir, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = '%s/%s_%s.txt' % \ (arare_full_output_dir,category,alpha_metric) compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str) commands.append([('Compare alpha diversity (%s, %s)' %\ (category,alpha_metric), compare_alpha_cmd)]) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric), alpha_comparison_output_fp, _index_headers['alpha_diversity'])) if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category) run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_otu_category_significance: # OTU category significance for category in categories: category_signifance_fp = \ '%s/category_significance_%s.txt' % (output_dir, category) try: params_str = get_params_str(params['otu_category_significance']) except KeyError: params_str = '' # Build the OTU cateogry significance command category_significance_cmd = \ 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\ (biom_fp, mapping_fp, category, category_signifance_fp, params_str) commands.append([('OTU category significance (%s)' % category, category_significance_cmd)]) index_links.append(('Category significance (%s)' % category, category_signifance_fp, _index_headers['otu_category_sig'])) commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)]) index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, '%s.gz' % filtered_biom_fp, _index_headers['run_summary'])) command_handler(commands, status_update_callback, logger) generate_index_page(index_links,index_fp)
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=0.60, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i,input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir,i) if iteration_output_exists(iteration_output_dir,min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger,[input_fp,refseqs_fp]) logger.write('Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i,input_fp)) else: pick_subsampled_open_reference_otus(input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id,str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback) ## perform post-iteration file shuffling whether the previous iteration's ## data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps),otu_table_fp) commands.append([("Merge OTU tables",merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def run_pick_closed_reference_otus( input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) Build an OTU table with optional pre-defined taxonmy. """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = ['blast','uclust_ref','usearch61_ref'] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method,' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp,refseqs_fp,taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s %s/%s -i %s -o %s -r %s -T %s' %\ (python_exe_fp, script_dir, otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str+= ' --suppress_new_clusters' logger.write("Forcing --suppress_new_clusters as this is closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (python_exe_fp, script_dir, input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses(biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_group_significance=False, status_update_callback=print_to_stdout): """ """ if categories is not None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp, 'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories = [] comma_separated_categories = ','.join(categories) # prep some variables if params is None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir, 'log_20*txt')) log_fp = generate_log_fp(output_dir) index_links.append( ('Master run log', log_fp, _index_headers['run_summary'])) for old_log_fp in old_log_fps: index_links.append( ('Previous run log', old_log_fp, _index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp is not None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params['biom-summarize-table']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = \ "biom summarize-table -i %s -o %s %s" % \ (biom_fp, biom_table_stats_output_fp, params_str) commands.append([('Generate BIOM table summary', biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp, filtered_biom_fp, sampling_depth) commands.append([( 'Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) else: logger.write( "Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp) biom_fp = filtered_biom_fp # rarify the BIOM table to sampling_depth rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth) if not exists(rarefied_biom_fp): single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" %\ (biom_fp, rarefied_biom_fp, sampling_depth) commands.append([ ('Rarify the OTU table to %d sequences/sample' % sampling_depth, single_rarefaction_cmd) ]) else: logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp) # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir, sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=rarefied_biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, # Note: we pass sampling depth=None here as # we rarify the BIOM table above and pass that # in here. sampling_depth=None, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ', '.join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip('_dm.txt'), fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % ( bdiv_even_output_dir, bdiv_metric) plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir, category) stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir, category) if not exists(plot_output_fp): boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd) ]) else: logger.write( "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp)) index_links.append( ('Distance boxplots (%s)' % bdiv_metric, plot_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Distance boxplots statistics (%s)' % bdiv_metric, stats_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('PCoA plot (%s)' % bdiv_metric, '%s/%s_emperor_pcoa_plot/index.html' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append( ('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % (bdiv_even_output_dir, bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: # Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir, sampling_depth) rarefaction_plots_output_fp = \ '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, retain_intermediate_files=False) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp) index_links.append( ('Alpha rarefaction plots', rarefaction_plots_output_fp, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' if len(categories) > 0: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext( split(collated_alpha_diversity_fp)[1])[0] compare_alpha_output_dir = '%s/compare_%s' % \ (arare_full_output_dir, alpha_metric) if not exists(compare_alpha_output_dir): compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, comma_separated_categories, compare_alpha_output_dir, params_str) commands.append([ ('Compare alpha diversity (%s)' % alpha_metric, compare_alpha_cmd) ]) for category in categories: alpha_comparison_stat_fp = '%s/%s_stats.txt' % \ (compare_alpha_output_dir, category) alpha_comparison_boxplot_fp = '%s/%s_boxplots.pdf' % \ (compare_alpha_output_dir, category) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category, alpha_metric), alpha_comparison_stat_fp, _index_headers['alpha_diversity'])) index_links.append( ('Alpha diversity boxplots (%s, %s)' % (category, alpha_metric), alpha_comparison_boxplot_fp, _index_headers['alpha_diversity'])) else: logger.write("Skipping compare_alpha_diversity.py" " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir)) else: logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n") if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob( join(taxa_plots_output_dir, 'taxa_summary_plots', '*.html')) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" % ', '.join(existing_taxa_plot_html_fps)) index_links.append( ('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append( ('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir, category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write( "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" % (category, ', '.join(existing_taxa_plot_html_fps))) index_links.append( ('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append( ('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html' % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_group_significance: params_str = get_params_str(params['group_significance']) # group significance tests, aka category significance for category in categories: group_signifance_fp = \ '%s/group_significance_%s.txt' % (output_dir, category) if not exists(group_signifance_fp): # Build the OTU cateogry significance command group_significance_cmd = \ 'group_significance.py -i %s -m %s -c %s -o %s %s' %\ (rarefied_biom_fp, mapping_fp, category, group_signifance_fp, params_str) commands.append([('Group significance (%s)' % category, group_significance_cmd)]) else: logger.write( "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp)) index_links.append( ('Category significance (%s)' % category, group_signifance_fp, _index_headers['group_significance'])) filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([('Compress the filtered BIOM table', 'gzip %s' % filtered_biom_fp)]) else: logger.write( "Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp) index_links.append( ('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, filtered_biom_gzip_fp, _index_headers['run_summary'])) rarified_biom_gzip_fp = '%s.gz' % rarefied_biom_fp if not exists(rarified_biom_gzip_fp): commands.append([('Compress the rarified BIOM table', 'gzip %s' % rarefied_biom_fp)]) else: logger.write( "Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp) index_links.append( ('Rarified BIOM table (sampling depth: %d)' % sampling_depth, rarified_biom_gzip_fp, _index_headers['run_summary'])) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links, index_fp)
def run_pick_closed_reference_otus(input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, assign_taxonomy=False, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) If assignment_taxonomy is True, choose representative sequence for OTUs and assign taxonomy using a classifier. 3) Build an OTU table with optional predefined taxonomy (if assign_taxonomy=False) or taxonomic assignments from step 2 (if assign_taxonomy=True). """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = [ 'blast', 'uclust_ref', 'usearch61_ref', 'usearch_ref', 'sortmerna' ] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method, ' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write("Forcing --suppress_new_clusters as this is " "closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Assign taxonomy using a taxonomy classifier, if request by the user. # (Alternatively predefined taxonomic assignments will be used, if provided.) if assign_taxonomy: # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, rep_set_fp, params_str) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # If assign_taxonomy is True, this will be the path to the taxonomic # assignment results. If assign_taxonomy is False this will be either # the precomputed taxonomic assignments that the user passed in, # or None. if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\ (otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = [ 'uclust_ref', 'usearch61_ref', 'sortmerna' ] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s( logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id) * 100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus(input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file( step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append(( 'Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append([('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([ ('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir)) ]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append( ('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write( '# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append(('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append(( 'New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write( '# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write( '# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append(( 'OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append(('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append(('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def align_and_tree(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # Prep the pynast alignment command alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method) aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename) failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename) if exists(pynast_dir): rmtree(pynast_dir) if parallel: # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['align_seqs'].copy() if 'alignment_method' in d: del d['alignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel pynast alignment command align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (repset_fasta_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\ (repset_fasta_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) # Prep the alignment filtering command filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\ (pynast_dir, input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\ (pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) # Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\ (filtered_aln_fp, tree_fp, params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) if exists(tree_fp): remove_files([tree_fp]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return failures_fp
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, tree_fp=None, parallel=False, logger=None, suppress_emperor_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Compute beta diversity distance matrices, run PCoA, and generate emperor plots The steps performed by this function are: 1) Compute a beta diversity distance matrix for each metric 2) Peform a principal coordinates analysis on the result of step 1 3) Generate an emperor plot for each result of step 2 """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp, 'U')) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ 'single_rarefaction.py -i %s -o %s -d %d' %\ (otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd) ]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str, tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = 'parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append([('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = 'beta_diversity.py -i %s -o %s --metrics %s %s' %\ (otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append([('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([ ('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp)) ]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = 'principal_coordinates.py -i %s -o %s %s' %\ (beta_div_fp, pc_fp, params_str) commands.append([('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate emperor plots if not suppress_emperor_plots: # Prep the emperor plots command emperor_dir = '%s/%s_emperor_pcoa_plot/' % (output_dir, beta_diversity_metric) create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command emperor_command = \ 'make_emperor.py -i %s -o %s -m %s %s' % (pc_fp, emperor_dir, mapping_fp, params_str) commands.append([ ('Make emperor plots, %s)' % beta_diversity_metric, emperor_command) ]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def run_alpha_rarefaction(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, tree_fp=None, num_steps=10, parallel=False, logger=None, min_rare_depth=10, max_rare_depth=None, suppress_md5=False, status_update_callback=print_to_stdout, plot_stderr_and_stddev=False, retain_intermediate_files=True): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Generate rarefied OTU tables; 2) Compute alpha diversity metrics for each rarefied OTU table; 3) Collate alpha diversity results; 4) Generate alpha rarefaction plots. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) if max_rare_depth is None: min_count, max_count, median_count, mean_count, counts_per_sample =\ compute_counts_per_sample_stats( load_table(otu_table_fp)) max_rare_depth = median_count step = int((max_rare_depth - min_rare_depth) / num_steps) or 1 max_rare_depth = int(max_rare_depth) rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions']) except KeyError: params_str = '' if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the rarefaction command rarefaction_cmd = \ 'parallel_multiple_rarefactions.py -T -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) else: # Build the rarefaction command rarefaction_cmd = \ 'multiple_rarefactions.py -i %s -m %s -x %s -s %s -o %s %s' %\ (otu_table_fp, min_rare_depth, max_rare_depth, step, rarefaction_dir, params_str) commands.append([('Alpha rarefaction', rarefaction_cmd)]) # Prep the alpha diversity command alpha_diversity_dir = '%s/alpha_div/' % output_dir create_dir(alpha_diversity_dir) try: params_str = get_params_str(params['alpha_diversity']) except KeyError: params_str = '' if tree_fp: params_str += ' -t %s' % tree_fp if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the alpha diversity command alpha_diversity_cmd = \ "parallel_alpha_diversity.py -T -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) else: # Build the alpha diversity command alpha_diversity_cmd = \ "alpha_diversity.py -i %s -o %s %s" %\ (rarefaction_dir, alpha_diversity_dir, params_str) commands.append([('Alpha diversity on rarefied OTU tables', alpha_diversity_cmd)]) # Prep the alpha diversity collation command alpha_collated_dir = '%s/alpha_div_collated/' % output_dir create_dir(alpha_collated_dir) try: params_str = get_params_str(params['collate_alpha']) except KeyError: params_str = '' # Build the alpha diversity collation command alpha_collated_cmd = 'collate_alpha.py -i %s -o %s %s' %\ (alpha_diversity_dir, alpha_collated_dir, params_str) commands.append([('Collate alpha', alpha_collated_cmd)]) if not retain_intermediate_files: commands.append([ ('Removing intermediate files', 'rm -r %s %s' % (rarefaction_dir, alpha_diversity_dir)) ]) else: commands.append([('Skipping removal of intermediate files.', '')]) # Prep the make rarefaction plot command(s) try: params_str = get_params_str(params['make_rarefaction_plots']) except KeyError: params_str = '' if 'std_type' in params[ 'make_rarefaction_plots'] or not plot_stderr_and_stddev: rarefaction_plot_dir = '%s/alpha_rarefaction_plots/' % output_dir create_dir(rarefaction_plot_dir) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) else: rarefaction_plot_dir_stddev = '%s/alpha_rarefaction_plots_stddev/' % output_dir rarefaction_plot_dir_stderr = '%s/alpha_rarefaction_plots_stderr/' % output_dir create_dir(rarefaction_plot_dir_stddev) create_dir(rarefaction_plot_dir_stderr) # Build the make rarefaction plot command(s) # for metric in alpha_diversity_metrics: make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stddev' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stddev, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) make_rarefaction_plot_cmd =\ 'make_rarefaction_plots.py -i %s -m %s -o %s %s --std_type stderr' %\ (alpha_collated_dir, mapping_fp, rarefaction_plot_dir_stderr, params_str) commands.append([('Rarefaction plot: %s' % 'All metrics', make_rarefaction_plot_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_beta_diversity_through_plots(otu_table_fp, mapping_fp, output_dir, command_handler, params, qiime_config, color_by_interesting_fields_only=True, sampling_depth=None, histogram_categories=None, tree_fp=None, parallel=False, logger=None, suppress_3d_plots=False, suppress_2d_plots=False, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute a beta diversity distance matrix; 2) Peform a principal coordinates analysis on the result of Step 1; 3) Generate a 3D prefs file for optimized coloring of continuous variables; 4) Generate a 3D plot for all mapping fields with colors optimized for continuous data; 5) Generate a 3D plot for all mapping fields with colors optimized for discrete data. """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) mapping_data, mapping_header, mapping_comments =\ parse_mapping_file(open(mapping_fp,'U')) if histogram_categories: invalid_categories = set(histogram_categories) - set(mapping_header) if invalid_categories: raise ValueError,\ "Invalid histogram categories - these must exactly match "+\ "mapping file column headers: %s" % (' '.join(invalid_categories)) # Get the interesting mapping fields to color by -- if none are # interesting, take all of them. Interesting is defined as those # which have greater than one value and fewer values than the number # of samples if color_by_interesting_fields_only: mapping_fields =\ get_interesting_mapping_fields(mapping_data, mapping_header) or\ mapping_header else: mapping_fields = mapping_header mapping_fields = ','.join(mapping_fields) if sampling_depth: # Sample the OTU table at even depth even_sampled_otu_table_fp = '%s/%s_even%d%s' %\ (output_dir, otu_table_basename, sampling_depth, otu_table_ext) single_rarefaction_cmd = \ '%s %s/single_rarefaction.py -i %s -o %s -d %d' %\ (python_exe_fp, script_dir, otu_table_fp, even_sampled_otu_table_fp, sampling_depth) commands.append([ ('Sample OTU table at %d seqs/sample' % sampling_depth, single_rarefaction_cmd) ]) otu_table_fp = even_sampled_otu_table_fp otu_table_dir, otu_table_filename = split(even_sampled_otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] # Prep the 3d prefs file generator command prefs_fp = '%s/prefs.txt' % output_dir try: params_str = get_params_str(params['make_prefs_file']) except KeyError: params_str = '' if not 'mapping_headers_to_use' in params['make_prefs_file']: params_str = '%s --mapping_headers_to_use %s' \ % (params_str,mapping_fields) # Build the 3d prefs file generator command prefs_cmd = \ '%s %s/make_prefs_file.py -m %s -o %s %s' %\ (python_exe_fp, script_dir, mapping_fp, prefs_fp, params_str) commands.append([('Build prefs file', prefs_cmd)]) dm_fps = [] for beta_diversity_metric in beta_diversity_metrics: # Prep the beta-diversity command try: bdiv_params_copy = params['beta_diversity'].copy() except KeyError: bdiv_params_copy = {} try: del bdiv_params_copy['metrics'] except KeyError: pass params_str = get_params_str(bdiv_params_copy) if tree_fp: params_str = '%s -t %s ' % (params_str, tree_fp) # Build the beta-diversity command if parallel: # Grab the parallel-specific parameters try: params_str += get_params_str(params['parallel']) except KeyError: pass beta_div_cmd = '%s %s/parallel_beta_diversity.py -i %s -o %s --metrics %s -T %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) else: beta_div_cmd = '%s %s/beta_diversity.py -i %s -o %s --metrics %s %s' %\ (python_exe_fp, script_dir, otu_table_fp, output_dir, beta_diversity_metric, params_str) commands.append(\ [('Beta Diversity (%s)' % beta_diversity_metric, beta_div_cmd)]) orig_beta_div_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) beta_div_fp = '%s/%s_dm.txt' % \ (output_dir, beta_diversity_metric) commands.append([ ('Rename distance matrix (%s)' % beta_diversity_metric, 'mv %s %s' % (orig_beta_div_fp, beta_div_fp)) ]) dm_fps.append((beta_diversity_metric, beta_div_fp)) # Prep the principal coordinates command pc_fp = '%s/%s_pc.txt' % (output_dir, beta_diversity_metric) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the principal coordinates command pc_cmd = '%s %s/principal_coordinates.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, beta_div_fp, pc_fp, params_str) commands.append(\ [('Principal coordinates (%s)' % beta_diversity_metric, pc_cmd)]) # Generate 3d plots if not suppress_3d_plots: # Prep the continuous-coloring 3d plots command continuous_3d_dir = '%s/%s_3d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_3d_command = \ '%s %s/make_3d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_3d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_3d_dir = '%s/%s_3d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_3d_dir) try: params_str = get_params_str(params['make_3d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 3d plots command discrete_3d_command = \ '%s %s/make_3d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_3d_dir, mapping_fp, params_str) commands.append([\ ('Make 3D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_3d_command),\ ('Make 3D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_3d_command,)]) # Generate 3d plots if not suppress_2d_plots: # Prep the continuous-coloring 3d plots command continuous_2d_dir = '%s/%s_2d_continuous/' %\ (output_dir, beta_diversity_metric) create_dir(continuous_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the continuous-coloring 3d plots command continuous_2d_command = \ '%s %s/make_2d_plots.py -p %s -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, prefs_fp, pc_fp, continuous_2d_dir, mapping_fp, params_str) # Prep the discrete-coloring 3d plots command discrete_2d_dir = '%s/%s_2d_discrete/' %\ (output_dir, beta_diversity_metric) create_dir(discrete_2d_dir) try: params_str = get_params_str(params['make_2d_plots']) except KeyError: params_str = '' # Build the discrete-coloring 2d plots command discrete_2d_command = \ '%s %s/make_2d_plots.py -b "%s" -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, mapping_fields, pc_fp, discrete_2d_dir, mapping_fp, params_str) commands.append([\ ('Make 2D plots (continuous coloring, %s)' %\ beta_diversity_metric,continuous_2d_command),\ ('Make 2D plots (discrete coloring, %s)' %\ beta_diversity_metric,discrete_2d_command,)]) if histogram_categories: # Prep the discrete-coloring 3d plots command histograms_dir = '%s/%s_histograms/' %\ (output_dir, beta_diversity_metric) create_dir(histograms_dir) try: params_str = get_params_str(params['make_distance_histograms']) except KeyError: params_str = '' # Build the make_distance_histograms command distance_histograms_command = \ '%s %s/make_distance_histograms.py -d %s -o %s -m %s -f "%s" %s' %\ (python_exe_fp, script_dir, beta_div_fp, histograms_dir, mapping_fp, ','.join(histogram_categories), params_str) commands.append([\ ('Make Distance Histograms (%s)' %\ beta_diversity_metric,distance_histograms_command)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return dm_fps
def create_personal_results(output_dir, mapping_fp, coord_fp, collated_dir, otu_table_fp, prefs_fp, personal_id_column, personal_ids=None, column_title='Self', individual_titles=None, category_to_split='BodySite', time_series_category='WeeksSinceStart', rarefaction_depth=10000, alpha=0.05, rep_set_fp=None, body_site_rarefied_otu_table_dir=None, retain_raw_data=False, suppress_alpha_rarefaction=False, suppress_beta_diversity=False, suppress_taxa_summary_plots=False, suppress_alpha_diversity_boxplots=False, suppress_otu_category_significance=False, command_handler=call_commands_serially, status_update_callback=no_status_updates): # Create our output directory and copy over the resources the personalized # pages need (e.g. javascript, images, etc.). create_dir(output_dir) support_files_dir = join(output_dir, 'support_files') if not exists(support_files_dir): copytree(join(get_project_dir(), 'my_microbes', 'support_files'), support_files_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U')) try: personal_id_index = header.index(personal_id_column) except ValueError: raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column) try: bodysite_index = header.index(category_to_split) except ValueError: raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split) header = header[:-1] + [column_title] + [header[-1]] # column that differentiates between body-sites within a single individual # used for the creation of the vectors in make_3d_plots.py, this data is # created by concatenating the two columns when writing the mapping file site_id_category = '%s&&%s' % (personal_id_column, category_to_split) header.insert(len(header)-1, site_id_category) all_personal_ids = get_personal_ids(mapping_data, personal_id_index) if personal_ids == None: personal_ids = all_personal_ids else: for pid in personal_ids: if pid not in all_personal_ids: raise ValueError("'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column)) if time_series_category not in header: raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category) otu_table_title = splitext(basename(otu_table_fp)) output_directories = [] raw_data_files = [] raw_data_dirs = [] # Rarefy the OTU table and split by body site here (instead of on a # per-individual basis) as we can use the same rarefied and split tables # for each individual. if not suppress_otu_category_significance: rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, '_even%d' % rarefaction_depth)) if body_site_rarefied_otu_table_dir is None: commands = [] cmd_title = 'Rarefying OTU table' cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth) commands.append([(cmd_title, cmd)]) raw_data_files.append(rarefied_otu_table_fp) per_body_site_dir = join(output_dir, 'per_body_site_otu_tables') cmd_title = 'Splitting rarefied OTU table by body site' cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % ( rarefied_otu_table_fp, mapping_fp, category_to_split, per_body_site_dir) commands.append([(cmd_title, cmd)]) raw_data_dirs.append(per_body_site_dir) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) else: per_body_site_dir = body_site_rarefied_otu_table_dir for person_of_interest in personal_ids: # Files to clean up on a per-individual basis. personal_raw_data_files = [] personal_raw_data_dirs = [] create_dir(join(output_dir, person_of_interest)) personal_mapping_file_fp = join(output_dir, person_of_interest, 'mapping_file.txt') html_fp = join(output_dir, person_of_interest, 'index.html') personal_mapping_data = create_personal_mapping_file(mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles) personal_mapping_f = open(personal_mapping_file_fp, 'w') personal_mapping_f.write( format_mapping_file(header, personal_mapping_data, comments)) personal_mapping_f.close() personal_raw_data_files.append(personal_mapping_file_fp) column_title_index = header.index(column_title) column_title_values = set([e[column_title_index] for e in personal_mapping_data]) cat_index = header.index(category_to_split) cat_values = set([e[cat_index] for e in personal_mapping_data]) # Generate alpha diversity boxplots, split by body site, one per # metric. We run this one first because it completes relatively # quickly and it does not call any QIIME scripts. alpha_diversity_boxplots_html = '' if not suppress_alpha_diversity_boxplots: adiv_boxplots_dir = join(output_dir, person_of_interest, 'adiv_boxplots') create_dir(adiv_boxplots_dir) output_directories.append(adiv_boxplots_dir) logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest) plot_filenames = _generate_alpha_diversity_boxplots( collated_dir, personal_mapping_file_fp, category_to_split, column_title, rarefaction_depth, adiv_boxplots_dir) # Create relative paths for use with the index page. rel_boxplot_dir = basename(normpath(adiv_boxplots_dir)) plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames] alpha_diversity_boxplots_html = \ create_alpha_diversity_boxplots_html(plot_fps) ## Alpha rarefaction steps if not suppress_alpha_rarefaction: rarefaction_dir = join(output_dir, person_of_interest, 'alpha_rarefaction') output_directories.append(rarefaction_dir) commands = [] cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % ( collated_dir, personal_mapping_file_fp, prefs_fp, rarefaction_dir) commands.append([(cmd_title, cmd)]) personal_raw_data_dirs.append(join(rarefaction_dir, 'average_plots')) personal_raw_data_dirs.append(join(rarefaction_dir, 'average_tables')) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Beta diversity steps if not suppress_beta_diversity: pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity') pcoa_time_series_dir = join(output_dir, person_of_interest, 'beta_diversity_time_series') output_directories.append(pcoa_dir) output_directories.append(pcoa_time_series_dir) commands = [] cmd_title = 'Creating beta diversity time series plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % ( personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\ '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category, site_id_category, time_series_category) commands.append([(cmd_title, cmd)]) cmd_title = 'Creating beta diversity plots (%s)' % \ person_of_interest cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir) commands.append([(cmd_title, cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) ## Time series taxa summary plots steps taxa_summary_plots_html = '' if not suppress_taxa_summary_plots: area_plots_dir = join(output_dir, person_of_interest, 'time_series') create_dir(area_plots_dir) output_directories.append(area_plots_dir) files_to_remove, dirs_to_remove = _generate_taxa_summary_plots( otu_table_fp, personal_mapping_file_fp, person_of_interest, column_title, column_title_values, category_to_split, cat_values, time_series_category, area_plots_dir, command_handler, status_update_callback, logger) personal_raw_data_files.extend(files_to_remove) personal_raw_data_dirs.extend(dirs_to_remove) taxa_summary_plots_html = create_taxa_summary_plots_html( output_dir, person_of_interest, cat_values) # Generate OTU category significance tables (per body site). otu_cat_sig_output_fps = [] otu_category_significance_html = '' if not suppress_otu_category_significance: otu_cat_sig_dir = join(output_dir, person_of_interest, 'otu_category_significance') create_dir(otu_cat_sig_dir) output_directories.append(otu_cat_sig_dir) # For each body-site rarefied OTU table, run # otu_category_significance.py using self versus other category. # Keep track of each output file that is created because we need to # parse these later on. commands = [] valid_body_sites = [] for cat_value in cat_values: body_site_otu_table_fp = join(per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, '_%s' % cat_value)) if exists(body_site_otu_table_fp): # Make sure we have at least one sample for Self, otherwise # otu_category_significance.py crashes with a division by # zero error. body_site_otu_table_f = open(body_site_otu_table_fp, 'U') personal_mapping_file_f = open(personal_mapping_file_fp, 'U') personal_sample_count = _count_per_individual_samples( body_site_otu_table_f, personal_mapping_file_f, personal_id_column, person_of_interest) body_site_otu_table_f.close() personal_mapping_file_f.close() if personal_sample_count < 1: continue else: valid_body_sites.append(cat_value) otu_cat_output_fp = join(otu_cat_sig_dir, 'otu_cat_sig_%s.txt' % cat_value) cmd_title = ('Testing for significant differences in ' 'OTU abundances in "%s" body site (%s)' % ( cat_value, person_of_interest)) cmd = ('otu_category_significance.py -i %s -m %s -c %s ' '-o %s' % (body_site_otu_table_fp, personal_mapping_file_fp, column_title, otu_cat_output_fp)) commands.append([(cmd_title, cmd)]) personal_raw_data_files.append(otu_cat_output_fp) otu_cat_sig_output_fps.append(otu_cat_output_fp) # Hack to allow print-only mode. if command_handler is not print_commands and not valid_body_sites: raise ValueError("None of the body sites for personal ID '%s' " "could be processed because there were no " "matching samples in the rarefied OTU table." % person_of_interest) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # Reformat otu category significance tables. otu_cat_sig_html_filenames = \ create_otu_category_significance_html_tables( otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp) # Create relative paths for use with the index page. rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir)) otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames] otu_category_significance_html = \ create_otu_category_significance_html(otu_cat_sig_html_fps) # Create the index.html file for the current individual. create_index_html(person_of_interest, html_fp, taxa_summary_plots_html=taxa_summary_plots_html, alpha_diversity_boxplots_html=alpha_diversity_boxplots_html, otu_category_significance_html=otu_category_significance_html) # Clean up the unnecessary raw data files and directories for the # current individual. glob will only grab paths that exist. if not retain_raw_data: clean_up_raw_data_files(personal_raw_data_files, personal_raw_data_dirs) # Clean up any remaining raw data files that weren't created on a # per-individual basis. if not retain_raw_data: clean_up_raw_data_files(raw_data_files, raw_data_dirs) logger.close() return output_directories
def run_pick_closed_reference_otus( input_fp, refseqs_fp, output_dir, taxonomy_fp, command_handler, params, qiime_config, assign_taxonomy=False, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) If assignment_taxonomy is True, choose representative sequence for OTUs and assign taxonomy using a classifier. 3) Build an OTU table with optional predefined taxonomy (if assign_taxonomy=False) or taxonomic assignments from step 2 (if assign_taxonomy=True). """ # confirm that a valid otu picking method was supplied before doing # any work reference_otu_picking_methods = ['blast', 'uclust_ref', 'usearch61_ref', 'usearch_ref', 'sortmerna'] try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust_ref' assert otu_picking_method in reference_otu_picking_methods,\ "Invalid OTU picking method supplied: %s. Valid choices are: %s"\ % (otu_picking_method, ' '.join(reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, taxonomy_fp]) # Prep the OTU picking command pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir, input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref' or otu_picking_method == 'usearch61_ref' or otu_picking_method == 'sortmerna'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. d = params['pick_otus'].copy() if 'otu_picking_method' in d: del d['otu_picking_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\ (otu_picking_script, input_fp, pick_otu_dir, refseqs_fp, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Since this is reference-based OTU picking we always want to # suppress new clusters -- force it here. params_str += ' --suppress_new_clusters' logger.write( "Forcing --suppress_new_clusters as this is " "closed-reference OTU picking.\n\n") # Build the OTU picking command pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\ (input_fp, pick_otu_dir, refseqs_fp, otu_picking_method, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) # Assign taxonomy using a taxonomy classifier, if request by the user. # (Alternatively predefined taxonomic assignments will be used, if provided.) if assign_taxonomy: # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir, input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir, input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = 'pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (otu_fp, input_fp, rep_set_log_fp, rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, rep_set_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, rep_set_fp, params_str) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # If assign_taxonomy is True, this will be the path to the taxonomic # assignment results. If assign_taxonomy is False this will be either # the precomputed taxonomic assignments that the user passed in, # or None. if taxonomy_fp: taxonomy_str = '-t %s' % taxonomy_fp else: taxonomy_str = '' # Build the OTU table building command make_otu_table_cmd = 'make_otu_table.py -i %s %s -o %s %s' %\ (otu_fp, taxonomy_str, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_group_significance=False, status_update_callback=print_to_stdout, ): """ """ if categories is not None: # Validate categories provided by the users mapping_data, mapping_comments = parse_mapping_file_to_dict(open(mapping_fp, "U")) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError( "Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c, ", ".join(metadata_map.CategoryNames)) ) if metadata_map.hasSingleCategoryValue(c): raise ValueError( "Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c ) else: categories = [] comma_separated_categories = ",".join(categories) # prep some variables if params is None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = "%s/index.html" % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir, "log_20*txt")) log_fp = generate_log_fp(output_dir) index_links.append(("Master run log", log_fp, _index_headers["run_summary"])) for old_log_fp in old_log_fps: index_links.append(("Previous run log", old_log_fp, _index_headers["run_summary"])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp, mapping_fp] if tree_fp is not None: input_fps.append(tree_fp) log_input_md5s(logger, input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params["biom-summarize-table"]) except KeyError: params_str = "" biom_table_stats_output_fp = "%s/biom_table_summary.txt" % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = "biom summarize-table -i %s -o %s --suppress-md5 %s" % ( biom_fp, biom_table_stats_output_fp, params_str, ) commands.append([("Generate BIOM table summary", biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" % biom_table_stats_output_fp) index_links.append(("BIOM table statistics", biom_table_stats_output_fp, _index_headers["run_summary"])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" % ( biom_fp, filtered_biom_fp, sampling_depth, ) commands.append( [ ( "Filter low sequence count samples from table (minimum sequence count: %d)" % sampling_depth, filter_samples_cmd, ) ] ) else: logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" % filtered_biom_fp) biom_fp = filtered_biom_fp # rarify the BIOM table to sampling_depth rarefied_biom_fp = "%s/table_even%d.biom" % (output_dir, sampling_depth) if not exists(rarefied_biom_fp): single_rarefaction_cmd = "single_rarefaction.py -i %s -o %s -d %d" % (biom_fp, rarefied_biom_fp, sampling_depth) commands.append([("Rarify the OTU table to %d sequences/sample" % sampling_depth, single_rarefaction_cmd)]) else: logger.write("Skipping single_rarefaction.py as %s exists.\n\n" % rarefied_biom_fp) # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = "%s/bdiv_even%d/" % (output_dir, sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob("%s/*_dm.txt" % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=rarefied_biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, # Note: we pass sampling depth=None here as # we rarify the BIOM table above and pass that # in here. sampling_depth=None, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" % ", ".join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip("_dm.txt"), fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params["make_distance_boxplots"]) except KeyError: params_str = "" for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = "%s/%s_boxplots/" % (bdiv_even_output_dir, bdiv_metric) plot_output_fp = "%s/%s_Distances.pdf" % (boxplots_output_dir, category) stats_output_fp = "%s/%s_Stats.txt" % (boxplots_output_dir, category) if not exists(plot_output_fp): boxplots_cmd = "make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s" % ( dm_fp, category, boxplots_output_dir, mapping_fp, params_str, ) commands.append([("Boxplots (%s)" % category, boxplots_cmd)]) else: logger.write( "Skipping make_distance_boxplots.py for %s as %s exists.\n\n" % (category, plot_output_fp) ) index_links.append( ( "Distance boxplots (%s)" % bdiv_metric, plot_output_fp, _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance boxplots statistics (%s)" % bdiv_metric, stats_output_fp, _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "PCoA plot (%s)" % bdiv_metric, "%s/%s_emperor_pcoa_plot/index.html" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Distance matrix (%s)" % bdiv_metric, "%s/%s_dm.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) index_links.append( ( "Principal coordinate matrix (%s)" % bdiv_metric, "%s/%s_pc.txt" % (bdiv_even_output_dir, bdiv_metric), _index_headers["beta_diversity_even"] % sampling_depth, ) ) if not suppress_alpha_diversity: # Alpha rarefaction workflow arare_full_output_dir = "%s/arare_max%d/" % (output_dir, sampling_depth) rarefaction_plots_output_fp = "%s/alpha_rarefaction_plots/rarefaction_plots.html" % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback, retain_intermediate_files=False, ) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" % rarefaction_plots_output_fp) index_links.append(("Alpha rarefaction plots", rarefaction_plots_output_fp, _index_headers["alpha_diversity"])) collated_alpha_diversity_fps = glob("%s/alpha_div_collated/*txt" % arare_full_output_dir) try: params_str = get_params_str(params["compare_alpha_diversity"]) except KeyError: params_str = "" if len(categories) > 0: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] compare_alpha_output_dir = "%s/compare_%s" % (arare_full_output_dir, alpha_metric) if not exists(compare_alpha_output_dir): compare_alpha_cmd = "compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s" % ( collated_alpha_diversity_fp, mapping_fp, comma_separated_categories, compare_alpha_output_dir, params_str, ) commands.append([("Compare alpha diversity (%s)" % alpha_metric, compare_alpha_cmd)]) for category in categories: alpha_comparison_stat_fp = "%s/%s_stats.txt" % (compare_alpha_output_dir, category) alpha_comparison_boxplot_fp = "%s/%s_boxplots.pdf" % (compare_alpha_output_dir, category) index_links.append( ( "Alpha diversity statistics (%s, %s)" % (category, alpha_metric), alpha_comparison_stat_fp, _index_headers["alpha_diversity"], ) ) index_links.append( ( "Alpha diversity boxplots (%s, %s)" % (category, alpha_metric), alpha_comparison_boxplot_fp, _index_headers["alpha_diversity"], ) ) else: logger.write( "Skipping compare_alpha_diversity.py" " for %s as %s exists.\n\n" % (alpha_metric, compare_alpha_output_dir) ) else: logger.write("Skipping compare_alpha_diversity.py as" " no categories were provided.\n\n") if not suppress_taxa_summary: taxa_plots_output_dir = "%s/taxa_plots/" % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob(join(taxa_plots_output_dir, "taxa_summary_plots", "*.html")) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write( "Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" % ", ".join(existing_taxa_plot_html_fps) ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary"], ) ) for category in categories: taxa_plots_output_dir = "%s/taxa_plots_%s/" % (output_dir, category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob("%s/taxa_summary_plots/*.html" % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback, ) else: logger.write( "Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" % (category, ", ".join(existing_taxa_plot_html_fps)) ) index_links.append( ( "Taxa summary bar plots", "%s/taxa_summary_plots/bar_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) index_links.append( ( "Taxa summary area plots", "%s/taxa_summary_plots/area_charts.html" % taxa_plots_output_dir, _index_headers["taxa_summary_categorical"] % category, ) ) if not suppress_group_significance: params_str = get_params_str(params["group_significance"]) # group significance tests, aka category significance for category in categories: group_signifance_fp = "%s/group_significance_%s.txt" % (output_dir, category) if not exists(group_signifance_fp): # Build the OTU cateogry significance command group_significance_cmd = "group_significance.py -i %s -m %s -c %s -o %s %s" % ( rarefied_biom_fp, mapping_fp, category, group_signifance_fp, params_str, ) commands.append([("Group significance (%s)" % category, group_significance_cmd)]) else: logger.write( "Skipping group_significance.py for %s as %s exists.\n\n" % (category, group_signifance_fp) ) index_links.append( ("Category significance (%s)" % category, group_signifance_fp, _index_headers["group_significance"]) ) filtered_biom_gzip_fp = "%s.gz" % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([("Compress the filtered BIOM table", "gzip %s" % filtered_biom_fp)]) else: logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" % filtered_biom_gzip_fp) index_links.append( ( "Filtered BIOM table (minimum sequence count: %d)" % sampling_depth, filtered_biom_gzip_fp, _index_headers["run_summary"], ) ) rarified_biom_gzip_fp = "%s.gz" % rarefied_biom_fp if not exists(rarified_biom_gzip_fp): commands.append([("Compress the rarified BIOM table", "gzip %s" % rarefied_biom_fp)]) else: logger.write("Skipping compressing of rarified BIOM table as %s exists.\n\n" % rarified_biom_gzip_fp) index_links.append( ( "Rarified BIOM table (sampling depth: %d)" % sampling_depth, rarified_biom_gzip_fp, _index_headers["run_summary"], ) ) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links, index_fp)
def run_core_diversity_analyses( biom_fp, mapping_fp, sampling_depth, output_dir, qiime_config, command_handler=call_commands_serially, tree_fp=None, params=None, categories=None, arare_min_rare_depth=10, arare_num_steps=10, parallel=False, suppress_taxa_summary=False, suppress_beta_diversity=False, suppress_alpha_diversity=False, suppress_otu_category_significance=False, status_update_callback=print_to_stdout): """ """ if categories != None: # Validate categories provided by the users mapping_data, mapping_comments = \ parse_mapping_file_to_dict(open(mapping_fp,'U')) metadata_map = MetadataMap(mapping_data, mapping_comments) for c in categories: if c not in metadata_map.CategoryNames: raise ValueError, ("Category '%s' is not a column header " "in your mapping file. " "Categories are case and white space sensitive. Valid " "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames))) if metadata_map.hasSingleCategoryValue(c): raise ValueError, ("Category '%s' contains only one value. " "Categories analyzed here require at least two values." % c) else: categories= [] # prep some variables if params == None: params = parse_qiime_parameters([]) create_dir(output_dir) index_fp = '%s/index.html' % output_dir index_links = [] commands = [] # begin logging old_log_fps = glob(join(output_dir,'log_20*txt')) log_fp = generate_log_fp(output_dir) index_links.append(('Master run log',log_fp,_index_headers['run_summary'])) for old_log_fp in old_log_fps: index_links.append(('Previous run log',old_log_fp,_index_headers['run_summary'])) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) input_fps = [biom_fp,mapping_fp] if tree_fp != None: input_fps.append(tree_fp) log_input_md5s(logger,input_fps) # run 'biom summarize-table' on input BIOM table try: params_str = get_params_str(params['biom-summarize-table']) except KeyError: params_str = '' biom_table_stats_output_fp = '%s/biom_table_summary.txt' % output_dir if not exists(biom_table_stats_output_fp): biom_table_summary_cmd = \ "biom summarize-table -i %s -o %s --suppress-md5 %s" % \ (biom_fp, biom_table_stats_output_fp,params_str) commands.append([('Generate BIOM table summary', biom_table_summary_cmd)]) else: logger.write("Skipping 'biom summarize-table' as %s exists.\n\n" \ % biom_table_stats_output_fp) index_links.append(('BIOM table statistics', biom_table_stats_output_fp, _index_headers['run_summary'])) # filter samples with fewer observations than the requested sampling_depth. # since these get filtered for some analyses (eg beta diversity after # even sampling) it's useful to filter them here so they're filtered # from all analyses. filtered_biom_fp = "%s/table_mc%d.biom" % (output_dir, sampling_depth) if not exists(filtered_biom_fp): filter_samples_cmd = "filter_samples_from_otu_table.py -i %s -o %s -n %d" %\ (biom_fp,filtered_biom_fp,sampling_depth) commands.append([('Filter low sequence count samples from table (minimum sequence count: %d)' % sampling_depth, filter_samples_cmd)]) else: logger.write("Skipping filter_samples_from_otu_table.py as %s exists.\n\n" \ % filtered_biom_fp) biom_fp = filtered_biom_fp # run initial commands and reset the command list if len(commands) > 0: command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] if not suppress_beta_diversity: bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth) # Need to check for the existence of any distance matrices, since the user # can select which will be generated. existing_dm_fps = glob('%s/*_dm.txt' % bdiv_even_output_dir) if len(existing_dm_fps) == 0: even_dm_fps = run_beta_diversity_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=bdiv_even_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, sampling_depth=sampling_depth, tree_fp=tree_fp, parallel=parallel, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping beta_diversity_through_plots.py as %s exist(s).\n\n" \ % ', '.join(existing_dm_fps)) even_dm_fps = [(split(fp)[1].strip('_dm.txt'),fp) for fp in existing_dm_fps] # Get make_distance_boxplots parameters try: params_str = get_params_str(params['make_distance_boxplots']) except KeyError: params_str = '' for bdiv_metric, dm_fp in even_dm_fps: for category in categories: boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric) plot_output_fp = '%s/%s_Distances.pdf' % (boxplots_output_dir,category) stats_output_fp = '%s/%s_Stats.txt' % (boxplots_output_dir,category) if not exists(plot_output_fp): boxplots_cmd = \ 'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\ (dm_fp, category, boxplots_output_dir, mapping_fp, params_str) commands.append([('Boxplots (%s)' % category, boxplots_cmd)]) else: logger.write("Skipping make_distance_boxplots.py for %s as %s exists.\n\n" \ % (category, plot_output_fp)) index_links.append(('Distance boxplots (%s)' % bdiv_metric, plot_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric, stats_output_fp, _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('PCoA plot (%s)' % bdiv_metric, '%s/%s_emperor_pcoa_plot/index.html' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Distance matrix (%s)' % bdiv_metric, '%s/%s_dm.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric, '%s/%s_pc.txt' % \ (bdiv_even_output_dir,bdiv_metric), _index_headers['beta_diversity_even'] % sampling_depth)) if not suppress_alpha_diversity: ## Alpha rarefaction workflow arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth) rarefaction_plots_output_fp = \ '%s/alpha_rarefaction_plots/rarefaction_plots.html' % arare_full_output_dir if not exists(rarefaction_plots_output_fp): run_alpha_rarefaction( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=arare_full_output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, tree_fp=tree_fp, num_steps=arare_num_steps, parallel=parallel, logger=logger, min_rare_depth=arare_min_rare_depth, max_rare_depth=sampling_depth, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping alpha_rarefaction.py as %s exists.\n\n" \ % rarefaction_plots_output_fp) index_links.append(('Alpha rarefaction plots', rarefaction_plots_output_fp, _index_headers['alpha_diversity'])) collated_alpha_diversity_fps = \ glob('%s/alpha_div_collated/*txt' % arare_full_output_dir) try: params_str = get_params_str(params['compare_alpha_diversity']) except KeyError: params_str = '' for category in categories: for collated_alpha_diversity_fp in collated_alpha_diversity_fps: alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0] alpha_comparison_output_fp = '%s/%s_%s.txt' % \ (arare_full_output_dir,category,alpha_metric) if not exists(alpha_comparison_output_fp): compare_alpha_cmd = \ 'compare_alpha_diversity.py -i %s -m %s -c %s -o %s -n 999 %s' %\ (collated_alpha_diversity_fp, mapping_fp, category, alpha_comparison_output_fp, params_str) commands.append([('Compare alpha diversity (%s, %s)' %\ (category,alpha_metric), compare_alpha_cmd)]) else: logger.write("Skipping compare_alpha_diversity.py for %s as %s exists.\n\n" \ % (category, alpha_comparison_output_fp)) index_links.append( ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric), alpha_comparison_output_fp, _index_headers['alpha_diversity'])) if not suppress_taxa_summary: taxa_plots_output_dir = '%s/taxa_plots/' % output_dir # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob(join(output_dir,'taxa_summary_plots','*.html')) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=None, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping summarize_taxa_through_plots.py for as %s exist(s).\n\n" \ % ', '.join(existing_taxa_plot_html_fps)) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary'])) for category in categories: taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,category) # need to check for existence of any html files, since the user can # select only certain ones to be generated existing_taxa_plot_html_fps = glob('%s/taxa_summary_plots/*.html' % taxa_plots_output_dir) if len(existing_taxa_plot_html_fps) == 0: run_summarize_taxa_through_plots( otu_table_fp=biom_fp, mapping_fp=mapping_fp, output_dir=taxa_plots_output_dir, mapping_cat=category, sort=True, command_handler=command_handler, params=params, qiime_config=qiime_config, logger=logger, suppress_md5=True, status_update_callback=status_update_callback) else: logger.write("Skipping summarize_taxa_through_plots.py for %s as %s exist(s).\n\n" \ % (category, ', '.join(existing_taxa_plot_html_fps))) index_links.append(('Taxa summary bar plots', '%s/taxa_summary_plots/bar_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) index_links.append(('Taxa summary area plots', '%s/taxa_summary_plots/area_charts.html'\ % taxa_plots_output_dir, _index_headers['taxa_summary_categorical'] % category)) if not suppress_otu_category_significance: try: params_str = get_params_str(params['otu_category_significance']) except KeyError: params_str = '' # OTU category significance for category in categories: category_signifance_fp = \ '%s/category_significance_%s.txt' % (output_dir, category) if not exists(category_signifance_fp): # Build the OTU cateogry significance command category_significance_cmd = \ 'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\ (biom_fp, mapping_fp, category, category_signifance_fp, params_str) commands.append([('OTU category significance (%s)' % category, category_significance_cmd)]) else: logger.write("Skipping otu_category_significance.py for %s as %s exists.\n\n" \ % (category, category_signifance_fp)) index_links.append(('Category significance (%s)' % category, category_signifance_fp, _index_headers['otu_category_sig'])) filtered_biom_gzip_fp = '%s.gz' % filtered_biom_fp if not exists(filtered_biom_gzip_fp): commands.append([('Compress the filtered BIOM table','gzip %s' % filtered_biom_fp)]) index_links.append(('Filtered BIOM table (minimum sequence count: %d)' % sampling_depth, filtered_biom_gzip_fp, _index_headers['run_summary'])) else: logger.write("Skipping compressing of filtered BIOM table as %s exists.\n\n" \ % filtered_biom_gzip_fp) if len(commands) > 0: command_handler(commands, status_update_callback, logger) else: logger.close() generate_index_page(index_links,index_fp)
def run_jackknifed_beta_diversity(otu_table_fp, tree_fp, seqs_per_sample, output_dir, command_handler, params, qiime_config, mapping_fp, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout, master_tree=None): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Compute beta diversity distance matrix from otu table (and tree, if applicable) 2) Build rarefied OTU tables; 3) Build UPGMA tree from full distance matrix; 4) Compute distance matrics for rarefied OTU tables; 5) Build UPGMA trees from rarefied OTU table distance matrices; 5.5) Build a consensus tree from the rarefied UPGMA trees 6) Compare rarefied OTU table distance matrix UPGMA trees to tree full UPGMA tree and write support file and newick tree with support values as node labels. master_tree can be 'full' or 'consensus', default full """ # Prepare some variables for the later steps if master_tree is None: master_tree = 'full' otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp, tree_fp]) try: beta_diversity_metrics = params['beta_diversity']['metrics'].split(',') except KeyError: beta_diversity_metrics = ['weighted_unifrac', 'unweighted_unifrac'] # Prep the beta-diversity command try: params_str = get_params_str(params['beta_diversity']) except KeyError: params_str = '' if tree_fp: params_str = '%s -t %s' % (params_str, tree_fp) # Build the beta-diversity command beta_div_cmd = 'beta_diversity.py -i %s -o %s %s' %\ (otu_table_fp, output_dir, params_str) commands.append([ ('Beta Diversity (%s)' % ', '.join(beta_diversity_metrics), beta_div_cmd) ]) # Prep rarefaction command rarefaction_dir = '%s/rarefaction/' % output_dir create_dir(rarefaction_dir) try: params_str = get_params_str(params['multiple_rarefactions_even_depth']) except KeyError: params_str = '' # Build the rarefaction command rarefaction_cmd = \ 'multiple_rarefactions_even_depth.py -i %s -d %d -o %s %s' %\ (otu_table_fp, seqs_per_sample, rarefaction_dir, params_str) commands.append([('Rarefaction', rarefaction_cmd)]) # Begin iterating over beta diversity distance metrics, if more than one # was provided for beta_diversity_metric in beta_diversity_metrics: metric_output_dir = '%s/%s/' % (output_dir, beta_diversity_metric) distance_matrix_fp = '%s/%s_%s.txt' % \ (output_dir, beta_diversity_metric, otu_table_basename) # Prep the hierarchical clustering command (for full distance matrix) full_tree_fp = '%s/%s_upgma.tre' % (metric_output_dir, otu_table_basename) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for full distance matrix) hierarchical_cluster_cmd = 'upgma_cluster.py -i %s -o %s %s' %\ (distance_matrix_fp, full_tree_fp, params_str) commands.append([ ('UPGMA on full distance matrix: %s' % beta_diversity_metric, hierarchical_cluster_cmd) ]) # Prep the beta diversity command (for rarefied OTU tables) dm_dir = '%s/rare_dm/' % metric_output_dir create_dir(dm_dir) # the metrics parameter needs to be ignored as we need to run # beta_diversity one metric at a time to keep the per-metric # output files in separate directories try: d = params['beta_diversity'].copy() del d['metrics'] except KeyError: params_str = {} params_str = get_params_str(d) + ' -m %s ' % beta_diversity_metric if tree_fp: params_str = '%s -t %s' % (params_str, tree_fp) if parallel: params_str += ' %s' % get_params_str(params['parallel']) # Build the parallel beta diversity command (for rarefied OTU # tables) beta_div_rarefied_cmd = \ 'parallel_beta_diversity.py -T -i %s -o %s %s' %\ (rarefaction_dir, dm_dir, params_str) else: # Build the serial beta diversity command (for rarefied OTU tables) beta_div_rarefied_cmd = \ 'beta_diversity.py -i %s -o %s %s' %\ (rarefaction_dir, dm_dir, params_str) commands.append([('Beta diversity on rarefied OTU tables (%s)' % beta_diversity_metric, beta_div_rarefied_cmd)]) # Prep the hierarchical clustering command (for rarefied # distance matrices) upgma_dir = '%s/rare_upgma/' % metric_output_dir create_dir(upgma_dir) try: params_str = get_params_str(params['upgma_cluster']) except KeyError: params_str = '' # Build the hierarchical clustering command (for rarefied # distance matrices) hierarchical_cluster_cmd =\ 'upgma_cluster.py -i %s -o %s %s' % (dm_dir, upgma_dir, params_str) commands.append([ ('UPGMA on rarefied distance matrix (%s)' % beta_diversity_metric, hierarchical_cluster_cmd) ]) # Build the consensus tree command consensus_tree_cmd =\ 'consensus_tree.py -i %s -o %s %s' %\ (upgma_dir, metric_output_dir + "/rare_upgma_consensus.tre", params_str) commands.append([('consensus on rarefied distance matrices (%s)' % beta_diversity_metric, consensus_tree_cmd)]) # Prep the tree compare command tree_compare_dir = '%s/upgma_cmp/' % metric_output_dir create_dir(tree_compare_dir) try: params_str = get_params_str(params['tree_compare']) except KeyError: params_str = '' # Build the tree compare command if master_tree == "full": master_tree_fp = full_tree_fp elif master_tree == "consensus": master_tree_fp = metric_output_dir + "/rare_upgma_consensus.tre" else: raise RuntimeError('master tree method "%s" not found' % (master_tree, )) tree_compare_cmd = 'tree_compare.py -s %s -m %s -o %s %s' %\ (upgma_dir, master_tree_fp, tree_compare_dir, params_str) commands.append([('Tree compare (%s)' % beta_diversity_metric, tree_compare_cmd)]) # Prep the PCoA command pcoa_dir = '%s/pcoa/' % metric_output_dir create_dir(pcoa_dir) try: params_str = get_params_str(params['principal_coordinates']) except KeyError: params_str = '' # Build the PCoA command pcoa_cmd = 'principal_coordinates.py -i %s -o %s %s' %\ (dm_dir, pcoa_dir, params_str) commands.append([('Principal coordinates (%s)' % beta_diversity_metric, pcoa_cmd)]) # Prep the emperor plots command emperor_dir = '%s/emperor_pcoa_plots/' % metric_output_dir create_dir(emperor_dir) try: params_str = get_params_str(params['make_emperor']) except KeyError: params_str = '' emperor_cmd = 'make_emperor.py -i %s -o %s -m %s %s' %\ (pcoa_dir, emperor_dir, mapping_fp, params_str) commands.append([('emperor plots (%s)' % beta_diversity_metric, emperor_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def assign_tax(repset_fasta_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, status_update_callback=print_to_stdout): input_dir, input_filename = split(repset_fasta_fp) input_basename, input_ext = splitext(input_filename) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir, assignment_method) taxonomy_fp = '%s/%s_tax_assignments.txt' % \ (assign_taxonomy_dir, input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ 'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (assignment_method, repset_fasta_fp, assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\ (assign_taxonomy_dir, repset_fasta_fp, params_str) if exists(assign_taxonomy_dir): rmtree(assign_taxonomy_dir) commands.append([('Assign taxonomy', assign_taxonomy_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return taxonomy_fp
def run_summarize_taxa_through_plots(otu_table_fp, mapping_fp, output_dir, mapping_cat, sort, command_handler, params, qiime_config, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation for summarizing taxonomies and generating plots The steps performed by this function are: 1) Summarize OTU by Category 2) Summarize Taxonomy 3) Plot Taxonomy Summary """ # Prepare some variables for the later steps otu_table_dir, otu_table_filename = split(otu_table_fp) otu_table_basename, otu_table_ext = splitext(otu_table_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [otu_table_fp, mapping_fp]) # if mapping category not passed via command-line, # check if it is passed in params file if not mapping_cat: try: mapping_cat = params['collapse_samples']['collapse_fields'] except: mapping_cat = None try: params_str = get_params_str(params['collapse_samples']) # Need to remove the mapping category option, since it is defined above. # Using this method since we don't want to change the params dict split_params = params_str.split('--') updated_params_str = [] for i in split_params: if not i.startswith('collapse_fields'): updated_params_str.append(i) params_str = '--'.join(updated_params_str) except: params_str = '' if mapping_cat: base_filename = mapping_cat.replace(' ', '-').replace(',', '') output_biom_fp = join(output_dir, '%s_otu_table.biom' % base_filename) output_map_fp = join(output_dir, '%s_map.txt' % base_filename) # Build the collapse samples command collapse_samples_cmd = \ "collapse_samples.py -m %s -b %s --output_biom_fp %s --output_mapping_fp %s --collapse_fields '%s' %s" %\ (mapping_fp, otu_table_fp, output_biom_fp, output_map_fp, mapping_cat, params_str) commands.append([('Collapse samples in OTU table by categories', collapse_samples_cmd)]) otu_table_fp = output_biom_fp # Build the sort OTU table command if sort: # Prep the sort_otu_table command try: params_str = get_params_str(params['sort_otu_table']) except: params_str = '' # define output otu table sorted_fp = join(output_dir, splitext(split(otu_table_fp)[-1])[0] + '_sorted.biom') if mapping_cat or params_str == '': # for this case we don't have a collapsed mapping file so must # handle separately sort_otu_table_cmd = \ "sort_otu_table.py -i %s -o %s" % (otu_table_fp, sorted_fp) else: sort_otu_table_cmd = \ "sort_otu_table.py -i %s -o %s -m %s %s" %\ (otu_table_fp, sorted_fp, mapping_fp, params_str) commands.append([('Sort OTU Table', sort_otu_table_cmd)]) # redefine otu_table_fp to use otu_table_fp = sorted_fp # Prep the summarize taxonomy command try: params_str = get_params_str(params['summarize_taxa']) except: params_str = '' try: sum_taxa_levels = params['summarize_taxa']['level'] except: sum_taxa_levels = None # Build the summarize taxonomy command summarize_taxa_cmd = 'summarize_taxa.py -i %s -o %s %s' %\ (otu_table_fp, output_dir, params_str) commands.append([('Summarize Taxonomy', summarize_taxa_cmd)]) sum_taxa_fps = [] if sum_taxa_levels: basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0]) for i in sum_taxa_levels.split(','): sum_taxa_fps.append(basename + '_L%s.txt' % (str(i))) else: basename = join(output_dir, splitext(split(otu_table_fp)[-1])[0]) # this is the default levels from summarize_taxa, but cannot import # script to get these values for i in [2, 3, 4, 5, 6]: sum_taxa_fps.append(basename + '_L%s.txt' % (str(i))) # Prep the plot taxa summary plot command(s) taxa_summary_plots_dir = '%s/taxa_summary_plots/' % output_dir create_dir(taxa_summary_plots_dir) try: params_str = get_params_str(params['plot_taxa_summary']) except: params_str = '' # Build the plot taxa summary plot command(s) plot_taxa_summary_cmd =\ 'plot_taxa_summary.py -i %s -o %s %s' %\ (','.join(sum_taxa_fps), taxa_summary_plots_dir, params_str) commands.append([('Plot Taxonomy Summary', plot_taxa_summary_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def iterative_pick_subsampled_open_reference_otus( input_fps, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, prefilter_percent_id=None, min_otu_size=2, run_assign_tax=True, run_align_and_tree=True, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Call the pick_subsampled_open_reference_otus workflow on multiple inputs and handle processing of the results. """ create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False # if the user has not passed a different reference collection for the pre-filter, # used the input refseqs_fp for all iterations. we want to pre-filter all data against # the input data as lower percent identity searches with uclust can be slow, so we # want the reference collection to stay at a reasonable size. if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp otu_table_fps = [] repset_fasta_fps = [] for i, input_fp in enumerate(input_fps): iteration_output_dir = '%s/%d/' % (output_dir, i) if iteration_output_exists(iteration_output_dir, min_otu_size): # if the output from an iteration already exists, skip that # iteration (useful for continuing failed runs) log_input_md5s(logger, [input_fp, refseqs_fp]) logger.write( 'Iteration %d (input file: %s) output data already exists. ' 'Skipping and moving to next.\n\n' % (i, input_fp)) else: pick_subsampled_open_reference_otus( input_fp=input_fp, refseqs_fp=refseqs_fp, output_dir=iteration_output_dir, percent_subsample=percent_subsample, new_ref_set_id='.'.join([new_ref_set_id, str(i)]), command_handler=command_handler, params=params, qiime_config=qiime_config, run_assign_tax=False, run_align_and_tree=False, prefilter_refseqs_fp=prefilter_refseqs_fp, prefilter_percent_id=prefilter_percent_id, min_otu_size=min_otu_size, step1_otu_map_fp=step1_otu_map_fp, step1_failures_fasta_fp=step1_failures_fasta_fp, parallel=parallel, suppress_step4=suppress_step4, logger=logger, suppress_md5=suppress_md5, suppress_index_page=True, denovo_otu_picking_method=denovo_otu_picking_method, reference_otu_picking_method=reference_otu_picking_method, status_update_callback=status_update_callback, minimum_failure_threshold=minimum_failure_threshold) # perform post-iteration file shuffling whether the previous iteration's # data previously existed or was just computed. # step1 otu map and failures can only be used for the first iteration # as subsequent iterations need to use updated refseqs files step1_otu_map_fp = step1_failures_fasta_fp = None new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir refseqs_fp = new_refseqs_fp otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir, min_otu_size)) repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir) # Merge OTU tables - check for existence first as this step has historically # been a frequent failure, so is sometimes run manually in failed runs. otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0): merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\ (','.join(otu_table_fps), otu_table_fp) commands.append([("Merge OTU tables", merge_cmd)]) # Build master rep set final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table( table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] logger.close()
def pick_subsampled_open_reference_otus( input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method="uclust", reference_otu_picking_method="uclust_ref", status_update_callback=print_to_stdout, ): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ["uclust", "usearch61"] allowed_reference_otu_picking_methods = ["uclust_ref", "usearch61_ref"] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods, ( "Unknown de novo OTU picking method: %s. Known methods are: %s" % (denovo_otu_picking_method, ",".join(allowed_denovo_otu_picking_methods)) ) assert reference_otu_picking_method in allowed_reference_otu_picking_methods, ( "Unknown reference OTU picking method: %s. Known methods are: %s" % (reference_otu_picking_method, ",".join(allowed_reference_otu_picking_methods)) ) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = "%s/step1_otus" % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = "%s/prefilter_otus/" % output_dir prefilter_failures_list_fp = "%s/%s_failures.txt" % (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id, ) commands.append([("Pick Reference OTUs (prefilter)", prefilter_pick_otu_cmd)]) prefiltered_input_fp = "%s/prefiltered_%s%s" % (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = "filter_fasta.py -f %s -o %s -s %s -n" % ( input_fp, prefiltered_input_fp, prefilter_failures_list_fp, ) commands.append([("Filter prefilter failures from input", filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?" ) # Build the OTU picking command step1_dir = "%s/step1_otus" % output_dir step1_otu_map_fp = "%s/%s_otus.txt" % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger ) commands.append([("Pick Reference OTUs", step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = "%s/%s_failures.txt" % (step1_dir, input_basename) step1_failures_fasta_fp = "%s/failures.fasta" % step1_dir step1_filter_fasta_cmd = "filter_fasta.py -f %s -s %s -o %s" % ( input_fp, step1_failures_list_fp, step1_failures_fasta_fp, ) commands.append([("Generate full failures fasta file", step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = "%s/step1_rep_set.fna" % step1_dir step1_pick_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([("Pick rep set", step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_input_fasta_fp = "%s/subsampled_failures.fasta" % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write( "# Subsample the failures fasta file using API \n" + 'python -c "import qiime; qiime.util.subsample_fasta' + "('%s', '%s', '%f')\n\n\"" % (abspath(step1_failures_fasta_fp), abspath(step2_input_fasta_fp), percent_subsample) ) # Prep the OTU picking command for the subsampled failures step2_dir = "%s/step2_otus/" % output_dir step2_cmd = pick_denovo_otus( step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger ) step2_otu_map_fp = "%s/subsampled_failures_otus.txt" % step2_dir commands.append([("Pick de novo OTUs for new clusters", step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = "%s/step2_rep_set.fna" % step2_dir step2_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % ( step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp, ) commands.append([("Pick representative set for subsampled failures", step2_rep_set_cmd)]) step3_dir = "%s/step3_otus/" % output_dir step3_otu_map_fp = "%s/failures_otus.txt" % step3_dir step3_failures_list_fp = "%s/failures_failures.txt" % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger, ) commands.append([("Pick reference OTUs using de novo rep set", step3_cmd)]) # name the final otu map merged_otu_map_fp = "%s/final_otu_map.txt" % output_dir if not suppress_step4: step3_failures_fasta_fp = "%s/failures_failures.fasta" % step3_dir step3_filter_fasta_cmd = "filter_fasta.py -f %s -s %s -o %s" % ( step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp, ) commands.append([("Create fasta file of step3 failures", step3_filter_fasta_cmd)]) step4_dir = "%s/step4_otus/" % output_dir step4_cmd = pick_denovo_otus( step3_failures_fasta_fp, step4_dir, ".".join([new_ref_set_id, "CleanUp"]), denovo_otu_picking_method, params, logger, ) step4_otu_map_fp = "%s/failures_failures_otus.txt" % step4_dir commands.append([("Pick de novo OTUs on step3 failures", step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = "cat %s %s %s > %s" % ( step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp, ) commands.append([("Merge OTU maps", cat_otu_tables_cmd)]) step4_repset_fasta_fp = "%s/step4_rep_set.fna" % step4_dir step4_rep_set_cmd = "pick_rep_set.py -i %s -o %s -f %s" % ( step4_otu_map_fp, step4_repset_fasta_fp, step3_failures_fasta_fp, ) commands.append([("Pick representative set for subsampled failures", step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = "cat %s %s > %s" % (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([("Merge OTU maps", cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append( [ ( "Move final failures file to top-level directory", "mv %s %s/final_failures.txt" % (step3_failures_list_fp, output_dir), ) ] ) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = "%s/final_otu_map_mc%d.txt" % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp, min_otu_size) logger.write( "# Filter singletons from the otu map using API \n" + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + "('%s', '%s', '%d')\"\n\n" % (abspath(otu_fp), abspath(otu_no_singletons_fp), min_otu_size) ) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = "%s/rep_set.fna" % output_dir final_repset_f = open(final_repset_fp, "w") new_refseqs_fp = "%s/new_refseqs.fna" % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, "U")): if otu_id.split()[0] in otus_to_keep: final_repset_f.write(">%s\n%s\n" % (otu_id, seq)) logger.write( "# Write non-singleton otus representative sequences " + "from step1 to the final rep set file: %s\n\n" % final_repset_fp ) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, "a") new_refseqs_f.write("\n") logger.write( "# Copy the full input refseqs file to the new refseq file\n" + "cp %s %s\n\n" % (refseqs_fp, new_refseqs_fp) ) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, "U")): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write(">%s\n%s\n" % (otu_id, seq)) final_repset_f.write(">%s\n%s\n" % (otu_id, seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp, "U")): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write(">%s\n%s\n" % (otu_id, seq)) final_repset_f.write(">%s\n%s\n" % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() logger.write( "# Write non-singleton otus representative sequences from " + "step 2 and step 4 to the final representative set and the new reference" + " set (%s and %s respectively)\n\n" % (final_repset_fp, new_refseqs_fp) ) # Prep the make_otu_table.py command otu_table_fp = "%s/otu_table_mc%d.biom" % (output_dir, min_otu_size) make_otu_table_cmd = "make_otu_table.py -i %s -o %s" % (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = "%s/otu_table_mc%d_w_tax.biom" % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = "%s/otu_table_mc%d_w_tax_no_pynast_failures.biom" % ( output_dir, min_otu_size, ) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = "%s/otu_table_mc%d_w_tax.biom" % (output_dir, min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = "%s/otu_table_mc%d_no_pynast_failures.biom" % (output_dir, min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback, ) # Add taxa to otu table add_metadata_cmd = ( "biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy" % (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) ) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback, ) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table, "U")), get_seq_ids_from_fasta_file(open(pynast_failures_fp, "U")), 0, inf, 0, inf, negate_ids_to_keep=True, ) otu_table_f = open(pynast_failure_filtered_otu_table_fp, "w") otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def run_ampliconnoise(mapping_fp, output_dir, command_handler, params, qiime_config, logger=None, status_update_callback=print_to_stdout, chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2, suppress_perseus=True, output_filepath=None, platform='flx', seqnoise_resolution=None, truncate_len=None): """ Run the ampliconnoise pipeline The steps performed by this function are: 1. Split input sff.txt file into one file per sample 2. Run scripts required for PyroNoise 3. Run scripts required for SeqNoise 4. Run scripts requred for Perseus (chimera removal) 5. Merge output files into one file similar to the output of split_libraries.py output_filepath should be absolute seqnoise_resolution should be string environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be careful passing command handlers that don't spawn child processes, as they may not inherit the correct environment variable setting """ map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U')) create_dir(output_dir) if seqnoise_resolution == None: if platform == 'flx': seqnoise_resolution = '30.0' elif platform == 'titanium': seqnoise_resolution = '25.0' else: raise RuntimeError('seqnoise_resolution not set, and no'+\ ' default for platform '+platform) if truncate_len == None: if platform == 'flx': truncate_len = '220' elif platform == 'titanium': truncate_len = '400' else: raise RuntimeError('truncate_len not set, and no'+\ ' default for platform '+platform) sample_names = [ ] # these are filenames minus extension, and are sample IDs primer_seqs = [] # same order as sample_names bc_seqs = [] # same order as sample_names for i in range(len(map_data)): sample_names.append(map_data[i][headers.index('SampleID')]) bc_seqs.append(map_data[i][headers.index('BarcodeSequence')]) # don't know why don't just take off the primer now. # but that's done later # primer += (map_data[i][headers.index('LinkerPrimerSequence')]) # for char, bases in IUPAC_DNA_ambiguities.items(): # primer = primer.replace(char,'['+''.join(bases)+']') primer = (map_data[i][headers.index('LinkerPrimerSequence')]) for char, bases in IUPAC_DNA_ambiguities.items(): primer = primer.replace(char, '[' + ''.join(bases) + ']') primer_seqs.append(primer) if len(set(primer_seqs)) != 1: raise RuntimeError( 'Error: only one primer per mapping file supported.') one_primer = primer_seqs[0] commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False log_input_md5s(logger, [mapping_fp, sff_txt_fp]) # execute commands in output_dir called_dir = os.getcwd() os.chdir(output_dir) fh = open(os.path.join(output_dir, 'map.csv'), 'w') for i in range(len(sample_names)): fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n') fh.close() # these are the fasta results, e.g. PC.636_Good.fa # later we merge them and copy to output file post_pyro_tail = '_' + truncate_len if suppress_perseus == True: fasta_result_names = [ sample_name + post_pyro_tail + '_seqnoise_cd.fa' for sample_name in sample_names ] else: fasta_result_names = [sample_name + '_Good.fa' \ for sample_name in sample_names] cmd = 'cd ' + output_dir # see also os.chdir above commands.append([('change to output dir', cmd)]) cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt' commands.append([('confirm pyro lookup filepath environment variable', cmd) ]) cmd = 'SplitKeys.pl '+one_primer+' map.csv < '+\ os.path.join(called_dir,sff_txt_fp)+\ ' > splitkeys_log.txt 2> unassigned.fna' commands.append([('split sff.txt via barcodes (keys)', cmd)]) for i, sample_name in enumerate(sample_names): # Build the summarize taxonomy command if platform == 'flx': cmd = 'Clean360.pl '+one_primer+' '+sample_name+' < '+\ sample_name+'.raw' commands.append([('clean flows ' + sample_name, cmd)]) # these run through the whole sff file once per sample, I think # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) elif platform == 'titanium': cmd = 'CleanMinMax.pl '+one_primer+' '+sample_name+' < '+\ sample_name+'.raw' commands.append([('clean flows ' + sample_name, cmd)]) # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\ # os.path.join(called_dir,sff_txt_fp) # commands.append([('extract flows '+sample_name, cmd)]) else: raise RuntimeError("platform " + platform + " not supported") cmd = "mpirun -np "+str(numnodes)+" PyroDist -in "+\ sample_name+".dat -out "+sample_name+ " > "+sample_name+".pdout" commands.append([('pyrodist ' + sample_name, cmd)]) cmd = "FCluster -in "+sample_name+".fdist -out "+sample_name+\ " > "+sample_name+".fcout" commands.append([('fcluster pyrodist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout cmd = "mpirun -np "+str(numnodes)+" PyroNoise -din "+\ sample_name+".dat -out "+\ sample_name+"_pyronoise "+"-lin "+\ sample_name+".list -s 60.0 -c 0.01 > "+\ sample_name+"_pyronoise.pnout" commands.append([('pyronoise ' + sample_name, cmd)]) cmd = 'Parse.pl '+bc_seqs[i]+one_primer+' '+truncate_len+' < '+\ sample_name+'_pyronoise_cd.fa'+' > '+ sample_name+'_'+\ truncate_len+'.fa' commands.append([('truncate ' + sample_name, cmd)]) # now start with post_pyro_tail cmd = "mpirun -np "+str(numnodes)+" SeqDist -in "+\ sample_name+post_pyro_tail+\ ".fa > "+sample_name+post_pyro_tail+".seqdist" commands.append([('seqdist ' + sample_name, cmd)]) cmd = "FCluster -in "+sample_name+post_pyro_tail+".seqdist -out "+\ sample_name+post_pyro_tail+"fcl > "+\ sample_name+post_pyro_tail+".fcout" commands.append([('fcluster seqdist ' + sample_name, cmd)]) # e.g.: # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 > # PC.354_pyronoise_cd.snout cmd = "mpirun -np "+str(numnodes)+" SeqNoise -in "+\ sample_name+post_pyro_tail+\ ".fa -din "+sample_name+post_pyro_tail+".seqdist -out "+\ sample_name+post_pyro_tail+\ "_seqnoise -lin "+sample_name+post_pyro_tail+'fcl.list -min '+\ sample_name+'_pyronoise'+\ '.mapping -s '+seqnoise_resolution+' -c 0.08 > '+\ sample_name+post_pyro_tail+'.snout' commands.append([('seqnoise ' + sample_name, cmd)]) if suppress_perseus == False: cmd = 'Perseus -sin '+sample_name+post_pyro_tail+\ '_seqnoise_cd.fa > ' +\ sample_name+'.per' commands.append([('Perseus ' + sample_name, cmd)]) cmd = 'Class.pl '+sample_name+'.per '+\ str(chimera_alpha) + ' '+ str(chimera_beta)+\ ' > '+sample_name+'.class' commands.append([('Class.pl ' + sample_name, cmd)]) cmd = 'FilterGoodClass.pl '+sample_name+post_pyro_tail+\ '_seqnoise_cd.fa '+\ sample_name+'.class 0.5 > '+sample_name+'_Chi.fa 2> '+\ sample_name+'_Good.fa' commands.append([('FilterGoodClass ' + sample_name, cmd)]) cmd = '%s %s/unweight_fasta.py -i %s -o %s -l %s' %\ (python_exe_fp, script_dir, fasta_result_names[i], sample_name+'_unw.fna', sample_name) commands.append([('unweight fasta ' + sample_name, cmd)]) cmd = 'cat ' +\ ' '.join([sample_name+'_unw.fna' for sample_name in sample_names]) +\ ' > ' + output_filepath # this should be an abs filepath commands.append([('cat into one fasta file', cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
def assign_taxonomy_multiple_times(input_dirs, output_dir, assignment_methods, reference_seqs_fp, id_to_taxonomy_fp, confidences=None, e_values=None, rtax_modes=None, uclust_min_consensus_fractions=None, uclust_similarities=None, uclust_max_accepts=None, input_fasta_filename='rep_set.fna', clean_otu_table_filename='otu_table_mc2_no_pynast_failures.biom', read_1_seqs_filename='seqs1.fna', read_2_seqs_filename='seqs2.fna', rtax_read_id_regexes=None, rtax_amplicon_id_regexes=None, rtax_header_id_regexes=None, rdp_max_memory=4000, command_handler=call_commands_serially, status_update_callback=no_status_updates, force=False): """ Performs sanity checks on passed arguments and directories. Builds commands for each method and sends them off to be executed. """ ## Check if output directory exists try: create_dir(output_dir, fail_on_exist=not force) except OSError: raise WorkflowError("Output directory '%s' already exists. Please " "choose a different directory, or force overwrite with -f." % output_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) # We're going to zip these with the input directories. num_dirs = len(input_dirs) if rtax_read_id_regexes is None: rtax_read_id_regexes = [None] * num_dirs if rtax_amplicon_id_regexes is None: rtax_amplicon_id_regexes = [None] * num_dirs if rtax_header_id_regexes is None: rtax_header_id_regexes = [None] * num_dirs if num_dirs != len(rtax_read_id_regexes) or \ num_dirs != len(rtax_amplicon_id_regexes) or \ num_dirs != len(rtax_header_id_regexes): raise WorkflowError("The number of RTAX regular expressions must " "match the number of input directories.") for input_dir, rtax_read_id_regex, rtax_amplicon_id_regex, \ rtax_header_id_regex in zip(input_dirs, rtax_read_id_regexes, rtax_amplicon_id_regexes, rtax_header_id_regexes): ## Make sure the input dataset directory exists. if not isdir(input_dir): raise WorkflowError("The input dataset directory '%s' does not " "exist." % input_dir) input_dir_name = split(normpath(input_dir))[1] output_dataset_dir = join(output_dir, input_dir_name) input_fasta_fp = join(input_dir, input_fasta_filename) clean_otu_table_fp = join(input_dir, clean_otu_table_filename) read_1_seqs_fp = join(input_dir, read_1_seqs_filename) read_2_seqs_fp = join(input_dir, read_2_seqs_filename) logger.write("\nCreating output subdirectory '%s' if it doesn't " "already exist.\n" % output_dataset_dir) create_dir(output_dataset_dir) for method in assignment_methods: ## Method is RDP if method == 'rdp': ## Check for execution parameters required by RDP method if confidences is None: raise WorkflowError("You must specify at least one " "confidence level.") ## Generate command for RDP commands = _generate_rdp_commands(output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, confidences, rdp_max_memory=rdp_max_memory) ## Method is BLAST elif method == 'blast': ## Check for execution parameters required by BLAST method if e_values is None: raise WorkflowError("You must specify at least one " "E-value.") ## Generate command for BLAST commands = _generate_blast_commands(output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, e_values) ## Method is Mothur elif method == 'mothur': ## Check for execution parameters required by Mothur method if confidences is None: raise WorkflowError("You must specify at least one " "confidence level.") ## Generate command for mothur commands = _generate_mothur_commands(output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, confidences) ## Method is RTAX elif method == 'rtax': ## Check for execution parameters required by RTAX method if rtax_modes is None: raise WorkflowError("You must specify at least one mode " "to run RTAX in.") for mode in rtax_modes: if mode not in ['single', 'paired']: raise WorkflowError("Invalid rtax mode '%s'. Must be " "'single' or 'paired'." % mode) ## Generate command for rtax commands = _generate_rtax_commands(output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, rtax_modes, read_1_seqs_fp, read_2_seqs_fp, rtax_read_id_regex, rtax_amplicon_id_regex, rtax_header_id_regex) ## Method is uclust elif method == 'uclust': ## Check for execution parameters required by uclust method if uclust_min_consensus_fractions is None: raise WorkflowError("You must specify at least one uclust " "minimum consensus fraction.") if uclust_similarities is None: raise WorkflowError("You must specify at least one uclust " "similarity.") if uclust_max_accepts is None: raise WorkflowError("You must specify at least one uclust " "max accepts.") ## Generate command for uclust commands = _generate_uclust_commands(output_dataset_dir, input_fasta_fp, reference_seqs_fp, id_to_taxonomy_fp, clean_otu_table_fp, uclust_min_consensus_fractions, uclust_similarities, uclust_max_accepts) ## Unsupported method else: raise WorkflowError("Unrecognized or unsupported taxonomy " "assignment method '%s'." % method) # send command for current method to command handler for command in commands: start = time() # call_commands_serially needs a list of commands so here's a # length one commmand list. command_handler([command], status_update_callback, logger, close_logger_on_success=False) end = time() logger.write('Time (s): %d\n\n' % (end - start)) logger.close()
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=None, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, suppress_index_page=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout, minimum_failure_threshold=100000): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust/usearch/sortmerna+sumaclust for otu picking allowed_denovo_otu_picking_methods = ['uclust', 'usearch61', 'sumaclust'] allowed_reference_otu_picking_methods = ['uclust_ref', 'usearch61_ref', 'sortmerna'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps index_links = [] input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger is None: log_fp = generate_log_fp(output_dir) logger = WorkflowLogger(log_fp, params=params, qiime_config=qiime_config) close_logger_on_success = True index_links.append( ('Run summary data', log_fp, _index_headers['run_summary'])) else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger, [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp is None: prefilter_refseqs_fp = refseqs_fp # Step 1: Closed-reference OTU picking on the input file (if not already # complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id is not None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir, input_basename) prefilter_pick_otu_cmd = pick_reference_otus( input_fp, prefilter_dir, reference_otu_picking_method, prefilter_refseqs_fp, parallel, params, logger, prefilter_percent_id) commands.append( [('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir, input_basename, input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp, prefiltered_input_fp, prefilter_failures_list_fp) commands.append( [('Filter prefilter failures from input', filter_fasta_cmd)]) index_links.append( ('Pre-filtered sequence identifiers ' '(failed to hit reference at %1.1f%% identity)' % (float(prefilter_percent_id)*100), prefilter_failures_list_fp, _index_headers['sequences'])) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) if getsize(prefiltered_input_fp) == 0: raise ValueError( "All sequences were discarded by the prefilter. " "Are the input sequences in the same orientation " "in your input file and reference file (you can " "add 'pick_otus:enable_rev_strand_match True' to " "your parameters file if not)? Are you using the " "correct reference file?") # Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir, input_basename) step1_pick_otu_cmd = pick_reference_otus( input_fp, step1_dir, reference_otu_picking_method, refseqs_fp, parallel, params, logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) # Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir, input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp, step1_failures_list_fp, step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set', step1_pick_rep_set_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir # count number of sequences in step 1 failures fasta file with open(abspath(step1_failures_fasta_fp), 'U') as step1_failures_fasta_f: num_failure_seqs, mean, std = count_seqs_from_file(step1_failures_fasta_f) # number of failures sequences is greater than the threshold, # continue to step 2,3 and 4 run_step_2_and_3 = num_failure_seqs > minimum_failure_threshold if run_step_2_and_3: # Subsample the failures fasta file to retain (roughly) the # percent_subsample step2_dir = '%s/step2_otus/' % output_dir create_dir(step2_dir) step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step2_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) logger.write('# Subsample the failures fasta file using API \n' + 'python -c "import qiime; qiime.util.subsample_fasta' + '(\'%s\', \'%s\', \'%f\')\n\n"' % (abspath(step1_failures_fasta_fp), abspath( step2_input_fasta_fp), percent_subsample)) # Prep the OTU picking command for the subsampled failures step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) # Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp, step2_repset_fasta_fp, step2_input_fasta_fp) commands.append( [('Pick representative set for subsampled failures', step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir # remove the indexed reference database from the dictionary of # parameters as it must be forced to build a new database # using the step2_repset_fasta_fp if reference_otu_picking_method == 'sortmerna': if 'sortmerna_db' in params['pick_otus']: del params['pick_otus']['sortmerna_db'] step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set', step3_cmd)]) index_links.append( ('Final map of OTU identifier to sequence identifers (i.e., "OTU map")', merged_otu_map_fp, _index_headers['otu_maps'])) if not suppress_step4: step4_dir = '%s/step4_otus/' % output_dir if run_step_2_and_3: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp, step3_failures_list_fp, step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) failures_fp = step3_failures_fasta_fp failures_otus_fp = 'failures_failures_otus.txt' failures_step = 'step3' else: failures_fp = step1_failures_fasta_fp failures_otus_fp = 'failures_otus.txt' failures_step = 'step1' step3_otu_map_fp = "" step4_cmd = pick_denovo_otus(failures_fp, step4_dir, '.'.join([new_ref_set_id, 'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/%s' % (step4_dir, failures_otus_fp) commands.append([('Pick de novo OTUs on %s failures' % failures_step, step4_cmd)]) # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created cat_otu_tables_cmd = 'cat %s %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, step4_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp, step4_repset_fasta_fp, failures_fp) commands.append( [('Pick representative set for subsampled failures', step4_rep_set_cmd)]) else: # Merge the otu maps, note that we are explicitly using the '>' operator # otherwise passing the --force flag on the script interface would # append the newly created maps to the map that was previously created if run_step_2_and_3: failures_fp = step3_failures_list_fp else: failures_fp = step1_failures_list_fp step3_otu_map_fp = "" cat_otu_tables_cmd = 'cat %s %s > %s' %\ (step1_otu_map_fp, step3_otu_map_fp, merged_otu_map_fp) commands.append([('Merge OTU maps', cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (failures_fp, output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir, min_otu_size) otus_to_keep = filter_otus_from_otu_map( otu_fp, otu_no_singletons_fp, min_otu_size) index_links.append(('Final map of OTU identifier to sequence identifers excluding ' 'OTUs with fewer than %d sequences' % min_otu_size, otu_no_singletons_fp, _index_headers['otu_maps'])) logger.write('# Filter singletons from the otu map using API \n' + 'python -c "import qiime; qiime.filter.filter_otus_from_otu_map' + '(\'%s\', \'%s\', \'%d\')"\n\n' % (abspath(otu_fp), abspath( otu_no_singletons_fp), min_otu_size)) # make the final representative seqs file and a new refseqs file that # could be used in subsequent otu picking runs. # this is clunky. first, we need to do this without singletons to match # the otu map without singletons. next, there is a difference in what # we need the reference set to be and what we need the repseqs to be. # the reference set needs to be a superset of the input reference set # to this set. the repset needs to be only the sequences that were observed # in this data set, and we want reps for the step1 reference otus to be # reads from this run so we don't hit issues building a tree using # sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir index_links.append( ('OTU representative sequences', final_repset_fp, _index_headers['sequences'])) final_repset_f = open(final_repset_fp, 'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir index_links.append( ('New reference sequences (i.e., OTU representative sequences plus input ' 'reference sequences)', new_refseqs_fp, _index_headers['sequences'])) # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in parse_fasta(open(step1_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) logger.write('# Write non-singleton otus representative sequences ' + 'from step1 to the final rep set file: %s\n\n' % final_repset_fp) # copy the full input refseqs file to the new refseqs_fp copyfile(refseqs_fp, new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp, 'a') new_refseqs_f.write('\n') logger.write('# Copy the full input refseqs file to the new refseq file\n' + 'cp %s %s\n\n' % (refseqs_fp, new_refseqs_fp)) # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. if run_step_2_and_3: for otu_id, seq in parse_fasta(open(step2_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) if not suppress_step4: for otu_id, seq in parse_fasta(open(step4_repset_fasta_fp, 'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq)) final_repset_f.write('>%s\n%s\n' % (otu_id, seq)) new_refseqs_f.close() final_repset_f.close() # steps 1-4 executed if run_step_2_and_3: logger.write('# Write non-singleton otus representative sequences from ' + 'step 2 and step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # only steps 1 and 4 executed else: logger.write('# Write non-singleton otus representative sequences from ' + 'step 4 to the final representative set and the new reference' + ' set (%s and %s respectively)\n\n' % (final_repset_fp, new_refseqs_fp)) # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp, otu_table_fp) commands.append([("Make the otu table", make_otu_table_cmd)]) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences' % min_otu_size, otu_table_fp, _index_headers['otu_tables'])) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST and including OTU taxonomy assignments' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and including OTU ' 'taxonomy assignments' % min_otu_size, otu_table_w_tax_fp, _index_headers['otu_tables'])) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir, min_otu_size) index_links.append( ('OTU table exluding OTUs with fewer than %d sequences and sequences that ' 'fail to align with PyNAST' % min_otu_size, pynast_failure_filtered_otu_table_fp, _index_headers['otu_tables'])) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write( "Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp], error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) index_links.append( ('OTU taxonomic assignments', taxonomy_fp, _index_headers['taxa_assignments'])) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp, taxonomy_fp, otu_table_w_tax_fp) commands.append([("Add taxa to OTU table", add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: rep_set_tree_fp = join(output_dir, 'rep_set.tre') index_links.append( ('OTU phylogenetic tree', rep_set_tree_fp, _index_headers['trees'])) if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures table = load_table(align_and_tree_input_otu_table) filtered_otu_table = filter_otus_from_otu_table(table, get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')), 0, inf, 0, inf, negate_ids_to_keep=True) write_biom_table(filtered_otu_table, pynast_failure_filtered_otu_table_fp) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close() if not suppress_index_page: index_fp = '%s/index.html' % output_dir generate_index_page(index_links, index_fp)
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust','usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table # Add taxa to otu table add_metadata_cmd = 'add_metadata.py -i %s --observation_mapping_fp %s -o %s --sc_separated taxonomy --observation_header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def run_pick_de_novo_otus(input_fp, output_dir, command_handler, params, qiime_config, parallel=False, logger=None, suppress_md5=False, status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: 1) Pick OTUs; 2) Pick a representative set; 3) Align the representative set; 4) Assign taxonomy; 5) Filter the alignment prior to tree building - remove positions which are all gaps, and specified as 0 in the lanemask 6) Build a phylogenetic tree; 7) Build an OTU table. """ # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] python_exe_fp = qiime_config['python_exe_fp'] script_dir = get_qiime_scripts_dir() cluster_failures = False if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp]) # Prep the OTU picking command try: otu_picking_method = params['pick_otus']['otu_picking_method'] except KeyError: otu_picking_method = 'uclust' pick_otu_dir = '%s/%s_picked_otus' % (output_dir, otu_picking_method) otu_fp = '%s/%s_otus.txt' % (pick_otu_dir,input_basename) if parallel and (otu_picking_method == 'blast' or otu_picking_method == 'uclust_ref'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the OTU picker parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --otu_picking_method # option. This works for now though. d = params['pick_otus'].copy() del d['otu_picking_method'] except KeyError: pass if otu_picking_method == 'uclust_ref': try: suppress_new_clusters = d['suppress_new_clusters'] del d['suppress_new_clusters'] cluster_failures = False except KeyError: cluster_failures = True failure_otu_picking_method = 'uclust' params_str += ' %s' % get_params_str(d) otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method # Build the OTU picking command pick_otus_cmd = '%s %s/%s -i %s -o %s -T %s' % (python_exe_fp, script_dir, otu_picking_script, input_fp, pick_otu_dir, params_str) else: try: params_str = get_params_str(params['pick_otus']) except KeyError: params_str = '' # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, input_fp, pick_otu_dir, params_str) commands.append([('Pick OTUs', pick_otus_cmd)]) if cluster_failures: reference_otu_fp = otu_fp clustered_failures_dir = '%s/failure_otus/' % pick_otu_dir try: d = params['pick_otus'].copy() del d['otu_picking_method'] except KeyError: pass if 'uclust_otu_id_prefix' not in d: d['uclust_otu_id_prefix'] = 'DeNovoOTU' params_str = ' %s' % get_params_str(d) failures_list_fp = '%s/%s_failures.txt' % \ (pick_otu_dir,input_basename) failures_fasta_fp = '%s/%s_failures.fasta' % \ (pick_otu_dir,input_basename) filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,failures_list_fp,failures_fasta_fp) commands.append([('Generate failures fasta file', filter_fasta_cmd)]) # Prep the OTU picking command for failure_otu_fp = '%s/%s_failures_otus.txt' % (clustered_failures_dir,input_basename) # Build the OTU picking command pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s -m %s %s' %\ (python_exe_fp, script_dir, failures_fasta_fp, clustered_failures_dir, failure_otu_picking_method, params_str) commands.append([('Pick de novo OTUs for new clusters', pick_otus_cmd)]) merged_otu_map_fp = '%s/merged_otu_map.txt' % clustered_failures_dir cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (reference_otu_fp,failure_otu_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) otu_fp = merged_otu_map_fp # Prep the representative set picking command rep_set_dir = '%s/rep_set/' % output_dir create_dir(rep_set_dir) rep_set_fp = '%s/%s_rep_set.fasta' % (rep_set_dir,input_basename) rep_set_log_fp = '%s/%s_rep_set.log' % (rep_set_dir,input_basename) try: params_str = get_params_str(params['pick_rep_set']) except KeyError: params_str = '' # Build the representative set picking command pick_rep_set_cmd = '%s %s/pick_rep_set.py -i %s -f %s -l %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, input_fp, rep_set_log_fp,\ rep_set_fp, params_str) commands.append([('Pick representative set', pick_rep_set_cmd)]) # Prep the taxonomy assignment command try: assignment_method = params['assign_taxonomy']['assignment_method'] except KeyError: assignment_method = 'uclust' assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\ (output_dir,assignment_method) taxonomy_fp = '%s/%s_rep_set_tax_assignments.txt' % \ (assign_taxonomy_dir,input_basename) if parallel and (assignment_method == 'rdp' or assignment_method == 'blast' or assignment_method == 'uclust'): # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the taxonomy assignment parameters try: # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --assignment_method # option. This works for now though. d = params['assign_taxonomy'].copy() if 'assignment_method' in d: del d['assignment_method'] params_str += ' %s' % get_params_str(d) except KeyError: pass # Build the parallel taxonomy assignment command assign_taxonomy_cmd = \ '%s %s/parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\ (python_exe_fp, script_dir, assignment_method, rep_set_fp,\ assign_taxonomy_dir, params_str) else: try: params_str = get_params_str(params['assign_taxonomy']) except KeyError: params_str = '' # Build the taxonomy assignment command assign_taxonomy_cmd = '%s %s/assign_taxonomy.py -o %s -i %s %s' %\ (python_exe_fp, script_dir, assign_taxonomy_dir,\ rep_set_fp, params_str) commands.append([('Assign taxonomy',assign_taxonomy_cmd)]) # Prep the OTU table building command otu_table_fp = '%s/otu_table.biom' % output_dir try: params_str = get_params_str(params['make_otu_table']) except KeyError: params_str = '' # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\ (python_exe_fp, script_dir, otu_fp, taxonomy_fp, otu_table_fp, params_str) commands.append([('Make OTU table', make_otu_table_cmd)]) if cluster_failures: reference_otu_table_fp = '%s/reference_only_otu_table.biom' % output_dir # Build the OTU table building command make_otu_table_cmd = '%s %s/make_otu_table.py -i %s -t %s -o %s %s' %\ (python_exe_fp, script_dir, reference_otu_fp, taxonomy_fp, reference_otu_table_fp, params_str) commands.append([('Make reference-only OTU table', make_otu_table_cmd)]) # Prep the pynast alignment command try: alignment_method = params['align_seqs']['alignment_method'] except KeyError: alignment_method = 'pynast' pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method) aln_fp = '%s/%s_rep_set_aligned.fasta' % (pynast_dir,input_basename) if parallel and alignment_method == 'pynast': # Grab the parallel-specific parameters try: params_str = get_params_str(params['parallel']) except KeyError: params_str = '' # Grab the alignment parameters # Want to find a cleaner strategy for this: the parallel script # is method-specific, so doesn't take a --alignment_method # option. This works for now though. try: d = params['align_seqs'].copy() except KeyError: d = {} try: del d['alignment_method'] except KeyError: pass params_str += ' %s' % get_params_str(d) # Build the parallel pynast alignment command align_seqs_cmd = '%s %s/parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\ (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str) else: try: params_str = get_params_str(params['align_seqs']) except KeyError: params_str = '' # Build the pynast alignment command align_seqs_cmd = '%s %s/align_seqs.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, rep_set_fp, pynast_dir, params_str) commands.append([('Align sequences', align_seqs_cmd)]) # Prep the alignment filtering command filtered_aln_fp = '%s/%s_rep_set_aligned_pfiltered.fasta' %\ (pynast_dir,input_basename) try: params_str = get_params_str(params['filter_alignment']) except KeyError: params_str = '' # Build the alignment filtering command filter_alignment_cmd = '%s %s/filter_alignment.py -o %s -i %s %s' %\ (python_exe_fp, script_dir, pynast_dir, aln_fp, params_str) commands.append([('Filter alignment', filter_alignment_cmd)]) # Prep the tree building command tree_fp = '%s/rep_set.tre' % output_dir try: params_str = get_params_str(params['make_phylogeny']) except KeyError: params_str = '' # Build the tree building command make_phylogeny_cmd = '%s %s/make_phylogeny.py -i %s -o %s %s' %\ (python_exe_fp, script_dir, filtered_aln_fp, tree_fp,\ params_str) commands.append([('Build phylogenetic tree', make_phylogeny_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success) return abspath(tree_fp), abspath(otu_table_fp)
def pick_nested_reference_otus(input_fasta_fp, input_tree_fp, output_dir, run_id, similarity_thresholds, command_handler, status_update_callback=print_to_stdout): # Prepare some variables for the later steps create_dir(output_dir) otu_dir = join(output_dir, 'otus') create_dir(otu_dir) rep_set_dir = join(output_dir, 'rep_set') create_dir(rep_set_dir) # currently not doing anything with taxonomies and trees # tax_dir = join(output_dir,'taxonomies') # create_dir(tax_dir) if input_tree_fp: tree_dir = join(output_dir, 'trees') create_dir(tree_dir) commands = [] files_to_remove = [] logger = WorkflowLogger(generate_log_fp(output_dir)) similarity_thresholds.sort() similarity_thresholds.reverse() current_inseqs_fp = input_fasta_fp current_tree_fp = input_tree_fp previous_otu_map = None for similarity_threshold in similarity_thresholds: current_inseqs_basename = splitext(split(current_inseqs_fp)[1])[0] # pick otus command otu_fp = '%s/%d_otu_map.txt' % (otu_dir, similarity_threshold) clusters_fp = '%s/%d_clusters.uc' % (otu_dir, similarity_threshold) temp_otu_fp = '%s/%s_otus.txt' % (otu_dir, current_inseqs_basename) temp_log_fp = '%s/%s_otus.log' % (otu_dir, current_inseqs_basename) temp_clusters_fp = '%s/%s_clusters.uc' % (otu_dir, current_inseqs_basename) pick_otus_cmd = \ 'pick_otus.py -m uclust -DBz -i %s -s %1.2f -o %s' % ( current_inseqs_fp, similarity_threshold/100, otu_dir) commands.append([('Pick OTUs (%d)' % similarity_threshold, pick_otus_cmd)]) commands.append([('Rename OTU file (%d)' % similarity_threshold, 'mv %s %s' % (temp_otu_fp, otu_fp))]) commands.append([('Rename uc file (%d)' % similarity_threshold, 'mv %s %s' % (temp_clusters_fp, clusters_fp))]) files_to_remove.append(temp_log_fp) # rep set picking temp_rep_set_fp = get_tmp_filename(prefix='NestedReference', suffix='.fasta') pick_rep_set_cmd = \ 'pick_rep_set.py -m first -i %s -o %s -f %s' % ( otu_fp, temp_rep_set_fp, current_inseqs_fp) commands.append([('Pick Rep Set (%d)' % similarity_threshold, pick_rep_set_cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] # rename representative sequences rep_set_fp = '%s/%d_otus_%s.fasta' % (rep_set_dir, similarity_threshold, run_id) logger.write( 'Renaming OTU representative sequences so OTU ids are reference sequence ids.' ) rep_set_f = open(rep_set_fp, 'w') for e in rename_rep_seqs(open(temp_rep_set_fp, 'U')): rep_set_f.write('>%s\n%s\n' % e) rep_set_f.close() files_to_remove.append(temp_rep_set_fp) # filter the tree, if provided if current_tree_fp != None: tree_fp = '%s/%d_otus_%s.tre' % (tree_dir, similarity_threshold, run_id) tree_cmd = 'filter_tree.py -i %s -f %s -o %s' %\ (current_tree_fp,rep_set_fp,tree_fp) commands.append([('Filter tree (%d)' % similarity_threshold, tree_cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # prep for the next iteration current_tree_fp = tree_fp # prep for the next iteration remove_files(files_to_remove) commands = [] files_to_remove = [] current_inseqs_fp = rep_set_fp logger.close()
def main(): option_parser, opts, args =\ parse_command_line_parameters(suppress_verbose=True, **script_info) input_dir = opts.input_dir demultiplexing_method = opts.demultiplexing_method parameter_fp = opts.parameter_fp read_indicator = opts.read_indicator barcode_indicator = opts.barcode_indicator mapping_indicator = opts.mapping_indicator mapping_extensions = opts.mapping_extensions.split(',') sampleid_indicator = opts.sampleid_indicator leading_text = opts.leading_text trailing_text = opts.trailing_text include_input_dir_path = opts.include_input_dir_path output_dir = abspath(opts.output_dir) remove_filepath_in_name = opts.remove_filepath_in_name print_only = opts.print_only if remove_filepath_in_name and not include_input_dir_path: option_parser.error("If --remove_filepath_in_name enabled, " "--include_input_dir_path must be enabled.") if opts.parameter_fp: with open(opts.parameter_fp, 'U') as parameter_f: params_dict = parse_qiime_parameters(parameter_f) params_str = get_params_str(params_dict['split_libraries_fastq']) else: params_dict = {} params_str = "" create_dir(output_dir) all_fastq = [] all_mapping = [] extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq'] for root, dir, fps in walk(input_dir): for fp in fps: for extension in extensions: if fp.endswith(extension): all_fastq += [abspath(join(root, fp))] if demultiplexing_method == 'mapping_barcode_files': for root, dir, fps in walk(input_dir): for fp in fps: for mapping_extension in mapping_extensions: if fp.endswith(mapping_extension): all_mapping += [abspath(join(root, fp))] all_files = get_matching_files(all_fastq, all_mapping, read_indicator, barcode_indicator, mapping_indicator) else: all_files = all_fastq commands = create_commands_slf(all_files, demultiplexing_method, output_dir, params_str, leading_text, trailing_text, include_input_dir_path, remove_filepath_in_name, sampleid_indicator) qiime_config = load_qiime_config() if print_only: command_handler = print_commands else: command_handler = call_commands_serially logger = WorkflowLogger(generate_log_fp(output_dir), params=params_dict, qiime_config=qiime_config) # Call the command handler on the list of commands command_handler(commands, status_update_callback=no_status_updates, logger=logger, close_logger_on_success=True)
def pick_nested_reference_otus(input_fasta_fp, input_tree_fp, output_dir, run_id, similarity_thresholds, command_handler, status_update_callback=print_to_stdout): # Prepare some variables for the later steps create_dir(output_dir) otu_dir = join(output_dir,'otus') create_dir(otu_dir) rep_set_dir = join(output_dir,'rep_set') create_dir(rep_set_dir) # currently not doing anything with taxonomies and trees # tax_dir = join(output_dir,'taxonomies') # create_dir(tax_dir) if input_tree_fp: tree_dir = join(output_dir,'trees') create_dir(tree_dir) commands = [] files_to_remove = [] logger = WorkflowLogger(generate_log_fp(output_dir)) similarity_thresholds.sort() similarity_thresholds.reverse() current_inseqs_fp = input_fasta_fp current_tree_fp = input_tree_fp previous_otu_map = None for similarity_threshold in similarity_thresholds: current_inseqs_basename = splitext(split(current_inseqs_fp)[1])[0] # pick otus command otu_fp = '%s/%d_otu_map.txt' % (otu_dir,similarity_threshold) clusters_fp = '%s/%d_clusters.uc' % (otu_dir,similarity_threshold) temp_otu_fp = '%s/%s_otus.txt' % (otu_dir, current_inseqs_basename) temp_log_fp = '%s/%s_otus.log' % (otu_dir, current_inseqs_basename) temp_clusters_fp = '%s/%s_clusters.uc' % (otu_dir, current_inseqs_basename) pick_otus_cmd = \ 'pick_otus.py -m uclust -DBz -i %s -s %1.2f -o %s' % ( current_inseqs_fp, similarity_threshold/100, otu_dir) commands.append([('Pick OTUs (%d)' % similarity_threshold, pick_otus_cmd)]) commands.append([('Rename OTU file (%d)' % similarity_threshold, 'mv %s %s' % (temp_otu_fp,otu_fp))]) commands.append([('Rename uc file (%d)' % similarity_threshold, 'mv %s %s' % (temp_clusters_fp,clusters_fp))]) files_to_remove.append(temp_log_fp) # rep set picking temp_rep_set_fp = get_tmp_filename(prefix='NestedReference', suffix='.fasta') pick_rep_set_cmd = \ 'pick_rep_set.py -m first -i %s -o %s -f %s' % ( otu_fp, temp_rep_set_fp, current_inseqs_fp) commands.append([('Pick Rep Set (%d)' % similarity_threshold, pick_rep_set_cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] # rename representative sequences rep_set_fp = '%s/%d_otus_%s.fasta' % ( rep_set_dir, similarity_threshold, run_id) logger.write('Renaming OTU representative sequences so OTU ids are reference sequence ids.') rep_set_f = open(rep_set_fp,'w') for e in rename_rep_seqs(open(temp_rep_set_fp,'U')): rep_set_f.write('>%s\n%s\n' % e) rep_set_f.close() files_to_remove.append(temp_rep_set_fp) # filter the tree, if provided if current_tree_fp != None: tree_fp = '%s/%d_otus_%s.tre' % ( tree_dir, similarity_threshold, run_id) tree_cmd = 'filter_tree.py -i %s -f %s -o %s' %\ (current_tree_fp,rep_set_fp,tree_fp) commands.append([('Filter tree (%d)' % similarity_threshold,tree_cmd)]) command_handler(commands, status_update_callback, logger, close_logger_on_success=False) # prep for the next iteration current_tree_fp = tree_fp # prep for the next iteration remove_files(files_to_remove) commands = [] files_to_remove = [] current_inseqs_fp = rep_set_fp logger.close()
def pick_subsampled_open_reference_otus(input_fp, refseqs_fp, output_dir, percent_subsample, new_ref_set_id, command_handler, params, qiime_config, prefilter_refseqs_fp=None, run_assign_tax=True, run_align_and_tree=True, prefilter_percent_id=0.60, min_otu_size=2, step1_otu_map_fp=None, step1_failures_fasta_fp=None, parallel=False, suppress_step4=False, logger=None, suppress_md5=False, denovo_otu_picking_method='uclust', reference_otu_picking_method='uclust_ref', status_update_callback=print_to_stdout): """ Run the data preparation steps of Qiime The steps performed by this function are: - Pick reference OTUs against refseqs_fp - Subsample the failures to n sequences. - Pick OTUs de novo on the n failures. - Pick representative sequences for the resulting OTUs. - Pick reference OTUs on all failures using the representative set from step 4 as the reference set. """ # for now only allowing uclust for otu picking allowed_denovo_otu_picking_methods = ['uclust','usearch61'] allowed_reference_otu_picking_methods = ['uclust_ref','usearch61_ref'] assert denovo_otu_picking_method in allowed_denovo_otu_picking_methods,\ "Unknown de novo OTU picking method: %s. Known methods are: %s"\ % (denovo_otu_picking_method, ','.join(allowed_denovo_otu_picking_methods)) assert reference_otu_picking_method in allowed_reference_otu_picking_methods,\ "Unknown reference OTU picking method: %s. Known methods are: %s"\ % (reference_otu_picking_method, ','.join(allowed_reference_otu_picking_methods)) # Prepare some variables for the later steps input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) create_dir(output_dir) commands = [] if logger == None: logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config) close_logger_on_success = True else: close_logger_on_success = False if not suppress_md5: log_input_md5s(logger,[input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp]) # if the user has not passed a different reference collection for the pre-filter, # used the main refseqs_fp. this is useful if the user wants to provide a smaller # reference collection, or to use the input reference collection when running in # iterative mode (rather than an iteration's new refseqs) if prefilter_refseqs_fp == None: prefilter_refseqs_fp = refseqs_fp ## Step 1: Closed-reference OTU picking on the input file (if not already complete) if step1_otu_map_fp and step1_failures_fasta_fp: step1_dir = '%s/step1_otus' % output_dir create_dir(step1_dir) logger.write("Using pre-existing reference otu map and failures.\n\n") else: if prefilter_percent_id != None: prefilter_dir = '%s/prefilter_otus/' % output_dir prefilter_failures_list_fp = '%s/%s_failures.txt' % \ (prefilter_dir,input_basename) prefilter_pick_otu_cmd = pick_reference_otus(\ input_fp,prefilter_dir,reference_otu_picking_method, prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id) commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)]) prefiltered_input_fp = '%s/prefiltered_%s%s' %\ (prefilter_dir,input_basename,input_ext) filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\ (input_fp,prefiltered_input_fp,prefilter_failures_list_fp) commands.append([('Filter prefilter failures from input', filter_fasta_cmd)]) input_fp = prefiltered_input_fp input_dir, input_filename = split(input_fp) input_basename, input_ext = splitext(input_filename) ## Build the OTU picking command step1_dir = \ '%s/step1_otus' % output_dir step1_otu_map_fp = \ '%s/%s_otus.txt' % (step1_dir,input_basename) step1_pick_otu_cmd = pick_reference_otus(\ input_fp,step1_dir,reference_otu_picking_method, refseqs_fp,parallel,params,logger) commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)]) ## Build the failures fasta file step1_failures_list_fp = '%s/%s_failures.txt' % \ (step1_dir,input_basename) step1_failures_fasta_fp = \ '%s/failures.fasta' % step1_dir step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (input_fp,step1_failures_list_fp,step1_failures_fasta_fp) commands.append([('Generate full failures fasta file', step1_filter_fasta_cmd)]) # Call the command handler on the list of commands command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] step1_repset_fasta_fp = \ '%s/step1_rep_set.fna' % step1_dir step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step1_otu_map_fp, step1_repset_fasta_fp, input_fp) commands.append([('Pick rep set',step1_pick_rep_set_cmd)]) ## Subsample the failures fasta file to retain (roughly) the ## percent_subsample step2_input_fasta_fp = \ '%s/subsampled_failures.fasta' % step1_dir subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp, percent_subsample) ## Prep the OTU picking command for the subsampled failures step2_dir = '%s/step2_otus/' % output_dir step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir, new_ref_set_id, denovo_otu_picking_method, params, logger) step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir commands.append([('Pick de novo OTUs for new clusters', step2_cmd)]) ## Prep the rep set picking command for the subsampled failures step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp) commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)]) step3_dir = '%s/step3_otus/' % output_dir step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir step3_cmd = pick_reference_otus( step1_failures_fasta_fp, step3_dir, reference_otu_picking_method, step2_repset_fasta_fp, parallel, params, logger) commands.append([ ('Pick reference OTUs using de novo rep set',step3_cmd)]) # name the final otu map merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir if not suppress_step4: step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\ (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp) commands.append([('Create fasta file of step3 failures', step3_filter_fasta_cmd)]) step4_dir = '%s/step4_otus/' % output_dir step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir, '.'.join([new_ref_set_id,'CleanUp']), denovo_otu_picking_method, params, logger) step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)]) # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\ (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp) commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)]) else: # Merge the otu maps cat_otu_tables_cmd = 'cat %s %s >> %s' %\ (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp) commands.append([('Merge OTU maps',cat_otu_tables_cmd)]) # Move the step 3 failures file to the top-level directory commands.append([('Move final failures file to top-level directory', 'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] otu_fp = merged_otu_map_fp # Filter singletons from the otu map otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size) otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size) ## make the final representative seqs file and a new refseqs file that ## could be used in subsequent otu picking runs. ## this is clunky. first, we need to do this without singletons to match ## the otu map without singletons. next, there is a difference in what ## we need the reference set to be and what we need the repseqs to be. ## the reference set needs to be a superset of the input reference set ## to this set. the repset needs to be only the sequences that were observed ## in this data set, and we want reps for the step1 reference otus to be ## reads from this run so we don't hit issues building a tree using ## sequences of very different lengths. so... final_repset_fp = '%s/rep_set.fna' % output_dir final_repset_f = open(final_repset_fp,'w') new_refseqs_fp = '%s/new_refseqs.fna' % output_dir # write non-singleton otus representative sequences from step1 to the # final rep set file for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) # copy the full input refseqs file to the new refseqs_fp copy(refseqs_fp,new_refseqs_fp) new_refseqs_f = open(new_refseqs_fp,'a') new_refseqs_f.write('\n') # iterate over all representative sequences from step2 and step4 and write # those corresponding to non-singleton otus to the final representative set # file and the new reference sequences file. for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) if not suppress_step4: for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')): if otu_id.split()[0] in otus_to_keep: new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq)) final_repset_f.write('>%s\n%s\n' % (otu_id,seq)) new_refseqs_f.close() final_repset_f.close() # Prep the make_otu_table.py command otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size) make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\ (otu_no_singletons_fp,otu_table_fp) commands.append([("Make the otu table",make_otu_table_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] # initialize output file names - these differ based on what combination of # taxonomy assignment and alignment/tree building is happening. if run_assign_tax and run_align_and_tree: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) align_and_tree_input_otu_table = otu_table_w_tax_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size) elif run_assign_tax: tax_input_otu_table_fp = otu_table_fp otu_table_w_tax_fp = \ '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size) elif run_align_and_tree: align_and_tree_input_otu_table = otu_table_fp pynast_failure_filtered_otu_table_fp = \ '%s/otu_table_mc%d_no_pynast_failures.biom' % (output_dir,min_otu_size) if run_assign_tax: if exists(otu_table_w_tax_fp) and getsize(otu_table_w_tax_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." % otu_table_w_tax_fp) else: # remove files from partially completed runs remove_files([otu_table_w_tax_fp],error_on_missing=False) taxonomy_fp = assign_tax( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Add taxa to otu table add_metadata_cmd = 'biom add-metadata -i %s --observation-metadata-fp %s -o %s --sc-separated taxonomy --observation-header OTUID,taxonomy' %\ (tax_input_otu_table_fp,taxonomy_fp,otu_table_w_tax_fp) commands.append([("Add taxa to OTU table",add_metadata_cmd)]) command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if run_align_and_tree: if exists(pynast_failure_filtered_otu_table_fp) and\ getsize(pynast_failure_filtered_otu_table_fp) > 0: logger.write("Final output file exists (%s). Will not rebuild." %\ pynast_failure_filtered_otu_table_fp) else: # remove files from partially completed runs remove_files([pynast_failure_filtered_otu_table_fp], error_on_missing=False) pynast_failures_fp = align_and_tree( repset_fasta_fp=final_repset_fp, output_dir=output_dir, command_handler=command_handler, params=params, qiime_config=qiime_config, parallel=parallel, logger=logger, status_update_callback=status_update_callback) # Build OTU table without PyNAST failures filtered_otu_table = filter_otus_from_otu_table( parse_biom_table(open(align_and_tree_input_otu_table,'U')), get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')), 0,inf,0,inf,negate_ids_to_keep=True) otu_table_f = open(pynast_failure_filtered_otu_table_fp,'w') otu_table_f.write(format_biom_table(filtered_otu_table)) otu_table_f.close() command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=False) commands = [] if close_logger_on_success: logger.close()
def generate_most_wanted_list( output_dir, otu_table_fps, rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category, top_n, min_abundance, max_abundance, min_categories, num_categories_to_plot, max_gg_similarity, max_nt_similarity, e_value, word_size, merged_otu_table_fp, suppress_taxonomic_output, jobs_to_start, command_handler, status_update_callback, force): try: makedirs(output_dir) except OSError: if not force: raise WorkflowError( "Output directory '%s' already exists. Please " "choose a different directory, or force overwrite with -f." % output_dir) logger = WorkflowLogger(generate_log_fp(output_dir)) commands, blast_results_fp, rep_set_cands_failures_fp, \ master_otu_table_ms_fp = _get_most_wanted_filtering_commands( output_dir, otu_table_fps, rep_set_fp, gg_fp, nt_fp, mapping_fp, mapping_category, min_abundance, max_abundance, min_categories, max_gg_similarity, e_value, word_size, merged_otu_table_fp, jobs_to_start) # Execute the commands, but keep the logger open because # we're going to write additional status updates as we process the data. command_handler(commands, status_update_callback, logger, close_logger_on_success=False) commands = [] # We'll sort the BLAST results by percent identity (ascending) and pick the # top n. logger.write("Reading in BLAST results, sorting by percent identity, " "and picking the top %d OTUs.\n\n" % top_n) top_n_mw = _get_top_n_blast_results(open(blast_results_fp, 'U'), top_n, max_nt_similarity) # Read in our filtered down candidate seqs file and latest filtered and # collapsed OTU table. We'll need to compute some stats on these to include # in our report. logger.write("Reading in filtered candidate sequences and latest filtered " "and collapsed OTU table.\n\n") mw_seqs = _get_rep_set_lookup(open(rep_set_cands_failures_fp, 'U')) master_otu_table_ms = parse_biom_table(open(master_otu_table_ms_fp, 'U')) # Write results out to tsv and HTML table. logger.write("Writing most wanted OTUs results to TSV and HTML " "tables.\n\n") output_img_dir = join(output_dir, 'img') try: makedirs(output_img_dir) except OSError: # It already exists, which is okay since we already know we are in # 'force' mode from above. pass tsv_lines, html_table_lines, mw_fasta_lines, plot_fps, plot_data_fps = \ _format_top_n_results_table(top_n_mw, mw_seqs, master_otu_table_ms, output_img_dir, mapping_category, suppress_taxonomic_output, num_categories_to_plot) mw_tsv_rel_fp = 'most_wanted_otus.txt' mw_tsv_fp = join(output_dir, mw_tsv_rel_fp) mw_tsv_f = open(mw_tsv_fp, 'w') mw_tsv_f.write(tsv_lines) mw_tsv_f.close() mw_fasta_rel_fp = 'most_wanted_otus.fasta' mw_fasta_fp = join(output_dir, mw_fasta_rel_fp) mw_fasta_f = open(mw_fasta_fp, 'w') mw_fasta_f.write(mw_fasta_lines) mw_fasta_f.close() html_dl_links = ( '<a href="%s" target="_blank">Download table in tab-' 'separated value (TSV) format</a><br /><a href="%s" ' 'target="_blank">Download OTU sequence data in FASTA format</a>' % (mw_tsv_rel_fp, mw_fasta_rel_fp)) html_lines = '%s<div>%s<br /><br />%s<br />%s</div>%s' % ( html_header, html_dl_links, html_table_lines, html_dl_links, html_footer) mw_html_f = open(join(output_dir, 'most_wanted_otus.html'), 'w') mw_html_f.write(html_lines) mw_html_f.close() logger.close()