def run_make_otu_heatmap_html(otu_table_fp,mapping_fp,output_dir, params,
                              qiime_config,
                              command_handler,tree_fp,
                              status_update_callback=print_to_stdout):
    """ This function calls the make_otu_heatmap_html script """
    
    # define upper-level values
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    commands = []
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # get the user-defined parameters
    try:
        params_str = get_params_str(params['make_otu_heatmap_html'])
    except KeyError:
        params_str = ''

    # Build the make_otu_heatmap_html command
    heatmap_cmd = '%s %s/make_otu_heatmap_html.py -i %s -m %s -t %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, mapping_fp,tree_fp, output_dir, 
      params_str)
    
    commands.append([('OTU Heatmap' , heatmap_cmd)])
     
    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger)

    return True
def rarefy_otu_table(data_access, otu_table_fname, otu_table_file_dir, 
                     otu_table_file_dir_db, otutable_rarefied_at, meta_id, 
                     otu_table_filepath, otu_table_filepath_db, zip_fpath):
    """ Rarefy the OTU table is specified by user """
    
    otu_table_basename, otu_table_ext = os.path.splitext(otu_table_fname)
    
    python_exe_fp = qiime_config['python_exe_fp']
    commands=[]
    command_handler=call_commands_serially
    status_update_callback=no_status_updates
    logger = WorkflowLogger(generate_log_fp('/tmp/'),
                            params=dict(''),
                            qiime_config=qiime_config)
    
    # get the date to put in the db
    run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S")
    
    # Sample the OTU table at even depth
    new_fname='%s_even%d%s' % (otu_table_basename, otutable_rarefied_at, 
                               otu_table_ext)
    even_sampled_otu_table_fp = os.path.join(otu_table_file_dir, new_fname)
    single_rarefaction_cmd = \
     '%s %s/single_rarefaction.py -i %s -o %s -d %d' % \
     (python_exe_fp, script_dir, otu_table_filepath,
      even_sampled_otu_table_fp, otutable_rarefied_at)
    commands.append([('Sample OTU table at %d seqs/sample' % \
                     otutable_rarefied_at, single_rarefaction_cmd)])
      
    otu_table_filepath=even_sampled_otu_table_fp
    otu_table_filepath_db=os.path.join(otu_table_file_dir_db, new_fname)
    
    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger)
    
    # Insert the rarefied OTU table filepath to the DB
    valid=data_access.addMetaAnalysisFiles(True, int(meta_id), 
                                           otu_table_filepath_db, 
                                           'OTUTABLE', run_date, 
                                           'OTU_TABLE')
    if not valid:
        raise ValueError, 'There was an issue uploading the filepaths to the DB!'
    
    # zip the rarefied OTU table
    cmd_call='cd %s; zip %s %s' % (otu_table_file_dir, zip_fpath, 
                                   otu_table_filepath.split('/')[-1])
    system(cmd_call)
    
    return
Esempio n. 3
0
 def _start_logging(self,
                    params,
                    args,
                    argv,
                    logger):
     if logger == None:
         self.logger = WorkflowLogger(generate_log_fp(params['master_script_log_dir']),
                                 params={},
                                 qiime_config=qiime_config)
         close_logger_on_success = True
     else:
         self.logger = logger
         close_logger_on_success = False
     
     self.logger.write('Command:\n')
     self.logger.write(' '.join(argv))
     self.logger.write('\n\n')
 
     log_input_md5s(self.logger,
                    [params[p] for p in self._input_file_parameter_ids])
     
     return close_logger_on_success
def pick_nested_reference_otus(input_fasta_fp,
                              input_tree_fp,
                              output_dir,
                              run_id,
                              similarity_thresholds,
                              command_handler,
                              status_update_callback=print_to_stdout):


    # Prepare some variables for the later steps
    create_dir(output_dir)
    otu_dir = join(output_dir,'otus')
    create_dir(otu_dir)
    rep_set_dir = join(output_dir,'rep_set')
    create_dir(rep_set_dir)
    # currently not doing anything with taxonomies and trees
    # tax_dir = join(output_dir,'taxonomies')
    # create_dir(tax_dir)
    if input_tree_fp:
        tree_dir = join(output_dir,'trees')
        create_dir(tree_dir)
    commands = []
    files_to_remove = []
    
    logger = WorkflowLogger(generate_log_fp(output_dir))
    similarity_thresholds.sort()
    similarity_thresholds.reverse()
    
    current_inseqs_fp = input_fasta_fp
    current_tree_fp = input_tree_fp
    previous_otu_map = None
    for similarity_threshold in similarity_thresholds:
        current_inseqs_basename = splitext(split(current_inseqs_fp)[1])[0]
        
        # pick otus command
        otu_fp = '%s/%d_otu_map.txt' % (otu_dir,similarity_threshold)
        clusters_fp = '%s/%d_clusters.uc' % (otu_dir,similarity_threshold)
        temp_otu_fp = '%s/%s_otus.txt' % (otu_dir, current_inseqs_basename)
        temp_log_fp = '%s/%s_otus.log' % (otu_dir, current_inseqs_basename)
        temp_clusters_fp = '%s/%s_clusters.uc' % (otu_dir, current_inseqs_basename)
        pick_otus_cmd = \
         'pick_otus.py -m uclust -DBz -i %s -s %1.2f -o %s' % (
           current_inseqs_fp,
           similarity_threshold/100,
           otu_dir)
        
        commands.append([('Pick OTUs (%d)' % similarity_threshold,
                          pick_otus_cmd)])
        commands.append([('Rename OTU file (%d)' % similarity_threshold,
                          'mv %s %s' % (temp_otu_fp,otu_fp))])
        commands.append([('Rename uc file (%d)' % similarity_threshold,
                          'mv %s %s' % (temp_clusters_fp,clusters_fp))])
        files_to_remove.append(temp_log_fp)
        
        # rep set picking
        temp_rep_set_fp = get_tmp_filename(prefix='NestedReference',
                                           suffix='.fasta')
        pick_rep_set_cmd = \
         'pick_rep_set.py -m first -i %s -o %s -f %s' % (
          otu_fp, 
          temp_rep_set_fp,
          current_inseqs_fp)
        commands.append([('Pick Rep Set (%d)' % similarity_threshold,
                           pick_rep_set_cmd)])
        command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
        commands = []
        
        # rename representative sequences
        rep_set_fp = '%s/%d_otus_%s.fasta' % (
          rep_set_dir,
          similarity_threshold,
          run_id)
        logger.write('Renaming OTU representative sequences so OTU ids are reference sequence ids.')
        rep_set_f = open(rep_set_fp,'w')
        for e in rename_rep_seqs(open(temp_rep_set_fp,'U')):
            rep_set_f.write('>%s\n%s\n' % e)
        rep_set_f.close()
        files_to_remove.append(temp_rep_set_fp)
        
        # filter the tree, if provided
        if current_tree_fp != None:
            tree_fp = '%s/%d_otus_%s.tre' % (
              tree_dir,
              similarity_threshold,
              run_id)
            tree_cmd = 'filter_tree.py -i %s -f %s -o %s' %\
               (current_tree_fp,rep_set_fp,tree_fp)
            commands.append([('Filter tree (%d)' % similarity_threshold,tree_cmd)])
            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
            # prep for the next iteration
            current_tree_fp = tree_fp
        
        
        # prep for the next iteration
        remove_files(files_to_remove)
        commands = []
        files_to_remove = []
        current_inseqs_fp = rep_set_fp
        
    logger.close()
qiime_config = load_qiime_config()

script_info = {}
script_info['brief_description'] = ""
script_info['script_description'] = ""
script_info['script_usage'] = []
script_info['script_usage'].append(("Run a subset of the interface tests in verbose mode","Run interface tests for the add_taxa.py and make_otu_table.py scripts. This illustrates how to run from the qiime_test_dir directory.","%prog -i $PWD/ -l $HOME/qime_script_tests.log -t add_taxa,make_otu_table -v"))
script_info['script_usage'].append(("Run all of the interface tests","Run all script interface tests.  This illustrates how to run from the qiime_test_dir directory.","%prog -i $PWD/ -l $HOME/all_qime_script_tests.log"))
script_info['output_description']= ""
script_info['required_options'] = []

log_fp_prefix = 'script_test_log'
log_fp_suffix = 'txt'
default_log_fp = generate_log_fp(get_qiime_temp_dir(),
                    basefile_name=log_fp_prefix,
                    suffix=log_fp_suffix,
                    timestamp_pattern='%Y%m%d%H%M%S')
default_log_fp_help_str = join(get_qiime_temp_dir(),
                               '%s_TIMESTAMP.%s' % (log_fp_prefix,log_fp_suffix))

script_info['optional_options'] = [\
 make_option('-t','--tests',
             help='comma-separated list of the tests to run [default: all]'),
 make_option('-w','--working_dir',default=get_qiime_temp_dir(),
             help='directory where the tests should be run [default: %default]',
             type='existing_dirpath'),
 make_option('-q','--qiime_scripts_dir',default=qiime_config['qiime_scripts_dir'],
             help='directory containing scripts to test [default: %default]',
             type='existing_dirpath'),
 make_option('-l','--failure_log_fp',type="new_filepath",default=default_log_fp,
             help='log file to store record of failures [default: %s]' % default_log_fp_help_str)
Esempio n. 6
0
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    status_update_callback=print_to_stdout):
    """
    """

    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    
    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,'Log files'))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)
    
    
    bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
    even_dm_fps = run_beta_diversity_through_plots(
     otu_table_fp=biom_fp, 
     mapping_fp=mapping_fp,
     output_dir=bdiv_even_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     sampling_depth=sampling_depth,
     # force suppression of distance histograms - boxplots work better
     # in this context, and are created below.
     histogram_categories=[],
     tree_fp=tree_fp,
     parallel=parallel,
     logger=logger,
     status_update_callback=status_update_callback)
    
    for bdiv_metric, dm_fp in even_dm_fps:
        for category in categories:
            boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
            try:
                params_str = get_params_str(params['make_distance_boxplots'])
            except KeyError:
                params_str = ''
            boxplots_cmd = \
             'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
             (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
            commands.append([('Boxplots (%s)' % category,
                              boxplots_cmd)])
            index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                '%s/%s_Distances.pdf' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                '%s/%s_Stats.txt' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            
        index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Distance matrix (%s)' % bdiv_metric,
                            '%s/%s_dm.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                            '%s/%s_pc.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        
    ## Alpha rarefaction workflow
    arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
    run_qiime_alpha_rarefaction(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=arare_full_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     tree_fp=tree_fp,
     num_steps=arare_num_steps,
     parallel=parallel,
     logger=logger,
     min_rare_depth=arare_min_rare_depth,
     max_rare_depth=sampling_depth,
     status_update_callback=status_update_callback)
    
    index_links.append(('Alpha rarefaction plots',
                        '%s/alpha_rarefaction_plots/rarefaction_plots.html'\
                          % arare_full_output_dir,
                        "Alpha rarefaction results"))
                        
    collated_alpha_diversity_fps = \
     glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
    try:
        params_str = get_params_str(params['compare_alpha_diversity'])
    except KeyError:
        params_str = ''
    for c in categories:
        for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
            alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
            alpha_comparison_output_fp = '%s/%s_%s.txt' % \
             (arare_full_output_dir,c,alpha_metric)
            compare_alpha_cmd = \
             'compare_alpha_diversity.py -i %s -m %s -c %s -d %s -o %s -n 999 %s' %\
             (collated_alpha_diversity_fp, mapping_fp, c, 
              sampling_depth, alpha_comparison_output_fp, params_str)
            commands.append([('Compare alpha diversity (%s, %s)' %\
                               (category,alpha_metric),
                              compare_alpha_cmd)])
            index_links.append(
             ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
              alpha_comparison_output_fp,
              "Alpha rarefaction results"))
    
    taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
    run_summarize_taxa_through_plots(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=taxa_plots_output_dir,
     mapping_cat=None, 
     sort=True,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     logger=logger, 
     status_update_callback=status_update_callback)
    

    index_links.append(('Taxa summary bar plots',
                        '%s/taxa_summary_plots/bar_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    index_links.append(('Taxa summary area plots',
                        '%s/taxa_summary_plots/area_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    for c in categories:
        taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,c)
        run_summarize_taxa_through_plots(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=taxa_plots_output_dir,
         mapping_cat=c, 
         sort=True,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         logger=logger, 
         status_update_callback=status_update_callback)

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
    
    # OTU category significance
    for category in categories:
        category_signifance_fp = \
         '%s/category_significance_%s.txt' % (output_dir, category)
        try:
            params_str = get_params_str(params['otu_category_significance'])
        except KeyError:
            params_str = ''
        # Build the OTU cateogry significance command
        category_significance_cmd = \
         'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
         (biom_fp, mapping_fp, category, 
          category_signifance_fp, params_str)
        commands.append([('OTU category significance (%s)' % category, 
                          category_significance_cmd)])
                          
        index_links.append(('Category significance (%s)' % category,
                    category_signifance_fp,
                    "Category results"))
    
    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links,index_fp)
def pick_subsampled_open_referenence_otus(input_fp, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              run_tax_align_tree=True,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    denovo_otu_picking_method = 'uclust'
    reference_otu_picking_method = 'uclust_ref'
    
    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    log_input_md5s(logger,[input_fp,refseqs_fp,step1_otu_map_fp,step1_failures_fasta_fp])
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in 
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_otu_map_fp = \
             '%s/%s_otus.txt' % (prefilter_dir,input_basename)
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)', prefilter_pick_otu_cmd)])
            
            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input', filter_fasta_cmd)])
            
            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)
            
        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)
        
        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])
        
        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []
    
    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set',step1_pick_rep_set_cmd)])
    
    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp,
                    step2_input_fasta_fp,
                    percent_subsample)
    
    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp,
                                 step2_dir,
                                 new_ref_set_id,
                                 denovo_otu_picking_method,
                                 params,
                                 logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])
    
    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(
     step1_failures_fasta_fp,
     step3_dir,
     reference_otu_picking_method,
     step2_repset_fasta_fp,
     parallel,
     params,
     logger)
    
    commands.append([
     ('Pick reference OTUs using de novo rep set',step3_cmd)])
    
    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir
    
    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures', 
                          step3_filter_fasta_cmd)])
        
        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp,
                                     step4_dir,
                                     '.'.join([new_ref_set_id,'CleanUp']),
                                     denovo_otu_picking_method,
                                     params,
                                     logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',step4_rep_set_cmd)])
        
    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps',cat_otu_tables_cmd)])    
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                      'mv %s %s/final_failures.txt' % (step3_failures_list_fp,output_dir))])
    
    command_handler(commands,
        status_update_callback,
        logger=logger,
        close_logger_on_success=False)
    commands = []
    
    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp,otu_no_singletons_fp,min_otu_size)
    
    ## make the final representative seqs file and a new refseqs file that 
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be. 
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be 
    ## reads from this run so we don't hit issues building a tree using 
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp,'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp,new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp,'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write 
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp,'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id,seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id,seq))
    new_refseqs_f.close()
    final_repset_f.close()
    
    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table",make_otu_table_cmd)])
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    
    commands = []
    
    if run_tax_align_tree:
            taxonomy_fp, pynast_failures_fp = tax_align_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
            
            # Add taxa to otu table
            otu_table_w_tax_fp = \
             '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
            add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
             (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_taxa_cmd)])
            
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []
            
            # Build OTU table without PyNAST failures
            otu_table_fp = \
             '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(otu_table_w_tax_fp,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
        
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
            
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=close_logger_on_success)
Esempio n. 8
0
def iterative_pick_subsampled_open_referenence_otus(
        input_fps,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        run_tax_align_tree=True,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    otu_table_fps = []
    repset_fasta_fps = []
    for i, input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir, i)
        if iteration_output_exists(iteration_output_dir, min_otu_size):
            # if the output from an iteration already exists, skip that
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger, [input_fp, refseqs_fp])
            logger.write(
                'Iteration %d (input file: %s) output data already exists. '
                'Skipping and moving to next.\n\n' % (i, input_fp))
        else:
            pick_subsampled_open_referenence_otus(
                input_fp=input_fp,
                refseqs_fp=refseqs_fp,
                output_dir=iteration_output_dir,
                percent_subsample=percent_subsample,
                new_ref_set_id='.'.join([new_ref_set_id,
                                         str(i)]),
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                run_tax_align_tree=False,
                prefilter_refseqs_fp=prefilter_refseqs_fp,
                prefilter_percent_id=prefilter_percent_id,
                min_otu_size=min_otu_size,
                step1_otu_map_fp=step1_otu_map_fp,
                step1_failures_fasta_fp=step1_failures_fasta_fp,
                parallel=parallel,
                suppress_step4=suppress_step4,
                logger=logger,
                status_update_callback=status_update_callback)
        ## perform post-iteration file shuffling whether the previous iteration's
        ## data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp
        otu_table_fps.append('%s/otu_table_mc%d.biom' %
                             (iteration_output_dir, min_otu_size))
        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)

    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
         (','.join(otu_table_fps),otu_table_fp)
        commands.append([("Merge OTU tables", merge_cmd)])

    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps, final_repset_fp)

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    if run_tax_align_tree:
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        final_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." %
                         otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp, final_otu_table_fp],
                         error_on_missing=False)

            taxonomy_fp, pynast_failures_fp = tax_align_tree(
                repset_fasta_fp=final_repset_fp,
                output_dir=output_dir,
                command_handler=command_handler,
                params=params,
                qiime_config=qiime_config,
                parallel=parallel,
                logger=logger,
                status_update_callback=status_update_callback)

            # Add taxa to otu table
            add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
             (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table", add_taxa_cmd)])

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                parse_biom_table(open(otu_table_w_tax_fp, 'U')),
                get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
                0,
                inf,
                0,
                inf,
                negate_ids_to_keep=True)
            otu_table_f = open(final_otu_table_fp, 'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()

            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []

    logger.close()
def assign_tax(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):
                   
    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    ## Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'rdp'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            if 'assignment_method' in d:
                del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
         (assign_taxonomy_dir,repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp
Esempio n. 10
0
def create_personal_results(output_dir,
                            mapping_fp,
                            coord_fp,
                            collated_dir,
                            otu_table_fp,
                            prefs_fp,
                            personal_id_column,
                            personal_ids=None,
                            column_title='Self',
                            individual_titles=None,
                            category_to_split='BodySite',
                            time_series_category='WeeksSinceStart',
                            rarefaction_depth=10000,
                            alpha=0.05,
                            rep_set_fp=None,
                            parameter_fp=None,
                            body_site_rarefied_otu_table_dir=None,
                            retain_raw_data=False,
                            suppress_alpha_rarefaction=False,
                            suppress_beta_diversity=False,
                            suppress_taxa_summary_plots=False,
                            suppress_alpha_diversity_boxplots=False,
                            suppress_otu_category_significance=False,
                            command_handler=call_commands_serially,
                            status_update_callback=no_status_updates):
    # Create our output directory and copy over the resources the personalized
    # pages need (e.g. javascript, images, etc.).
    create_dir(output_dir)

    support_files_dir = join(output_dir, 'support_files')
    if not exists(support_files_dir):
        copytree(join(get_project_dir(), 'my_microbes', 'support_files'),
                 support_files_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    mapping_data, header, comments = parse_mapping_file(open(mapping_fp, 'U'))
    try:
        personal_id_index = header.index(personal_id_column)
    except ValueError:
        raise ValueError("Personal ID field '%s' is not a mapping file column "
                         "header." % personal_id_column)
    try:
        bodysite_index = header.index(category_to_split)
    except ValueError:
        raise ValueError("Category to split field '%s' is not a mapping file "
            "column header." % category_to_split)

    header = header[:-1] + [column_title] + [header[-1]]

    # column that differentiates between body-sites within a single individual
    # used for the creation of the vectors in make_3d_plots.py, this data is
    # created by concatenating the two columns when writing the mapping file
    site_id_category = '%s&&%s' % (personal_id_column, category_to_split)
    header.insert(len(header)-1, site_id_category)

    all_personal_ids = get_personal_ids(mapping_data, personal_id_index)
    if personal_ids == None: 
        personal_ids = all_personal_ids
    else:
        for pid in personal_ids:
            if pid not in all_personal_ids:
                raise ValueError("'%s' is not a personal ID in the mapping "
                                 "file column '%s'." %
                                 (pid, personal_id_column))

    if time_series_category not in header:
        raise ValueError("Time series field '%s' is not a mapping file column "
                         "header." % time_series_category)

    otu_table_title = splitext(basename(otu_table_fp))

    output_directories = []
    raw_data_files = []
    raw_data_dirs = []

    # Rarefy the OTU table and split by body site here (instead of on a
    # per-individual basis) as we can use the same rarefied and split tables
    # for each individual.
    if not suppress_otu_category_significance:
        rarefied_otu_table_fp = join(output_dir,
                add_filename_suffix(otu_table_fp,
                                    '_even%d' % rarefaction_depth))

        if body_site_rarefied_otu_table_dir is None:
            commands = []
            cmd_title = 'Rarefying OTU table'
            cmd = 'single_rarefaction.py -i %s -o %s -d %s' % (otu_table_fp,
                    rarefied_otu_table_fp, rarefaction_depth)
            commands.append([(cmd_title, cmd)])
            raw_data_files.append(rarefied_otu_table_fp)

            per_body_site_dir = join(output_dir, 'per_body_site_otu_tables')

            cmd_title = 'Splitting rarefied OTU table by body site'
            cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (
                    rarefied_otu_table_fp, mapping_fp, category_to_split,
                    per_body_site_dir)
            commands.append([(cmd_title, cmd)])
            raw_data_dirs.append(per_body_site_dir)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)
        else:
            per_body_site_dir = body_site_rarefied_otu_table_dir

    for person_of_interest in personal_ids:
        create_dir(join(output_dir, person_of_interest))

        personal_mapping_file_fp = join(output_dir, person_of_interest,
                                        'mapping_file.txt')
        html_fp = join(output_dir, person_of_interest, 'index.html')

        personal_mapping_data = create_personal_mapping_file(mapping_data,
                person_of_interest, personal_id_index, bodysite_index,
                individual_titles)

        personal_mapping_f = open(personal_mapping_file_fp, 'w')
        personal_mapping_f.write(
                format_mapping_file(header, personal_mapping_data, comments))
        personal_mapping_f.close()
        raw_data_files.append(personal_mapping_file_fp)

        column_title_index = header.index(column_title)
        column_title_values = set([e[column_title_index]
                                   for e in personal_mapping_data])
        cat_index = header.index(category_to_split)
        cat_values = set([e[cat_index] for e in personal_mapping_data])

        # Generate alpha diversity boxplots, split by body site, one per
        # metric. We run this one first because it completes relatively
        # quickly and it does not call any QIIME scripts.
        alpha_diversity_boxplots_html = ''
        if not suppress_alpha_diversity_boxplots:
            adiv_boxplots_dir = join(output_dir, person_of_interest,
                                     'adiv_boxplots')
            create_dir(adiv_boxplots_dir)
            output_directories.append(adiv_boxplots_dir)

            logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" %
                         person_of_interest)

            plot_filenames = _generate_alpha_diversity_boxplots(
                    collated_dir, personal_mapping_file_fp,
                    category_to_split, column_title, rarefaction_depth,
                    adiv_boxplots_dir)

            # Create relative paths for use with the index page.
            rel_boxplot_dir = basename(normpath(adiv_boxplots_dir))
            plot_fps = [join(rel_boxplot_dir, plot_filename)
                        for plot_filename in plot_filenames]

            alpha_diversity_boxplots_html = \
                    create_alpha_diversity_boxplots_html(plot_fps)

        ## Alpha rarefaction steps
        if not suppress_alpha_rarefaction:
            rarefaction_dir = join(output_dir, person_of_interest,
                                   'alpha_rarefaction')
            output_directories.append(rarefaction_dir)

            commands = []
            cmd_title = 'Creating rarefaction plots (%s)' % person_of_interest
            cmd = 'make_rarefaction_plots.py -i %s -m %s -p %s -o %s' % (
                    collated_dir, personal_mapping_file_fp, prefs_fp,
                    rarefaction_dir)
            commands.append([(cmd_title, cmd)])

            raw_data_dirs.append(join(rarefaction_dir, 'average_plots'))
            raw_data_dirs.append(join(rarefaction_dir, 'average_tables'))

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Beta diversity steps
        if not suppress_beta_diversity:
            pcoa_dir = join(output_dir, person_of_interest, 'beta_diversity')
            pcoa_time_series_dir = join(output_dir, person_of_interest, 
                                         'beta_diversity_time_series')
            output_directories.append(pcoa_dir)
            output_directories.append(pcoa_time_series_dir)

            commands = []
            cmd_title = 'Creating beta diversity time series plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=' % (
                personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_time_series_dir) +\
                '\'%s\' --add_vectors=\'%s,%s\'' % (time_series_category,
                site_id_category, time_series_category)
            commands.append([(cmd_title, cmd)])
            
            cmd_title = 'Creating beta diversity plots (%s)' % \
                        person_of_interest
            cmd = 'make_3d_plots.py  -m %s -p %s -i %s -o %s' % (personal_mapping_file_fp,
                                                                 prefs_fp, coord_fp, 
                                                                 pcoa_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

        ## Time series taxa summary plots steps
        if not suppress_taxa_summary_plots:
            area_plots_dir = join(output_dir, person_of_interest, 'time_series')
            create_dir(area_plots_dir)
            output_directories.append(area_plots_dir)

            ## Split OTU table into self/other per-body-site tables
            commands = []
            cmd_title = 'Splitting OTU table into self/other (%s)' % \
                        person_of_interest
            cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (otu_table_fp,
                    personal_mapping_file_fp, column_title, area_plots_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

            for column_title_value in column_title_values:
                biom_fp = join(area_plots_dir,
                               add_filename_suffix(otu_table_fp,
                                                   '_%s' % column_title_value))
                column_title_map_fp = join(area_plots_dir, 'mapping_%s.txt' %
                                                           column_title_value)
                raw_data_files.append(biom_fp)
                raw_data_files.append(column_title_map_fp)

                body_site_dir = join(area_plots_dir, column_title_value)

                commands = []
                cmd_title = 'Splitting "%s" OTU table by body site (%s)' % \
                            (column_title_value, person_of_interest)
                cmd = 'split_otu_table.py -i %s -m %s -f %s -o %s' % (biom_fp,
                        personal_mapping_file_fp, category_to_split,
                        body_site_dir)
                commands.append([(cmd_title, cmd)])
                raw_data_dirs.append(body_site_dir)

                command_handler(commands, status_update_callback, logger,
                                close_logger_on_success=False)

                commands = []
                for cat_value in cat_values:
                    body_site_otu_table_fp = join(body_site_dir,
                            add_filename_suffix(biom_fp, '_%s' % cat_value))

                    # We won't always get an OTU table if the mapping file
                    # category contains samples that aren't in the OTU table
                    # (e.g. the 'na' state for body site).
                    if exists(body_site_otu_table_fp):
                        plots = join(area_plots_dir, 'taxa_plots_%s_%s' % (
                            column_title_value, cat_value))

                        cmd_title = 'Creating taxa summary plots (%s)' % \
                                    person_of_interest
                        cmd = ('summarize_taxa_through_plots.py -i %s '
                               '-o %s -c %s -m %s -s' %
                               (body_site_otu_table_fp, plots,
                                time_series_category,
                                personal_mapping_file_fp))
                        if parameter_fp is not None:
                            cmd += ' -p %s' % parameter_fp
                            
                        commands.append([(cmd_title, cmd)])

                        raw_data_files.append(join(plots, '*.biom'))
                        raw_data_files.append(join(plots, '*.txt'))

                        create_comparative_taxa_plots_html(cat_value, 
                                join(area_plots_dir, '%s_comparative.html' %
                                                     cat_value))

                command_handler(commands, status_update_callback, logger,
                                close_logger_on_success=False)

        # Generate OTU category significance tables (per body site).
        otu_cat_sig_output_fps = []
        otu_category_significance_html = ''
        if not suppress_otu_category_significance:
            otu_cat_sig_dir = join(output_dir, person_of_interest,
                                   'otu_category_significance')
            create_dir(otu_cat_sig_dir)
            output_directories.append(otu_cat_sig_dir)

            # For each body-site rarefied OTU table, run
            # otu_category_significance.py using self versus other category.
            # Keep track of each output file that is created because we need to
            # parse these later on.
            commands = []
            for cat_value in cat_values:
                body_site_otu_table_fp = join(per_body_site_dir,
                        add_filename_suffix(rarefied_otu_table_fp,
                                            '_%s' % cat_value))

                if exists(body_site_otu_table_fp):
                    otu_cat_output_fp = join(otu_cat_sig_dir,
                                             'otu_cat_sig_%s.txt' % cat_value)

                    cmd_title = ('Testing for significant differences in '
                                 'OTU abundances in "%s" body site (%s)' % (
                                 cat_value, person_of_interest))
                    cmd = ('otu_category_significance.py -i %s -m %s -c %s '
                           '-o %s' % (body_site_otu_table_fp,
                                      personal_mapping_file_fp,
                                      column_title,
                                      otu_cat_output_fp))
                    commands.append([(cmd_title, cmd)])
                    raw_data_files.append(otu_cat_output_fp)
                    otu_cat_sig_output_fps.append(otu_cat_output_fp)

            command_handler(commands, status_update_callback, logger,
                            close_logger_on_success=False)

            # Reformat otu category significance tables.
            otu_cat_sig_html_filenames = \
                    format_otu_category_significance_tables_as_html(
                            otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, 
                            individual_titles, rep_set_fp=rep_set_fp)

            # Create relative paths for use with the index page.
            rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir))
            otu_cat_sig_html_fps = [join(rel_otu_cat_sig_dir, html_filename)
                    for html_filename in otu_cat_sig_html_filenames]

            otu_category_significance_html = \
                    create_otu_category_significance_html(otu_cat_sig_html_fps)

        # Create the index.html file for the current individual.
        create_index_html(person_of_interest, html_fp,
                alpha_diversity_boxplots_html=alpha_diversity_boxplots_html,
                otu_category_significance_html=otu_category_significance_html)

    logger.close()

    # Clean up the unnecessary raw data files and directories. glob will only
    # grab paths that exist.
    if not retain_raw_data:
        for raw_data_fp_glob in raw_data_files:
            remove_files(glob(raw_data_fp_glob))

        for raw_data_dir_glob in raw_data_dirs:
            for dir_to_remove in glob(raw_data_dir_glob):
                rmtree(dir_to_remove)

    return output_directories
Esempio n. 11
0
def generate_most_wanted_list(output_dir, otu_table_fp, rep_set_fp, gg_fp,
        nt_fp, mapping_fp, mapping_category, top_n, min_abundance,
        max_abundance, min_categories, max_gg_similarity, e_value,
        word_size, jobs_to_start, command_handler, status_update_callback,
        force):
    try:
        makedirs(output_dir)
    except OSError:
        if not force:
            raise WorkflowError("Output directory '%s' already exists. Please "
                    "choose a different directory, or force overwrite with -f."
                    % output_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))
    commands = []

    # First filter to keep only new (non-GG) OTUs.
    novel_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp,
                                                              '_novel'))
    commands.append([('Filtering out all GG reference OTUs',
            'filter_otus_from_otu_table.py -i %s -o %s -e %s' %
            (otu_table_fp, novel_otu_table_fp, gg_fp))])

    # Next filter to keep only abundant otus in the specified range (looking
    # only at extremely abundant OTUs has the problem of yielding too many
    # that are similar to stuff in the nt database).
    novel_abund_otu_table_fp = join(output_dir,
            add_filename_suffix(novel_otu_table_fp, '_min%d_max%d' %
            (min_abundance, max_abundance)))
    commands.append([('Filtering out all OTUs that do not fall within the '
            'specified abundance threshold',
            'filter_otus_from_otu_table.py -i %s -o %s -n %d -x %d' %
            (novel_otu_table_fp, novel_abund_otu_table_fp, min_abundance,
             max_abundance))])

    # Next, collapse by mapping_category.
    otu_table_by_samp_type_fp = join(output_dir,
            add_filename_suffix(novel_abund_otu_table_fp, '_%s' %
            mapping_category))
    commands.append([('Collapsing OTU table by %s' % mapping_category,
            'summarize_otu_by_cat.py -c %s -o %s -m %s -i %s' %
            (novel_abund_otu_table_fp, otu_table_by_samp_type_fp,
             mapping_category, mapping_fp))])

    # Filter to contain only otus in the specified minimum number of sample
    # types.
    otu_table_by_samp_type_ms_fp = join(output_dir, add_filename_suffix(
            otu_table_by_samp_type_fp, '_ms%d' % min_categories))
    commands.append([('Filtering OTU table to include only OTUs that appear '
            'in at least %d sample groups' % min_categories,
            'filter_otus_from_otu_table.py -i %s -o %s -s %d' %
            (otu_table_by_samp_type_fp, otu_table_by_samp_type_ms_fp,
             min_categories))])

    # Now that we have a filtered down OTU table of good candidate OTUs, filter
    # the corresponding representative set to include only these candidate
    # sequences.
    candidate_rep_set_fp = join(output_dir, add_filename_suffix(
            rep_set_fp, '_most_wanted_candidates'))
    commands.append([('Filtering representative set to include only the '
            'latest candidate OTUs',
            'filter_fasta.py -f %s -o %s -b %s' %
            (rep_set_fp, candidate_rep_set_fp, otu_table_by_samp_type_ms_fp))])

    # Find the otus that don't hit GG at a certain maximum similarity
    # threshold.
    uclust_output_dir = join(output_dir, 'most_wanted_candidates_%s_%s' %
            (basename(gg_fp), str(max_gg_similarity)))
    commands.append([('Running uclust to get list of sequences that don\'t '
            'hit the maximum GG similarity threshold',
            'parallel_pick_otus_uclust_ref.py -i %s -o %s -r %s -s %s -O %d' %
            (candidate_rep_set_fp, uclust_output_dir, gg_fp,
             str(max_gg_similarity), jobs_to_start))])

    # Filter the candidate sequences to only include the failures from uclust.
    cand_gg_dis_rep_set_fp = join(output_dir,
            add_filename_suffix(candidate_rep_set_fp, '_failures'))
    commands.append([('Filtering candidate sequences to only include uclust '
            'failures',
            'filter_fasta.py -f %s -s %s -o %s' %
            (candidate_rep_set_fp, join(uclust_output_dir,
             splitext(basename(candidate_rep_set_fp))[0] + '_failures.txt'),
             cand_gg_dis_rep_set_fp))])

    # BLAST the failures against nt.
    blast_output_dir = join(output_dir, 'blast_output')
    commands.append([('BLASTing candidate sequences against nt database',
            'parallel_blast.py -i %s -o %s -r %s -D -e %f -w %d -O %d' %
            (cand_gg_dis_rep_set_fp, blast_output_dir, nt_fp, e_value,
             word_size, jobs_to_start))])

    # Execute the commands we have so far, but keep the logger open because
    # we're going to write additional status updates as we process the data.
    command_handler(commands, status_update_callback, logger,
                    close_logger_on_success=False)

    # We'll sort the BLAST results by percent identity (ascending) and pick the
    # top n.
    logger.write("Reading in BLAST results, sorting by percent identity, "
                 "and picking the top %d OTUs.\n\n" % top_n)
    blast_results = open(join(blast_output_dir,
        splitext(basename(cand_gg_dis_rep_set_fp))[0] + '_blast_out.txt'), 'U')
    top_n_mw = []
    for line in blast_results:
        # Skip headers.
        line = line.strip()
        if line and not line.startswith('#'):
            line = line.split('\t')
            top_n_mw.append((line[0], line[1], float(line[2])))
    top_n_mw = sorted(top_n_mw, key=itemgetter(2))[:top_n]

    # Read in our filtered down candidate seqs file and latest filtered and
    # collapsed OTU table. We'll need to compute some stats on these to include
    # in our report.
    logger.write("Reading in candidate sequences and latest filtered and "
                 "collapsed OTU table.\n\n")
    mw_seqs = {}
    for seq_id, seq in MinimalFastaParser(open(cand_gg_dis_rep_set_fp, 'U')):
        seq_id = seq_id.strip().split()[0]
        mw_seqs[seq_id] = seq
    otu_table_by_samp_type_ms = parse_biom_table(
            open(otu_table_by_samp_type_ms_fp, 'U'))

    # Write results out to tsv and HTML table.
    logger.write("Writing most wanted OTUs results to TSV and HTML "
                 "tables.\n\n")
    mw_tsv_f = open(join(output_dir,
                    'top_%d_most_wanted_otus.txt' % top_n), 'w')
    mw_html_f = open(join(output_dir,
                    'top_%d_most_wanted_otus.html' % top_n), 'w')
    tsv_header = 'OTU ID\tSequence\tGreengenes taxonomy\t' + \
                 'NCBI nt closest match\tNCBI nt % identity'
    mw_tsv_f.write(tsv_header + '\n')

    tsv_header += '\tAbundance by %s' % mapping_category
    html_header = ''
    for col in tsv_header.split('\t'):
        html_header += '<th>%s</th>' % col
    mw_html_f.write('<table><tr>' + html_header + '</tr>')

    for otu_id, subject_id, percent_identity in top_n_mw:
        # Grab all necessary information to be included in our report.
        seq = mw_seqs[otu_id]
        tax = otu_table_by_samp_type_ms.ObservationMetadata[
            otu_table_by_samp_type_ms.getObservationIndex(otu_id)]['taxonomy']
        gb_id = subject_id.split('|')[3]
        ncbi_link = 'http://www.ncbi.nlm.nih.gov/nuccore/%s' % gb_id

        # Compute the abundance of each most wanted OTU in each sample
        # grouping and create a pie chart to go in the HTML table.
        samp_types = otu_table_by_samp_type_ms.SampleIds
        counts = otu_table_by_samp_type_ms.observationData(otu_id)
        if len(counts) != len(samp_types):
            raise WorkflowError("The number of observation counts does not "
                                "match the number of samples in the OTU "
                                "table.")

        # Piechart code modified from matplotlib example:
        # http://matplotlib.sourceforge.net/examples/pylab_examples/
        #   pie_demo.html
        figure(figsize=(6,6))
        ax = axes([0.1, 0.1, 0.8, 0.8])
        # Will auto-normalize the counts.
        pie(counts, labels=samp_types, autopct='%1.1f%%', shadow=True)

        output_img_dir = join(output_dir, 'img')
        try:
            makedirs(output_img_dir)
        except OSError:
            # It already exists, which is okay since we already know we are in
            # 'force' mode from above.
            pass

        # We need a relative path to the image.
        pie_chart_fp = join('img', 'abundance_by_%s_%s.png' %
                            (mapping_category, otu_id))
        savefig(join(output_dir, pie_chart_fp))

        mw_tsv_f.write('%s\t%s\t%s\t%s\t%s\n' %
                       (otu_id, seq, tax, gb_id, percent_identity))

        mw_html_f.write('<tr><td>%s</td><td>%s</td><td>%s</td>'
                '<td><a href="%s" target="_blank">%s</a></td><td>%s</td><td>'
                '<img src="%s" /></td></tr>' % (otu_id, seq, tax, ncbi_link,
                gb_id, percent_identity, pie_chart_fp))
    mw_html_f.write('</table>')
    mw_tsv_f.close()
    mw_html_f.close()
    logger.close()
Esempio n. 12
0
def assign_taxonomy_multiple_times(input_dirs,
                                   output_dir,
                                   assignment_methods,
                                   reference_seqs_fp,
                                   input_fasta_filename,
                                   clean_otu_table_filename,
                                   id_to_taxonomy_fp=None,
                                   confidences=None,
                                   e_values=None,
                                   command_handler=call_commands_serially,
                                   rdp_max_memory=None,
                                   status_update_callback=print_to_stdout,
                                   force=False,
                                   read_1_seqs_fp=None,
                                   read_2_seqs_fp=None):
    """ Performs sanity checks on passed arguments and directories. Builds 
        commands for each method and sends them off to be executed. """
    ## Check if temp output directory exists
    try:
        makedirs(output_dir)
    except OSError:
        if not force:
            raise WorkflowError(
                "Output directory '%s' already exists. Please "
                "choose a different directory, or force overwrite with -f." %
                output_dir)

    ## Check for inputs that are universally required
    if assignment_methods is None:
        raise WorkflowError("You must specify at least one method:"
                            "'rdp', 'blast', 'mothur', or 'rtax'.")
    if input_fasta_filename is None:
        raise WorkflowError("You must provide an input fasta filename.")
    if clean_otu_table_filename is None:
        raise WorkflowError("You must provide a clean otu table filename.")
    if id_to_taxonomy_fp is None:
        raise WorkflowError("You must provide an ID to taxonomy map filename.")

    logger = WorkflowLogger(generate_log_fp(output_dir))
    time_results = []

    for input_dir in input_dirs:
        ## Make sure the input dataset directory exists.
        if not isdir(input_dir):
            raise WorkflowError("The input directory '%s' does not exist." %
                                input_dir)

        input_dir_name = split(normpath(input_dir))[1]
        output_dataset_dir = join(output_dir, input_dir_name)
        input_fasta_fp = join(input_dir, input_fasta_filename)
        clean_otu_table_fp = join(input_dir, clean_otu_table_filename)

        logger.write("\nCreating output subdirectory '%s' if it doesn't "
                     "already exist.\n" % output_dataset_dir)
        try:
            makedirs(output_dataset_dir)
        except OSError:
            # It already exists, which is okay since we already know we are in
            # 'force' mode from above.
            pass

        for method in assignment_methods:
            ## Method is RDP
            if method == 'rdp':
                ## Check for execution parameters required by RDP method
                if confidences is None:
                    raise WorkflowError("You must specify at least one "
                                        "confidence level.")
                ## Generate command for RDP
                commands = _generate_rdp_commands(
                    output_dataset_dir,
                    input_fasta_fp,
                    reference_seqs_fp,
                    id_to_taxonomy_fp,
                    clean_otu_table_fp,
                    confidences,
                    rdp_max_memory=rdp_max_memory)

            ## Method is BLAST
            elif method == 'blast':
                ## Check for execution parameters required by BLAST method
                if e_values is None:
                    raise WorkflowError("You must specify at least one "
                                        "E value.")
                ## Generate command for BLAST
                commands = _generate_blast_commands(
                    output_dataset_dir, input_fasta_fp, reference_seqs_fp,
                    id_to_taxonomy_fp, clean_otu_table_fp, e_values)

            ## Method is Mothur
            elif method == 'mothur':
                ## Check for execution parameters required by Mothur method
                if confidences is None:
                    raise WorkflowError("You must specify at least one "
                                        "confidence level.")
                ## Generate command for mothur
                commands = _generate_mothur_commands(
                    output_dataset_dir, input_fasta_fp, reference_seqs_fp,
                    id_to_taxonomy_fp, clean_otu_table_fp, confidences)

            ## Method is RTAX
            elif method == 'rtax':
                ## Check for execution parameters required by RTAX method
                if read_1_seqs_fp is None:
                    raise WorkflowError("You must specify a file containing "
                                        "the first read from pair-end "
                                        "sequencing.")
                ## Generate command for rtax
                commands = _generate_rtax_commands(
                    output_dataset_dir,
                    input_fasta_fp,
                    reference_seqs_fp,
                    id_to_taxonomy_fp,
                    clean_otu_table_fp,
                    read_1_seqs_fp,
                    read_2_seqs_fp=read_2_seqs_fp)

            ## Unsupported method
            else:
                raise WorkflowError("Unrecognized or unsupported taxonomy "
                                    "assignment method '%s'." % method)

            # send command for current method to command handler
            for command in commands:
                #call_commands_serially needs a list of commands so here's a length one commmand list.
                c = list()
                c.append(command)
                start = time()
                command_handler(c,
                                status_update_callback,
                                logger,
                                close_logger_on_success=False)
                end = time()
                input_file = command[0][1].split()[
                    command[0][1].split().index('-i') + 1].split('/')[-2]
                if 'Assigning' in command[0][0]:
                    time_results.append(
                        (input_file, ' '.join(command[0][0].split()[2:]),
                         end - start))

    # removes and writes out the title we initialized with earlier
    logger.write('\n\nAssignment times (seconds):\n')
    for t in time_results:
        # write out each time result as (method, params)\ttime (seconds)
        #First clean up the output
        method, param = t[1].split(', ')
        method = method.lstrip('(')
        param = param.rstrip(')')

        logger.write('%s\t%s\t%s\t%s\n' % (t[0], method, param, str(t[2])))

    logger.close()
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #get all the options
    cd_dir=path.join(opts.fs_fp,'sumtaxa')
    tmp_prefix=get_tmp_filename('',suffix='').strip()
    output_dir=path.join(opts.fs_fp,'sumtaxa','sum_taxa_'+tmp_prefix)
    web_fp=path.join(opts.web_fp,'sumtaxa','sum_taxa_'+tmp_prefix)
    otu_table_fp=opts.otu_table_fp
    mapping_file_fp=opts.mapping_file_fp
    file_name_prefix=opts.fname_prefix
    user_id=int(opts.user_id)
    meta_id=int(opts.meta_id)
    bdiv_rarefied_at=int(opts.bdiv_rarefied_at)
    jobs_to_start=opts.jobs_to_start.split(',')
    tree_fp=opts.tree_fp
    command_handler=call_commands_serially
    status_update_callback=no_status_updates
    zip_fpath=opts.zip_fpath
    zip_fpath_db=opts.zip_fpath_db
    run_date=opts.run_date
    force=True
    
    # get database connection
    try:
        from data_access_connections import data_access_factory
        from enums import ServerConfig
        import cx_Oracle
        data_access = data_access_factory(ServerConfig.data_access_type)
    except ImportError:
        print "NOT IMPORTING QIIMEDATAACCESS"
        pass
        
    # parse params
    try:
        parameter_f = open(opts.params_path)
    except IOError:
        raise IOError,\
         "Can't open parameters file (%s). Does it exist? Do you have read access?"\
         % opts.params_path
    
    params=parse_qiime_parameters(parameter_f)
    
    # write output directory
    try:
        makedirs(output_dir)
    except OSError:
        if force:
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            print "Output directory already exists. Please choose "+\
             "a different directory, or force overwrite with -f."
            exit(1)
    
    create_dir(output_dir)
    commands = []
    
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    #start preparing the script call
    sum_taxa_cmd='%s %s/summarize_taxa_through_plots.py -i %s -m %s -o %s -p %s -s -f' %\
        (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, output_dir,\
         opts.params_path)
    
    chart_types=params['plot_taxa_summary']['chart_type'].split(',')

    html_fpaths=[]
    for ctype in chart_types:
        html_fpaths.append((path.join(web_fp,'taxa_summary_plots',
                                         '%s_charts.html' % (ctype)),
                                         'SUMTAXA'))
    
    commands.append([('Summarize Taxonomy',sum_taxa_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger)
    
    
    #zip the files produced
    cmd_call='cd %s; zip -r %s %s' % (output_dir,\
                                      zip_fpath, './*')
    system(cmd_call)

    #add html links to DB for easy display
    for i in html_fpaths:
        valid=data_access.addMetaAnalysisFiles(True,int(meta_id),i[0],
                                               'SUMTAXA',run_date,i[1].upper())
        if not valid:
            raise ValueError, 'There was an issue uploading the filepaths to the DB!'     
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #get all the options
    cd_dir=path.join(opts.fs_fp,'arare')
    tmp_prefix=get_tmp_filename('',suffix='').strip()
    output_dir=path.join(opts.fs_fp,'arare','arare_'+tmp_prefix)
    web_fp=path.join(opts.web_fp,'arare','arare_'+tmp_prefix)
    otu_table_fp=opts.otu_table_fp
    mapping_file_fp=opts.mapping_file_fp
    file_name_prefix=opts.fname_prefix
    user_id=int(opts.user_id)
    meta_id=int(opts.meta_id)
    bdiv_rarefied_at=int(opts.bdiv_rarefied_at)
    jobs_to_start=opts.jobs_to_start
    tree_fp=opts.tree_fp
    command_handler=call_commands_serially
    status_update_callback=no_status_updates
    zip_fpath=opts.zip_fpath
    zip_fpath_db=opts.zip_fpath_db
    run_date=opts.run_date
    force=True
    
    try:
        from data_access_connections import data_access_factory
        from enums import ServerConfig
        import cx_Oracle
        data_access = data_access_factory(ServerConfig.data_access_type)
    except ImportError:
        print "NOT IMPORTING QIIMEDATAACCESS"
        pass
        
    try:
        parameter_f = open(opts.params_path)
    except IOError:
        raise IOError,\
         "Can't open parameters file (%s). Does it exist? Do you have read access?"\
         % opts.params_path
    
    params=parse_qiime_parameters(parameter_f)
    
    try:
        makedirs(output_dir)
    except OSError:
        if force:
            pass
        else:
            # Since the analysis can take quite a while, I put this check
            # in to help users avoid overwriting previous output.
            print "Output directory already exists. Please choose "+\
             "a different directory, or force overwrite with -f."
            exit(1)
    
    commands=[]
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # determine whether to run alpha-diversity in serial or parallel
    serial_or_parallel = params['serial_or_parallel']['method']
    if serial_or_parallel=='Serial':
        arare_cmd='%s %s/alpha_rarefaction.py -i %s -m %s -o %s -t %s -p %s -f' %\
            (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \
             output_dir,tree_fp,opts.params_path)
    else:
        arare_cmd='%s %s/alpha_rarefaction.py -i %s -m %s -o %s -t %s -a -O 50 -p %s -f' %\
            (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \
             output_dir,tree_fp,opts.params_path)
    
    commands.append([('Alpha-Rarefaction',arare_cmd)])
    
    command_handler(commands, status_update_callback, logger)

    #zip the distance matrices
    cmd_call='cd %s; zip -r %s %s' % (cd_dir,zip_fpath,'arare_'+tmp_prefix)
    system(cmd_call)

    #convert link into web-link
    web_link=path.join(web_fp, 'alpha_rarefaction_plots',
                       'rarefaction_plots.html')
    
    #add the distance matrices
    valid=data_access.addMetaAnalysisFiles(True, int(meta_id), web_link, 
                                           'ARARE', run_date, 'ARARE')
    if not valid:
        raise ValueError, 'There was an issue uploading the filepaths to the DB!'
Esempio n. 15
0
def create_personal_results(
    output_dir,
    mapping_fp,
    coord_fp,
    collated_dir,
    otu_table_fp,
    prefs_fp,
    personal_id_column,
    personal_ids=None,
    column_title="Self",
    individual_titles=None,
    category_to_split="BodySite",
    time_series_category="WeeksSinceStart",
    rarefaction_depth=10000,
    alpha=0.05,
    rep_set_fp=None,
    body_site_rarefied_otu_table_dir=None,
    retain_raw_data=False,
    suppress_alpha_rarefaction=False,
    suppress_beta_diversity=False,
    suppress_taxa_summary_plots=False,
    suppress_alpha_diversity_boxplots=False,
    suppress_otu_category_significance=False,
    command_handler=call_commands_serially,
    status_update_callback=no_status_updates,
):
    # Create our output directory and copy over the resources the personalized
    # pages need (e.g. javascript, images, etc.).
    create_dir(output_dir)

    support_files_dir = join(output_dir, "support_files")
    if not exists(support_files_dir):
        copytree(join(get_project_dir(), "my_microbes", "support_files"), support_files_dir)

    logger = WorkflowLogger(generate_log_fp(output_dir))

    mapping_data, header, comments = parse_mapping_file(open(mapping_fp, "U"))
    try:
        personal_id_index = header.index(personal_id_column)
    except ValueError:
        raise ValueError("Personal ID field '%s' is not a mapping file column " "header." % personal_id_column)
    try:
        bodysite_index = header.index(category_to_split)
    except ValueError:
        raise ValueError("Category to split field '%s' is not a mapping file " "column header." % category_to_split)

    header = header[:-1] + [column_title] + [header[-1]]

    # column that differentiates between body-sites within a single individual
    # used for the creation of the vectors in make_3d_plots.py, this data is
    # created by concatenating the two columns when writing the mapping file
    site_id_category = "%s&&%s" % (personal_id_column, category_to_split)
    header.insert(len(header) - 1, site_id_category)

    all_personal_ids = get_personal_ids(mapping_data, personal_id_index)
    if personal_ids == None:
        personal_ids = all_personal_ids
    else:
        for pid in personal_ids:
            if pid not in all_personal_ids:
                raise ValueError(
                    "'%s' is not a personal ID in the mapping " "file column '%s'." % (pid, personal_id_column)
                )

    if time_series_category not in header:
        raise ValueError("Time series field '%s' is not a mapping file column " "header." % time_series_category)

    otu_table_title = splitext(basename(otu_table_fp))

    output_directories = []
    raw_data_files = []
    raw_data_dirs = []

    # Rarefy the OTU table and split by body site here (instead of on a
    # per-individual basis) as we can use the same rarefied and split tables
    # for each individual.
    if not suppress_otu_category_significance:
        rarefied_otu_table_fp = join(output_dir, add_filename_suffix(otu_table_fp, "_even%d" % rarefaction_depth))

        if body_site_rarefied_otu_table_dir is None:
            commands = []
            cmd_title = "Rarefying OTU table"
            cmd = "single_rarefaction.py -i %s -o %s -d %s" % (otu_table_fp, rarefied_otu_table_fp, rarefaction_depth)
            commands.append([(cmd_title, cmd)])
            raw_data_files.append(rarefied_otu_table_fp)

            per_body_site_dir = join(output_dir, "per_body_site_otu_tables")

            cmd_title = "Splitting rarefied OTU table by body site"
            cmd = "split_otu_table.py -i %s -m %s -f %s -o %s" % (
                rarefied_otu_table_fp,
                mapping_fp,
                category_to_split,
                per_body_site_dir,
            )
            commands.append([(cmd_title, cmd)])
            raw_data_dirs.append(per_body_site_dir)

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)
        else:
            per_body_site_dir = body_site_rarefied_otu_table_dir

    for person_of_interest in personal_ids:
        # Files to clean up on a per-individual basis.
        personal_raw_data_files = []
        personal_raw_data_dirs = []

        create_dir(join(output_dir, person_of_interest))

        personal_mapping_file_fp = join(output_dir, person_of_interest, "mapping_file.txt")
        html_fp = join(output_dir, person_of_interest, "index.html")

        personal_mapping_data = create_personal_mapping_file(
            mapping_data, person_of_interest, personal_id_index, bodysite_index, individual_titles
        )

        personal_mapping_f = open(personal_mapping_file_fp, "w")
        personal_mapping_f.write(format_mapping_file(header, personal_mapping_data, comments))
        personal_mapping_f.close()
        personal_raw_data_files.append(personal_mapping_file_fp)

        column_title_index = header.index(column_title)
        column_title_values = set([e[column_title_index] for e in personal_mapping_data])
        cat_index = header.index(category_to_split)
        cat_values = set([e[cat_index] for e in personal_mapping_data])

        # Generate alpha diversity boxplots, split by body site, one per
        # metric. We run this one first because it completes relatively
        # quickly and it does not call any QIIME scripts.
        alpha_diversity_boxplots_html = ""
        if not suppress_alpha_diversity_boxplots:
            adiv_boxplots_dir = join(output_dir, person_of_interest, "adiv_boxplots")
            create_dir(adiv_boxplots_dir)
            output_directories.append(adiv_boxplots_dir)

            logger.write("\nGenerating alpha diversity boxplots (%s)\n\n" % person_of_interest)

            plot_filenames = _generate_alpha_diversity_boxplots(
                collated_dir,
                personal_mapping_file_fp,
                category_to_split,
                column_title,
                rarefaction_depth,
                adiv_boxplots_dir,
            )

            # Create relative paths for use with the index page.
            rel_boxplot_dir = basename(normpath(adiv_boxplots_dir))
            plot_fps = [join(rel_boxplot_dir, plot_filename) for plot_filename in plot_filenames]

            alpha_diversity_boxplots_html = create_alpha_diversity_boxplots_html(plot_fps)

        ## Alpha rarefaction steps
        if not suppress_alpha_rarefaction:
            rarefaction_dir = join(output_dir, person_of_interest, "alpha_rarefaction")
            output_directories.append(rarefaction_dir)

            commands = []
            cmd_title = "Creating rarefaction plots (%s)" % person_of_interest
            cmd = "make_rarefaction_plots.py -i %s -m %s -p %s -o %s" % (
                collated_dir,
                personal_mapping_file_fp,
                prefs_fp,
                rarefaction_dir,
            )
            commands.append([(cmd_title, cmd)])

            personal_raw_data_dirs.append(join(rarefaction_dir, "average_plots"))
            personal_raw_data_dirs.append(join(rarefaction_dir, "average_tables"))

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

        ## Beta diversity steps
        if not suppress_beta_diversity:
            pcoa_dir = join(output_dir, person_of_interest, "beta_diversity")
            pcoa_time_series_dir = join(output_dir, person_of_interest, "beta_diversity_time_series")
            output_directories.append(pcoa_dir)
            output_directories.append(pcoa_time_series_dir)

            commands = []
            cmd_title = "Creating beta diversity time series plots (%s)" % person_of_interest
            cmd = "make_3d_plots.py -m %s -p %s -i %s -o %s --custom_axes=" % (
                personal_mapping_file_fp,
                prefs_fp,
                coord_fp,
                pcoa_time_series_dir,
            ) + "'%s' --add_vectors='%s,%s'" % (time_series_category, site_id_category, time_series_category)
            commands.append([(cmd_title, cmd)])

            cmd_title = "Creating beta diversity plots (%s)" % person_of_interest
            cmd = "make_3d_plots.py  -m %s -p %s -i %s -o %s" % (personal_mapping_file_fp, prefs_fp, coord_fp, pcoa_dir)
            commands.append([(cmd_title, cmd)])

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

        ## Time series taxa summary plots steps
        taxa_summary_plots_html = ""
        if not suppress_taxa_summary_plots:
            area_plots_dir = join(output_dir, person_of_interest, "time_series")
            create_dir(area_plots_dir)
            output_directories.append(area_plots_dir)

            files_to_remove, dirs_to_remove = _generate_taxa_summary_plots(
                otu_table_fp,
                personal_mapping_file_fp,
                person_of_interest,
                column_title,
                column_title_values,
                category_to_split,
                cat_values,
                time_series_category,
                area_plots_dir,
                command_handler,
                status_update_callback,
                logger,
            )

            personal_raw_data_files.extend(files_to_remove)
            personal_raw_data_dirs.extend(dirs_to_remove)

            taxa_summary_plots_html = create_taxa_summary_plots_html(output_dir, person_of_interest, cat_values)

        # Generate OTU category significance tables (per body site).
        otu_cat_sig_output_fps = []
        otu_category_significance_html = ""
        if not suppress_otu_category_significance:
            otu_cat_sig_dir = join(output_dir, person_of_interest, "otu_category_significance")
            create_dir(otu_cat_sig_dir)
            output_directories.append(otu_cat_sig_dir)

            # For each body-site rarefied OTU table, run
            # otu_category_significance.py using self versus other category.
            # Keep track of each output file that is created because we need to
            # parse these later on.
            commands = []
            valid_body_sites = []
            for cat_value in cat_values:
                body_site_otu_table_fp = join(
                    per_body_site_dir, add_filename_suffix(rarefied_otu_table_fp, "_%s" % cat_value)
                )

                if exists(body_site_otu_table_fp):
                    # Make sure we have at least one sample for Self, otherwise
                    # otu_category_significance.py crashes with a division by
                    # zero error.
                    with open(body_site_otu_table_fp, "U") as body_site_otu_table_f, open(
                        personal_mapping_file_fp, "U"
                    ) as personal_mapping_file_f:
                        personal_sample_count = _count_per_individual_samples(
                            body_site_otu_table_f, personal_mapping_file_f, personal_id_column, person_of_interest
                        )

                        if personal_sample_count < 1:
                            continue
                        else:
                            valid_body_sites.append(cat_value)

                    otu_cat_output_fp = join(otu_cat_sig_dir, "otu_cat_sig_%s.txt" % cat_value)

                    cmd_title = "Testing for significant differences in " 'OTU abundances in "%s" body site (%s)' % (
                        cat_value,
                        person_of_interest,
                    )
                    cmd = "otu_category_significance.py -i %s -m %s -c %s " "-o %s" % (
                        body_site_otu_table_fp,
                        personal_mapping_file_fp,
                        column_title,
                        otu_cat_output_fp,
                    )
                    commands.append([(cmd_title, cmd)])

                    personal_raw_data_files.append(otu_cat_output_fp)
                    otu_cat_sig_output_fps.append(otu_cat_output_fp)

            # Hack to allow print-only mode.
            if command_handler is not print_commands and not valid_body_sites:
                raise ValueError(
                    "None of the body sites for personal ID '%s' "
                    "could be processed because there were no "
                    "matching samples in the rarefied OTU table." % person_of_interest
                )

            command_handler(commands, status_update_callback, logger, close_logger_on_success=False)

            # Reformat otu category significance tables.
            otu_cat_sig_html_filenames = create_otu_category_significance_html_tables(
                otu_cat_sig_output_fps, alpha, otu_cat_sig_dir, individual_titles, rep_set_fp=rep_set_fp
            )

            # Create relative paths for use with the index page.
            rel_otu_cat_sig_dir = basename(normpath(otu_cat_sig_dir))
            otu_cat_sig_html_fps = [
                join(rel_otu_cat_sig_dir, html_filename) for html_filename in otu_cat_sig_html_filenames
            ]

            otu_category_significance_html = create_otu_category_significance_html(otu_cat_sig_html_fps)

        # Create the index.html file for the current individual.
        create_index_html(
            person_of_interest,
            html_fp,
            taxa_summary_plots_html=taxa_summary_plots_html,
            alpha_diversity_boxplots_html=alpha_diversity_boxplots_html,
            otu_category_significance_html=otu_category_significance_html,
        )

        # Clean up the unnecessary raw data files and directories for the
        # current individual. glob will only grab paths that exist.
        if not retain_raw_data:
            clean_up_raw_data_files(personal_raw_data_files, personal_raw_data_dirs)

    # Clean up any remaining raw data files that weren't created on a
    # per-individual basis.
    if not retain_raw_data:
        clean_up_raw_data_files(raw_data_files, raw_data_dirs)

    logger.close()

    return output_directories
Esempio n. 16
0
def tax_align_tree(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):

    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    ## Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'rdp'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp'
                     or assignment_method == 'blast'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass

        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
         (assign_taxonomy_dir,repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy', assign_taxonomy_cmd)])

    ## Prep the pynast alignment command
    alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method)
    aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename)
    failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename)
    if exists(pynast_dir):
        rmtree(pynast_dir)

    if parallel:
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['align_seqs'].copy()
            if 'alignment_method' in d:
                del d['alignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass

        # Build the parallel pynast alignment command
        align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])

    ## Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\
     (pynast_dir,input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
     (pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])

    ## Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
     (filtered_aln_fp, tree_fp,params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    if exists(tree_fp):
        remove_files([tree_fp])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp, failures_fp
def tax_align_tree(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):
                   
    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    ## Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'rdp'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
         (assign_taxonomy_dir,repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
    
    ## Prep the pynast alignment command
    alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method)
    aln_fp = '%s/%s_aligned.fasta' % (pynast_dir,input_basename)
    failures_fp = '%s/%s_failures.fasta' % (pynast_dir,input_basename)
    if exists(pynast_dir):
        rmtree(pynast_dir)
    
    if parallel:
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['align_seqs'].copy()
            if 'alignment_method' in d:
                del d['alignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel pynast alignment command
        align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])
    
    
    ## Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\
     (pynast_dir,input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
     (pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])
    
    
    ## Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
     (filtered_aln_fp, tree_fp,params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    if exists(tree_fp):
        remove_files([tree_fp])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp, failures_fp
Esempio n. 18
0
def pick_subsampled_open_referenence_otus(
        input_fp,
        refseqs_fp,
        output_dir,
        percent_subsample,
        new_ref_set_id,
        command_handler,
        params,
        qiime_config,
        prefilter_refseqs_fp=None,
        run_tax_align_tree=True,
        prefilter_percent_id=0.60,
        min_otu_size=2,
        step1_otu_map_fp=None,
        step1_failures_fasta_fp=None,
        parallel=False,
        suppress_step4=False,
        logger=None,
        status_update_callback=print_to_stdout):
    """ Run the data preparation steps of Qiime 
    
        The steps performed by this function are:
          - Pick reference OTUs against refseqs_fp
          - Subsample the failures to n sequences.
          - Pick OTUs de novo on the n failures.
          - Pick representative sequences for the resulting OTUs.
          - Pick reference OTUs on all failures using the 
             representative set from step 4 as the reference set.
    
    """
    # for now only allowing uclust for otu picking
    denovo_otu_picking_method = 'uclust'
    reference_otu_picking_method = 'uclust_ref'

    # Prepare some variables for the later steps
    input_dir, input_filename = split(input_fp)
    input_basename, input_ext = splitext(input_filename)
    create_dir(output_dir)
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    log_input_md5s(
        logger,
        [input_fp, refseqs_fp, step1_otu_map_fp, step1_failures_fasta_fp])

    # if the user has not passed a different reference collection for the pre-filter,
    # used the main refseqs_fp. this is useful if the user wants to provide a smaller
    # reference collection, or to use the input reference collection when running in
    # iterative mode (rather than an iteration's new refseqs)
    if prefilter_refseqs_fp == None:
        prefilter_refseqs_fp = refseqs_fp

    ## Step 1: Closed-reference OTU picking on the input file (if not already complete)
    if step1_otu_map_fp and step1_failures_fasta_fp:
        step1_dir = '%s/step1_otus' % output_dir
        create_dir(step1_dir)
        logger.write("Using pre-existing reference otu map and failures.\n\n")
    else:
        if prefilter_percent_id != None:
            prefilter_dir = '%s/prefilter_otus/' % output_dir
            prefilter_otu_map_fp = \
             '%s/%s_otus.txt' % (prefilter_dir,input_basename)
            prefilter_failures_list_fp = '%s/%s_failures.txt' % \
             (prefilter_dir,input_basename)
            prefilter_pick_otu_cmd = pick_reference_otus(\
             input_fp,prefilter_dir,reference_otu_picking_method,
             prefilter_refseqs_fp,parallel,params,logger,prefilter_percent_id)
            commands.append([('Pick Reference OTUs (prefilter)',
                              prefilter_pick_otu_cmd)])

            prefiltered_input_fp = '%s/prefiltered_%s%s' %\
             (prefilter_dir,input_basename,input_ext)
            filter_fasta_cmd = 'filter_fasta.py -f %s -o %s -s %s -n' %\
             (input_fp,prefiltered_input_fp,prefilter_failures_list_fp)
            commands.append([('Filter prefilter failures from input',
                              filter_fasta_cmd)])

            input_fp = prefiltered_input_fp
            input_dir, input_filename = split(input_fp)
            input_basename, input_ext = splitext(input_filename)

        ## Build the OTU picking command
        step1_dir = \
         '%s/step1_otus' % output_dir
        step1_otu_map_fp = \
         '%s/%s_otus.txt' % (step1_dir,input_basename)
        step1_pick_otu_cmd = pick_reference_otus(\
         input_fp,step1_dir,reference_otu_picking_method,
         refseqs_fp,parallel,params,logger)
        commands.append([('Pick Reference OTUs', step1_pick_otu_cmd)])

        ## Build the failures fasta file
        step1_failures_list_fp = '%s/%s_failures.txt' % \
         (step1_dir,input_basename)
        step1_failures_fasta_fp = \
         '%s/failures.fasta' % step1_dir
        step1_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (input_fp,step1_failures_list_fp,step1_failures_fasta_fp)

        commands.append([('Generate full failures fasta file',
                          step1_filter_fasta_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    step1_repset_fasta_fp = \
     '%s/step1_rep_set.fna' % step1_dir
    step1_pick_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step1_otu_map_fp, step1_repset_fasta_fp, input_fp)
    commands.append([('Pick rep set', step1_pick_rep_set_cmd)])

    ## Subsample the failures fasta file to retain (roughly) the
    ## percent_subsample
    step2_input_fasta_fp = \
     '%s/subsampled_failures.fasta' % step1_dir
    subsample_fasta(step1_failures_fasta_fp, step2_input_fasta_fp,
                    percent_subsample)

    ## Prep the OTU picking command for the subsampled failures
    step2_dir = '%s/step2_otus/' % output_dir
    step2_cmd = pick_denovo_otus(step2_input_fasta_fp, step2_dir,
                                 new_ref_set_id, denovo_otu_picking_method,
                                 params, logger)
    step2_otu_map_fp = '%s/subsampled_failures_otus.txt' % step2_dir

    commands.append([('Pick de novo OTUs for new clusters', step2_cmd)])

    ## Prep the rep set picking command for the subsampled failures
    step2_repset_fasta_fp = '%s/step2_rep_set.fna' % step2_dir
    step2_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
     (step2_otu_map_fp,step2_repset_fasta_fp,step2_input_fasta_fp)
    commands.append([('Pick representative set for subsampled failures',
                      step2_rep_set_cmd)])

    step3_dir = '%s/step3_otus/' % output_dir
    step3_otu_map_fp = '%s/failures_otus.txt' % step3_dir
    step3_failures_list_fp = '%s/failures_failures.txt' % step3_dir
    step3_cmd = pick_reference_otus(step1_failures_fasta_fp, step3_dir,
                                    reference_otu_picking_method,
                                    step2_repset_fasta_fp, parallel, params,
                                    logger)

    commands.append([('Pick reference OTUs using de novo rep set', step3_cmd)])

    # name the final otu map
    merged_otu_map_fp = '%s/final_otu_map.txt' % output_dir

    if not suppress_step4:
        step3_failures_fasta_fp = '%s/failures_failures.fasta' % step3_dir
        step3_filter_fasta_cmd = 'filter_fasta.py -f %s -s %s -o %s' %\
         (step1_failures_fasta_fp,step3_failures_list_fp,step3_failures_fasta_fp)
        commands.append([('Create fasta file of step3 failures',
                          step3_filter_fasta_cmd)])

        step4_dir = '%s/step4_otus/' % output_dir
        step4_cmd = pick_denovo_otus(step3_failures_fasta_fp, step4_dir,
                                     '.'.join([new_ref_set_id, 'CleanUp']),
                                     denovo_otu_picking_method, params, logger)
        step4_otu_map_fp = '%s/failures_failures_otus.txt' % step4_dir
        commands.append([('Pick de novo OTUs on step3 failures', step4_cmd)])
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,step4_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        step4_repset_fasta_fp = '%s/step4_rep_set.fna' % step4_dir
        step4_rep_set_cmd = 'pick_rep_set.py -i %s -o %s -f %s' %\
         (step4_otu_map_fp,step4_repset_fasta_fp,step3_failures_fasta_fp)
        commands.append([('Pick representative set for subsampled failures',
                          step4_rep_set_cmd)])

    else:
        # Merge the otu maps
        cat_otu_tables_cmd = 'cat %s %s >> %s' %\
             (step1_otu_map_fp,step3_otu_map_fp,merged_otu_map_fp)
        commands.append([('Merge OTU maps', cat_otu_tables_cmd)])
        # Move the step 3 failures file to the top-level directory
        commands.append([('Move final failures file to top-level directory',
                          'mv %s %s/final_failures.txt' %
                          (step3_failures_list_fp, output_dir))])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)
    commands = []

    otu_fp = merged_otu_map_fp
    # Filter singletons from the otu map
    otu_no_singletons_fp = '%s/final_otu_map_mc%d.txt' % (output_dir,
                                                          min_otu_size)
    otus_to_keep = filter_otus_from_otu_map(otu_fp, otu_no_singletons_fp,
                                            min_otu_size)

    ## make the final representative seqs file and a new refseqs file that
    ## could be used in subsequent otu picking runs.
    ## this is clunky. first, we need to do this without singletons to match
    ## the otu map without singletons. next, there is a difference in what
    ## we need the reference set to be and what we need the repseqs to be.
    ## the reference set needs to be a superset of the input reference set
    ## to this set. the repset needs to be only the sequences that were observed
    ## in this data set, and we want reps for the step1 reference otus to be
    ## reads from this run so we don't hit issues building a tree using
    ## sequences of very different lengths. so...
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_f = open(final_repset_fp, 'w')
    new_refseqs_fp = '%s/new_refseqs.fna' % output_dir
    # write non-singleton otus representative sequences from step1 to the
    # final rep set file
    for otu_id, seq in MinimalFastaParser(open(step1_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    # copy the full input refseqs file to the new refseqs_fp
    copy(refseqs_fp, new_refseqs_fp)
    new_refseqs_f = open(new_refseqs_fp, 'a')
    new_refseqs_f.write('\n')
    # iterate over all representative sequences from step2 and step4 and write
    # those corresponding to non-singleton otus to the final representative set
    # file and the new reference sequences file.
    for otu_id, seq in MinimalFastaParser(open(step2_repset_fasta_fp, 'U')):
        if otu_id.split()[0] in otus_to_keep:
            new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
            final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    if not suppress_step4:
        for otu_id, seq in MinimalFastaParser(open(step4_repset_fasta_fp,
                                                   'U')):
            if otu_id.split()[0] in otus_to_keep:
                new_refseqs_f.write('>%s\n%s\n' % (otu_id, seq))
                final_repset_f.write('>%s\n%s\n' % (otu_id, seq))
    new_refseqs_f.close()
    final_repset_f.close()

    # Prep the make_otu_table.py command
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir, min_otu_size)
    make_otu_table_cmd = 'make_otu_table.py -i %s -o %s' %\
     (otu_no_singletons_fp,otu_table_fp)
    commands.append([("Make the otu table", make_otu_table_cmd)])

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=False)

    commands = []

    if run_tax_align_tree:
        taxonomy_fp, pynast_failures_fp = tax_align_tree(
            repset_fasta_fp=final_repset_fp,
            output_dir=output_dir,
            command_handler=command_handler,
            params=params,
            qiime_config=qiime_config,
            parallel=parallel,
            logger=logger,
            status_update_callback=status_update_callback)

        # Add taxa to otu table
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
         (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
        commands.append([("Add taxa to OTU table", add_taxa_cmd)])

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

        # Build OTU table without PyNAST failures
        otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        filtered_otu_table = filter_otus_from_otu_table(
            parse_biom_table(open(otu_table_w_tax_fp, 'U')),
            get_seq_ids_from_fasta_file(open(pynast_failures_fp, 'U')),
            0,
            inf,
            0,
            inf,
            negate_ids_to_keep=True)
        otu_table_f = open(otu_table_fp, 'w')
        otu_table_f.write(format_biom_table(filtered_otu_table))
        otu_table_f.close()

        command_handler(commands,
                        status_update_callback,
                        logger=logger,
                        close_logger_on_success=False)
        commands = []

    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
def iterative_pick_subsampled_open_referenence_otus(
                              input_fps, 
                              refseqs_fp,
                              output_dir,
                              percent_subsample,
                              new_ref_set_id,
                              command_handler,
                              params,
                              qiime_config,
                              prefilter_refseqs_fp=None,
                              prefilter_percent_id=0.60,
                              min_otu_size=2,
                              run_tax_align_tree=True,
                              step1_otu_map_fp=None,
                              step1_failures_fasta_fp=None,
                              parallel=False,
                              suppress_step4=False,
                              logger=None,
                              status_update_callback=print_to_stdout):
    """ Call the pick_subsampled_open_referenence_otus workflow on multiple inputs
         and handle processing of the results.
    """
    create_dir(output_dir)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    # if the user has not passed a different reference collection for the pre-filter,
    # used the input refseqs_fp for all iterations. we want to pre-filter all data against
    # the input data as lower percent identity searches with uclust can be slow, so we 
    # want the reference collection to stay at a reasonable size.
    if prefilter_refseqs_fp == None:
       prefilter_refseqs_fp = refseqs_fp
    
    otu_table_fps = []
    repset_fasta_fps = []
    for i,input_fp in enumerate(input_fps):
        iteration_output_dir = '%s/%d/' % (output_dir,i)
        if iteration_output_exists(iteration_output_dir,min_otu_size):
            # if the output from an iteration already exists, skip that 
            # iteration (useful for continuing failed runs)
            log_input_md5s(logger,[input_fp,refseqs_fp])
            logger.write('Iteration %d (input file: %s) output data already exists. '
                         'Skipping and moving to next.\n\n' % (i,input_fp))
        else:
            pick_subsampled_open_referenence_otus(input_fp=input_fp,
                                     refseqs_fp=refseqs_fp,
                                     output_dir=iteration_output_dir,
                                     percent_subsample=percent_subsample,
                                     new_ref_set_id='.'.join([new_ref_set_id,str(i)]),
                                     command_handler=command_handler,
                                     params=params,
                                     qiime_config=qiime_config,
                                     run_tax_align_tree=False,
                                     prefilter_refseqs_fp=prefilter_refseqs_fp,
                                     prefilter_percent_id=prefilter_percent_id,
                                     min_otu_size=min_otu_size,
                                     step1_otu_map_fp=step1_otu_map_fp,
                                     step1_failures_fasta_fp=step1_failures_fasta_fp,
                                     parallel=parallel,
                                     suppress_step4=suppress_step4,
                                     logger=logger,
                                     status_update_callback=status_update_callback)
        ## perform post-iteration file shuffling whether the previous iteration's
        ## data previously existed or was just computed.
        # step1 otu map and failures can only be used for the first iteration
        # as subsequent iterations need to use updated refseqs files
        step1_otu_map_fp = step1_failures_fasta_fp = None
        new_refseqs_fp = '%s/new_refseqs.fna' % iteration_output_dir
        refseqs_fp = new_refseqs_fp
        otu_table_fps.append('%s/otu_table_mc%d.biom' % (iteration_output_dir,min_otu_size))
        repset_fasta_fps.append('%s/rep_set.fna' % iteration_output_dir)
    
    # Merge OTU tables - check for existence first as this step has historically
    # been a frequent failure, so is sometimes run manually in failed runs.
    otu_table_fp = '%s/otu_table_mc%d.biom' % (output_dir,min_otu_size)
    if not (exists(otu_table_fp) and getsize(otu_table_fp) > 0):
        merge_cmd = 'merge_otu_tables.py -i %s -o %s' %\
         (','.join(otu_table_fps),otu_table_fp)        
        commands.append([("Merge OTU tables",merge_cmd)])
    
    # Build master rep set
    final_repset_fp = '%s/rep_set.fna' % output_dir
    final_repset_from_iteration_repsets_fps(repset_fasta_fps,final_repset_fp)
    
    command_handler(commands,
            status_update_callback,
            logger=logger,
            close_logger_on_success=False)
    commands = []
    
    if run_tax_align_tree:
        otu_table_w_tax_fp = \
         '%s/otu_table_mc%d_w_tax.biom' % (output_dir,min_otu_size)
        final_otu_table_fp = \
         '%s/otu_table_mc%d_w_tax_no_pynast_failures.biom' % (output_dir,min_otu_size)
        if exists(final_otu_table_fp) and getsize(final_otu_table_fp) > 0:
            logger.write("Final output file exists (%s). Will not rebuild." % otu_table_fp)
        else:
            # remove files from partially completed runs
            remove_files([otu_table_w_tax_fp,final_otu_table_fp],error_on_missing=False)
        
            taxonomy_fp, pynast_failures_fp = tax_align_tree(
                       repset_fasta_fp=final_repset_fp,
                       output_dir=output_dir,
                       command_handler=command_handler,
                       params=params,
                       qiime_config=qiime_config,
                       parallel=parallel,
                       logger=logger,
                       status_update_callback=status_update_callback)
        
            # Add taxa to otu table
            add_taxa_cmd = 'add_taxa.py -i %s -t %s -o %s' %\
             (otu_table_fp,taxonomy_fp,otu_table_w_tax_fp)
            commands.append([("Add taxa to OTU table",add_taxa_cmd)])
        
            command_handler(commands,
                status_update_callback,
                logger=logger,
                close_logger_on_success=False)
            commands = []
        
            # Build OTU table without PyNAST failures
            filtered_otu_table = filter_otus_from_otu_table(
                  parse_biom_table(open(otu_table_w_tax_fp,'U')),
                  get_seq_ids_from_fasta_file(open(pynast_failures_fp,'U')),
                  0,inf,0,inf,negate_ids_to_keep=True)
            otu_table_f = open(final_otu_table_fp,'w')
            otu_table_f.write(format_biom_table(filtered_otu_table))
            otu_table_f.close()
    
            command_handler(commands,
                            status_update_callback,
                            logger=logger,
                            close_logger_on_success=False)
            commands = []
    
    logger.close()
def run_other_qiime_analysis(data_access, fs_fp, web_fp, otu_table_filepath, 
                             map_filepath, file_name_prefix, user_id, meta_id, 
                             params_path, rarefied_at, jobs_to_start, tree_fp, 
                             zip_fpath, zip_fpath_db):

    # get the date to put in the db
    run_date=datetime.now().strftime("%d/%m/%Y/%H/%M/%S")
    
    # Prepare the params for submitting new jobs to the torque-poller
    params=[]
    params.append('fs_fp=%s' % fs_fp)
    params.append('web_fp=%s' % web_fp)
    params.append('otu_table_fp=%s' % otu_table_filepath)
    params.append('mapping_file_fp=%s' % map_filepath)
    params.append('fname_prefix=%s' % file_name_prefix)
    params.append('user_id=%s' % user_id)
    params.append('meta_id=%s' % meta_id)
    params.append('params_path=%s' % params_path)
    params.append('bdiv_rarefied_at=%s' % rarefied_at)
    params.append('jobs_to_start=%s' % jobs_to_start)
    params.append('tree_fp=%s' % tree_fp)
    params.append('run_date=%s' % run_date)
    params.append('zip_fpath=%s' % zip_fpath)
    params.append('zip_fpath_db=%s' % zip_fpath_db)
    job_input='!!'.join(params)
    
    # Determine which meta-analyses the user selected 
    analyses_to_start=jobs_to_start.split(',')
    
    # Prepare TopiaryExplorer job
    if 'showTE' in analyses_to_start:
        tree_fpath=path.abspath('%s/software/gg_otus_4feb2011/trees/gg_97_otus_4feb2011.tre' % (os.environ['HOME']))
        python_exe_fp = qiime_config['python_exe_fp']
        commands=[]
        command_handler=call_commands_serially
        status_update_callback=no_status_updates
        logger = WorkflowLogger(generate_log_fp('/tmp/'),
                               params=dict(''),
                               qiime_config=qiime_config)
        
        #define topiary explorer fpaths
        jnlp_fname=path.splitext(path.split(otu_table_filepath)[-1])[0]+'.jnlp'
        tep_fname=path.splitext(path.split(otu_table_filepath)[-1])[0] + '.tep'
        jnlp_filepath_web=path.join(web_fp, 'topiaryexplorer_files', jnlp_fname)
        jnlp_filepath_web_tep=path.join(web_fp,'topiaryexplorer_files', 
                                        tep_fname)
        
        # define the hard-link for the JNLP 
        if ServerConfig.home=='/home/wwwdevuser/':
            host_name='http://webdev.microbio.me/qiime'
        else:
            host_name='http://www.microbio.me/qiime'
            
        jnlp_filepath_web_tep_url=path.join(host_name, jnlp_filepath_web_tep)
        output_dir=os.path.join(fs_fp, 'topiaryexplorer_files')
        
        #build command
        make_tep_cmd='%s %s/make_tep.py -i %s -m %s -t %s -o %s -u %s -w' %\
        (python_exe_fp, script_dir, otu_table_filepath, map_filepath, 
         tree_fpath, output_dir, jnlp_filepath_web_tep_url)
        
        commands.append([('Make TopiaryExplorer jnlp', make_tep_cmd)])

        # Call the command handler on the list of commands
        command_handler(commands, status_update_callback, logger)
        
        #zip Topiary Explorer jnlp file
        cmd_call='cd %s; zip %s %s' % (output_dir,zip_fpath,jnlp_fname)
        system(cmd_call)
        
        #zip Topiary Explorer project file
        cmd_call='cd %s; zip %s %s' % (output_dir,zip_fpath,tep_fname)
        system(cmd_call)
        
        valid=data_access.addMetaAnalysisFiles(True, int(meta_id), 
                                               jnlp_filepath_web, 'OTUTABLE', 
                                               run_date, 'TOPIARYEXPLORER')
        if not valid:
            raise ValueError, 'There was an issue uploading the filepaths to the DB!'
            
    # Generate and Submit Beta-Diversity Job
    if 'bdiv' in analyses_to_start:
        job_type='betaDiversityThroughPlots'

        # Submit the Beta Diversity jobs
        try:
            # Attempt the submission
            submitQiimeJob(meta_id, user_id, job_type, job_input, data_access)
        
        except Exception, e:
            raise ValueError,e
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    #get all the options
    tmp_prefix=get_tmp_filename('',suffix='').strip()
    output_dir=path.join(opts.fs_fp,'bdiv',tmp_prefix)
    web_fp=path.join(opts.web_fp,'bdiv',tmp_prefix)
    otu_table_fp=opts.otu_table_fp
    mapping_file_fp=opts.mapping_file_fp
    file_name_prefix=opts.fname_prefix
    user_id=int(opts.user_id)
    meta_id=int(opts.meta_id)
    bdiv_rarefied_at=int(opts.bdiv_rarefied_at)
    jobs_to_start=opts.jobs_to_start.split(',')
    tree_fp=opts.tree_fp
    command_handler=call_commands_serially
    status_update_callback=no_status_updates
    zip_fpath=opts.zip_fpath
    zip_fpath_db=opts.zip_fpath_db
    run_date=opts.run_date
    force=True
    
    # Connect to the database for adding fpaths
    try:
        from data_access_connections import data_access_factory
        from enums import ServerConfig
        import cx_Oracle
        data_access = data_access_factory(ServerConfig.data_access_type)
    except ImportError:
        print "NOT IMPORTING QIIMEDATAACCESS"
        pass
    
    # open and get params
    try:
        parameter_f = open(opts.params_path)
    except IOError:
        raise IOError,\
         "Can't open parameters file (%s). Does it exist? Do you have read access?"\
         % opts.params_path
    
    params=parse_qiime_parameters(parameter_f)
    
    create_dir(output_dir)
    commands = []
    
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # get the beta_diversity metrics, so we can determine the filepaths based
    # on these
    beta_diversity_metrics = params['beta_diversity']['metrics'].split(',')
    
    # determine if beta-diversity should be run in serial or parallel
    serial_or_parallel = params['serial_or_parallel']['method']
    
    if 'disthist_bdiv_plots' in jobs_to_start:
        #start preparing the script call
        if serial_or_parallel=='Serial':
            beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -c %s -f' %\
                (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \
                 output_dir,tree_fp,opts.params_path, \
                 params['make_distance_histograms']['fields'])
        else:
            beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -c %s -a -O 50 -f' %\
                (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, \
                 output_dir,tree_fp,opts.params_path, \
                 params['make_distance_histograms']['fields'])
            
    else:
        #start preparing the script call
        if serial_or_parallel=='Serial':
            beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -f' %\
                (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, output_dir,\
                 tree_fp,opts.params_path)
        else:
            beta_div_cmd='%s %s/beta_diversity_through_plots.py -i %s -m %s -o %s -t %s -p %s -a -O 50 -f' %\
                (python_exe_fp, script_dir, otu_table_fp, mapping_file_fp, output_dir,\
                 tree_fp,opts.params_path)
    
    # add in optional parameters depending on whether they are supplied
    if bdiv_rarefied_at:
        beta_div_cmd+=" -e %s" % (str(bdiv_rarefied_at))
    
    html_fpaths=[]
    
    # add 3d plots params
    if '3d_bdiv_plots' not in jobs_to_start:  
        beta_div_cmd+=" --suppress_3d_plots"
    else:
        for met in beta_diversity_metrics:
            html_fpaths.append((path.join(web_fp,'%s_3d_discrete' % (met),
                                '%s_pc_3D_PCoA_plots.html' % (met)),
                                '3D_DISCRETE_PLOT'))
            html_fpaths.append((path.join(web_fp,'%s_3d_continuous' % (met),
                                         '%s_pc_3D_PCoA_plots.html' % (met)), 
                                         '3D_CONTINUOUS_PLOT'))
                                         
    # add 2d plots params
    if '2d_bdiv_plots' not in jobs_to_start:
        beta_div_cmd+=" --suppress_2d_plots"
    else:
        for met in beta_diversity_metrics:
            html_fpaths.append((path.join(web_fp,'%s_2d_discrete' % (met),
                                         '%s_pc_2D_PCoA_plots.html' % (met)),
                                         '2D_DISCRETE_PLOT'))
            html_fpaths.append((path.join(web_fp,'%s_2d_continuous' % (met),
                                         '%s_pc_2D_PCoA_plots.html' % (met)),
                                          '2D_CONTINUOUS_PLOT'))
    
    # add distance histograms params
    if 'disthist_bdiv_plots' not in jobs_to_start:
        #beta_div_cmd+=" --suppress_distance_histograms"
        pass
    else:
        for met in beta_diversity_metrics:
            html_fpaths.append((path.join(web_fp,'%s_histograms' % (met),
                                    '%s_dm_distance_histograms.html' % (met)),
                                         'DISTANCE_HISTOGRAM'))
    
    commands.append([('Beta Diversity Through Plots',beta_div_cmd)])

    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger)
    
    
    #zip the files produced
    cmd_call='cd %s; zip -r %s %s' % (output_dir,\
                                      zip_fpath, './*')
    system(cmd_call)

    #add html links to DB for easy display
    for i in html_fpaths:
        valid=data_access.addMetaAnalysisFiles(True,int(meta_id),i[0],
                                               'BDIV',run_date,i[1].upper())
        if not valid:
            raise ValueError, 'There was an issue uploading the filepaths to the DB!'