def pick_reference_otus(input_fp,
                        output_dir,
                        otu_picking_method,
                        refseqs_fp,
                        parallel,
                        params,
                        logger,
                        similarity_override=None):
    params_copy = deepcopy(params)
    if similarity_override != None:
        logger.write('Overridding similiary with %1.3f.\n' % similarity_override)
        if 'pick_otus' in params_copy:
            params_copy['pick_otus']['similarity'] = str(similarity_override)
        else:
            params_copy['pick_otus'] = {'similarity':str(similarity_override)}
    
    if parallel and otu_picking_method == 'uclust_ref':
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params_copy['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --otu_picking_method
            # option. This works for now though.
            if 'otu_picking_method' in params_copy['pick_otus']:
                del params_copy['pick_otus']['otu_picking_method']
        except KeyError:
            pass
        
        params_str += ' %s' % get_params_str(params_copy['pick_otus'])
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\
          (otu_picking_script,
           input_fp,
           output_dir,
           refseqs_fp,
           params_str)
    else:
        try:
            params_str = get_params_str(params_copy['pick_otus'])
        except KeyError:
            params_str = ''
        # Since this is reference-based OTU picking we always want to
        # suppress new clusters -- force it here.
        params_str+= ' --suppress_new_clusters'
        logger.write("Forcing --suppress_new_clusters as this is reference-based OTU picking.\n\n")
        # Build the OTU picking command
        pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\
         (input_fp,
          output_dir,
          refseqs_fp,
          otu_picking_method,
          params_str)
    return pick_otus_cmd
Ejemplo n.º 2
0
def run_make_otu_heatmap_html(otu_table_fp,mapping_fp,output_dir, params,
                              qiime_config,
                              command_handler,tree_fp,
                              status_update_callback=print_to_stdout):
    """ This function calls the make_otu_heatmap_html script """
    
    # define upper-level values
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    commands = []
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # get the user-defined parameters
    try:
        params_str = get_params_str(params['make_otu_heatmap_html'])
    except KeyError:
        params_str = ''

    # Build the make_otu_heatmap_html command
    heatmap_cmd = '%s %s/make_otu_heatmap_html.py -i %s -m %s -t %s -o %s %s' %\
     (python_exe_fp, script_dir, otu_table_fp, mapping_fp,tree_fp, output_dir, 
      params_str)
    
    commands.append([('OTU Heatmap' , heatmap_cmd)])
     
    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger)

    return True
Ejemplo n.º 3
0
def pick_denovo_otus(input_fp, output_dir, new_ref_set_id, otu_picking_method,
                     params, logger):
    try:
        d = params['pick_otus'].copy()
        del d['otu_picking_method']
    except KeyError:
        pass

    d['uclust_otu_id_prefix'] = '%s.ReferenceOTU' % new_ref_set_id
    params_str = ' %s' % get_params_str(d)
    # Build the OTU picking command
    result = 'pick_otus.py -i %s -o %s -m %s %s' %\
     (input_fp, output_dir, otu_picking_method, params_str)

    return result
def pick_denovo_otus(input_fp,
                     output_dir,
                     new_ref_set_id,
                     otu_picking_method,
                     params,
                     logger):
    try:
        d = params['pick_otus'].copy()
        del d['otu_picking_method']
    except KeyError:
        pass

    d['uclust_otu_id_prefix'] = '%s.ReferenceOTU' % new_ref_set_id
    params_str = ' %s' % get_params_str(d)
    # Build the OTU picking command
    result = 'pick_otus.py -i %s -o %s -m %s %s' %\
     (input_fp, output_dir, otu_picking_method, params_str)
    
    return result
def run_process_sff_through_split_lib(study_id,run_prefix,sff_input_fp,
    mapping_fp, output_dir, 
    command_handler, params, qiime_config,
    convert_to_flx=False, write_to_all_fasta=False,
    status_update_callback=print_to_stdout):
    """ NOTE: Parts of this function are a directly copied from the
        run_qiime_data_preparation function from the workflow.py library file 
        in QIIME.
    
        The steps performed by this function are:
          1) Process SFFs to generate .fna, .qual and flowgram file.
             (process_sff.py)
          2) De-multiplex sequences. (split_libraries.py)
          
    """

    # Prepare some variables for the later steps
    sff_filenames=sff_input_fp.split(',')
    commands = []
    create_dir(output_dir)
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    
    # generate a log file
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    make_flowgram=True
    split_lib_fasta_input_files=[]
    split_lib_qual_input_files=[]
    denoise_flow_input_files=[]

    # make a copy of the mapping file
    copied_mapping=split(mapping_fp)[-1]
    mapping_input_fp_copy=join(output_dir, copied_mapping)
    copy_mapping_cmd='cp %s %s' % (mapping_fp,mapping_input_fp_copy)
    commands.append([('CopyMapping', copy_mapping_cmd)])
    
    # iterate over SFFs and match to the mapping file
    for sff_input_fp in sff_filenames:
        # GENERATE THE MD5 HERE AND STORE IN THE DATABASE AFTER FILE 
        # SUCCESSFULLY PROCESSED
        
        # Copy the SFF into the processed files directory
        copied_sff=split(sff_input_fp)[-1]
        sff_input_fp_copy=join(output_dir, copied_sff)

        #Generate filenames for split_libraries
        input_dir, input_filename = split(sff_input_fp)

        if is_gzip(sff_input_fp) and sff_input_fp.endswith('.gz'):
            input_basename, input_ext = splitext(splitext(input_filename)[0])
        else:
            input_basename, input_ext = splitext(input_filename)

        # Convert sff file into fasta, qual and flowgram file
        if convert_to_flx:
            if study_id in ['496','968','969','1069','1002','1066','1194','1195','1457','1458','1460','1536','1918','1962']:
                ### this function is for handling files where the barcode and
                ### linkerprimer are all lowercase (i.e. HMP data or SRA data)
                
                # write process_sff command
                process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s -t --no_trim --use_sfftools' %\
                                  (python_exe_fp, script_dir, sff_input_fp,
                                   output_dir)
                #process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s -t' % (python_exe_fp, script_dir, sff_input_fp, output_dir)
                
                commands.append([('ProcessSFFs', process_sff_cmd)])
                
                # define output fasta from process_sff
                no_trim_fasta_fp=join(output_dir,input_basename + '_FLX.fna')
                
                # define pprospector scripts dir
                pprospector_scripts_dir=join(ServerConfig.home,'software',
                                                 'pprospector','scripts')
                
                # clean fasta - basically converting lowercase to uppercase
                clean_fasta_cmd = '%s %s/clean_fasta.py -f %s -o %s' %\
                                      (python_exe_fp, pprospector_scripts_dir, 
                                       no_trim_fasta_fp,output_dir)
                
                commands.append([('CleanFasta', clean_fasta_cmd)])
                
                # move the cleaned file to be consistent with other processes
                cleaned_fasta_fp=join(output_dir,input_basename + \
                                      '_FLX_filtered.fasta')
                moved_fasta_fp=join(output_dir,input_basename + '_FLX.fna')
                mv_cmd='mv %s %s' %  (cleaned_fasta_fp,moved_fasta_fp)

                commands.append([('RenameFasta',mv_cmd)])
                
                # update the split-lib files to use the cleaned file
                split_lib_fasta_input_files.append(moved_fasta_fp)
                split_lib_qual_input_files.append(join(output_dir,
                                                input_basename + '_FLX.qual'))
                denoise_flow_input_files.append(join(output_dir,
                                                input_basename + '_FLX.txt'))
            else:
                # write process_sff command
                process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s -t' %\
                                  (python_exe_fp, script_dir, sff_input_fp,
                                   output_dir)
                
                commands.append([('ProcessSFFs', process_sff_cmd)])
                
                # get filepaths for generated files
                split_lib_fasta_input_files.append(join(output_dir,
                                                input_basename + '_FLX.fna'))
                split_lib_qual_input_files.append(join(output_dir,
                                                input_basename + '_FLX.qual'))
                denoise_flow_input_files.append(join(output_dir,
                                                input_basename + '_FLX.txt'))
                
                
        else:
            # write process_sff command
            process_sff_cmd = '%s %s/process_sff.py -i %s -f -o %s' %\
                                (python_exe_fp, script_dir, sff_input_fp,
                                 output_dir)
            
            commands.append([('ProcessSFFs', process_sff_cmd)])
            
            # get filepaths for generated files
            split_lib_fasta_input_files.append(join(output_dir,input_basename + '.fna'))
            split_lib_qual_input_files.append(join(output_dir,input_basename + '.qual'))
            denoise_flow_input_files.append(join(output_dir,input_basename + '.txt'))
        

    split_lib_fasta_input=','.join(split_lib_fasta_input_files)
    split_lib_qual_input=','.join(split_lib_qual_input_files)
    denoise_flow_input=','.join(denoise_flow_input_files)
    
    # If dataset is metagenomic disable primer check
    data_access = data_access_factory(ServerConfig.data_access_type)
    study_info=data_access.getStudyInfo(study_id,12171)
    if study_info['investigation_type'].lower() == 'metagenome':
        params['split_libraries']['disable_primers']=None
    
    # create split-libraries folder
    split_library_output=join(output_dir,'split_libraries')
    create_dir(split_library_output)
    
    # get params string
    try:
        params_str = get_params_str(params['split_libraries'])
    except KeyError:
        params_str = ''
    
    # Build the split libraries command
    split_libraries_cmd = '%s %s/split_libraries.py -f %s -q %s -m %s -o %s %s'%\
     (python_exe_fp, script_dir, split_lib_fasta_input, split_lib_qual_input,
      mapping_fp, split_library_output, params_str)
    commands.append([('SplitLibraries', split_libraries_cmd)])
        
    input_fp=join(split_library_output,'seqs.fna')
    
    # create per sample fastq files
    fastq_output=join(split_library_output,'per_sample_fastq')
    create_dir(fastq_output)
    try:
        params_str = get_params_str(params['convert_fastaqual_fastq'])
    except KeyError:
        params_str = ''
        
    input_qual_fp=join(split_library_output,'seqs_filtered.qual')
    
    # build the convert fasta/qual to fastq command
    create_fastq_cmd = '%s %s/convert_fastaqual_fastq.py -f %s -q %s -o %s %s'%\
     (python_exe_fp, script_dir, input_fp, input_qual_fp,
      fastq_output, params_str)
      
    commands.append([('Create FASTQ', create_fastq_cmd)])
   
    # Call the command handler on the list of commands
    command_handler(commands,status_update_callback,logger=logger)
    
    # Return the fasta file paths
    return split_lib_fasta_input_files
         input_str = '-i {0} --sample_id {1}'.format(filenames[0], sample_and_prep)
     except Exception, e:
         error = 'Failed to obtain sample and sequence prep info for study_id {0} and run_prefix {1}\n'.format(study_id, run_prefix)
         error += 'SQL was: \n {0} \n'.format(sql)
         error += 'Original exception was: \n {0}'.format(str(e))
         raise Exception(error)
 else:
     input_str=get_split_libraries_fastq_params_and_file_types(filenames, mapping_fp)
 
 # create split_libaries folder
 split_library_output=join(output_dir,'split_libraries')
 create_dir(split_library_output)
 
 # get params string
 try:
     params_str = get_params_str(params['split_libraries_fastq'])
 except KeyError:
     params_str = ''
 
 # Build the split libraries command
 split_libraries_cmd = '%s %s/split_libraries_fastq.py -o %s -m %s %s %s' % \
  (python_exe_fp, script_dir, split_library_output, mapping_input_fp_copy,
   input_str,params_str)
 
 commands.append([('SplitLibraries', split_libraries_cmd)])
 
 # define the generate files
 input_fp=join(split_library_output,'seqs.fna')
 
 # create per sample fastq files
 fastq_output=join(split_library_output,'per_sample_fastq')
Ejemplo n.º 7
0
def pick_reference_otus(input_fp,
                        output_dir,
                        otu_picking_method,
                        refseqs_fp,
                        parallel,
                        params,
                        logger,
                        similarity_override=None):
    params_copy = deepcopy(params)
    if 'pick_otus' in params_copy and 'refseqs_fp' in params_copy['pick_otus']:
        raise WorkflowError, \
         ("Cannot pass pick_otus:refseqs_fp in parameters file. This can only be"
          " passed on the command line or through the API.")
    if similarity_override != None:
        logger.write('Overridding similiary with %1.3f.\n' %
                     similarity_override)
        if 'pick_otus' in params_copy:
            params_copy['pick_otus']['similarity'] = str(similarity_override)
        else:
            params_copy['pick_otus'] = {'similarity': str(similarity_override)}

    if parallel and otu_picking_method == 'uclust_ref':
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params_copy['parallel'])
        except KeyError:
            params_str = ''

        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --otu_picking_method
            # option. This works for now though.
            if 'otu_picking_method' in params_copy['pick_otus']:
                del params_copy['pick_otus']['otu_picking_method']
        except KeyError:
            pass

        params_str += ' %s' % get_params_str(params_copy['pick_otus'])
        otu_picking_script = 'parallel_pick_otus_%s.py' % otu_picking_method
        # Build the OTU picking command
        pick_otus_cmd = '%s -i %s -o %s -r %s -T %s' %\
          (otu_picking_script,
           input_fp,
           output_dir,
           refseqs_fp,
           params_str)
    else:
        try:
            params_str = get_params_str(params_copy['pick_otus'])
        except KeyError:
            params_str = ''
        # Since this is reference-based OTU picking we always want to
        # suppress new clusters -- force it here.
        params_str += ' --suppress_new_clusters'
        logger.write(
            "Forcing --suppress_new_clusters as this is reference-based OTU picking.\n\n"
        )
        # Build the OTU picking command
        pick_otus_cmd = 'pick_otus.py -i %s -o %s -r %s -m %s %s' %\
         (input_fp,
          output_dir,
          refseqs_fp,
          otu_picking_method,
          params_str)
    return pick_otus_cmd
Ejemplo n.º 8
0
def tax_align_tree(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):

    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False

    ## Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'rdp'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp'
                     or assignment_method == 'blast'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass

        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
         (assign_taxonomy_dir,repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy', assign_taxonomy_cmd)])

    ## Prep the pynast alignment command
    alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir, alignment_method)
    aln_fp = '%s/%s_aligned.fasta' % (pynast_dir, input_basename)
    failures_fp = '%s/%s_failures.fasta' % (pynast_dir, input_basename)
    if exists(pynast_dir):
        rmtree(pynast_dir)

    if parallel:
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''

        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['align_seqs'].copy()
            if 'alignment_method' in d:
                del d['alignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass

        # Build the parallel pynast alignment command
        align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])

    ## Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\
     (pynast_dir,input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
     (pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])

    ## Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
     (filtered_aln_fp, tree_fp,params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    if exists(tree_fp):
        remove_files([tree_fp])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp, failures_fp
Ejemplo n.º 9
0
def run_core_diversity_analyses(
    biom_fp,
    mapping_fp,
    sampling_depth,
    output_dir,
    qiime_config,
    command_handler=call_commands_serially,
    tree_fp=None,
    params=None,
    categories=None,
    arare_min_rare_depth=10,
    arare_num_steps=10,
    parallel=False,
    status_update_callback=print_to_stdout):
    """
    """

    if categories != None:
        # Validate categories provided by the users
        mapping_data, mapping_comments = \
         parse_mapping_file_to_dict(open(mapping_fp,'U'))
        metadata_map = MetadataMap(mapping_data, mapping_comments)
        for c in categories:
            if c not in metadata_map.CategoryNames:
                raise ValueError, ("Category '%s' is not a column header "
                 "in your mapping file. "
                 "Categories are case and white space sensitive. Valid "
                 "choices are: (%s)" % (c,', '.join(metadata_map.CategoryNames)))
            if metadata_map.hasSingleCategoryValue(c):
                raise ValueError, ("Category '%s' contains only one value. "
                 "Categories analyzed here require at least two values." % c)
            
    else:
        categories= []
    
    # prep some variables
    if params == None:
        params = parse_qiime_parameters([])
        
    create_dir(output_dir)
    index_fp = '%s/index.html' % output_dir
    index_links = []
    commands = []
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    
    # begin logging
    log_fp = generate_log_fp(output_dir)
    index_links.append(('Master run log',log_fp,'Log files'))
    logger = WorkflowLogger(log_fp,
                            params=params,
                            qiime_config=qiime_config)
    input_fps = [biom_fp,mapping_fp]
    if tree_fp != None:
        input_fps.append(tree_fp)
    log_input_md5s(logger,input_fps)
    
    
    bdiv_even_output_dir = '%s/bdiv_even%d/' % (output_dir,sampling_depth)
    even_dm_fps = run_beta_diversity_through_plots(
     otu_table_fp=biom_fp, 
     mapping_fp=mapping_fp,
     output_dir=bdiv_even_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     sampling_depth=sampling_depth,
     # force suppression of distance histograms - boxplots work better
     # in this context, and are created below.
     histogram_categories=[],
     tree_fp=tree_fp,
     parallel=parallel,
     logger=logger,
     status_update_callback=status_update_callback)
    
    for bdiv_metric, dm_fp in even_dm_fps:
        for category in categories:
            boxplots_output_dir = '%s/%s_boxplots/' % (bdiv_even_output_dir,bdiv_metric)
            try:
                params_str = get_params_str(params['make_distance_boxplots'])
            except KeyError:
                params_str = ''
            boxplots_cmd = \
             'make_distance_boxplots.py -d %s -f %s -o %s -m %s -n 999 %s' %\
             (dm_fp, category, boxplots_output_dir, mapping_fp, params_str)
            commands.append([('Boxplots (%s)' % category,
                              boxplots_cmd)])
            index_links.append(('Distance boxplots (%s)' % bdiv_metric,
                                '%s/%s_Distances.pdf' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            index_links.append(('Distance boxplots statistics (%s)' % bdiv_metric,
                                '%s/%s_Stats.txt' % \
                                 (boxplots_output_dir,category),
                                'Beta diversity results (even sampling: %d)' % sampling_depth))
            
        index_links.append(('3D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_3d_continuous/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('3D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_3d_discrete/%s_pc_3D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, continuous coloring)' % bdiv_metric,
                            '%s/%s_2d_continuous/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('2D plot (%s, discrete coloring)' % bdiv_metric,
                            '%s/%s_2d_discrete/%s_pc_2D_PCoA_plots.html' % \
                             (bdiv_even_output_dir,bdiv_metric,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Distance matrix (%s)' % bdiv_metric,
                            '%s/%s_dm.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        index_links.append(('Principal coordinate matrix (%s)' % bdiv_metric,
                            '%s/%s_pc.txt' % \
                             (bdiv_even_output_dir,bdiv_metric),
                            'Beta diversity results (even sampling: %d)' % sampling_depth))
        
    ## Alpha rarefaction workflow
    arare_full_output_dir = '%s/arare_max%d/' % (output_dir,sampling_depth)
    run_qiime_alpha_rarefaction(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=arare_full_output_dir,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     tree_fp=tree_fp,
     num_steps=arare_num_steps,
     parallel=parallel,
     logger=logger,
     min_rare_depth=arare_min_rare_depth,
     max_rare_depth=sampling_depth,
     status_update_callback=status_update_callback)
    
    index_links.append(('Alpha rarefaction plots',
                        '%s/alpha_rarefaction_plots/rarefaction_plots.html'\
                          % arare_full_output_dir,
                        "Alpha rarefaction results"))
                        
    collated_alpha_diversity_fps = \
     glob('%s/alpha_div_collated/*txt' % arare_full_output_dir)
    try:
        params_str = get_params_str(params['compare_alpha_diversity'])
    except KeyError:
        params_str = ''
    for c in categories:
        for collated_alpha_diversity_fp in collated_alpha_diversity_fps:
            alpha_metric = splitext(split(collated_alpha_diversity_fp)[1])[0]
            alpha_comparison_output_fp = '%s/%s_%s.txt' % \
             (arare_full_output_dir,c,alpha_metric)
            compare_alpha_cmd = \
             'compare_alpha_diversity.py -i %s -m %s -c %s -d %s -o %s -n 999 %s' %\
             (collated_alpha_diversity_fp, mapping_fp, c, 
              sampling_depth, alpha_comparison_output_fp, params_str)
            commands.append([('Compare alpha diversity (%s, %s)' %\
                               (category,alpha_metric),
                              compare_alpha_cmd)])
            index_links.append(
             ('Alpha diversity statistics (%s, %s)' % (category,alpha_metric),
              alpha_comparison_output_fp,
              "Alpha rarefaction results"))
    
    taxa_plots_output_dir = '%s/taxa_plots/' % output_dir
    run_summarize_taxa_through_plots(
     otu_table_fp=biom_fp,
     mapping_fp=mapping_fp,
     output_dir=taxa_plots_output_dir,
     mapping_cat=None, 
     sort=True,
     command_handler=command_handler,
     params=params,
     qiime_config=qiime_config,
     logger=logger, 
     status_update_callback=status_update_callback)
    

    index_links.append(('Taxa summary bar plots',
                        '%s/taxa_summary_plots/bar_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    index_links.append(('Taxa summary area plots',
                        '%s/taxa_summary_plots/area_charts.html'\
                          % taxa_plots_output_dir,
                        "Taxonomic summary results"))
    for c in categories:
        taxa_plots_output_dir = '%s/taxa_plots_%s/' % (output_dir,c)
        run_summarize_taxa_through_plots(
         otu_table_fp=biom_fp,
         mapping_fp=mapping_fp,
         output_dir=taxa_plots_output_dir,
         mapping_cat=c, 
         sort=True,
         command_handler=command_handler,
         params=params,
         qiime_config=qiime_config,
         logger=logger, 
         status_update_callback=status_update_callback)

        index_links.append(('Taxa summary bar plots',
                            '%s/taxa_summary_plots/bar_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
        index_links.append(('Taxa summary area plots',
                            '%s/taxa_summary_plots/area_charts.html'\
                              % taxa_plots_output_dir,
                            "Taxonomic summary results (by %s)" % c))
    
    # OTU category significance
    for category in categories:
        category_signifance_fp = \
         '%s/category_significance_%s.txt' % (output_dir, category)
        try:
            params_str = get_params_str(params['otu_category_significance'])
        except KeyError:
            params_str = ''
        # Build the OTU cateogry significance command
        category_significance_cmd = \
         'otu_category_significance.py -i %s -m %s -c %s -o %s %s' %\
         (biom_fp, mapping_fp, category, 
          category_signifance_fp, params_str)
        commands.append([('OTU category significance (%s)' % category, 
                          category_significance_cmd)])
                          
        index_links.append(('Category significance (%s)' % category,
                    category_signifance_fp,
                    "Category results"))
    
    command_handler(commands, status_update_callback, logger)
    generate_index_page(index_links,index_fp)
def tax_align_tree(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):
                   
    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    ## Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'rdp'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
         (assign_taxonomy_dir,repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
    
    ## Prep the pynast alignment command
    alignment_method = 'pynast'
    pynast_dir = '%s/%s_aligned_seqs' % (output_dir,alignment_method)
    aln_fp = '%s/%s_aligned.fasta' % (pynast_dir,input_basename)
    failures_fp = '%s/%s_failures.fasta' % (pynast_dir,input_basename)
    if exists(pynast_dir):
        rmtree(pynast_dir)
    
    if parallel:
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --alignment_method
            # option. This works for now though.
            d = params['align_seqs'].copy()
            if 'alignment_method' in d:
                del d['alignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel pynast alignment command
        align_seqs_cmd = 'parallel_align_seqs_pynast.py -i %s -o %s -T %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['align_seqs'])
        except KeyError:
            params_str = ''
        # Build the pynast alignment command
        align_seqs_cmd = 'align_seqs.py -i %s -o %s %s' %\
         (repset_fasta_fp, pynast_dir, params_str)
    commands.append([('Align sequences', align_seqs_cmd)])
    
    
    ## Prep the alignment filtering command
    filtered_aln_fp = '%s/%s_aligned_pfiltered.fasta' %\
     (pynast_dir,input_basename)
    try:
        params_str = get_params_str(params['filter_alignment'])
    except KeyError:
        params_str = ''
    # Build the alignment filtering command
    filter_alignment_cmd = 'filter_alignment.py -o %s -i %s %s' %\
     (pynast_dir, aln_fp, params_str)
    commands.append([('Filter alignment', filter_alignment_cmd)])
    
    
    ## Prep the tree building command
    tree_fp = '%s/rep_set.tre' % output_dir
    try:
        params_str = get_params_str(params['make_phylogeny'])
    except KeyError:
        params_str = ''
    # Build the tree building command
    make_phylogeny_cmd = 'make_phylogeny.py -i %s -o %s %s' %\
     (filtered_aln_fp, tree_fp,params_str)
    commands.append([('Build phylogenetic tree', make_phylogeny_cmd)])
    if exists(tree_fp):
        remove_files([tree_fp])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp, failures_fp
def run_chain_pick_otus(fasta_file, output_dir, command_handler, params, 
                        qiime_config, parallel=False,
                        status_update_callback=print_to_stdout):
    """ NOTE: Parts of this function are a directly copied from the
        run_qiime_data_preparation function from the workflow.py library file 
        in QIIME.
    
        The steps performed by this function are:
            1) Pick OTUs;

    """
    
    # Prepare some variables for the later steps
    #split_lib_fasta_filenames=fasta_files.split(',')
    otu_maps_to_merge=[]
    commands = []

    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()

    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    ###Starting the Chain OTU picking###
    # Perform exact match pre-filtering
    exact_match_otus_dir=join(output_dir,'pick_otus_exact')
    pick_otus_cmd = '%s %s/pick_otus.py -m prefix_suffix -i %s -o %s -p 5000' %\
        (python_exe_fp, script_dir, fasta_file, exact_match_otus_dir)
    
    commands.append([('Pick OTUs: Exact match', pick_otus_cmd)])
    
    # Pick Rep set from exact match pre-filtering
    exact_match_basename=splitext(split(fasta_file)[-1])[0]
    exact_otu_fp=join(exact_match_otus_dir,exact_match_basename+'_otus.txt')
    exact_match_fna = join(exact_match_otus_dir,exact_match_basename) + \
                                                                '_exact_rep.fna'
    otu_maps_to_merge.append(exact_otu_fp)
    
    pick_rep_set_exact_cmd = '%s %s/pick_rep_set.py -i %s -f %s -o %s ' %\
        (python_exe_fp, script_dir, exact_otu_fp, fasta_file, exact_match_fna)

    commands.append([('Pick Rep Set: Exact match', pick_rep_set_exact_cmd)])

    # Do exact-match database pre-filtering
    leftover_fasta = join(output_dir, 'leftover.fasta')
    db_otu_map = join(output_dir, 'otu_map.txt')
    web_app_scripts_dir = join(split(realpath(__file__))[0], 'scripts')
    find_db_otus_command = '%s %s/find_otus_in_database.py -i %s -f %s -m %s' %\
        (python_exe_fp, web_app_scripts_dir, exact_match_fna, leftover_fasta,\
         db_otu_map)
        
    commands.append([('Find Database OTU Hits', find_db_otus_command)])
    
    # Prep the UCLUST_REF OTU picking command
    otu_picking_method = params['pick_otus']['otu_picking_method'].upper()
    otu_picking_similarity = int(float(params['pick_otus']['similarity'])*100)
    pick_otu_dir = '%s/picked_otus_%s_%s' % (output_dir,otu_picking_method,\
                                             otu_picking_similarity)
    uclust_otu_fp = join(pick_otu_dir,\
                         splitext(split(leftover_fasta)[-1])[0]+'_otus.txt')
    uclust_failure_fp = join(pick_otu_dir,\
                        splitext(split(leftover_fasta)[-1])[0]+'_failures.txt')

    # Grab the OTU picker parameters
    try:
        # Want to find a cleaner strategy for this: the parallel script
        # is method-specific, so doesn't take a --otu_picking_method
        # option. This works for now though.
        d = params['pick_otus'].copy()
        del d['otu_picking_method']
        params_str = ' %s' % get_params_str(d)
    except KeyError:
        pass
        
    if parallel:
        # Grab the parallel-specific parameters
        
        # Grab the OTU picker parameters
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --otu_picking_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            del d['otu_picking_method']
            del d['clustering_algorithm']
            del d['suppress_new_clusters']
            params_str = ' %s' % get_params_str(d)
        except KeyError:
            pass
        
        try:
            params_str += ' %s' % get_params_str(params['parallel'])
        except KeyError:
            params_str += ''
        
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/parallel_pick_otus_uclust_ref.py -i %s -T -o %s %s' %\
         (python_exe_fp, script_dir, leftover_fasta, pick_otu_dir, params_str)
        
    else:
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --otu_picking_method
            # option. This works for now though.
            d = params['pick_otus'].copy()
            params_str = ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the OTU picking command
        pick_otus_cmd = '%s %s/pick_otus.py -i %s -o %s %s' %\
         (python_exe_fp, script_dir, leftover_fasta, pick_otu_dir, params_str)
    
    commands.append([('Pick OTUs: uclust_ref', pick_otus_cmd)])
    
    # Must now merge the otu file produced from database matching and the file
    # produced by uclust_ref - they are of the same kind but need to be mashed
    # together
    combined_otu_file = join(output_dir, 'combined_otu_map.txt')
    otu_map_files = [db_otu_map, uclust_otu_fp]
    otu_maps_to_merge.append(combined_otu_file)
    combine_otu_maps_cmd = '%s %s/combine_otu_map_files.py -i %s -o %s' %\
          (python_exe_fp, web_app_scripts_dir, ','.join(otu_map_files), 
           combined_otu_file)
        
    commands.append([('Combine OTU maps', combine_otu_maps_cmd)])
    
    # Run merge_otu_maps.py on the newly combined file and the originally 
    # produced otu map
    merged_otus_fp = join(output_dir,'exact_uclust_ref_otus.txt')
    merge_otus_cmd = '%s %s/merge_otu_maps.py -i %s -o %s' %\
          (python_exe_fp, script_dir, ','.join(otu_maps_to_merge), 
           merged_otus_fp)
          
    commands.append([('Merge OTUs', merge_otus_cmd)])
    
    # Deal with failures produced in uclust_ref
    all_failures_fp = join(output_dir,'all_failures.txt')
    merge_otus_failures_cmd = '%s %s/merge_otu_maps.py -f %s -i %s -o %s' %\
          (python_exe_fp, script_dir, uclust_failure_fp, exact_otu_fp, 
           all_failures_fp)
          
    commands.append([('Merge OTUs - Failures', merge_otus_failures_cmd)])
    
    # Make OTU Table
    otu_biom_fp = join(output_dir,'exact_uclust_ref_otu_table.biom')
    make_otu_biom_cmd='%s %s/make_otu_table.py -i %s -o %s' %\
            (python_exe_fp, script_dir, merged_otus_fp, otu_biom_fp)

    commands.append([('Make Biom File', make_otu_biom_cmd)])
    
    # Convert to classic OTU table
    otu_table_fp = join(output_dir,'exact_uclust_ref_otu_table.txt')
    make_otu_table_cmd='%s %s/software/biom-format/scripts/convert_biom.py -i %s -o %s -b' %\
            (python_exe_fp, environ['HOME'], otu_biom_fp, otu_table_fp)

    commands.append([('Make OTU Table', make_otu_table_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger=logger)
def assign_tax(repset_fasta_fp,
                   output_dir,
                   command_handler,
                   params,
                   qiime_config,
                   parallel=False,
                   logger=None,
                   status_update_callback=print_to_stdout):
                   
    input_dir, input_filename = split(repset_fasta_fp)
    input_basename, input_ext = splitext(input_filename)
    commands = []
    if logger == None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    
    ## Prep the taxonomy assignment command
    try:
        assignment_method = params['assign_taxonomy']['assignment_method']
    except KeyError:
        assignment_method = 'rdp'
    assign_taxonomy_dir = '%s/%s_assigned_taxonomy' %\
     (output_dir,assignment_method)
    taxonomy_fp = '%s/%s_tax_assignments.txt' % \
     (assign_taxonomy_dir,input_basename)
    if parallel and (assignment_method == 'rdp' or assignment_method == 'blast'):
        # Grab the parallel-specific parameters
        try:
            params_str = get_params_str(params['parallel'])
        except KeyError:
            params_str = ''
        
        try:
            # Want to find a cleaner strategy for this: the parallel script
            # is method-specific, so doesn't take a --assignment_method
            # option. This works for now though.
            d = params['assign_taxonomy'].copy()
            if 'assignment_method' in d:
                del d['assignment_method']
            params_str += ' %s' % get_params_str(d)
        except KeyError:
            pass
            
        # Build the parallel taxonomy assignment command
        assign_taxonomy_cmd = \
         'parallel_assign_taxonomy_%s.py -i %s -o %s -T %s' %\
         (assignment_method, repset_fasta_fp,assign_taxonomy_dir, params_str)
    else:
        try:
            params_str = get_params_str(params['assign_taxonomy'])
        except KeyError:
            params_str = ''
        # Build the taxonomy assignment command
        assign_taxonomy_cmd = 'assign_taxonomy.py -o %s -i %s %s' %\
         (assign_taxonomy_dir,repset_fasta_fp, params_str)
    if exists(assign_taxonomy_dir):
        rmtree(assign_taxonomy_dir)
    commands.append([('Assign taxonomy',assign_taxonomy_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
    return taxonomy_fp
def run_process_illumina_through_split_lib(study_id,run_prefix,input_fp,
    mapping_fp, output_dir, 
    command_handler, params, qiime_config,
    write_to_all_fasta=False,
    status_update_callback=print_to_stdout):
    """ NOTE: Parts of this function are a directly copied from the
        run_qiime_data_preparation function from the workflow.py library file 
        in QIIME.
    
        The steps performed by this function are:
          1) De-multiplex sequences. (split_libraries_fastq.py)
    
    """

    # Prepare some variables for the later steps
    filenames=input_fp.split(',')
    commands = []
    create_dir(output_dir)
    python_exe_fp = qiime_config['python_exe_fp']
    script_dir = get_qiime_scripts_dir()
    logger = WorkflowLogger(generate_log_fp(output_dir),
                            params=params,
                            qiime_config=qiime_config)
    
    # copy the mapping file
    copied_mapping=split(mapping_fp)[-1]
    mapping_input_fp_copy=join(output_dir, copied_mapping)
    copy_mapping_cmd='cp %s %s' % (mapping_fp,mapping_input_fp_copy)
    commands.append([('CopyMapping', copy_mapping_cmd)])

    # sort the filenames
    filenames.sort()
    
    # determine which file is seq-file and which is barcode-file and associate
    # to mapping file
    input_str=get_split_libraries_fastq_params_and_file_types(filenames,
                                                              mapping_fp)
    
    # create split_libaries folder
    split_library_output=join(output_dir,'split_libraries')
    create_dir(split_library_output)
    
    # get params string
    try:
        params_str = get_params_str(params['split_libraries_fastq'])
    except KeyError:
        params_str = ''
    
    # Build the split libraries command
    split_libraries_cmd = '%s %s/split_libraries_fastq.py -o %s -m %s %s %s' % \
     (python_exe_fp, script_dir, split_library_output, mapping_input_fp_copy,
      input_str,params_str)
    
    commands.append([('SplitLibraries', split_libraries_cmd)])
    
    # define the generate files
    input_fp=join(split_library_output,'seqs.fna')
    
    # create per sample fastq files
    fastq_output=join(split_library_output,'per_sample_fastq')
    create_dir(fastq_output)
    
    """
    # not used for the one-off
    try:
        params_str = get_params_str(params['convert_fastaqual_fastq'])
    except KeyError:
        params_str = ''
    """
    
    # build the per-sample fastq command
    input_qual_fp=join(split_library_output,'seqs.qual')
    create_fastq_cmd = '%s %s/git/qiime_web_app/python_code/scripts/make_per_sample_fastq.py -i %s -q %s -o %s' % \
    (python_exe_fp, environ['HOME'], input_fp, input_qual_fp, fastq_output)
    
    """
    # TURN ON when convert_fastaqual_fastq can handle Illumina qual file
    create_fastq_cmd = '%s %s/convert_fastaqual_fastq.py -f %s -q %s -o %s %s'%\
     (python_exe_fp, script_dir, input_fp, input_qual_fp,
      fastq_output, params_str)
    """
    commands.append([('Create FASTQ', create_fastq_cmd)])
    
    # Call the command handler on the list of commands
    command_handler(commands,status_update_callback,logger=logger)

    # Return the fasta file paths
    return filenames