Ejemplo n.º 1
0
def filter_fastq(workflow,
                 method,
                 fastq_file,
                 output_folder,
                 threads,
                 maxee,
                 trunc_len,
                 fastq_ascii,
                 qmax=45):
    """ Filter the fastq files using the maxee value
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fastq_file (string): The path to the fastq file.
        output_folder (string): The path of the output folder.
        threads (int): The number of threads for each task.
        maxee (int): The maxee value to use for filtering.
        trunc_len (int): The value to use for max length.
        qmax (int): Max qvalue increased from the default of 43 to allow for Ion Torrent data
    Requires:
        usearch or vsearch
        
    Returns:
        string: A path to the filtered fasta file
        string: A path to the full fasta file
        
    """

    # get the name of the final merged fastq file
    fasta_filtered_file = utilities.name_files(
        "all_samples_concatenated_filtered.fasta", output_folder)
    fasta_discarded_file = utilities.name_files(
        "all_samples_concatenated_discarded.fasta", output_folder)
    if method == "vsearch":
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; " + \
            "vsearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]]",
            depends=[fastq_file, TrackedExecutable("vsearch")],
            targets=[fasta_filtered_file, fasta_discarded_file],
            args=[threads, maxee, trunc_len],
            name="vsearch_fastq_filter")
    else:
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; "+\
            "usearch -fastq_filter [depends[0]] -fastq_maxee [args[1]] -fastaout [targets[0]] -threads [args[0]] -fastaout_discarded [targets[1]] -fastq_trunclen [args[2]] -fastq_qmax [args[3]] -fastq_ascii [args[4]]",
            depends=[fastq_file,TrackedExecutable("usearch")],
            targets=[fasta_filtered_file, fasta_discarded_file],
            args=[threads, maxee, trunc_len, qmax, fastq_ascii],
            name="usearch_fastq_filter")

    # create a fasta file of all reads (included the discarded
    fasta_file = utilities.name_files("all_samples_concatenated.fasta",
                                      output_folder)
    workflow.add_task("cat [depends[0]] [depends[1]] > [targets[0]]",
                      depends=[fasta_filtered_file, fasta_discarded_file],
                      targets=fasta_file)

    return fasta_filtered_file, fasta_file
Ejemplo n.º 2
0
def picrust(workflow, otu_table_biom, output_folder):
    """ Runs picrust normalize, then predict
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        out_table_biom (string): The path to the biom file (closed reference otu table).
        output_folder (string): The path of the output folder.
        
    Requires:
        Picrust v1.1: Software to predict metagenome function.
        
    Returns:
        string: The path to the functional data file in biom format.
    
    """

    # normalize the otu table
    normalized_otu_table = utilities.name_files(
        "all_samples_normalize_by_copy_number.biom", output_folder)
    # first remove target file as picrust will not overwrite
    # expects biom file is json (not hdf5) format
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; "+\
        "normalize_by_copy_number.py -i [depends[0]] -o [targets[0]]",
        depends=[otu_table_biom,TrackedExecutable("normalize_by_copy_number.py")],
        targets=normalized_otu_table,
        name="normalize_by_copy_number.py")

    # predict metagenomes
    predict_metagenomes_table = utilities.name_files(
        "all_samples_predict_metagenomes.biom", output_folder)
    # first remove target file as picrust will not overwrite
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; "+\
        "predict_metagenomes.py -i [depends[0]] -o [targets[0]]",
        depends=[normalized_otu_table,TrackedExecutable("predict_metagenomes.py")],
        targets=predict_metagenomes_table,
        name="predict_metagenomes.py")

    # categorize by function
    categorized_function_table = utilities.name_files(
        "all_samples_categorize_by_function.biom", output_folder)
    # first remove target file as picrust will not overwrite
    workflow.add_task(
        "remove_if_exists.py [targets[0]] ; " + \
        "categorize_by_function.py -i [depends[0]] -o [targets[0]] --level 3 -c KEGG_Pathways",
        depends=[predict_metagenomes_table, TrackedExecutable("categorize_by_function.py")],
        targets=categorized_function_table,
        name="categorize_by_function.py")

    return categorized_function_table, predict_metagenomes_table
Ejemplo n.º 3
0
def cluster_otus(workflow, method, fasta_file, reference_fasta, output_folder):
    """ Cluster the otus with usearch
    
    Args:
        workflow (anadama2.workflow): an instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fasta_file (string): the path to the fasta file (filtered and dereplicated).
        reference_fasta (string): the path to reference fasta db
        output_folder (string): the path of the output folder.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Path to the fasta file sorted by size

    """

    # get the name of the output files
    output_fasta = utilities.name_files("all_samples_otus_nonchimeras.fasta",
                                        output_folder)

    if method == "vsearch":
        output_txt = utilities.name_files("all_samples_vsearch_otus.txt",
                                          output_folder)
        all_otus = utilities.name_files("all_otus.fasta", output_folder)
        workflow.add_task(
            "vsearch --cluster_size [depends[0]] --consout [targets[0]] --id 0.97 --relabel 'OTU' --uc [targets[1]]",
            depends=[fasta_file, TrackedExecutable("vsearch")],
            targets=[all_otus, output_txt],
            name="vsearch_cluster_otus")

        workflow.add_task(
            "vsearch --uchime_ref [depends[0]] --nonchimeras [targets[0]] --strand plus --db [args[0]]",
            depends=[all_otus, TrackedExecutable("vsearch")],
            targets=[output_fasta],
            args=[reference_fasta],
            name="vsearch_nochim")
    else:
        output_txt = utilities.name_files("all_samples_uparse_otus.txt",
                                          output_folder)
        workflow.add_task(
            "usearch -cluster_otus [depends[0]] -otus [targets[0]] -relabel 'OTU' -uparseout [targets[1]]",
            depends=[fasta_file, TrackedExecutable("usearch")],
            targets=[output_fasta, output_txt],
            name="usearch_cluster_otus")

    return output_fasta
Ejemplo n.º 4
0
def merge_fastq(workflow, input_files, output_folder):
    """ Merge all of the fastq files into a single fastq file
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        input_files (list): A list of paths to fastq files.
        output_folder (string): The path of the output folder.
        
    Requires:
        None
        
    Returns:
        string: A path to the merged file
        
    """

    # get the name of the final merged fastq file
    all_samples_fastq = utilities.name_files("all_samples_concatenated.fastq",
                                             output_folder)

    workflow.add_task("merge_fastq.py [args[0]] _renamed.fastq [targets[0]]",
                      depends=input_files,
                      targets=all_samples_fastq,
                      args=os.path.dirname(input_files[0]))

    return all_samples_fastq
Ejemplo n.º 5
0
def convert_from_biom_to_tsv_list(workflow, input_files, output):
    # if any of the files provided are of type biom then convert to tsv

    if isinstance(input_files, dict):
        converted_files = {}
    else:
        converted_files = []

    for filename in input_files:
        if filename.endswith(".biom"):
            new_tsv = utilities.name_files(filename.replace(".biom", ".tsv"),
                                           output,
                                           subfolder="biom_to_tsv",
                                           create_folder=True)
            convert_from_biom_to_tsv(workflow, filename, new_tsv)
            if isinstance(input_files, dict):
                converted_files[new_tsv] = input_files[filename]
            else:
                converted_files.append(new_tsv)
        else:
            if isinstance(input_files, dict):
                converted_files[filename] = input_files[filename]
            else:
                converted_files.append(filename)

    return converted_files
Ejemplo n.º 6
0
def sort_by_size(workflow, method, fasta_file, output_folder, min_size):
    """ Sort reads by size, removing those that are not of min size
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fasta_file (string): The path to the fasta file (filtered and dereplicated).
        output_folder (string): The path of the output folder.
        min_size (int): Min size of the reads to filter.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Path to the fasta file sorted by size
    """

    # get the name of the output files
    output_file = utilities.name_files("all_samples_sorted.fasta",
                                       output_folder)
    if method == "vsearch":
        workflow.add_task(
            "vsearch --sortbysize [depends[0]] --output [targets[0]] --minsize [args[0]]",
            depends=[fasta_file, TrackedExecutable("vsearch")],
            targets=output_file,
            args=min_size,
            name="vsearch_sortbysize")
    else:
        workflow.add_task(
            "usearch -sortbysize [depends[0]] -fastaout [targets[0]] -minsize [args[0]]",
            depends=[fasta_file, TrackedExecutable("usearch")],
            targets=output_file,
            args=min_size,
            name="usearch_sortbysize")

    return output_file
Ejemplo n.º 7
0
def excel_to_csv(workflow, input_files, output_dir):
    """Converts an Excel file to a CSV file. Only attempts to convert the 
    first worksheet in the file and ignores the rest.

    Args:
        workflow (anadama2.Workflow): The AnADAMA2 workflow object.
        input_files (list): A list containing all Excel files to be converted.
        output_dir (string): The output directory to write converted CSV files
            too.

    Requires:
        None

    Returns:
        list: A list of newly-converted CSV files.
    """
    output_files = bb_utils.name_files(map(os.path.basename, input_files),
                                       output_dir,
                                       extension='csv')

    def _convert_excel_csv(task):
        """Helper function passed to AnADAMA2 doing the lifting of converting 
        the supplied Excel file to a CSV file using the pandas python library.
        """
        excel_file = task.depends[0].name
        csv_out_file = task.targets[0].name

        excel_df = pd.read_excel(excel_file)
        excel_df.to_csv(csv_out_file)

    workflow.add_task_group(_convert_excel_csv,
                            depends=input_files,
                            targets=output_files)

    return output_files
Ejemplo n.º 8
0
def build_otu_tables(workflow, reference_taxonomy, reference_fasta,
                     reference_mapping_results_uc, otu_mapping_results_uc,
                     otu_fasta, original_fasta, output_folder):
    """ Build the open/closed reference otu tables, denovo table, and corresponding fasta files

    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        reference_taxonomy (string): The path to the reference taxonomy file.
        reference_fasta (string): The path to the reference fasta file.
        reference_mapping_results_uc (string): The path to the reference mapping uc results file.
        otu_mapping_results_uc (string): The path to the otu mapping uc results file.
        otu_fasta (string): The path to the fasta file of otu sequences.
        original_fasta (string): The path to the fasta file (not qc or truncated).
        output_folder (string): The path of the output folder.
        
    Requires:
        None
        
    Returns:
        list: The path to the closed reference otu files (tsv and fasta)

    """

    # name the output files
    open_ref_tsv = files.SixteenS.path("otu_table_open_reference",
                                       output_folder)
    open_ref_fasta = utilities.name_files("all_samples_open_reference.fasta",
                                          output_folder)
    closed_ref_tsv = files.SixteenS.path("otu_table_closed_reference",
                                         output_folder)
    closed_ref_fasta = utilities.name_files(
        "all_samples_closed_reference.fasta", output_folder)
    denovo_tsv = utilities.name_files("all_samples_denovo_otu_table.tsv",
                                      output_folder)
    read_counts = files.SixteenS.path("read_count_table", output_folder)


    workflow.add_task(
        "create_otu_tables_from_alignments.py [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]] [depends[5]] "+\
        "[targets[0]] [targets[1]] [targets[2]] [targets[3]] [targets[4]] [targets[5]]",
        depends=[reference_taxonomy, reference_fasta, reference_mapping_results_uc, otu_mapping_results_uc, otu_fasta, original_fasta],
        targets=[open_ref_tsv,open_ref_fasta,closed_ref_tsv,closed_ref_fasta,denovo_tsv,read_counts])

    return closed_ref_tsv, closed_ref_fasta
Ejemplo n.º 9
0
def run_picrust2(task, threads, otus=False):
    """ Run picrust2, first changing sequence ids to avoid all numeric (as per picrust2 tutorial) """

    picrust2_input_dir = os.path.dirname(task.depends[0].name)
    picrust2_output_dir = os.path.dirname(task.targets[0].name)

    if otus:
        reformat_input_fasta = utilities.name_files(
            task.depends[0].name,
            picrust2_input_dir,
            tag="picrust_reformatted_input",
            create_folder=True)
        with open(task.depends[0].name) as file_handle:
            with open(reformat_input_fasta, "w") as file_handle_write:
                for line in file_handle:
                    if line.startswith(">"):
                        line = line.replace(">", ">seq")
                    file_handle_write.write(line)
    else:
        reformat_input_fasta = task.depends[0].name

    reformat_input_tsv = utilities.name_files(task.depends[1].name,
                                              picrust2_input_dir,
                                              tag="picrust_reformatted_input")
    with open(task.depends[1].name) as file_handle:
        with open(reformat_input_tsv, "w") as file_handle_write:
            header = file_handle.readline()
            header = "\t".join(header.split("\t")[:-1]) + "\n"
            file_handle_write.write(header)
            for line in file_handle:
                if otus:
                    line = "seq" + line
                line = "\t".join(line.split("\t")[:-1]) + "\n"
                file_handle_write.write(line)

    utilities.run_task(
        "remove_if_exists.py [args[0]] --is-folder ; picrust2_pipeline.py -s [args[1]] -i [args[2]] -o [args[0]] -p [args[3]]",
        depends=task.depends,
        targets=task.depends,
        args=[
            picrust2_output_dir, reformat_input_fasta, reformat_input_tsv,
            threads
        ])
Ejemplo n.º 10
0
def batch_convert_tsv_to_biom(workflow, tsv_files):
    """Batch converts tsv files to the biom format. BIOM files will be 
    deposited in the same folder as source TSV files and will carry the 
    same filenames.

    Args:
        workflow (anadama2.Workflow): The workflow object.
        tsv_files (list): A list containing all TSV files to be converted 
            to BIOM format.
    
    Requires:
        Biom v2: A tool for general use formatting of biological data.

    Returns: 
        list: A list containing paths to all converted BIOM files.

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import common

        workflow = anadama2.Workflow()

        tsv_files = ['/tmp/foo.tsv', '/tmp/bar.tsv', '/tmp/baz.tsv']
        biom_files = common.batch_convert_tsv_to_biom(workflow, tsv_files)

        print biom_files
        ## ['/tmp/foo.biom', '/tmp/bar.biom', '/tmp/baz.biom']
    """
    biom_files = []

    tsv_fnames = bb_utils.sample_names(tsv_files, '.tsv')
    tsv_dir = os.path.dirname(tsv_files[0])

    biom_dir = os.path.join(tsv_dir, 'biom')
    bb_utils.create_folders(biom_dir)

    biom_files = [
        os.path.join(biom_dir, biom_fname) for biom_fname in
        bb_utils.name_files(tsv_fnames, biom_dir, extension='biom')
    ]

    for (tsv_file, biom_file) in zip(tsv_files, biom_files):
        convert_to_biom_from_tsv(workflow, tsv_file, biom_file)

    return biom_files
Ejemplo n.º 11
0
def truncate(workflow, method, input_files, output_folder, threads, trunc_len,
             fastq_ascii):
    """ Truncate the fasta sequences by length
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        input_files (list): A list of paths to fastq files.
        output_folder (string): The path of the output folder.
        threads (int): The number of threads for each task.
        trunc_len (int): The value to use for max length.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Paths to the truncated files
        
    """

    # get the name of the output files
    output_files = utilities.name_files(input_files,
                                        output_folder,
                                        tag="truncated")
    if method == "vsearch":
        workflow.add_task_group(
            "vsearch --fastx_filter [depends[0]]  --fastq_trunclen [args[0]]  -fastaout [targets[0]]",
            depends=input_files,
            targets=output_files,
            args=trunc_len,
            name="vsearch_fastx_truncate")
    else:
        workflow.add_task_group(
            "usearch -fastx_truncate [depends[0]] -trunclen [args[0]] -fastaout [targets[0]] -fastq_ascii [args[1]]",
            depends=input_files,
            targets=output_files,
            args=[trunc_len, fastq_ascii],
            name="usearch_fastx_truncate")

    return output_files
Ejemplo n.º 12
0
def dereplicate(workflow, method, fasta_file, output_folder, threads):
    """ Dereplicate reads
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        fasta_file (string): The path to the fasta file (filtered and dereplicated).
        output_folder (string): The path of the output folder.
        threads (int): The number of threads for each task.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: Path to the dereplicated fasta file
    """

    # get the name of the output files
    output_file = utilities.name_files("all_samples_dereplicated.fasta",
                                       output_folder)

    if method == "vsearch":
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; " + \
            "vsearch --derep_fulllength [depends[0]] --output [targets[0]] --sizein --sizeout --threads [args[0]]",
            depends=[fasta_file, TrackedExecutable("vsearch")],
            targets=output_file,
            args=threads,
            name="vsearch_derep_fulllength")
    else:
        workflow.add_task(
            "export OMP_NUM_THREADS=[args[0]]; "+\
            "usearch -derep_fulllength [depends[0]] -fastaout [targets[0]] -sizeout -threads [args[0]]",
            depends=[fasta_file,TrackedExecutable("usearch")],
            targets=output_file,
            args=threads,
            name="usearch_derep_fulllength")

    return output_file
Ejemplo n.º 13
0
def generate_md5_checksums(workflow, files):
    """Generates MD5 checksums for the provided set of files. All checksums 
    are written to a file containing the same name as the input but with the 
    "md5" extension appended.

    Args:
        workflow (anadama2.Workflow): The workflow object.
        files (list): A list of files to package together into a tarball.
        output_tarball (string): The desired output tarball file.

    Requires:
        None

    Returns:
        list: A list of the generated md5 checksum files.

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import common

        workflow = anadama2.Workflow()

        files = ['/tmp/foo.txt', '/tmp/bar.txt']

        md5sum_files = common.generate_md5_checksums(workflow, files)
    """
    output_dir = os.path.dirname(files[0])
    checksum_files = bb_utils.name_files(bb_utils.sample_names(files),
                                         output_dir,
                                         extension=".md5")

    workflow.add_task_gridable('md5sum [depends[0]] > [targets[0]]',
                               depends=files,
                               targets=checksum_files)

    return checksum_files
Ejemplo n.º 14
0
def demultiplex_dual(workflow, output_folder, input_files, extension,
            barcode_files, dual_barcode_path, min_phred, pair_identifier):

    """Demultiplex the files (dual indexed paired)

        Args:
            workflow (anadama2.workflow): An instance of the workflow class.
            input_files (list): A list of paths to fastq(gz) files for input to ea-utils.
            extension (string): The extension for all files.
            output_folder (string): The path of the output folder.
            barcode_files (list): A list of barcode files.
            dual_index_path (string): A paths to the dual index file.
            min_phred (int): The min phred quality score to use in the demultiplex command.
            pair_identifier (string): The string in the file basename to identify
                the first pair in the set.

        Requires:
            ea-utils fastq-multx: A tool to demultiplex fastq files.

        Returns:
            list: A list of the demultiplexed files
            string: output folder of demultiplexed files

        """

    # capture the demultiplex stats in log file, one for each set of input files
    demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log",create_folder=True)
    demultiplex_output_folder = os.path.dirname(demultiplex_log)

    # create a tracked executable
    fastq_multx_tracked = TrackedExecutable("fastq-multx",
                                            version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`")

    # check for paired input files
    input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier)

    # get barcode files
    barcode1, barcode2 = utilities.paired_files(barcode_files, extension, pair_identifier)

    # get the second pair identifier
    pair_identifier2 = pair_identifier.replace("1", "2", 1)

    try:
        file_handle = open(dual_barcode_path)
        lines = file_handle.readlines()
        file_handle.close()
    except EnvironmentError:
        sys.exit("ERROR: Unable to read dual barcode file: " + dual_barcode_path)

    run_name = os.path.basename(input_pair1[0]).replace(pair_identifier, "").replace("." + extension, "")
    demultiplex_files = set()
    for line in lines:
        # ignore headers or comment lines
        if not line.startswith("#"):
            sample_name = line.split("\t")[0]

            if sample_name:
                nm1 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier + "." + extension
                nm2 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier2 + "." + extension
                demultiplex_files.add(nm1)
                demultiplex_files.add(nm2)

    # get the names of the expected output files
    # demultiplex_files = utilities.name_files(samples, demultiplex_output_folder, extension=extension)

    workflow.add_task(
        "fastq-multx -B [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]]\
         -o n/a -o n/a -o [args[0]]/[args[5]]_%[args[3]].[args[1]] -o [args[0]]/[args[5]]_%[args[4]].[args[1]]\
         -q [args[2]] > [targets[0]]",
        depends=[dual_barcode_path, barcode1[0], barcode2[0], input_pair1[0], input_pair2[0]],
        args=[demultiplex_output_folder, extension, min_phred, pair_identifier, pair_identifier2, run_name, fastq_multx_tracked],
        targets=[demultiplex_log, TrackedDirectory(demultiplex_output_folder)],
        name="demultiplex_dual")

    demultiplex_files = demultiplex_check(workflow, demultiplex_log, demultiplex_files)


    return demultiplex_files, demultiplex_output_folder
Ejemplo n.º 15
0
        error_ratesR_path, args.threads, args.minoverlap, args.maxmismatch)

    # construct otu
    seqtab_file_path, read_counts_steps_path, seqs_fasta_path = dadatwo.const_seq_table(
        workflow, args.output, filtered_dir, mergers_file_path, args.threads)

    # centroid alignment
    centroid_fasta = files.SixteenS.path("msa_nonchimera", args.output)
    sixteen_s.centroid_alignment(workflow,
                                 seqs_fasta_path,
                                 centroid_fasta,
                                 args.threads,
                                 task_name="clustalo_nonchimera")

    # phylogenetic tree
    closed_tree = utilities.name_files("closed_reference.tre", args.output)
    sixteen_s.create_tree(workflow, centroid_fasta, closed_tree)

    # assign taxonomy
    closed_reference_tsv = dadatwo.assign_taxonomy(workflow, args.output,
                                                   seqtab_file_path,
                                                   args.dada_db, args.threads)

    # functional profiling
    # check for picrust1 as not an option with this workflow
    if args.picrust_version == "1":
        print(
            "WARNING: PICRUSt v1 is not compatible with ASV tables so will not be run for this workflow."
        )
    else:
        categorized_function = sixteen_s.functional_profile(
Ejemplo n.º 16
0
def add_metadata_to_tsv(workflow, analysis_files, metadata_file, dtype,
                        id_col, col_replace=None, col_offset=-1, 
                        metadata_rows=None, target_cols=None, 
                        aux_files=None, na_rep=""):
    """Adds metadata to the top of a tab-delimited file. This function is
    meant to be called on analysis files to append relevant metadata to the 
    analysis output found in the file. An example can be seen below:

        
        sample  Sample1 Sample2 Sample3 Sample4 Sample5 Sample6 Sample7 Sample8
        Age 87  78  3   2   32  10  39  96
        Cohort  Healthy Healthy Healthy Healthy IBD IBD IBD IBD
        Favorite_color  Yellow  Blue    Green   Yellow  Green   Blue    Green 
        Height  60  72  63  67  71  65  61  64
        Sex 0   1   0   1   1   0   1   0
        Smoking 0   0   1   0   1   1   1   0
        Star_Trek_Fan   1   1   0   0   1   0   0   1
        Weight  151 258 195 172 202 210 139 140
        Bacteria    1   1   1   1   1   1   1   1
        Bacteria|Actinobacteria|Actinobacteria  0.0507585   0.252153    0.161725   

    Args:
        workflow (anadama2.Workflow): The AnADAMA2 workflow object.
        analysis_files (list): Target TSV's to add metadata too
        metadata_file (string): The path to the metadata file to pull from.
        dtype (string): Data type of files for which metadata is being refreshed
            to include.
        id_col (string): The column name in the supplied metadata file to 
            attempt to subset on using ID's from the analysis file.
        col_replace (list): A list of string fragments that should be searched 
            for and replaced in either of the column headers of the analysis 
            or metadata files.
        col_offset (int): In certain situations a series of metadata columns
            will be present prior to columns containing analysis results.
            In these cases an offset needs to be provided for proper creation 
            of PCL files.
        metadata_rows (int): If our analysis file already contains some 
            metadata files at the top of the file (in effect already a PCL
            file) this parameter indicates how many rows of metadata exist.
        target_cols (list): A list of columns to filter the metadata file on.
        aux_files (list): Any additional metadata files to integrate into 
            analysis files. 
        na_rep (string): String representation for any empty cell in our 
            PCL file. Defaults to an empty string.

    Requires:
        None

    Returns: 
        list: A list containing the path to all modified files.

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import metadata

        workflow = anadama2.Workflow()
    
        target_cols = ['age', 'sex', 'smoking']
        col_replace = ['_taxonomic_profile', '_functional_profile']
        out_files = metadata.add_metadata_to_tsv(workflow, ['/tmp/metaphlan2.out'], 
                                                 'External ID',
                                                 col_replace,
                                                 '/tmp/metadata.tsv',
                                                 target_cols)

        print out_files
        ## ['/tmp/metaphlan2.out']
    """
    metadata_df = pd.read_csv(metadata_file, dtype='str',
                              parse_dates=['date_of_receipt'])
    
    def _workflow_add_metadata_to_tsv(task):
        analysis_file = task.depends[0].name
        pcl_out = task.targets[0].name

        analysis_df = pd.read_csv(analysis_file, dtype='str', header=None)
        pcl_metadata_df = None
        header = True
            
        # Going to make the assumption that the next row following our PCL 
        # metadata rows is the row containing the ID's that we will use to merge
        # the analysis file with our metadata file and we can use these same 
        # ID's to merge the PCL metadata rows into the larger metadata file.
        if metadata_rows:
            pcl_metadata_df = analysis_df[:metadata_rows+1]
            header = None

            offset_cols = range(0, col_offset+1)
            pcl_metadata_df.drop(pcl_metadata_df.columns[offset_cols[:-1]], 
                                    axis=1,
                                    inplace=True)

            pcl_metadata_df = pcl_metadata_df.T.reset_index(drop=True).T
            pcl_metadata_df.xs(metadata_rows)[0] = id_col
            
            pcl_metadata_df = pcl_metadata_df.T
            pcl_metadata_df = reset_column_headers(pcl_metadata_df)

            analysis_df.drop(analysis_df.index[range(0,metadata_rows)], inplace=True)
            analysis_df.rename(columns=analysis_df.iloc[0], inplace=True)
        else:
            analysis_df = hmp2_utils.misc.reset_column_headers(analysis_df)

        sample_ids = analysis_df.columns.tolist()[col_offset+1:]
            
        if len(sample_ids) == 1:
            raise ValueError('Could not parse sample ID\'s:', 
                             sample_ids)

        if col_replace:
            new_ids = sample_ids
            for replace_str in col_replace:
                new_ids = [sid.replace(replace_str, '') if not pd.isnull(sid) 
                           else sid for sid in new_ids]

            if new_ids != sample_ids:
                sample_ids_map = dict(zip(sample_ids, new_ids))
                sample_ids = new_ids
    
                analysis_df.rename(columns=sample_ids_map, inplace=True)

        subset_metadata_df = metadata_df[(metadata_df.data_type == dtype) &
                                         (metadata_df[id_col].isin(sample_ids))]

        if aux_files:
            for aux_file in aux_files:
                aux_metadata_df = pd.read_table(aux_file, dtype='str')
                aux_metadata_cols = aux_metadata_df.columns.tolist()
                join_id = aux_metadata_cols[0]

                ## We need to do this in two stages. If the columns already exist
                ## here we want to update them. If they do not exist we append
                ## them.
                subset_metadata_cols = subset_metadata_df.columns.tolist()
                new_cols = set(aux_metadata_cols[1:]) - set(subset_metadata_cols)
                existing_cols = set(aux_metadata_cols[1:]).intersection(subset_metadata_cols)

                if new_cols:
                    aux_metadata_new_df = aux_metadata_df.filter(items=aux_metadata_cols[:1] + 
                                                                 list(new_cols))
                    subset_metadata_df = pd.merge(subset_metadata_df, aux_metadata_new_df, 
                                                  how='left', on=join_id)

                if existing_cols:
                    aux_metadata_existing_df = aux_metadata_df.filter(items=aux_metadata_cols[:1] + 
                                                                      list(existing_cols))
                    subset_metadata_df.set_index(join_id, inplace=True)
                    aux_metadata_existing_df.set_index(join_id, inplace=True)

                    subset_metadata_df.update(aux_metadata_existing_df)
                    subset_metadata_df.reset_index(inplace=True)

        if not pcl_metadata_df.empty:
            subset_metadata_df = pd.merge(subset_metadata_df, pcl_metadata_df,
                                          how='left', on=id_col)

        if target_cols:
            target_cols.insert(0, id_col)
            subset_metadata_df = subset_metadata_df.filter(target_cols)

        subset_metadata_df = subset_metadata_df.T
        subset_metadata_df = reset_column_headers(subset_metadata_df)
        subset_metadata_df = subset_metadata_df.reset_index()
        subset_metadata_df.fillna('NA', inplace=True)

        _col_offset = col_offset-1 if col_offset != -1 else col_offset
        col_name = analysis_df.columns[_col_offset+1]

        col_name = '' if col_name == "index" else col_name
        subset_metadata_df.rename(columns={'index': col_name}, inplace=True)

        analysis_df.index = analysis_df.index + len(subset_metadata_df.index)

        analysis_metadata_df = pd.concat([subset_metadata_df,
                                          analysis_df], axis=0)
        analysis_metadata_df = analysis_metadata_df[analysis_df.columns]
        analysis_metadata_df.to_csv(pcl_out, 
                                    index=False, 
                                    header=header, 
                                    sep='\t',
                                    na_rep=na_rep)

    output_folder = os.path.dirname(analysis_files[0])
    pcl_files = bb_utils.name_files(analysis_files, 
                                    output_folder, 
                                    extension="pcl.tsv")

    # Because of how YAML inherits lists we'll need to see if we can't 
    # flatten this list out. 
    target_cols = funcy.flatten(target_cols)

    workflow.add_task_group(_workflow_add_metadata_to_tsv,
                            depends=analysis_files,
                            targets=pcl_files,
                            time="1*60 if ( file_size('depends[0]]') < 1 else 2*60",
                            mem="4*1024 if ( file_size('depends[0]]') < 1 else 3*12*1024" ,
                            cores=1,
                            name="Generate analysis PCL output file")

    return pcl_files
Ejemplo n.º 17
0
def generate_sample_metadata(workflow, data_type, in_files, metadata_file, 
                             output_dir, id_column = 'External ID'):
    """Generates a series of individual metadata files in CSV format 
    from the provided merged metadata file. Each of the provided samples
    has a metadata file generated to accompany any product files generated 
    by the analysis pipelines.

    Args:
        workflow (anadama2.Workflow): The workflow object.
        data_type (string): The data type of the provided samples. One of 
            either 'metageonimcs', 'proteomices', 'amplicon'.
        in_files (list): A list of files that should have corresponding 
            metadata files written.
        metadata_file (string): Path to the merged metadata file.
        id_column (string): The ID column to attempt to map sample names to 
            in the merged metadata file. Default set to "External ID" but 
            can change depending on the data type.
        output_dir (string): Path to output directory to write each 
            sample metadata file too.

    Requires:
        None

    Returns:
        list: A list containing the path to all sample metadata files created.

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import metadata

        workflow = anadama2.Workflow()
        
        samples = ['sampleA', 'sampleB']
        metadadta_file = '/tmp/merged_metadata.csv'
        output_dir = '/tmp/metadata'

        metadata_files = metadata.generate_sample_metadata(workflow, 
                                                           'metagenomics',
                                                           samples, 
                                                           metadata_file, 
                                                           output_dir)
        print metadata_files
        ## ['/tmp/metadata/sampleA.csv', '/tmp/metadata/sampleB.csv']
    """
    metadata_df = pd.read_csv(metadata_file)
    samples = bb_utils.sample_names(in_files)

    output_metadata_files = bb_utils.name_files(samples, 
                                                output_dir, 
                                                extension = 'csv',
                                                subfolder = 'metadata',
                                                create_folder = True)
    sample_metadata_dict = dict(zip(samples, output_metadata_files))

    def _workflow_gen_metadata(task):
        metadata_subset = metadata_df.loc[(metadata_df[id_column].isin(samples)) &
                                          (metadata_df['data_type'] == data_type)]
    
        if metadata_subset.empty:
            raise ValueError('Could not find metadata associated with samples.',
                             ",".join(samples))

        for (sample_id, row) in metadata_subset.iterrows():
            sample_metadata_file = sample_metadata_dict.get(row[id_column])
            metadata_subset.xs(sample_id).to_csv(sample_metadata_file, index=False)
    
    workflow.add_task(_workflow_gen_metadata,
                      targets=output_metadata_files,  
                      depends=in_files + [metadata_file],
                      name='Generate sample metadata')

    return sample_metadata_dict.values()
                                   exit_if_not_found=True)

### STEP #1: Run quality control on all input files ###
sample_names = utilities.sample_names(input_files, args.input_extension)
input_pair1, input_pair2 = utilities.paired_files(input_files,
                                                  args.input_extension,
                                                  args.pair_identifier)
paired = False
if input_pair1:
    sample_names = utilities.sample_names(input_pair1, args.input_extension,
                                          args.pair_identifier)
    qc_targets = [
        utilities.name_files([
            name + ".trimmed.1.fastq", name + ".trimmed.2.fastq",
            name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq",
            name + ".trimmed.single.12.fastq"
        ],
                             args.output,
                             subfolder="kneaddata",
                             create_folder=True) for name in sample_names
    ]
    paired = True
    for target_set, input_R1, input_R2, name in zip(qc_targets, input_pair1,
                                                    input_pair2, sample_names):
        workflow.add_task(
            "kneaddata --run-fastqc-start --input [depends[0]] --input [depends[1]] --output [args[0]] --threads [args[1]] --output-prefix [args[2]] && cat [args[3]] [args[4]] > [targets[2]]",
            depends=[input_R1, input_R2,
                     TrackedExecutable("kneaddata")],
            targets=[target_set[0], target_set[1], target_set[4]],
            args=[
                os.path.dirname(target_set[0]), args.threads, name,
                target_set[2], target_set[3]
Ejemplo n.º 19
0
def main(workflow):
    args = workflow.parse_args()

    conf_mtx = parse_cfg_file(args.config_file, section='MTX')
    conf_mgx = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    adapters_file = manifest.get('adapters_file')

    contaminate_db = conf_mtx.get('databases').get('knead_dna')
    mtx_db = conf_mtx.get('databases').get('knead_mtx')
    rrna_db = conf_mtx.get('databases').get('knead_rrna')
    adapter_sequences = conf_mtx.get('adapter_sequences')

    qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
    tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
    func_threads = args.threads_humann if args.threads_humann else args.threads

    if data_files and data_files.get('MTX', {}).get('input'):
        input_files_mtx = data_files.get('MTX').get('input')
        file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq')
        pair_identifier_mtx = data_files.get('MTX').get('pair_identifier')
        input_file_tags = data_files.get('MTX').get('tags')
        input_tax_profiles = []

        project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'),
                                                conf_mtx.get('processing_dir'),
                                                conf_mtx.get('public_dir')],
                                               project,
                                               creation_date,
                                               'MTX')
        public_dir_mtx = project_dirs_mtx[-1]
        base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..'))

        manifest_file = stage_files(workflow, 
                                    [args.manifest_file],
                                    base_depo_dir)
        deposited_files_mtx = stage_files(workflow,
                                          input_files_mtx,
                                          project_dirs_mtx[0],
                                          symlink=True)

        if file_extension_mtx == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow, 
                                            deposited_files_mtx, 
                                            project_dirs_mtx[1],
                                            paired_end=True,
                                            compress=False,
                                            threads=args.threads)
            pair_identifier_mtx = "_R1"                                            
        else:
            paired_end_seqs = deposited_files_mtx

        if adapters_file:
            adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE "
                                 "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file)

        (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow,
                                                                paired_end_seqs,
                                                                file_extension_mtx,
                                                                project_dirs_mtx[1],
                                                                qc_threads,
                                                                databases=[contaminate_db,
                                                                           rrna_db,
                                                                           mtx_db],
                                                                pair_identifier=pair_identifier_mtx,
                                                                additional_options=adapter_trim_opts,
                                                                remove_intermediate_output=True)

        sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx)

        ##########################################
        #          MGX FILE PROCESSING           #
        ##########################################
        # Ideally we would be passed in a set of corresponding metagenome
        # sequence(s) to go with our metatranscriptomic files but we also
        # have two other scenarios:
        #
        #       1.) No accompanying metagenomic sequences exist; in this
        #           case we will proceed just using the metatranscriptomic
        #           data.
        #       2.) Taxonomic profiles are passed directly in in our MANIFEST
        #           file; here we remove these from our input files and
        #           prevent them from running through the kneaddata ->
        #           metaphlan2 portions of our pipeline
        if data_files.get('MGX', {}).get('input'):
            input_files_mgx = data_files.get('MGX').get('input')
            file_extension_mgx = data_files.get('MGX').get('file_ext')
            pair_identifier_mgx = data_files.get('MGX').get('pair_identifier')
            input_tax_profiles = [in_file for in_file in input_files_mgx
                                  if 'taxonomic_profile.tsv' in in_file]
            input_files_mgx = set(input_files_mgx) - set(input_tax_profiles)

            if input_files_mgx:
                sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx)

                project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'),
                                                        conf_mgx.get('processing_dir'),
                                                        conf_mgx.get('public_dir')],
                                                       project,
                                                       creation_date,
                                                       'WGS')
                public_dir_mgx = project_dirs_mgx[-1]

                deposited_files_mgx = stage_files(workflow,
                                                  input_files_mgx,
                                                  project_dirs_mgx[0],
                                                  symlink=True)

                if file_extension_mgx == ".bam":
                    ## Need to sort our BAM files to be sure here...
                    paired_end_seqs = bam_to_fastq(workflow, 
                                                    deposited_files_mgx, 
                                                    project_dirs_mgx[1],
                                                    paired_end=True,
                                                    compress=False,
                                                    threads=args.threads)
                    pair_identifier_mgx = "_R1"                                            
                else:
                    paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx)  

                (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow,
                                                                        paired_end_seqs_mgx,
                                                                        project_dirs_mgx[1],
                                                                        qc_threads,
                                                                        [contaminate_db,
                                                                        rrna_db],
                                                                        remove_intermediate_output=True)

                tax_outs_mgx = taxonomic_profile(workflow,
                                                 cleaned_fastqs_mgx,
                                                 project_dirs_mgx[1],
                                                 tax_threads,
                                                 '*.fastq')

                func_outs_mgx = functional_profile(workflow,
                                                   cleaned_fastqs_mgx,
                                                   project_dirs_mgx[1],
                                                   func_threads,
                                                   tax_outs_mgx[1],
                                                   remove_intermediate_output=True)
                input_tax_profiles.extend(tax_outs_mgx[1])

                pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw')
                pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile')
                pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile')
                map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir,
                                    pub_wgs_func_profile_dir])

                norm_genefamilies_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='genes',
                                                tag='genefamilies_relab',
                                                extension='tsv')
                norm_ecs_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='ecs',
                                                tag='genefamilies_ecs_relab',
                                                extension='tsv')
                norm_path_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='pathways',
                                                tag='pathabundance_relab',
                                                extension='tsv')

                pcl_files = add_metadata_to_tsv(workflow,
                                                [tax_outs_mgx[1]] 
                                                + func_outs_mgx,
                                                'metagenomics',
                                                conf_mgx.get('metadata_id_col'),
                                                conf_mgx.get('analysis_col_patterns'),
                                                conf_mgx.get('target_metadata_cols'))
                                      
                func_tar_files_wgs = []
                for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx,
                                                                    norm_genefamilies_mgx,
                                                                    norm_ecs_files_mgx,
                                                                    norm_path_files_mgx):
                    tar_path = os.path.join(pub_wgs_func_profile_dir, 
                                            "%s_humann2.tgz" % sample)
                    func_tar_file = tar_files(workflow,
                                            [gene_file, ecs_file, path_file],
                                            tar_path,
                                            depends=func_outs_mgx)
                    func_tar_files_wgs.append(func_tar_file)

        ##########################################
        #          MTX FILE PROCESSING           #
        ##########################################
        # Here we want to see if we can create a set of matching cleaned
        # MTX files to corresponding MGX taxonomic profiles. If these exist
        # we want to run functional profiling wit hthe corresponding MGX
        # taxonomic profile otherwise we will run a taxonomic profiling
        # on the MTX sequences and run functional profiling with the produced
        # taxonomic profile.
        func_outs_match_mtx = []
        if input_tax_profiles:
            (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx,
                                                                     '.fastq',
                                                                     data_files.get('MTX').get('metadata_id_col', 'External ID'),
                                                                     input_tax_profiles,
                                                                     data_files.get('MGX').get('tax_profile_id', 'External ID'),
                                                                     args.metadata_file,
                                                                     tags=input_file_tags)

            func_outs_match_mtx = functional_profile(workflow,
                                                     matched_fqs,
                                                     project_dirs_mtx[1],
                                                     func_threads,
                                                     matched_tax_profiles,
                                                     remove_intermediate_output=True)

            # Reset the remaining MTX files left over here so that we can run them through
            # the metaphlan2 -> humann2 pipeline.
            cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs)

        if cleaned_fastqs_mtx:
            tax_outs_mtx = taxonomic_profile(workflow,
                                             cleaned_fastqs_mtx,
                                             project_dirs_mtx[1],
                                             tax_threads,
                                             '*.fastq')
            func_outs_mtx = functional_profile(workflow,
                                               cleaned_fastqs_mtx,
                                               file_extension_mtx,
                                               project_dirs_mtx[1],
                                               func_threads,
                                               tax_outs_mtx[1],
                                               remove_intermediate_output=True)
            func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx)
        else:
            func_outs_mtx = func_outs_match_mtx

        # We'll need to generate DNA/RNA normalized files to be displayed 
        # in our visualization output.
        (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow,
                                                                        func_outs_mgx[0],
                                                                        func_outs_mgx[1],
                                                                        func_outs_mgx[2],
                                                                        func_outs_mtx[0],
                                                                        func_outs_mtx[1],
                                                                        func_outs_mtx[2],
                                                                        project_dirs_mtx[1])

        pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw')
        pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile')
        pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile')
        map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir,
                             pub_mtx_func_profile_dir])

        norm_genefamilies_mtx = name_files(sample_names_mtx,
                                           project_dirs_mtx[1],
                                           subfolder='genes',
                                           tag='genefamilies_relab',
                                           extension='tsv')
        norm_ecs_files_mtx = name_files(sample_names_mtx,
                                        project_dirs_mtx[1],
                                        subfolder='ecs',
                                        tag='genefamilies_ecs_relab',
                                        extension='tsv')
        norm_path_files_mtx = name_files(sample_names_mtx,
                                         project_dirs_mtx[1],
                                         subfolder='pathways',
                                         tag='pathabundance_relab',
                                         extension='tsv')

        func_tar_files_mtx = []
        for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx,
                                                            norm_genefamilies_mtx,
                                                            norm_ecs_files_mtx,
                                                            norm_path_files_mtx):
            tar_path = os.path.join(pub_mtx_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_outs_mtx)
            func_tar_files_mtx.append(func_tar_file)
    
        workflow.go()
Ejemplo n.º 20
0
def stage_files(workflow,
                input_files,
                target_dir,
                delete=False,
                preserve=False,
                symlink=False):
    """Moves data files from the supplied origin directory to the supplied
    destination directory. In order to include a file verification check in
    the staging process rsync is used by default to copy files.

    If the symlink parameter is set to True this function will instead create
    symlinks from the origin directory to the target directory.

    An optional parameter may be provided to only stage files with the 
    corresponding extension.

    Args:
        workflow (anadama2.Workflow): The workflow object.
        input_files: A collection of input files to be staged.
        dest_dir (string): Path to destination directory where files should 
            be moved.
        preserve (boolean): If set to True preserve the source subdirectory 
            structure on the target side.
        symlink (boolean): By default create symlinks from the origin 
            directory to the destination directory. If set to 
            False files will be copied using rsync.

    Requires:
        rsync v3.0.6+: A versatile file copying tool.

    Returns:
        list: A list of all files that were successfuly staged

    Example:
        from anadama2 import Workflow
        from hmp2_workflows.tasks import common

        workflow = anadama2.Workflow()

        staged_files = common.stage_files(workflow, 
                                          ['/tmp/fooA.sam', '/tmp/fooB.sam'],
                                          '/tmp/out_dir')

        workflow.go()
    """
    if not os.path.exists(target_dir):
        raise OSError(2, 'Target directory does not exist', target_dir)

    ## TODO: We need to preserve the file directory structure here because
    ## it tells when the files were received and is used by the website.
    target_files = bb_utils.name_files(input_files, target_dir)

    ## TODO: Figure out a better way to handle this rather than creating
    ## N rsync calls.
    stage_cmd = "remove_if_exists.py [targets[0]] ; rsync -avz [depends[0]] [targets[0]]"

    if preserve:
        stage_cmd = stage_cmd.replace(
            '-avz', '--rsync-path=\"mkdir -p `dirname '
            '[depends[0]]`\" -avz')
    if symlink:
        stage_cmd = "remove_if_exists.py [targets[0]] ; ln -s [depends[0]] [targets[0]]"

    workflow.add_task_group(stage_cmd,
                            depends=input_files,
                            targets=target_files)

    return target_files
Ejemplo n.º 21
0
def deinterleave_fastq(workflow,
                       input_files,
                       output_dir,
                       threads=1,
                       compress=True):
    """Deinterleaves a FASTQ file producing paired-end FASTQ reads.

    Args:
        workflow (anadama2.Workflow): The AnADAMA2 Workflow object.
        input_files (list) A list of FASTQ files to deinterleave.
        output_dir (string): The output directory to write paired-end reads 
            too.
        compress (bool): Compress FASTQ files generated.
        threads (int): The number of threads/cores to be used if compressing
            paired ends reads.

    Requires:
        None

    Returns:
        list: A list of paired-end files.

    Example:
        from anadama2 import Workflow

        from hmp2_workflows.tasks.file_conv import deinterleave_fastq

        workflow = Workflow()
        paired_end_fastqs = deinterleave_fastq(workflow,
                                               ['foo.fasta', 'bar.fastq'],
                                               threads=4)

        print paired_end_fastqs                                                      
        # [[foo_R1.fastq.gz, foo_R2.fastq.gz], [bar_R1.fastq.gz, bar_R2.fastq.gz]]
    """
    paired_end_reads = []
    deinterleave_cmd = "deinterleave_fastq.sh < [depends[0]] [targets[0]] [targets[1]]"

    out_ext = "fastq"
    if compress:
        deinterleave_cmd += " " + "compress"
        out_ext = "fastq.gz"

    deinterleave_cmd += " " + str(threads)

    mate_1_files = bb_utils.name_files(map(os.path.basename, input_files),
                                       output_dir,
                                       tag="R1",
                                       extension=out_ext)
    mate_2_files = bb_utils.name_files(map(os.path.basename, input_files),
                                       output_dir,
                                       tag="R2",
                                       extension=out_ext)

    if "gz" in out_ext:
        mate_1_files = [
            fname.replace('.fastq_R1', '_R1.fastq') for fname in mate_1_files
        ]
        mate_2_files = [
            fname.replace('.fastq_R2', '_R2.fastq') for fname in mate_2_files
        ]
    output_files = zip(mate_1_files, mate_2_files)

    workflow.add_task_group_gridable(deinterleave_cmd,
                                     depends=input_files,
                                     targets=output_files,
                                     time=5 * 60,
                                     mem=4096,
                                     cores=threads)

    return output_files
Ejemplo n.º 22
0
def functional_profile(workflow, closed_reference_tsv, closed_reference_fasta,
                       picrust_version, threads, output_folder, otus):
    """ Run picrust for functional profiling
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        closed_reference_tsv (string): The path to the closed reference tsv file.
        closed_reference_fasta (string): The path to the closed reference fasta file.
        picrust_version (str): The version of picrust to use.
        threads (int): The number of threads/cores for each task.
        output_folder (string): The path of the output folder.
        otus (bool): Are the inputs from OTUs (so all numerical ids).
        
    Requires:
        Picrust v1.1 or v2: Software to predict metagenome function.
        Biom v2: A tool for general use formatting of biological data.
        
    Returns:
        string: The path to the functional data file in tsv format.

    """

    if picrust_version == "1":
        # convert the tsv file to biom format
        closed_reference_biom_file = utilities.name_files(closed_reference_tsv,
                                                          output_folder,
                                                          extension="biom")
        convert_to_biom_from_tsv(
            workflow,
            closed_reference_tsv,
            closed_reference_biom_file,
            options=
            "--process-obs-metadata=taxonomy --output-metadata-id=taxonomy")

        # run picrust to get functional data
        functional_data_categorized_biom, functional_data_predicted_biom = picrust(
            workflow, closed_reference_biom_file, output_folder)

        # convert the predited biom file to tsv
        functional_data_predicted_tsv = utilities.name_files(
            functional_data_predicted_biom, output_folder, extension="tsv")
        convert_from_biom_to_tsv(workflow, functional_data_predicted_biom,
                                 functional_data_predicted_tsv)

        # convert the categorized biom file to tsv
        functional_data_categorized_tsv = utilities.name_files(
            functional_data_categorized_biom, output_folder, extension="tsv")
        convert_from_biom_to_tsv(workflow, functional_data_categorized_biom,
                                 functional_data_categorized_tsv)

        return functional_data_categorized_tsv

    else:
        # run the v2 pipeline
        functional_data_predicted_tre = utilities.name_files(
            "out.tre", output_folder, subfolder="picrust2")
        workflow.add_task(
            utilities.partial_function(run_picrust2,
                                       threads=threads,
                                       otus=otus),
            depends=[closed_reference_fasta, closed_reference_tsv],
            targets=functional_data_predicted_tre)
        return functional_data_predicted_tre
Ejemplo n.º 23
0
def merge_pairs_and_rename(workflow, method, input_files, extension,
                           output_folder, pair_identifier, threads,
                           fastq_ascii):
    """ Merge the files if pairs and rename sequence ids to match sample id
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis, usearch default or vsearch
        input_files (list): A list of paths to fastq files.
        extension (string): The extension for all files.
        output_folder (string): The path of the output folder.
        pair_identifier (string): The string in the file basename to identify
            the first pair in the set.
        threads (int): The number of threads for each task.
        
    Requires:
        usearch or vsearch
        
    Returns:
        list: A list of the renamed files.
        
    """

    pair1, pair2 = utilities.paired_files(input_files, extension,
                                          pair_identifier)

    if pair1 and pair2:
        # paired input files were found

        # if the files are gzipped, first decompress as fastq_mergepairs will take in fastq.gz but the output will not be correctly formatted
        if pair1[0].endswith(".gz"):
            # get the names of the decompressed output files
            decompressed_pair1 = utilities.name_files(
                [os.path.basename(file).replace(".gz", "") for file in pair1],
                output_folder,
                subfolder="merged_renamed")
            # get the names of the decompressed output files
            decompressed_pair2 = utilities.name_files(
                [os.path.basename(file).replace(".gz", "") for file in pair2],
                output_folder,
                subfolder="merged_renamed")

            # add tasks to decompress the files
            workflow.add_task_group("gunzip -c [depends[0]] > [targets[0]]",
                                    depends=pair1 + pair2,
                                    targets=decompressed_pair1 +
                                    decompressed_pair2)

            # the pair files to be used for the remaining tasks are those that are decompressed
            pair1 = decompressed_pair1
            pair2 = decompressed_pair2

        # get the sample names from the input file names
        sample_names = [
            os.path.basename(file).replace(pair_identifier + ".fastq", "")
            for file in pair1
        ]

        # get the names of the output files
        stitched_files = utilities.name_files(sample_names,
                                              output_folder,
                                              subfolder="merged_renamed",
                                              tag="stitched",
                                              extension="fastq",
                                              create_folder=True)
        unjoined_files = utilities.name_files(sample_names,
                                              output_folder,
                                              subfolder="merged_renamed",
                                              tag="unjoined",
                                              extension="fastq")

        # run usearch to merge pairs, if input files are non-empty
        for read1, read2, stitched_output, unjoined_output in zip(
                pair1, pair2, stitched_files, unjoined_files):
            if method == 'vsearch':
                workflow.add_task(
                    utilities.partial_function(merge_pairs,
                                               method="vsearch",
                                               threads=threads),
                    depends=[read1, read2,
                             TrackedExecutable("vsearch")],
                    targets=[stitched_output, unjoined_output],
                    name="vsearch_fastq_mergepairs")
            else:
                workflow.add_task(
                    utilities.partial_function(merge_pairs,
                                               method="userach",
                                               threads=threads,
                                               fastq_ascii=fastq_ascii),
                    depends=[read1, read2,
                             TrackedExecutable("usearch")],
                    targets=[stitched_output, unjoined_output],
                    name="usearch_fastq_mergepairs")

        # merge the stitched and unjoined from the prior step
        renamed_files = utilities.name_files(sample_names,
                                             output_folder,
                                             subfolder="merged_renamed",
                                             tag="renamed",
                                             extension="fastq")
        workflow.add_task_group(
            "merge_and_rename_fastq.py [depends[0]] [depends[1]] _stitched [targets[0]]",
            depends=zip(stitched_files, unjoined_files),
            targets=renamed_files)

    else:
        # these files are not pairs and do not need to be merged
        # rename the files
        renamed_files = utilities.name_files(input_files,
                                             output_folder,
                                             subfolder="merged_renamed",
                                             tag="renamed",
                                             extension="fastq",
                                             create_folder=True)
        workflow.add_task_group(
            "merge_and_rename_fastq.py [depends[0]] '' '' [targets[0]]",
            depends=input_files,
            targets=renamed_files)

    return renamed_files
Ejemplo n.º 24
0
def taxonomic_profile(workflow,
                      method,
                      filtered_fasta_file,
                      truncated_fasta_file,
                      original_fasta_file,
                      output_folder,
                      threads,
                      percent_identity,
                      reference_usearch,
                      reference_fasta,
                      reference_taxonomy,
                      min_size,
                      bypass_msa=False):
    """ Pick otus, cluster centroids, otu mapping, reference mapping to create open/closed reference taxonomy files
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        method (string): tools for sequence analysis - usearhc(default) or vsearch
        filtered_fasta_file (string): The path to the fasta file (filtered and dereplicated).
        truncated_fasta_file (string): The path to the fasta file (truncated not qced).
        original_fasta_file (string): The path to the fasta file (not qc or truncated).
        output_folder (string): The path of the output folder.
        threads (int): The number of threads for each task.
        percent_identity (float): The percent identity to use for alignments.
        reference_usearch (string): The path to the reference usearch formatted database.
        reference_fasta (string): The path to the reference fasta file.
        reference_taxonomy (string): The path to the reference taxonomy file.
        min_size (int): Min size of the reads to filter.
        bypass_msa (bool): Bypass msa clustering and tree generation.        

    Requires:
        usearch(as of usearch v9: has built-in de novo chimera filtering) or vsearch
        clustal omega: multiple sequence alignment for proteins 
        
    Returns:
        list: Path to the fasta file sorted by size

    """

    # first pick otus
    otu_fasta = pick_otus(workflow, method, filtered_fasta_file,
                          reference_fasta, output_folder, threads, min_size)

    # centroid OTU sequence alignment
    # get the name of the output files
    if not bypass_msa:
        centroid_fasta = files.SixteenS.path("msa_nonchimera", output_folder)
        centroid_alignment(workflow,
                           otu_fasta,
                           centroid_fasta,
                           threads,
                           task_name="clustalo_nonchimera")

    # align the reads to the otus
    otu_alignment_uc = utilities.name_files(
        "all_samples_otu_mapping_results.uc", output_folder)
    otu_alignment_tsv = utilities.name_files(
        "all_samples_otu_mapping_results.tsv", output_folder)
    global_alignment(workflow, method, truncated_fasta_file, otu_fasta,
                     percent_identity, threads, otu_alignment_uc,
                     otu_alignment_tsv)

    # align the otus to the reference database
    reference_alignment_uc = utilities.name_files(
        "all_samples_green_genes_mapping_results.uc", output_folder)
    reference_alignment_tsv = utilities.name_files(
        "all_samples_green_genes_mapping_results.tsv", output_folder)
    global_alignment(workflow,
                     method,
                     otu_fasta,
                     reference_usearch,
                     percent_identity,
                     threads,
                     reference_alignment_uc,
                     reference_alignment_tsv,
                     top_hit_only=True)

    # create the open/cosed reference tables
    closed_reference_tsv, closed_ref_fasta = build_otu_tables(
        workflow, reference_taxonomy, reference_fasta, reference_alignment_uc,
        otu_alignment_uc, otu_fasta, original_fasta_file, output_folder)

    # cluster the closed reference otu fasta sequences and generate a tree
    if not bypass_msa:
        centroid_closed_fasta = files.SixteenS.path("msa_closed_reference",
                                                    output_folder)
        closed_tree = utilities.name_files("closed_reference.tre",
                                           output_folder)
        centroid_alignment(workflow,
                           closed_ref_fasta,
                           centroid_closed_fasta,
                           threads,
                           task_name="clustalo_closed_reference")
        create_tree(workflow, centroid_closed_fasta, closed_tree)

    return closed_reference_tsv, closed_ref_fasta
Ejemplo n.º 25
0
def demultiplex(workflow, input_files, extension, output_folder, barcode_file, index_files, min_phred, pair_identifier):
    """Demultiplex the files (single end or paired)
    
    Args:
        workflow (anadama2.workflow): An instance of the workflow class.
        input_files (list): A list of paths to fastq files for input to ea-utils.
        extension (string): The extension for all files.
        output_folder (string): The path of the output folder.
        barcode_file (string): A file of barcodes.
        index_files (string): A list of paths to the index files.
        min_phred (int): The min phred quality score to use in the demultiplex command.
        pair_identifier (string): The string in the file basename to identify
            the first pair in the set.
        
    Requires:
        ea-utils fastq-multx: A tool to demultiplex fastq files.
        
    Returns:
        list: A list of the demultiplexed files
        string: output folder of demultiplexed files
        
    """
    
    # error if there is more than one index file
    if len(index_files) > 1:
        sys.exit("ERROR: Only one index file expected for demultiplexing step.")
    
    # read the barcode file to get the expected output files 
    try:
        file_handle=open(barcode_file)
        lines=file_handle.readlines()
        file_handle.close()
    except EnvironmentError:
        sys.exit("ERROR: Unable to read barcode file: " + barcode_file)
        
    samples=set()
    for line in lines:
        # ignore headers or comment lines
        if not line.startswith("#"):
            sample_name=line.rstrip().split("\t")[0]
            if sample_name:
                samples.add(sample_name)
            
    # get the names of the expected output files
    demultiplex_fastq_files = utilities.name_files(samples,output_folder,subfolder="demultiplex",extension="fastq")
    
    # name the barcode file with the reverse complement barcodes added
    expanded_barcode_file = utilities.name_files("expanded_barcode_file.txt",output_folder,subfolder="demultiplex",create_folder=True)
    
    # create a file that includes the reverse complements of the barcodes
    workflow.add_task(
        "reverse_compliment_barcodes.py --input [depends[0]] --output [targets[0]]",
        depends=barcode_file,
        targets=expanded_barcode_file)
    
    # check for paired input files
    input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier)
    
    # capture the demultiplex stats in output files, one for each set of input files
    if input_pair1:
        demultiplex_log = utilities.name_files(input_pair1[0],output_folder,subfolder="demultiplex",extension="log")
    else:
        demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log")
        
    # get the output folder for all files
    demultiplex_output_folder = os.path.dirname(demultiplex_log)
    
    # get the basenames of the output files, one for each sample
    demultiplex_output_basenames = utilities.name_files(samples,output_folder,subfolder="demultiplex")
    
    # create a tracked executable
    fastq_multx_tracked = TrackedExecutable("fastq-multx",version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`")
    
    if input_pair1 and input_pair2:
        # this run has paired input files
        # get the second pair identifier
        pair_identifier2=pair_identifier.replace("1","2",1)
        # get the names of the expected output files
        demultiplex_fastq_files_R1 = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames]
        demultiplex_fastq_files_R2 = [file+pair_identifier2+".fastq" for file in demultiplex_output_basenames]
        demultiplex_fastq_files = demultiplex_fastq_files_R1+demultiplex_fastq_files_R2
        
        if index_files:
            # this run has index files
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] [depends[3]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, index_files[0], input_pair1[0], input_pair2[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2],
                targets=demultiplex_log,
                name="demultiplex")
            
        else:
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, input_pair1[0], input_pair2[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2],
                targets=demultiplex_log,
                name="demultiplex")
        
    else:
        # this run has single end input files
        # get the names of the expected output files
        demultiplex_fastq_files = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames]
        
        if index_files:
            # this run has index files
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, index_files[0], input_files[0], fastq_multx_tracked],
                args=[min_phred, demultiplex_output_folder, pair_identifier],
                targets=demultiplex_log,
                name="demultiplex")
            
        else:
            workflow.add_task(
                "fastq-multx -l [depends[0]] [depends[1]] -o [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]",
                depends=[expanded_barcode_file, input_files[0]],
                args=[min_phred, demultiplex_output_folder, pair_identifier, fastq_multx_tracked],
                targets=demultiplex_log,
                name="demultiplex")

    demultiplex_fastq_files = demultiplex_check(workflow, demultiplex_log, demultiplex_fastq_files)


    return demultiplex_fastq_files, demultiplex_output_folder
Ejemplo n.º 26
0
workflow.add_argument("max-strains", desc="the max number of strains to profile", default=20, type=int)

# get the arguments from the command line
args = workflow.parse_args()

# get all input files with the input extension provided on the command line
# return an error if no files are found
input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True)

### STEP #1: Run taxonomic profiling on all of the filtered files ###
if not args.bypass_taxonomic_profiling:
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow,
        input_files,args.output,args.threads,args.input_extension)
elif:
    sample_names = utilities.sample_names(input_files,args.input_extension)
    tsv_profiles = utilities.name_files(sample_names, demultiplex_output_folder, tag="taxonomic_profile", extension="tsv")
    # check all of the expected profiles are found
    if len(tsv_profiles) != len(list(filter(os.path.isfile,tsv_profiles))):
        sys.exit("ERROR: Bypassing taxonomic profiling but all of the tsv taxonomy profile files are not found in the input folder. Expecting the following input files:\n"+"\n".join(tsv_profiles))
    # run taxonomic profile steps bypassing metaphlan2
    merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(workflow,
        tsv_profiles,args.output,args.threads,"tsv",already_profiled=True)
    # look for the sam profiles
    taxonomy_sam_files = utilities.name_files(sample_names, demultiplex_output_folder, tag="bowtie2", extension="sam")
    # if they do not all exist, then bypass strain profiling if not already set
    if len(taxonomy_sam_files) != len(list(filter(os.path.isfile,taxonomy_sam_files))):
        print("Warning: Bypassing taxonomic profiling but not all taxonomy sam files are present in the input folder. Strain profiling will be bypassed. Expecting the following input files:\n"+"\n".join(taxonomy_sam_files))
        args.bypass_strain_profiling = True

### STEP #2: Run strain profiling
# Provide taxonomic profiling output so top strains by abundance will be selected
Ejemplo n.º 27
0
def main(workflow):
    args = workflow.parse_args()

    conf = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    contaminate_db = conf.get('databases').get('knead_dna')

    if data_files and data_files.get('MGX'):
        input_files = data_files.get('MGX').get('input')
        pair_identifier = data_files.get('MGX').get('pair_identifier')
        file_extension = data_files.get('MGX', {}).get('input_extension',
                                                       '.fastq')

        sample_names = get_sample_names(input_files, file_extension)

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, creation_date, 'WGS')
        (deposition_dir, processing_dir, public_dir) = project_dirs
        base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..'))

        manifest_file = stage_files(workflow, [args.manifest_file],
                                    base_depo_dir)
        deposited_files = stage_files(workflow,
                                      input_files,
                                      deposition_dir,
                                      symlink=True)

        if file_extension == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow,
                                           deposited_files,
                                           processing_dir,
                                           paired_end=True,
                                           compress=False,
                                           threads=args.threads)
            pair_identifier = "_R1"
        else:
            paired_end_seqs = input_files

        qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
        (cleaned_fastqs,
         read_counts) = quality_control(workflow,
                                        paired_end_seqs,
                                        '.fastq',
                                        processing_dir,
                                        qc_threads,
                                        contaminate_db,
                                        pair_identifier=pair_identifier,
                                        remove_intermediate_output=True)

        ## Generate taxonomic profile output. Output are stored in a list
        ## and are the following:
        ##
        ##      * Merged taxonomic profile
        ##      * Individual taxonomic files
        ##      * metaphlan2 SAM files
        tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
        tax_profile_outputs = taxonomic_profile(workflow, cleaned_fastqs,
                                                processing_dir, tax_threads,
                                                '.fastq')

        ## Generate functional profile output using humann2. Outputs are the
        ## the following:
        ##
        ##      * Merged normalized genefamilies
        ##      * Merged normalized ecs
        ##      * Merged normalized pathways
        ##      * Merged genefamilies
        ##      * Merged ecs
        ##      * Merged pathways
        func_threads = args.threads_humann if args.threads_humann else args.threads
        func_profile_outputs = functional_profile(
            workflow,
            cleaned_fastqs,
            '.fastq',
            processing_dir,
            func_threads,
            tax_profile_outputs[1],
            remove_intermediate_output=True)

        ## The current biobakery workflows do not generate KO's from our genefamilies
        ## so we're going to want to do that ourselves.
        genefamilies = name_files(sample_names,
                                  os.path.join(processing_dir, 'metaphlan2'),
                                  subfolder='main',
                                  tag='genefamilies',
                                  extension='tsv')
        pathways = name_files(sample_names,
                              os.path.join(processing_dir, 'humann2'),
                              subfolder='main',
                              tag='pathabundance',
                              extension='tsv')
        ecs = name_files(sample_names,
                         os.path.join(processing_dir, 'humann2'),
                         subfolder='regrouped',
                         tag='ecs',
                         extension='tsv')
        kos = name_files(sample_names,
                         os.path.join(processing_dir, 'humann2'),
                         subfolder='regrouped',
                         tag='kos',
                         extension='tsv')

        #(merged_norm_kos, merged_kos) = generate_ko_files(workflow,
        #                                                  genefamilies,
        #                                                  processing_dir)

        biom_files = batch_convert_tsv_to_biom(workflow,
                                               tax_profile_outputs[1])
        tax_biom_files = stage_files(workflow, biom_files, processing_dir)

        kneaddata_log_files = name_files(sample_names,
                                         os.path.join(processing_dir,
                                                      'kneaddata'),
                                         subfolder='main',
                                         extension='log')

        pub_raw_dir = os.path.join(public_dir, 'raw')
        pub_tax_profile_dir = os.path.join(public_dir, 'tax_profile')
        pub_func_profile_dir = os.path.join(public_dir, 'func_profile')
        map(create_folders,
            [pub_raw_dir, pub_tax_profile_dir, pub_func_profile_dir])

        knead_read_counts = os.path.join(processing_dir, 'counts', 'merged',
                                         'kneaddata_read_count_table.tsv')

        tax_profile_pcl = add_metadata_to_tsv(
            workflow, [tax_profile_outputs[0]],
            args.metadata_file,
            'metagenomics',
            id_col=conf.get('metadata_id_col'),
            col_replace=conf.get('analysis_col_patterns'),
            target_cols=conf.get('target_metadata_cols'),
            aux_files=[knead_read_counts])
        func_profile_pcl = add_metadata_to_tsv(
            workflow, [func_profile_outputs[0]],
            args.metadata_file,
            'metagenomics',
            id_col=conf.get('metadata_id_col'),
            col_replace=conf.get('analysis_col_patterns'),
            target_cols=conf.get('target_metadata_cols'),
            aux_files=[knead_read_counts])

        pub_files = [
            stage_files(workflow, files, target_dir) for (files, target_dir) in
            [(cleaned_fastqs,
              pub_raw_dir), ([tax_profile_outputs[0]], pub_tax_profile_dir),
             (tax_profile_outputs[1],
              pub_tax_profile_dir), (
                  tax_biom_files,
                  pub_tax_profile_dir), (tax_profile_pcl, pub_tax_profile_dir),
             (func_profile_outputs,
              pub_func_profile_dir), (
                  func_profile_pcl,
                  pub_func_profile_dir), (kneaddata_log_files, pub_raw_dir)]
        ]

        norm_genefamilies = name_files(sample_names,
                                       os.path.join(processing_dir, 'humann2',
                                                    'relab'),
                                       subfolder='genes',
                                       tag='genefamilies_relab',
                                       extension='tsv')
        norm_ecs_files = name_files(sample_names,
                                    os.path.join(processing_dir, 'humann2',
                                                 'relab'),
                                    subfolder='ecs',
                                    tag='ecs_relab',
                                    extension='tsv')
        norm_path_files = name_files(sample_names,
                                     os.path.join(processing_dir, 'humann2',
                                                  'relab'),
                                     subfolder='pathways',
                                     tag='pathabundance_relab',
                                     extension='tsv')
        norm_kos_files = name_files(sample_names,
                                    os.path.join(processing_dir, 'humann2',
                                                 'relab'),
                                    subfolder='kos_relab',
                                    extension='tsv')

        func_tar_files = []
        for (sample, gene_file, ecs_file,
             path_file) in zip(sample_names, norm_genefamilies, norm_ecs_files,
                               norm_path_files):
            tar_path = os.path.join(pub_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_profile_outputs)
            func_tar_files.append(func_tar_file)

        workflow.go()