Example #1
0
def main(workflow):
    args = workflow.parse_args()

    conf = parse_cfg_file(args.config_file, section='MBX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    dataset_cfg = manifest.get('config')

    if data_files and data_files.get('MBX'):
        input_files = data_files.get('MBX').get('input')
        sample_names = get_sample_names(input_files)

        project_dirs = create_project_dirs([conf.get('deposition_dir'),
                                            conf.get('processing_dir'),
                                            conf.get('public_dir')],
                                            project,
                                            creation_date,
                                            'Metabolomics')

        (deposition_dir, processing_dir, public_dir) = project_dirs
        base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..'))

        manifest_file = stage_files(workflow,
                                    [args.manifest_file],
                                    base_depo_dir)

        deposited_files = stage_files(workflow,
                                      input_files,
                                      deposition_dir)

        # Our metabolite data are just a series of spreadsheets that we are 
        # going to want to append some metadata too. If they are Excel files 
        # we will want to process them and convert to CSV.
        processed_files = excel_to_csv(workflow, 
                                       deposited_files,
                                       processing_dir)

        pcl_files = add_metadata_to_tsv(workflow,
                                        processed_files,
                                        args.metadata_file,
                                        'metabolomics',
                                        conf.get('metadata_id_col'),
                                        metadata_rows=dataset_cfg.get('metadata_rows'),
                                        col_offset=dataset_cfg.get('col_offset'),
                                        target_cols=conf.get('target_metadata_cols', None))

        public_files = stage_files(workflow, pcl_files, public_dir)

        workflow.go()
Example #2
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='MVX')
    knead_human_genome_db = conf.get('databases').get('knead_dna')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')
    submission_date = manifest.get('submission_date')

    if data_files and data_files.get('MVX', {}).get('input'):
        input_files = data_files.get('MVX').get('input')
        input_file_ext = data_files.get('MVX').get('input_file_extension')
        pair_identifier = data_files.get('MVX').get('pair_identifier')

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, submission_date, 'MVX')

        deposited_files = stage_files(workflow,
                                      input_files,
                                      project_dirs[0],
                                      symlink=True)

        mvx_qc_output = shotgun.quality_control(
            workflow,
            input_files,
            project_dirs[1],
            args.threads, [knead_human_genome_db],
            pair_identifier=pair_identifier,
            remove_intermediate_output=True)

        paired_fastq_files = deinterleave_fastq(workflow, input_files,
                                                project_dirs[1])

        paired_fastq_tars = []
        for (mate_1, mate_2) in zip(paired_fastq_files[0],
                                    paired_fastq_files[1]):
            sample_name = sample_names(mate_1,
                                       input_file_ext,
                                       pair_identifier=pair_identifier)
            tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name)
            paired_fastq_tar = tar_files(workflow, [mate_1, mate_2],
                                         tar_path,
                                         depends=[mate_1, mate_2],
                                         compress=False)
            paired_fastq_tars.append(paired_fastq_tar)

    workflow.go()
Example #3
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='HG')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')
    submission_date = manifest.get('submission_date')

    if data_files and data_files.get('HG', {}).get('input'):
        input_files = data_files.get('HG').get('input')

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, submission_date, 'HG')

        deposited_files = stage_files(workflow,
                                      input_files,
                                      project_dirs[0],
                                      symlink=True)
        fastq_files = bam_to_fastq(workflow,
                                   input_files,
                                   project_dirs[1],
                                   paired_end=True,
                                   threads=args.threads,
                                   compress=False)
        paired_fastq_files = paired_files(fastq_files, '_R1')

        paired_fastq_tars = []
        for (mate_1, mate_2) in zip(paired_fastq_files[0],
                                    paired_fastq_files[1]):
            sample_name = sample_names(mate_4, pair_identifier="_R1")
            tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name)
            paired_fastq_tar = tar_files(workflow, [mate_1, mate_2],
                                         tar_path,
                                         depends=[mate_1, mate_2])
            paired_fastq_tars.append(paired_fastq_tar)

        md5sum_files = generate_md5_checksums(workflow, paired_fastq_tars)

        workflow.go()
Example #4
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='16S')

    manifest = parse_cfg_file(args.manifest_file)
    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')

    gg_tax = conf.get('databases').get('gg_taxonomy')
    gg_usearch = conf.get('databases').get('gg_usearch')
    gg_fasta = conf.get('databases').get('gg_fasta')

    if data_files and data_files.get('16S', {}).get('input'):
        input_files = data_files.get('16S').get('input')
        input_extension = data_files.get('16S').get('file_extension')
        barcode_file = data_files.get('16S').get('barcode_file')
        pair_identifier = data_files.get('16S').get('pair_identifier')
        index_identifier = data_files.get('16S').get('index_identifier')

        if index_identifier:
            index_files = [in_file for in_file in input_files if
                        index_identifier in in_file]
            input_files = set(input_files) - set(index_files)

        project_dirs = create_project_dirs([conf.get('deposition_dir'),
                                            conf.get('processing_dir'),
                                            conf.get('public_dir')],
                                           project,
                                           creation_date,
                                           '16S')

        base_depo_dir = os.path.abspath(os.path.join(project_dirs[0], '..'))
        manifest_file = stage_files(workflow,
                                    [args.manifest_file],
                                    base_depo_dir)
        sequence_files = stage_files(workflow,
                                     input_files,
                                     project_dirs[0],
                                     symlink=True)
 
        # An entry point into this pipeline is any analysis conducted by Baylor/CMMR
        # which will require a slight branching of the pipeline utilized.

        # We are making a very fraught assumption here that if only one sequence file 
        # is passed in alongside a centroid FASTA file we are dealing with any files
        # generated by CMMR
        otu_table = data_files.get('16S').get('otu_table')
        centroid_fasta = data_files.get('16S').get('centroid_fasta')
        if otu_table and centroid_fasta:
            if len(sequence_files) < 1:
                merged_fastq = sequence_files[0]

            fixed_otu_table = fix_CMMR_OTU_table_taxonomy_labels(workflow,
                                                                 otu_table,
                                                                 project_dirs[1])
        else:
            if barcode_file:
                sequence_files = demultiplex(workflow,
                                            input_files,
                                            project_dirs[1],
                                            barcode_file,
                                            index_files,
                                            conf.get('min_pred_qc_score'),
                                            pair_identifier)

            merged_fastq = merge_samples_and_rename(workflow,
                                                    sequence_files,
                                                    input_extension,
                                                    project_dirs[1],
                                                    pair_identifier,
                                                    args.threads)

            qc_fasta_outs = quality_control(workflow,
                                            merged_fastq,
                                            project_dirs[1],
                                            args.threads,
                                            conf.get('maxee'),
                                            conf.get('min_trunc_len_max'))

            if not otu_table:
                closed_ref_tsv = taxonomic_profile(workflow,
                                                   qc_fasta_outs[0],
                                                   qc_fasta_outs[1],
                                                   qc_fasta_outs[2],
                                                   project_dirs[1],
                                                   args.threads,
                                                   conf.get('percent_identity'),
                                                   gg_usearch,
                                                   gg_fasta,
                                                   gg_tax,
                                                   conf.get('min_size'))
                
                predict_metagenomes_tsv = functional_profile(workflow,
                                                             closed_ref_tsv,
                                                             project_dirs[1])

        workflow.go()
Example #5
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='proteomics')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')

    if data_files and data_files.get('proteomics'):
        (input_files, output_files) = data_files.get('proteomics').values()

        ## Step #1 - Verify MD5sums of all input data provided to IBDMDB
        ##
        ## Since our proteomics files will be coming from the PNNL our
        ## files won't be in the same location as the Broad files so
        ## we'll need to get MD5's manually supplied.
        validated_files = verify_files(workflow, input_files,
                                       args.checksums_file)

        ## Setup the directories where we will be depositing our files
        date_stamp = str(datetime.date.today())
        base_deposition_dir = os.path.join(conf.get('deposition_dir'), project,
                                           date_stamp)
        deposition_dir = os.path.join(base_deposition_dir, 'proteomics')
        create_folders(deposition_dir)

        processing_dir = os.path.join(conf.get('processing_dir'), project,
                                      date_stamp, 'proteomics')
        create_folders(processing_dir)

        public_dir = os.path.join(conf.get('public_dir'), project, date_stamp,
                                  'proteomics')
        create_folders(public_dir)

        ## Move the manifest file over so we have information about this
        ## batch of data in the deposition directory
        manifest_file = stage_files(workflow, [args.manifest_file],
                                    base_deposition_dir)

        ## Step 2 - Move files over to our deposition directory
        deposited_files = stage_files(workflow, validated_files,
                                      deposition_dir)

        ## Step #3 - Stage files to processing directory
        ##
        ## For the Proteomics data it is ok to symlink these files over from the
        ## data deposition folder because these files aren't actually processed
        ## but we need them to be in place here to show up on the website.
        files_to_process = stage_files(workflow,
                                       deposited_files,
                                       processing_dir,
                                       symlink=True)

        output_files = output_files if output_files else []

        ## We have a dataset specific metadata file that we can incorporate
        ## into the analysis output.
        if output_files and args.data_specific_metadata:
            output_files = add_metadata_to_tsv(
                workflow, output_files, args.data_specific_metadata,
                conf.get('metadata_id_col'),
                conf.get('target_metadata_cols', []))

        ## Step #4 - Stage output files to public folder
        public_files = stage_files(workflow, output_files, public_dir)

        ## TODO: We need to generate metadata files for the output files that
        ## are included with this dataset. Need to talk to George about
        ## getting the ID-mapped version of these files since they will be
        ## needed here.

        ## Step #5 - Make files web-visible by creating the complete.html file
        ## in each of our output directories.
        make_files_web_visible(workflow, [files_to_process, public_files])

        ## Step #6 - Once all the files have been staged we can go ahead and
        ## delete the raw files from their original directory as well as the
        ## MANIFEST file.

        workflow.go()
Example #6
0
def main(workflow):
    args = workflow.parse_args()

    conf_mtx = parse_cfg_file(args.config_file, section='MTX')
    conf_mgx = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    adapters_file = manifest.get('adapters_file')

    contaminate_db = conf_mtx.get('databases').get('knead_dna')
    mtx_db = conf_mtx.get('databases').get('knead_mtx')
    rrna_db = conf_mtx.get('databases').get('knead_rrna')
    adapter_sequences = conf_mtx.get('adapter_sequences')

    qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
    tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
    func_threads = args.threads_humann if args.threads_humann else args.threads

    if data_files and data_files.get('MTX', {}).get('input'):
        input_files_mtx = data_files.get('MTX').get('input')
        file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq')
        pair_identifier_mtx = data_files.get('MTX').get('pair_identifier')
        input_file_tags = data_files.get('MTX').get('tags')
        input_tax_profiles = []

        project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'),
                                                conf_mtx.get('processing_dir'),
                                                conf_mtx.get('public_dir')],
                                               project,
                                               creation_date,
                                               'MTX')
        public_dir_mtx = project_dirs_mtx[-1]
        base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..'))

        manifest_file = stage_files(workflow, 
                                    [args.manifest_file],
                                    base_depo_dir)
        deposited_files_mtx = stage_files(workflow,
                                          input_files_mtx,
                                          project_dirs_mtx[0],
                                          symlink=True)

        if file_extension_mtx == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow, 
                                            deposited_files_mtx, 
                                            project_dirs_mtx[1],
                                            paired_end=True,
                                            compress=False,
                                            threads=args.threads)
            pair_identifier_mtx = "_R1"                                            
        else:
            paired_end_seqs = deposited_files_mtx

        if adapters_file:
            adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE "
                                 "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file)

        (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow,
                                                                paired_end_seqs,
                                                                file_extension_mtx,
                                                                project_dirs_mtx[1],
                                                                qc_threads,
                                                                databases=[contaminate_db,
                                                                           rrna_db,
                                                                           mtx_db],
                                                                pair_identifier=pair_identifier_mtx,
                                                                additional_options=adapter_trim_opts,
                                                                remove_intermediate_output=True)

        sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx)

        ##########################################
        #          MGX FILE PROCESSING           #
        ##########################################
        # Ideally we would be passed in a set of corresponding metagenome
        # sequence(s) to go with our metatranscriptomic files but we also
        # have two other scenarios:
        #
        #       1.) No accompanying metagenomic sequences exist; in this
        #           case we will proceed just using the metatranscriptomic
        #           data.
        #       2.) Taxonomic profiles are passed directly in in our MANIFEST
        #           file; here we remove these from our input files and
        #           prevent them from running through the kneaddata ->
        #           metaphlan2 portions of our pipeline
        if data_files.get('MGX', {}).get('input'):
            input_files_mgx = data_files.get('MGX').get('input')
            file_extension_mgx = data_files.get('MGX').get('file_ext')
            pair_identifier_mgx = data_files.get('MGX').get('pair_identifier')
            input_tax_profiles = [in_file for in_file in input_files_mgx
                                  if 'taxonomic_profile.tsv' in in_file]
            input_files_mgx = set(input_files_mgx) - set(input_tax_profiles)

            if input_files_mgx:
                sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx)

                project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'),
                                                        conf_mgx.get('processing_dir'),
                                                        conf_mgx.get('public_dir')],
                                                       project,
                                                       creation_date,
                                                       'WGS')
                public_dir_mgx = project_dirs_mgx[-1]

                deposited_files_mgx = stage_files(workflow,
                                                  input_files_mgx,
                                                  project_dirs_mgx[0],
                                                  symlink=True)

                if file_extension_mgx == ".bam":
                    ## Need to sort our BAM files to be sure here...
                    paired_end_seqs = bam_to_fastq(workflow, 
                                                    deposited_files_mgx, 
                                                    project_dirs_mgx[1],
                                                    paired_end=True,
                                                    compress=False,
                                                    threads=args.threads)
                    pair_identifier_mgx = "_R1"                                            
                else:
                    paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx)  

                (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow,
                                                                        paired_end_seqs_mgx,
                                                                        project_dirs_mgx[1],
                                                                        qc_threads,
                                                                        [contaminate_db,
                                                                        rrna_db],
                                                                        remove_intermediate_output=True)

                tax_outs_mgx = taxonomic_profile(workflow,
                                                 cleaned_fastqs_mgx,
                                                 project_dirs_mgx[1],
                                                 tax_threads,
                                                 '*.fastq')

                func_outs_mgx = functional_profile(workflow,
                                                   cleaned_fastqs_mgx,
                                                   project_dirs_mgx[1],
                                                   func_threads,
                                                   tax_outs_mgx[1],
                                                   remove_intermediate_output=True)
                input_tax_profiles.extend(tax_outs_mgx[1])

                pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw')
                pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile')
                pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile')
                map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir,
                                    pub_wgs_func_profile_dir])

                norm_genefamilies_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='genes',
                                                tag='genefamilies_relab',
                                                extension='tsv')
                norm_ecs_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='ecs',
                                                tag='genefamilies_ecs_relab',
                                                extension='tsv')
                norm_path_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='pathways',
                                                tag='pathabundance_relab',
                                                extension='tsv')

                pcl_files = add_metadata_to_tsv(workflow,
                                                [tax_outs_mgx[1]] 
                                                + func_outs_mgx,
                                                'metagenomics',
                                                conf_mgx.get('metadata_id_col'),
                                                conf_mgx.get('analysis_col_patterns'),
                                                conf_mgx.get('target_metadata_cols'))
                                      
                func_tar_files_wgs = []
                for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx,
                                                                    norm_genefamilies_mgx,
                                                                    norm_ecs_files_mgx,
                                                                    norm_path_files_mgx):
                    tar_path = os.path.join(pub_wgs_func_profile_dir, 
                                            "%s_humann2.tgz" % sample)
                    func_tar_file = tar_files(workflow,
                                            [gene_file, ecs_file, path_file],
                                            tar_path,
                                            depends=func_outs_mgx)
                    func_tar_files_wgs.append(func_tar_file)

        ##########################################
        #          MTX FILE PROCESSING           #
        ##########################################
        # Here we want to see if we can create a set of matching cleaned
        # MTX files to corresponding MGX taxonomic profiles. If these exist
        # we want to run functional profiling wit hthe corresponding MGX
        # taxonomic profile otherwise we will run a taxonomic profiling
        # on the MTX sequences and run functional profiling with the produced
        # taxonomic profile.
        func_outs_match_mtx = []
        if input_tax_profiles:
            (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx,
                                                                     '.fastq',
                                                                     data_files.get('MTX').get('metadata_id_col', 'External ID'),
                                                                     input_tax_profiles,
                                                                     data_files.get('MGX').get('tax_profile_id', 'External ID'),
                                                                     args.metadata_file,
                                                                     tags=input_file_tags)

            func_outs_match_mtx = functional_profile(workflow,
                                                     matched_fqs,
                                                     project_dirs_mtx[1],
                                                     func_threads,
                                                     matched_tax_profiles,
                                                     remove_intermediate_output=True)

            # Reset the remaining MTX files left over here so that we can run them through
            # the metaphlan2 -> humann2 pipeline.
            cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs)

        if cleaned_fastqs_mtx:
            tax_outs_mtx = taxonomic_profile(workflow,
                                             cleaned_fastqs_mtx,
                                             project_dirs_mtx[1],
                                             tax_threads,
                                             '*.fastq')
            func_outs_mtx = functional_profile(workflow,
                                               cleaned_fastqs_mtx,
                                               file_extension_mtx,
                                               project_dirs_mtx[1],
                                               func_threads,
                                               tax_outs_mtx[1],
                                               remove_intermediate_output=True)
            func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx)
        else:
            func_outs_mtx = func_outs_match_mtx

        # We'll need to generate DNA/RNA normalized files to be displayed 
        # in our visualization output.
        (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow,
                                                                        func_outs_mgx[0],
                                                                        func_outs_mgx[1],
                                                                        func_outs_mgx[2],
                                                                        func_outs_mtx[0],
                                                                        func_outs_mtx[1],
                                                                        func_outs_mtx[2],
                                                                        project_dirs_mtx[1])

        pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw')
        pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile')
        pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile')
        map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir,
                             pub_mtx_func_profile_dir])

        norm_genefamilies_mtx = name_files(sample_names_mtx,
                                           project_dirs_mtx[1],
                                           subfolder='genes',
                                           tag='genefamilies_relab',
                                           extension='tsv')
        norm_ecs_files_mtx = name_files(sample_names_mtx,
                                        project_dirs_mtx[1],
                                        subfolder='ecs',
                                        tag='genefamilies_ecs_relab',
                                        extension='tsv')
        norm_path_files_mtx = name_files(sample_names_mtx,
                                         project_dirs_mtx[1],
                                         subfolder='pathways',
                                         tag='pathabundance_relab',
                                         extension='tsv')

        func_tar_files_mtx = []
        for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx,
                                                            norm_genefamilies_mtx,
                                                            norm_ecs_files_mtx,
                                                            norm_path_files_mtx):
            tar_path = os.path.join(pub_mtx_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_outs_mtx)
            func_tar_files_mtx.append(func_tar_file)
    
        workflow.go()
Example #7
0
def main(workflow):
    args = workflow.parse_args()

    conf = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    contaminate_db = conf.get('databases').get('knead_dna')

    if data_files and data_files.get('MGX'):
        input_files = data_files.get('MGX').get('input')
        pair_identifier = data_files.get('MGX').get('pair_identifier')
        file_extension = data_files.get('MGX', {}).get('input_extension',
                                                       '.fastq')

        sample_names = get_sample_names(input_files, file_extension)

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, creation_date, 'WGS')
        (deposition_dir, processing_dir, public_dir) = project_dirs
        base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..'))

        manifest_file = stage_files(workflow, [args.manifest_file],
                                    base_depo_dir)
        deposited_files = stage_files(workflow,
                                      input_files,
                                      deposition_dir,
                                      symlink=True)

        if file_extension == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow,
                                           deposited_files,
                                           processing_dir,
                                           paired_end=True,
                                           compress=False,
                                           threads=args.threads)
            pair_identifier = "_R1"
        else:
            paired_end_seqs = input_files

        qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
        (cleaned_fastqs,
         read_counts) = quality_control(workflow,
                                        paired_end_seqs,
                                        '.fastq',
                                        processing_dir,
                                        qc_threads,
                                        contaminate_db,
                                        pair_identifier=pair_identifier,
                                        remove_intermediate_output=True)

        ## Generate taxonomic profile output. Output are stored in a list
        ## and are the following:
        ##
        ##      * Merged taxonomic profile
        ##      * Individual taxonomic files
        ##      * metaphlan2 SAM files
        tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
        tax_profile_outputs = taxonomic_profile(workflow, cleaned_fastqs,
                                                processing_dir, tax_threads,
                                                '.fastq')

        ## Generate functional profile output using humann2. Outputs are the
        ## the following:
        ##
        ##      * Merged normalized genefamilies
        ##      * Merged normalized ecs
        ##      * Merged normalized pathways
        ##      * Merged genefamilies
        ##      * Merged ecs
        ##      * Merged pathways
        func_threads = args.threads_humann if args.threads_humann else args.threads
        func_profile_outputs = functional_profile(
            workflow,
            cleaned_fastqs,
            '.fastq',
            processing_dir,
            func_threads,
            tax_profile_outputs[1],
            remove_intermediate_output=True)

        ## The current biobakery workflows do not generate KO's from our genefamilies
        ## so we're going to want to do that ourselves.
        genefamilies = name_files(sample_names,
                                  os.path.join(processing_dir, 'metaphlan2'),
                                  subfolder='main',
                                  tag='genefamilies',
                                  extension='tsv')
        pathways = name_files(sample_names,
                              os.path.join(processing_dir, 'humann2'),
                              subfolder='main',
                              tag='pathabundance',
                              extension='tsv')
        ecs = name_files(sample_names,
                         os.path.join(processing_dir, 'humann2'),
                         subfolder='regrouped',
                         tag='ecs',
                         extension='tsv')
        kos = name_files(sample_names,
                         os.path.join(processing_dir, 'humann2'),
                         subfolder='regrouped',
                         tag='kos',
                         extension='tsv')

        #(merged_norm_kos, merged_kos) = generate_ko_files(workflow,
        #                                                  genefamilies,
        #                                                  processing_dir)

        biom_files = batch_convert_tsv_to_biom(workflow,
                                               tax_profile_outputs[1])
        tax_biom_files = stage_files(workflow, biom_files, processing_dir)

        kneaddata_log_files = name_files(sample_names,
                                         os.path.join(processing_dir,
                                                      'kneaddata'),
                                         subfolder='main',
                                         extension='log')

        pub_raw_dir = os.path.join(public_dir, 'raw')
        pub_tax_profile_dir = os.path.join(public_dir, 'tax_profile')
        pub_func_profile_dir = os.path.join(public_dir, 'func_profile')
        map(create_folders,
            [pub_raw_dir, pub_tax_profile_dir, pub_func_profile_dir])

        knead_read_counts = os.path.join(processing_dir, 'counts', 'merged',
                                         'kneaddata_read_count_table.tsv')

        tax_profile_pcl = add_metadata_to_tsv(
            workflow, [tax_profile_outputs[0]],
            args.metadata_file,
            'metagenomics',
            id_col=conf.get('metadata_id_col'),
            col_replace=conf.get('analysis_col_patterns'),
            target_cols=conf.get('target_metadata_cols'),
            aux_files=[knead_read_counts])
        func_profile_pcl = add_metadata_to_tsv(
            workflow, [func_profile_outputs[0]],
            args.metadata_file,
            'metagenomics',
            id_col=conf.get('metadata_id_col'),
            col_replace=conf.get('analysis_col_patterns'),
            target_cols=conf.get('target_metadata_cols'),
            aux_files=[knead_read_counts])

        pub_files = [
            stage_files(workflow, files, target_dir) for (files, target_dir) in
            [(cleaned_fastqs,
              pub_raw_dir), ([tax_profile_outputs[0]], pub_tax_profile_dir),
             (tax_profile_outputs[1],
              pub_tax_profile_dir), (
                  tax_biom_files,
                  pub_tax_profile_dir), (tax_profile_pcl, pub_tax_profile_dir),
             (func_profile_outputs,
              pub_func_profile_dir), (
                  func_profile_pcl,
                  pub_func_profile_dir), (kneaddata_log_files, pub_raw_dir)]
        ]

        norm_genefamilies = name_files(sample_names,
                                       os.path.join(processing_dir, 'humann2',
                                                    'relab'),
                                       subfolder='genes',
                                       tag='genefamilies_relab',
                                       extension='tsv')
        norm_ecs_files = name_files(sample_names,
                                    os.path.join(processing_dir, 'humann2',
                                                 'relab'),
                                    subfolder='ecs',
                                    tag='ecs_relab',
                                    extension='tsv')
        norm_path_files = name_files(sample_names,
                                     os.path.join(processing_dir, 'humann2',
                                                  'relab'),
                                     subfolder='pathways',
                                     tag='pathabundance_relab',
                                     extension='tsv')
        norm_kos_files = name_files(sample_names,
                                    os.path.join(processing_dir, 'humann2',
                                                 'relab'),
                                    subfolder='kos_relab',
                                    extension='tsv')

        func_tar_files = []
        for (sample, gene_file, ecs_file,
             path_file) in zip(sample_names, norm_genefamilies, norm_ecs_files,
                               norm_path_files):
            tar_path = os.path.join(pub_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_profile_outputs)
            func_tar_files.append(func_tar_file)

        workflow.go()