Esempio n. 1
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='MVX')
    knead_human_genome_db = conf.get('databases').get('knead_dna')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')
    submission_date = manifest.get('submission_date')

    if data_files and data_files.get('MVX', {}).get('input'):
        input_files = data_files.get('MVX').get('input')
        input_file_ext = data_files.get('MVX').get('input_file_extension')
        pair_identifier = data_files.get('MVX').get('pair_identifier')

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, submission_date, 'MVX')

        deposited_files = stage_files(workflow,
                                      input_files,
                                      project_dirs[0],
                                      symlink=True)

        mvx_qc_output = shotgun.quality_control(
            workflow,
            input_files,
            project_dirs[1],
            args.threads, [knead_human_genome_db],
            pair_identifier=pair_identifier,
            remove_intermediate_output=True)

        paired_fastq_files = deinterleave_fastq(workflow, input_files,
                                                project_dirs[1])

        paired_fastq_tars = []
        for (mate_1, mate_2) in zip(paired_fastq_files[0],
                                    paired_fastq_files[1]):
            sample_name = sample_names(mate_1,
                                       input_file_ext,
                                       pair_identifier=pair_identifier)
            tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name)
            paired_fastq_tar = tar_files(workflow, [mate_1, mate_2],
                                         tar_path,
                                         depends=[mate_1, mate_2],
                                         compress=False)
            paired_fastq_tars.append(paired_fastq_tar)

    workflow.go()
Esempio n. 2
0
def main(workflow):
    args = workflow.parse_args()

    conf = parse_cfg_file(args.config_file, section='MBX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    dataset_cfg = manifest.get('config')

    if data_files and data_files.get('MBX'):
        input_files = data_files.get('MBX').get('input')
        sample_names = get_sample_names(input_files)

        project_dirs = create_project_dirs([conf.get('deposition_dir'),
                                            conf.get('processing_dir'),
                                            conf.get('public_dir')],
                                            project,
                                            creation_date,
                                            'Metabolomics')

        (deposition_dir, processing_dir, public_dir) = project_dirs
        base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..'))

        manifest_file = stage_files(workflow,
                                    [args.manifest_file],
                                    base_depo_dir)

        deposited_files = stage_files(workflow,
                                      input_files,
                                      deposition_dir)

        # Our metabolite data are just a series of spreadsheets that we are 
        # going to want to append some metadata too. If they are Excel files 
        # we will want to process them and convert to CSV.
        processed_files = excel_to_csv(workflow, 
                                       deposited_files,
                                       processing_dir)

        pcl_files = add_metadata_to_tsv(workflow,
                                        processed_files,
                                        args.metadata_file,
                                        'metabolomics',
                                        conf.get('metadata_id_col'),
                                        metadata_rows=dataset_cfg.get('metadata_rows'),
                                        col_offset=dataset_cfg.get('col_offset'),
                                        target_cols=conf.get('target_metadata_cols', None))

        public_files = stage_files(workflow, pcl_files, public_dir)

        workflow.go()
Esempio n. 3
0
def main(args):
    metadata_df = pd.read_csv(args.metadata_file, dtype='str')
    input_analysis_df = pd.read_table(args.input_analysis_file, dtype='str')
    analysis_cols = input_analysis_df.columns.tolist()[1:]
    config = parse_cfg_file(args.config_file)

    metadata_subset_df = metadata_df[metadata_df['data_type'] ==
                                     args.data_type]

    if (args.old_id not in metadata_df.columns
            or args.new_id not in metadata_df.columns):
        raise ValueError('Could not find current column identifier or new '
                         'column identifier in HMP2 metadata file.')

    (column_mapping,
     not_found) = get_column_mapping(metadata_subset_df, analysis_cols,
                                     args.old_id, args.new_id, args.data_type,
                                     config, args.no_tag)

    ## TODO: Deal with the not found IDs here at some point.
    input_analysis_df.rename(columns=column_mapping, inplace=True)

    filter_cols = column_mapping.values()
    filter_cols.insert(0, input_analysis_df.columns[0])
    input_analysis_df = input_analysis_df.filter(filter_cols)

    input_analysis_df.to_csv(args.output_file,
                             sep="\t",
                             index=False,
                             na_rep="NA")
Esempio n. 4
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='HG')

    ## Parse the manifest file containing all data files from this submission
    manifest = parse_cfg_file(args.manifest_file)
    project = manifest.get('project')
    data_files = manifest.get('submitted_files')
    submission_date = manifest.get('submission_date')

    if data_files and data_files.get('HG', {}).get('input'):
        input_files = data_files.get('HG').get('input')

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, submission_date, 'HG')

        deposited_files = stage_files(workflow,
                                      input_files,
                                      project_dirs[0],
                                      symlink=True)
        fastq_files = bam_to_fastq(workflow,
                                   input_files,
                                   project_dirs[1],
                                   paired_end=True,
                                   threads=args.threads,
                                   compress=False)
        paired_fastq_files = paired_files(fastq_files, '_R1')

        paired_fastq_tars = []
        for (mate_1, mate_2) in zip(paired_fastq_files[0],
                                    paired_fastq_files[1]):
            sample_name = sample_names(mate_4, pair_identifier="_R1")
            tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name)
            paired_fastq_tar = tar_files(workflow, [mate_1, mate_2],
                                         tar_path,
                                         depends=[mate_1, mate_2])
            paired_fastq_tars.append(paired_fastq_tar)

        md5sum_files = generate_md5_checksums(workflow, paired_fastq_tars)

        workflow.go()
Esempio n. 5
0
def main(workflow):
    args = workflow.parse_args()

    config = parse_cfg_file(args.config_file)
    manifest = parse_cfg_file(args.manifest_file)

    if manifest.get('submitted_files'):
        data_files = manifest.get('submitted_files')

        ## Validate our metadata files
        validate_metadata_file(workflow, args.studytrax_metadata_file,
                               config.get('validators').get('studytrax'))
        validate_metadata_file(
            workflow, args.broad_sample_tracking_file,
            config.get('validators').get('broad_sample_status'))
        validate_metadata_file(workflow, args.proteomics_metadata,
                               config.get('validators').get('proteomics'))

        metadata_file = None
        metadata_dir = os.path.join(config.get('public_dir'),
                                    config.get('project'), 'metadata')

        new_metadata_files = generate_metadata_file(
            workflow, config, data_files, args.studytrax_metadata_file,
            args.broad_sample_tracking_file, args.auxillary_metadata)

        metadata_file = merge_metadata_files(new_metadata_files, metadata_dir)

        if args.metadata_file:
            metadata_file = merge_metadata_files([args.metadata_file] +
                                                 metadata_file, metadata_dir)

        final_metadata_file = make_metadata_human_readable(metadata_file)
        validate_metadata_file(workflow, final_metadata_file,
                               config.get('validators').get('hmp2_metadata'))

        #pub_metadata_file = stage_files(workflow, metadata_file,
        #                                metadata_dir)
        workflow.go()
def main(args):
    config = parse_cfg_file(args.config_file, section=args.data_type)
    analysis_col_patterns = config.get('analysis_col_patterns')
    target_metadata_cols = funcy.flatten(config.get('target_metadata_cols'))
    data_type_label = config.get('data_type_mapping').get(args.data_type)

    output_pcl = add_metadata_to_tsv(args.input_file,
                                     args.metadata_file,
                                     data_type_label,
                                     args.id_col,
                                     analysis_col_patterns,
                                     args.drop_missing_cols,
                                     target_metadata_cols,
                                     supplement=args.supplement)
Esempio n. 7
0
def main(args):
    manifest = parse_cfg_file(args.input_manifest)
    analysis_cfg = parse_cfg_file(args.config_file)
    mapping_cols = analysis_cfg.get('mapping_columns')

    data_files = manifest.get('submitted_files')
    studytrax_df = pd.read_csv(args.studytrax_metadata, dtype='str')
    broad_sample_df = pd.read_csv(args.broad_sample_sheet, dtype='str')

    # When we have some samples that don't map to their desired columns
    # we are going to want to run them through every single sample ID column
    # and see if we can find a hit.
    clinical_search_cols = [
        'st_q13', 'st_q12', 'st_q10', 'st_q4', 'st_q17', 'st_q11'
    ]
    broad_search_cols = [
        'Viromics', 'MbX', 'Proteomics', 'Parent Sample A', 'Parent Sample B',
        'DNA/RNA'
    ]

    if data_files:
        for (dtype, mapping_cols) in [('MVX', mapping_cols.get('MVX')),
                                      ('HTX', mapping_cols.get('HTX')),
                                      ('RRBS', mapping_cols.get('RRBS')),
                                      ('SER', mapping_cols.get('SER')),
                                      ('HG', mapping_cols.get('HG'))]:

            if not data_files.get(dtype):
                continue

            samples = set([
                os.path.splitext(os.path.basename(sample_id))[0]
                for sample_id in data_files.get(dtype).get('input')
            ])

            if not samples:
                continue

            mapping_cols = funcy.flatten(mapping_cols)
            found_samples = check_sample_mapping(samples, studytrax_df,
                                                 mapping_cols)
            print "Correctly mapped %s samples" % len(found_samples)

            missing_samples = samples - found_samples
            print "%s samples were not mapped" % len(missing_samples)
            print

            if len(missing_samples) == 0:
                continue

            # Sometimes we have a combination of characters that were recorded incorrectly
            # (i.e. 1 becomes I or O becomes 0 so we want to isolate and test for all of these)
            for (orig_char, replace_char) in [('1', 'I'), ('O', '0'),
                                              ('I', '1'), ('0', 'O'),
                                              ('SM-', 'SM')]:
                mod_samples = map(lambda s: s.replace(orig_char, replace_char),
                                  missing_samples)
                found_samples = check_sample_mapping(mod_samples, studytrax_df,
                                                     mapping_cols)

                if found_samples:
                    print "Found %s more samples after replacing character %s with %s in sample IDs:" % (
                        len(found_samples), orig_char, replace_char)
                    print "\n".join(found_samples)
                    print

                    found_samples = map(
                        lambda s: s.replace(replace_char, orig_char),
                        found_samples)
                    missing_samples = missing_samples - set(found_samples)

            for col in clinical_search_cols:
                found_samples = check_sample_mapping(missing_samples,
                                                     studytrax_df, [col])

                if found_samples:
                    print
                    print "Found %s samples for data type %s in incorrect column %s:" % (
                        len(found_samples), dtype, col)
                    print "\n".join(list(found_samples))
                    print

                    missing_samples = missing_samples - found_samples

            # For Viromics data we can reference the Broad tracking sheet.
            if dtype == 'MVX':
                for col in broad_search_cols:
                    found_samples = check_sample_mapping(
                        missing_samples, broad_sample_df, [col])

                    if found_samples:
                        print
                        print "Found %s samples for data type %s in Broad tracking sheet, column %s:" % (
                            len(found_samples), dtype, col)
                        print "\n".join(list(found_samples))
                        print

                    missing_samples = missing_samples - found_samples

            print "Final missing samples: %s" % (len(missing_samples))
            print "\n".join(missing_samples)
Esempio n. 8
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file, section='16S')

    manifest = parse_cfg_file(args.manifest_file)
    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')

    gg_tax = conf.get('databases').get('gg_taxonomy')
    gg_usearch = conf.get('databases').get('gg_usearch')
    gg_fasta = conf.get('databases').get('gg_fasta')

    if data_files and data_files.get('16S', {}).get('input'):
        input_files = data_files.get('16S').get('input')
        input_extension = data_files.get('16S').get('file_extension')
        barcode_file = data_files.get('16S').get('barcode_file')
        pair_identifier = data_files.get('16S').get('pair_identifier')
        index_identifier = data_files.get('16S').get('index_identifier')

        if index_identifier:
            index_files = [in_file for in_file in input_files if
                        index_identifier in in_file]
            input_files = set(input_files) - set(index_files)

        project_dirs = create_project_dirs([conf.get('deposition_dir'),
                                            conf.get('processing_dir'),
                                            conf.get('public_dir')],
                                           project,
                                           creation_date,
                                           '16S')

        base_depo_dir = os.path.abspath(os.path.join(project_dirs[0], '..'))
        manifest_file = stage_files(workflow,
                                    [args.manifest_file],
                                    base_depo_dir)
        sequence_files = stage_files(workflow,
                                     input_files,
                                     project_dirs[0],
                                     symlink=True)
 
        # An entry point into this pipeline is any analysis conducted by Baylor/CMMR
        # which will require a slight branching of the pipeline utilized.

        # We are making a very fraught assumption here that if only one sequence file 
        # is passed in alongside a centroid FASTA file we are dealing with any files
        # generated by CMMR
        otu_table = data_files.get('16S').get('otu_table')
        centroid_fasta = data_files.get('16S').get('centroid_fasta')
        if otu_table and centroid_fasta:
            if len(sequence_files) < 1:
                merged_fastq = sequence_files[0]

            fixed_otu_table = fix_CMMR_OTU_table_taxonomy_labels(workflow,
                                                                 otu_table,
                                                                 project_dirs[1])
        else:
            if barcode_file:
                sequence_files = demultiplex(workflow,
                                            input_files,
                                            project_dirs[1],
                                            barcode_file,
                                            index_files,
                                            conf.get('min_pred_qc_score'),
                                            pair_identifier)

            merged_fastq = merge_samples_and_rename(workflow,
                                                    sequence_files,
                                                    input_extension,
                                                    project_dirs[1],
                                                    pair_identifier,
                                                    args.threads)

            qc_fasta_outs = quality_control(workflow,
                                            merged_fastq,
                                            project_dirs[1],
                                            args.threads,
                                            conf.get('maxee'),
                                            conf.get('min_trunc_len_max'))

            if not otu_table:
                closed_ref_tsv = taxonomic_profile(workflow,
                                                   qc_fasta_outs[0],
                                                   qc_fasta_outs[1],
                                                   qc_fasta_outs[2],
                                                   project_dirs[1],
                                                   args.threads,
                                                   conf.get('percent_identity'),
                                                   gg_usearch,
                                                   gg_fasta,
                                                   gg_tax,
                                                   conf.get('min_size'))
                
                predict_metagenomes_tsv = functional_profile(workflow,
                                                             closed_ref_tsv,
                                                             project_dirs[1])

        workflow.go()
Esempio n. 9
0
def main(workflow):
    args = workflow.parse_args()

    conf_mtx = parse_cfg_file(args.config_file, section='MTX')
    conf_mgx = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    adapters_file = manifest.get('adapters_file')

    contaminate_db = conf_mtx.get('databases').get('knead_dna')
    mtx_db = conf_mtx.get('databases').get('knead_mtx')
    rrna_db = conf_mtx.get('databases').get('knead_rrna')
    adapter_sequences = conf_mtx.get('adapter_sequences')

    qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
    tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
    func_threads = args.threads_humann if args.threads_humann else args.threads

    if data_files and data_files.get('MTX', {}).get('input'):
        input_files_mtx = data_files.get('MTX').get('input')
        file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq')
        pair_identifier_mtx = data_files.get('MTX').get('pair_identifier')
        input_file_tags = data_files.get('MTX').get('tags')
        input_tax_profiles = []

        project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'),
                                                conf_mtx.get('processing_dir'),
                                                conf_mtx.get('public_dir')],
                                               project,
                                               creation_date,
                                               'MTX')
        public_dir_mtx = project_dirs_mtx[-1]
        base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..'))

        manifest_file = stage_files(workflow, 
                                    [args.manifest_file],
                                    base_depo_dir)
        deposited_files_mtx = stage_files(workflow,
                                          input_files_mtx,
                                          project_dirs_mtx[0],
                                          symlink=True)

        if file_extension_mtx == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow, 
                                            deposited_files_mtx, 
                                            project_dirs_mtx[1],
                                            paired_end=True,
                                            compress=False,
                                            threads=args.threads)
            pair_identifier_mtx = "_R1"                                            
        else:
            paired_end_seqs = deposited_files_mtx

        if adapters_file:
            adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE "
                                 "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file)

        (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow,
                                                                paired_end_seqs,
                                                                file_extension_mtx,
                                                                project_dirs_mtx[1],
                                                                qc_threads,
                                                                databases=[contaminate_db,
                                                                           rrna_db,
                                                                           mtx_db],
                                                                pair_identifier=pair_identifier_mtx,
                                                                additional_options=adapter_trim_opts,
                                                                remove_intermediate_output=True)

        sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx)

        ##########################################
        #          MGX FILE PROCESSING           #
        ##########################################
        # Ideally we would be passed in a set of corresponding metagenome
        # sequence(s) to go with our metatranscriptomic files but we also
        # have two other scenarios:
        #
        #       1.) No accompanying metagenomic sequences exist; in this
        #           case we will proceed just using the metatranscriptomic
        #           data.
        #       2.) Taxonomic profiles are passed directly in in our MANIFEST
        #           file; here we remove these from our input files and
        #           prevent them from running through the kneaddata ->
        #           metaphlan2 portions of our pipeline
        if data_files.get('MGX', {}).get('input'):
            input_files_mgx = data_files.get('MGX').get('input')
            file_extension_mgx = data_files.get('MGX').get('file_ext')
            pair_identifier_mgx = data_files.get('MGX').get('pair_identifier')
            input_tax_profiles = [in_file for in_file in input_files_mgx
                                  if 'taxonomic_profile.tsv' in in_file]
            input_files_mgx = set(input_files_mgx) - set(input_tax_profiles)

            if input_files_mgx:
                sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx)

                project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'),
                                                        conf_mgx.get('processing_dir'),
                                                        conf_mgx.get('public_dir')],
                                                       project,
                                                       creation_date,
                                                       'WGS')
                public_dir_mgx = project_dirs_mgx[-1]

                deposited_files_mgx = stage_files(workflow,
                                                  input_files_mgx,
                                                  project_dirs_mgx[0],
                                                  symlink=True)

                if file_extension_mgx == ".bam":
                    ## Need to sort our BAM files to be sure here...
                    paired_end_seqs = bam_to_fastq(workflow, 
                                                    deposited_files_mgx, 
                                                    project_dirs_mgx[1],
                                                    paired_end=True,
                                                    compress=False,
                                                    threads=args.threads)
                    pair_identifier_mgx = "_R1"                                            
                else:
                    paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx)  

                (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow,
                                                                        paired_end_seqs_mgx,
                                                                        project_dirs_mgx[1],
                                                                        qc_threads,
                                                                        [contaminate_db,
                                                                        rrna_db],
                                                                        remove_intermediate_output=True)

                tax_outs_mgx = taxonomic_profile(workflow,
                                                 cleaned_fastqs_mgx,
                                                 project_dirs_mgx[1],
                                                 tax_threads,
                                                 '*.fastq')

                func_outs_mgx = functional_profile(workflow,
                                                   cleaned_fastqs_mgx,
                                                   project_dirs_mgx[1],
                                                   func_threads,
                                                   tax_outs_mgx[1],
                                                   remove_intermediate_output=True)
                input_tax_profiles.extend(tax_outs_mgx[1])

                pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw')
                pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile')
                pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile')
                map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir,
                                    pub_wgs_func_profile_dir])

                norm_genefamilies_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='genes',
                                                tag='genefamilies_relab',
                                                extension='tsv')
                norm_ecs_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='ecs',
                                                tag='genefamilies_ecs_relab',
                                                extension='tsv')
                norm_path_files_mgx = name_files(sample_names,
                                                project_dirs_mgx[1],
                                                subfolder='pathways',
                                                tag='pathabundance_relab',
                                                extension='tsv')

                pcl_files = add_metadata_to_tsv(workflow,
                                                [tax_outs_mgx[1]] 
                                                + func_outs_mgx,
                                                'metagenomics',
                                                conf_mgx.get('metadata_id_col'),
                                                conf_mgx.get('analysis_col_patterns'),
                                                conf_mgx.get('target_metadata_cols'))
                                      
                func_tar_files_wgs = []
                for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx,
                                                                    norm_genefamilies_mgx,
                                                                    norm_ecs_files_mgx,
                                                                    norm_path_files_mgx):
                    tar_path = os.path.join(pub_wgs_func_profile_dir, 
                                            "%s_humann2.tgz" % sample)
                    func_tar_file = tar_files(workflow,
                                            [gene_file, ecs_file, path_file],
                                            tar_path,
                                            depends=func_outs_mgx)
                    func_tar_files_wgs.append(func_tar_file)

        ##########################################
        #          MTX FILE PROCESSING           #
        ##########################################
        # Here we want to see if we can create a set of matching cleaned
        # MTX files to corresponding MGX taxonomic profiles. If these exist
        # we want to run functional profiling wit hthe corresponding MGX
        # taxonomic profile otherwise we will run a taxonomic profiling
        # on the MTX sequences and run functional profiling with the produced
        # taxonomic profile.
        func_outs_match_mtx = []
        if input_tax_profiles:
            (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx,
                                                                     '.fastq',
                                                                     data_files.get('MTX').get('metadata_id_col', 'External ID'),
                                                                     input_tax_profiles,
                                                                     data_files.get('MGX').get('tax_profile_id', 'External ID'),
                                                                     args.metadata_file,
                                                                     tags=input_file_tags)

            func_outs_match_mtx = functional_profile(workflow,
                                                     matched_fqs,
                                                     project_dirs_mtx[1],
                                                     func_threads,
                                                     matched_tax_profiles,
                                                     remove_intermediate_output=True)

            # Reset the remaining MTX files left over here so that we can run them through
            # the metaphlan2 -> humann2 pipeline.
            cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs)

        if cleaned_fastqs_mtx:
            tax_outs_mtx = taxonomic_profile(workflow,
                                             cleaned_fastqs_mtx,
                                             project_dirs_mtx[1],
                                             tax_threads,
                                             '*.fastq')
            func_outs_mtx = functional_profile(workflow,
                                               cleaned_fastqs_mtx,
                                               file_extension_mtx,
                                               project_dirs_mtx[1],
                                               func_threads,
                                               tax_outs_mtx[1],
                                               remove_intermediate_output=True)
            func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx)
        else:
            func_outs_mtx = func_outs_match_mtx

        # We'll need to generate DNA/RNA normalized files to be displayed 
        # in our visualization output.
        (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow,
                                                                        func_outs_mgx[0],
                                                                        func_outs_mgx[1],
                                                                        func_outs_mgx[2],
                                                                        func_outs_mtx[0],
                                                                        func_outs_mtx[1],
                                                                        func_outs_mtx[2],
                                                                        project_dirs_mtx[1])

        pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw')
        pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile')
        pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile')
        map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir,
                             pub_mtx_func_profile_dir])

        norm_genefamilies_mtx = name_files(sample_names_mtx,
                                           project_dirs_mtx[1],
                                           subfolder='genes',
                                           tag='genefamilies_relab',
                                           extension='tsv')
        norm_ecs_files_mtx = name_files(sample_names_mtx,
                                        project_dirs_mtx[1],
                                        subfolder='ecs',
                                        tag='genefamilies_ecs_relab',
                                        extension='tsv')
        norm_path_files_mtx = name_files(sample_names_mtx,
                                         project_dirs_mtx[1],
                                         subfolder='pathways',
                                         tag='pathabundance_relab',
                                         extension='tsv')

        func_tar_files_mtx = []
        for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx,
                                                            norm_genefamilies_mtx,
                                                            norm_ecs_files_mtx,
                                                            norm_path_files_mtx):
            tar_path = os.path.join(pub_mtx_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_outs_mtx)
            func_tar_files_mtx.append(func_tar_file)
    
        workflow.go()
def main(args):
    config = parse_cfg_file(args.config)

    study_trax_df = pd.read_csv(args.studytrax_metadata, dtype='str')
    broad_sample_df = pd.read_csv(args.broad_sample_tracking,
                                  na_values=['destroyed', 'missed'],
                                  parse_dates=['Actual Date of Receipt'])
    proteomics_df = None
    metadata_df = None
    new_metadata_df = None

    date_today = datetime.date.today()
    metadata_file = os.path.join(args.output_dir,
                                 'hmp2_metadata_%s.csv' % date_today)

    ## Before we filter our metadata rows down to just to rows associated
    ## with the files we have present, we'll want a list of all the collection
    ## dates
    collection_dates_dict = get_collection_dates(broad_sample_df)

    biopsy_date_map = None
    if args.proteomics_metadata:
        proteomics_df = pd.read_table(args.proteomics_metadata)
    if args.biopsy_dates:
        biopsy_date_map = parse_biopsy_dates(args.biopsy_dates)

    ## The update procedure either assumes that we have an exisitng metadata
    ## file that we are going to be appending too/updating or that we are
    ## creating a fresh metadata sheet and will be adding the files in the
    ## manifest file too it.
    ## TODO: This needs to be re-worked to account for snagging datatypes as as well.
    #if not args.metadata_file or args.refresh_all:
    #    sequence_files.extend(get_all_sequence_files(config.get('deposition_dir'),
    #                                                 config.get('input_extensions')))
    if args.manifest_file:
        manifest = parse_cfg_file(args.manifest_file)
        submitted_files = manifest.get('submitted_files')

        if submitted_files:
            new_metadata = []
            for (dtype, items) in submitted_files.iteritems():
                input_files = items.get('input')
                pair_identifier = items.get('pair_identifier')

                if pair_identifier:
                    (input_pair1, input_pair2) = bb_utils.paired_files(
                        input_files, pair_identifier)
                    input_files = input_pair1 if input_pair1 else input_files

                else:
                    new_metadata.append(
                        get_metadata_rows(config, study_trax_df,
                                          broad_sample_df, proteomics_df,
                                          dtype, input_files, pair_identifier))

            new_metadata_df = pd.concat(new_metadata, ignore_index=True)

            #new_metadata_df[new_metadata_df['External ID'].isnull()] = None
            new_metadata_df['Site/Sub/Coll ID'] = new_metadata_df[
                'Site/Sub/Coll'].map(lambda sid: str(sid))
            #new_metadata_df['Participant ID'] = new_metadata_df['Subject'].map(lambda subj: 'C' + str(subj))
            if 'Collection #' in new_metadata_df.columns:
                new_metadata_df['visit_num'] = new_metadata_df['Collection #']
            new_metadata_df['Project'] = new_metadata_df.apply(get_project_id,
                                                               axis=1)
            new_metadata_df['ProjectSpecificID'] = pd.to_numeric(
                new_metadata_df['ProjectSpecificID'])
            new_metadata_df['Site'] = new_metadata_df['SiteName']
            new_metadata_df = new_metadata_df.apply(generate_external_id,
                                                    axis=1)

            new_metadata_df = remove_columns(new_metadata_df,
                                             config.get('drop_cols'))

    if args.metadata_file:
        metadata_df = pd.read_csv(args.metadata_file,
                                  parse_dates=['Actual Date of Receipt'])

        site_mapping = config.get('site_map')
        metadata_df['Site/Sub/Coll ID'] = metadata_df.apply(
            fix_site_sub_coll_id, args=(site_mapping, ), axis=1)
        metadata_df['PDO Number'] = metadata_df.apply(get_pdo_number, axis=1)

        if new_metadata_df and not new_metadata_df.empty:
            metadata_df = pd.concat([metadata_df, new_metadata_df],
                                    ignore_index=True)
            metadata_df = metadata_df.drop_duplicates(
                subset=['External ID', 'Site/Sub/Coll ID', 'data_type'],
                keep='last')
    else:
        metadata_df = new_metadata_df

    metadata_df[metadata_df['External ID'].isnull()] = metadata_df[
        metadata_df['External ID'].isnull()].apply(generate_external_id,
                                                   axis=1)

    if args.auxillary_metadata:
        for aux_file in args.auxillary_metadata:
            supp_df = pd.read_table(aux_file)
            supp_columns = supp_df.columns.tolist()

            idx_offset = 1
            if 'data_type' in supp_columns:
                join_id = supp_columns[:1] + ['data_type']
                idx_offset = 2
            else:
                join_id = supp_columns[0]

            ## We need to do this in two stages. If the columns already exist
            ## here we want to update them. If they do not exist we append
            ## them.
            metadata_cols = metadata_df.columns.tolist()
            new_cols = set(supp_columns[idx_offset:]) - set(metadata_cols)
            existing_cols = set(
                supp_columns[idx_offset:]).intersection(metadata_cols)

            if new_cols:
                supp_new_df = supp_df.filter(items=supp_columns[:idx_offset] +
                                             list(new_cols))
                metadata_df = metadata_df.merge(supp_new_df,
                                                how='left',
                                                on=join_id)

            if existing_cols:
                supp_existing_df = supp_df.filter(
                    items=supp_columns[:idx_offset] + list(existing_cols))
                metadata_df.set_index(join_id, inplace=True)
                supp_existing_df.set_index(join_id, inplace=True)

                metadata_df.update(supp_existing_df)
                metadata_df.reset_index(inplace=True)

    if args.add_all_stool_collections:
        metadata_df = add_all_stool_collections(metadata_df, study_trax_df,
                                                broad_sample_df)

    metadata_df['Actual Date of Receipt'] = pd.to_datetime(
        metadata_df['Actual Date of Receipt'])
    metadata_df['visit_num'] = metadata_df.apply(fill_visit_nums, axis=1)

    metadata_df['hbi_score'] = pd.to_numeric(metadata_df['hbi_score'])
    if 'Site' in metadata_df.columns.tolist():
        metadata_df['SiteName'] = metadata_df['Site']
    else:
        metadata_df['Site'] = metadata_df['SiteName']

    ## Couple small remaining changes
    metadata_df.ix[metadata_df.hbi_score > 900, 'hbi_score'] = None
    metadata_df.ix[metadata_df.consent_age > 150, 'consent_age'] = None
    metadata_df['total_reads'].loc[metadata_df['total_reads'].astype(
        'str').str.startswith('PDO')] = None
    metadata_df['Research Project'] = "ibdmdb"

    metadata_df = generate_collection_statistics(metadata_df,
                                                 collection_dates_dict,
                                                 biopsy_date_map)
    metadata_df = add_baseline_metadata_values(metadata_df, study_trax_df,
                                               config.get('baseline_cols'))

    metadata_df[metadata_df['SiteName'].isnull()] = fix_site_name(
        metadata_df[metadata_df['SiteName'].isnull()])
    metadata_df = reorder_columns(metadata_df, config.get('col_order'))
    metadata_df.drop(['Site'], 1, inplace=True)

    metadata_df = metadata_df.sort_values(
        ['data_type', 'Participant ID', 'visit_num'])
    metadata_df.to_csv(metadata_file, index=False)
Esempio n. 11
0
def main(workflow):
    args = workflow.parse_args()
    conf = parse_cfg_file(args.config_file)
    data_type_mapping = conf.get('datatype_mapping')

    manifest = parse_cfg_file(args.manifest_file)
    data_files = manifest.get('submitted_files')

    metadata_df = pd.read_csv(args.metadata_file)
    baseline_metadata_df = pd.read_csv(args.baseline_metadata_file)
    md5sums_map = {}

    ## For every set of data files we're going to want to iterate over each  
    ## section and process the data files one section at a time, handling 
    ## the unique pieces of metadata for each data type as needed.
    if data_files:
        username = conf.get('username')
        password = conf.get('password')
        session = cutlass.iHMPSession(username, password, ssl=False)

        dcc_objs = []
        dcc_project = dcc.get_project(conf, session)
        dcc_study = dcc.crud_study(conf, 
                                   session,
                                   dcc_project.id)
        dcc_subjects = dcc.group_osdf_objects(dcc_study.subjects(),
                                              'rand_subject_id')
        dcc_subjects = dcc.crud_subjects(dcc_subjects, dcc_study, baseline_metadata_df, conf)

        for data_type in data_files:
            dtype_metadata = conf.get(data_type)

            input_files = data_files.get(data_type, {}).get('input')
            output_files = data_files.get(data_type, {}).get('output')
            md5sums_file = data_files.get(data_type).get('md5sums_file')
            file_tags = data_files.get(data_type).get('tags', [])

            if md5sums_file:
                md5sums_map.update(parse_checksums_file(md5sums_file))
            else:
                raise ValueError("MD5 checksums file is required.")

            ## Getting data files from different souces means that the 
            ## identifier we use to map a file to a piece of metadata may 
            ## be different. We need to account for this and create a map 
            ## of the 'universal ID' back to the specific file it references
            id_cols = conf['metadata_id_mappings'][data_type]
            seq_fname_map = dcc.create_seq_fname_map(data_type, input_files, tags=file_tags)
 
            sample_ids = seq_fname_map.keys()
            dtype_name = data_type_mapping.get(data_type)
            id_col = conf['metadata_id_mappings'][data_type]

            sample_metadata_df = metadata_df[(metadata_df[id_col].isin(sample_ids)) &
                                             (metadata_df['data_type'] == dtype_name)]

            ## Just in case there are more samples
            missing_samples = set(sample_ids) - set(sample_metadata_df[id_col].tolist())

            ## In our proteomics dataset we occasionally see two datasets tied to the same
            ## sample so we need to do a little extra work to figure out which dataset
            ## we are working with.
            is_proteomics = True if data_type == "proteomics" else False

            ## Add an extra column to our sample metadata with the corresponding sequence file 
            ## for easy access later on.
            #sample_metadata_df['seq_file'] = None
            sample_metadata_df = sample_metadata_df.apply(dcc.map_sample_id_to_file,
                                                          args=(id_col, seq_fname_map, 
                                                                is_proteomics),
                                                          axis=1)
            #sample_metadata_df = sample_metadata_df.dropna(axis=0, subset=['seq_file'])
            
            output_files_map = None
            if output_files:
                ## Do a bunch of stuff here since we have output files
                output_files_map = dcc.create_output_file_map(data_type, output_files, tags=file_tags)
    
            for (subject_id, metadata) in sample_metadata_df.groupby(['Participant ID']):
                dcc_subject = dcc_subjects.get(subject_id[1:])
                if dcc_subject:
                    dcc_subject = dcc_subject[0]
                else:
                    raise ValueError('Could not find Subject object for subject ID %s' % subject_id)                        

                dcc_visits = dcc.group_osdf_objects(dcc_subject.visits(),
                                                    'visit_id')
                
                for (idx, row) in metadata.iterrows():
                    dcc_visit = dcc.crud_visit(dcc_visits, 
                                               row['visit_num'],
                                               dcc_subject.id,
                                               data_type,
                                               row,
                                               conf)
                    dcc_visits.setdefault(dcc_visit.visit_id, []).append(dcc_visit)

                    dcc_samples = dcc.group_osdf_objects(dcc_visit.samples(),
                                                         'name')
                    dcc_sample = dcc.crud_sample(dcc_samples,
                                                 row.get('site_sub_coll'),
                                                 dcc_visit.id, 
                                                 conf,
                                                 row)

                    input_dcc_objs = []
                    if data_type == "MBX": 
                        dcc_prep = dcc.crud_host_assay_prep(dcc_sample, 
                                                            conf.get('data_study'),
                                                            data_type,
                                                            conf.get(data_type),
                                                            row)

                        ## MBX is a curious case in that we have multiple raw files (and multiple outputs) so 
                        ## we'll need to handle all these files (in another loop.) I need to refactor this 
                        ## to be way more elegant in the future.
                        ##
                        ## This is going to be a bit trickier than other data types because of that 
                        input_metabolomes = row.filter(like='metabolome')

                        for (metabolome_type, metabolome_file) in input_metabolomes.iteritems():
                            metabolome_fname = os.path.basename(metabolome_file[0])
                            analysis_type = metabolome_type.split('_', 1)[-1]

                            dcc_seq_obj = dcc.crud_metabolome(dcc_prep,
                                                              metabolome_file[0],
                                                              md5sums_map.get(metabolome_fname),
                                                              dcc_sample.name,
                                                              conf.get('data_study'),
                                                              dtype_metadata,
                                                              row)
                            input_dcc_objs.append(dcc_seq_obj)
                    elif data_type == "MPX":
                        #url_param = '_raw_url'
                        dcc_prep = dcc.crud_microb_assay_prep(dcc_sample,
                                                                conf.get('data_study'),
                                                                data_type,
                                                                dtype_metadata,
                                                                row)
                        dcc_seq_obj = dcc.crud_proteome(dcc_prep,
                                                        file_md5sum,
                                                        dcc_sample.name,
                                                        dtype_metadata,
                                                        row) 
                    elif data_type == "HTX":
                        dcc_prep = dcc.crud_host_seq_prep(dcc_sample,
                                                            conf.get('data_study'),
                                                            data_type,
                                                            dtype_metadata,
                                                            row)
                        dcc_seq_obj = dcc.crud_host_tx_raw_seq_set(dcc_prep,
                                                                    file_md5sum,
                                                                    dcc_sample.name,
                                                                    conf.get(data_type),
                                                                    row)
                    elif data_type == "HG":
                        dcc_prep =  dcc.crud_host_seq_prep(dcc_sample,
                                                            conf.get('data_study'),
                                                            data_type,
                                                            dtype_metadata,
                                                            row)
                        dcc_seq_obj = dcc.crud_host_wgs_raw_seq_set(dcc_prep,
                                                                    file_md5sum,
                                                                    dcc_sample.name,
                                                                    dtype_metadata,
                                                                    row)
                    elif data_type == "MTX":
                        mtx_raw_seq_set = row.get('microb_transcriptomics_raw_seq_set')
                        mtx_raw_fname = os.path.basename(mtx_raw_seq_set[0])

                        dcc_prep = dcc.crud_wgs_dna_prep(dcc_sample,
                                                         conf.get('data_study'),
                                                         data_type,
                                                         dtype_metadata,
                                                         row)
                        dcc_seq_obj = dcc.crud_microb_transcriptomics_raw_seq_set(dcc_prep,
                                                                                  mtx_raw_seq_set[0],
                                                                                  md5sums_map.get(mtx_raw_fname),
                                                                                  dcc_sample.name,
                                                                                  dtype_metadata,
                                                                                  row)
                    elif data_type == "MGX":
                        wgs_raw_seq_set = row.get('wgs_raw_seq_set')
                        wgs_raw_fname = os.path.basename(wgs_raw_seq_set[0])

                        dcc_prep = dcc.crud_wgs_dna_prep(dcc_sample,
                                                            conf.get('data_study'),
                                                            data_type,
                                                            dtype_metadata,
                                                            row)
                        dcc_seq_obj = dcc.crud_wgs_raw_seq_set(dcc_prep,
                                                               wgs_raw_seq_set[0],
                                                               md5sums_map.get(wgs_raw_fname),
                                                               dcc_sample.name,
                                                               dtype_metadata,
                                                               row)
                    elif data_type == "MVX":
                        raw_seq_set_fname = os.path.basename(row.get('wgs_raw_seq_set')[0])
                        viral_seq_set_fname = os.path.basename(row.get('viral_seq_set')[0])

                        dcc_prep = dcc.crud_wgs_dna_prep(dcc_sample,
                                                         conf.get('data_study'),
                                                         data_type,
                                                         dtype_metadata,
                                                         row) 
                        dcc_raw_seq_set = dcc.crud_wgs_raw_seq_set(dcc_prep,
                                                                   row.get('wgs_raw_seq_set')[0],
                                                                   md5sums_map.get(raw_seq_set_fname),
                                                                   dcc_sample.name,
                                                                   dtype_metadata,
                                                                   row)
                        dcc_viral_seq_set = dcc.crud_viral_seq_set(dcc_raw_seq_set,
                                                                   row.get('viral_seq_set')[0],
                                                                   md5sums_map.get(viral_seq_set_fname),
                                                                   dtype_metadata,
                                                                   row)
                        input_dcc_objs.extend([dcc_raw_seq_set, dcc_viral_seq_set])

                    elif data_type == '16SBP' or data_type == "16S":
                        raw_seq_set_fname = os.path.basename(row.get('16S_raw_seq_set')[0])
                        trimmed_seq_set_fname = os.path.basename(row.get('16S_trimmed_seq_set')[0])

                        dcc_prep = dcc.crud_sixs_dna_prep(dcc_sample,
                                                          conf.get('data_study'),
                                                          data_type,
                                                          dtype_metadata,
                                                          row)
                        dcc_raw_seq_set = dcc.crud_sixs_raw_seq_set(dcc_prep,
                                                                    md5sums_map.get(raw_seq_set_fname),
                                                                    dtype_metadata,
                                                                    row)
                        dcc_trimmed_seq_set = dcc.crud_sixs_trimmed_seq_set(dcc_raw_seq_set,
                                                                            md5sums_map.get(trimmed_seq_set_fname),
                                                                            dtype_metadata,
                                                                            row)
                        input_dcc_objs.extend([dcc_raw_seq_set, dcc_trimmed_seq_set])
                    elif data_type == 'RRBS':
                        raw_epigenetics_seq_set = row.get('host_epigenetics_raw_seq_set')
                        raw_epigenetics_fname = os.path.basename(raw_epigenetics_seq_set[0])

                        dcc_prep = dcc.crud_host_seq_prep(dcc_sample,
                                                            conf.get('data_study'),
                                                            data_type,
                                                            dtype_metadata,
                                                            row)
                        dcc_seq_obj = dcc.crud_host_epigenetics_raw_seq_set(session,
                                                                            dcc_prep,
                                                                            raw_epigenetics_seq_set[0],
                                                                            md5sums_map.get(raw_epigenetics_fname),
                                                                            dcc_sample.name,
                                                                            conf.get('data_study'),
                                                                            conf.get(data_type),
                                                                            row)
                    elif data_type == 'SER':
                        dcc_prep = dcc.crud_host_assay_prep(dcc_sample, 
                                                            conf.get('data_study'),
                                                            data_type,
                                                            conf.get(data_type),
                                                            row)
                        dcc_seq_obj = dcc.crud_serology(session,
                                                        dcc_prep, 
                                                        file_md5sum,
                                                        dcc_sample.name,
                                                        conf.get('data_study'),
                                                        dtype_metadata,
                                                        row)

                    if len(input_dcc_objs) == 0:
                        input_dcc_objs.append(dcc_seq_obj)

                    ##uploaded_files = upload_data_files(workflow, input_dcc_objs)

                    ## The only output type currently supported are AbundanceMatrices 
                    ## so those are the only we will work with. Short-sided and 
                    ## ugly but can re-work this later.
                    if output_files_map and row.get('External ID') in output_files_map:
                        seq_out_files = output_files_map.get(row.get('External ID'))
                        dcc_output_objs = []

                        for (output_ftype, output_files) in seq_out_files.iteritems():
                            for output_file in output_files:
                                output_filename = os.path.basename(output_file)
                                output_md5sum = md5sums_map.get(output_filename)

                                if not output_md5sum:
                                    raise ValueError("Could not find md5sum for file", output_filename)

                                if data_type == "MBX" and file_tags:
                                    output_base = os.path.splitext(output_filename)

                                    ## MBX data is a bit tricky since we can have multiple inputs and outputs
                                    ## that need to be threaded together.
                                    analysis_type = output_base.split('_', 1)[-1]
                                    dcc_parent_obj = next((p for p in input_dcc_objs if analysis_type in p.urls[0]), None)
                                else:
                                    dcc_parent_obj = input_dcc_objs[-1]

                                ## We need a special case here when dealing with Host Genomes... 
                                ## TODO: Clean this up to make this a lot better...
                                if data_type == "HG":
                                    dcc_output_obj  = dcc.crud_host_variant_call(session,
                                                                                dcc_parent_obj,
                                                                                output_file,
                                                                                output_md5sum,
                                                                                conf.get('data_study'),
                                                                                dtype_metadata,
                                                                                row)
                                else:
                                    dcc_output_obj = dcc.crud_abundance_matrix(session,
                                                                            dcc_parent_obj,
                                                                            output_file,
                                                                            output_md5sum,
                                                                            dcc_sample.name,
                                                                            conf.get('data_study'),
                                                                            dtype_metadata,
                                                                            row)

                                dcc_output_objs.append(dcc_output_obj)

                        uploaded_file = upload_data_files(workflow, dcc_output_objs)
Esempio n. 12
0
def main(workflow):
    args = workflow.parse_args()

    conf = parse_cfg_file(args.config_file, section='MGX')
    manifest = parse_cfg_file(args.manifest_file)

    data_files = manifest.get('submitted_files')
    project = manifest.get('project')
    creation_date = manifest.get('submission_date')
    contaminate_db = conf.get('databases').get('knead_dna')

    if data_files and data_files.get('MGX'):
        input_files = data_files.get('MGX').get('input')
        pair_identifier = data_files.get('MGX').get('pair_identifier')
        file_extension = data_files.get('MGX', {}).get('input_extension',
                                                       '.fastq')

        sample_names = get_sample_names(input_files, file_extension)

        project_dirs = create_project_dirs([
            conf.get('deposition_dir'),
            conf.get('processing_dir'),
            conf.get('public_dir')
        ], project, creation_date, 'WGS')
        (deposition_dir, processing_dir, public_dir) = project_dirs
        base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..'))

        manifest_file = stage_files(workflow, [args.manifest_file],
                                    base_depo_dir)
        deposited_files = stage_files(workflow,
                                      input_files,
                                      deposition_dir,
                                      symlink=True)

        if file_extension == ".bam":
            ## Need to sort our BAM files to be sure here...
            paired_end_seqs = bam_to_fastq(workflow,
                                           deposited_files,
                                           processing_dir,
                                           paired_end=True,
                                           compress=False,
                                           threads=args.threads)
            pair_identifier = "_R1"
        else:
            paired_end_seqs = input_files

        qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads
        (cleaned_fastqs,
         read_counts) = quality_control(workflow,
                                        paired_end_seqs,
                                        '.fastq',
                                        processing_dir,
                                        qc_threads,
                                        contaminate_db,
                                        pair_identifier=pair_identifier,
                                        remove_intermediate_output=True)

        ## Generate taxonomic profile output. Output are stored in a list
        ## and are the following:
        ##
        ##      * Merged taxonomic profile
        ##      * Individual taxonomic files
        ##      * metaphlan2 SAM files
        tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads
        tax_profile_outputs = taxonomic_profile(workflow, cleaned_fastqs,
                                                processing_dir, tax_threads,
                                                '.fastq')

        ## Generate functional profile output using humann2. Outputs are the
        ## the following:
        ##
        ##      * Merged normalized genefamilies
        ##      * Merged normalized ecs
        ##      * Merged normalized pathways
        ##      * Merged genefamilies
        ##      * Merged ecs
        ##      * Merged pathways
        func_threads = args.threads_humann if args.threads_humann else args.threads
        func_profile_outputs = functional_profile(
            workflow,
            cleaned_fastqs,
            '.fastq',
            processing_dir,
            func_threads,
            tax_profile_outputs[1],
            remove_intermediate_output=True)

        ## The current biobakery workflows do not generate KO's from our genefamilies
        ## so we're going to want to do that ourselves.
        genefamilies = name_files(sample_names,
                                  os.path.join(processing_dir, 'metaphlan2'),
                                  subfolder='main',
                                  tag='genefamilies',
                                  extension='tsv')
        pathways = name_files(sample_names,
                              os.path.join(processing_dir, 'humann2'),
                              subfolder='main',
                              tag='pathabundance',
                              extension='tsv')
        ecs = name_files(sample_names,
                         os.path.join(processing_dir, 'humann2'),
                         subfolder='regrouped',
                         tag='ecs',
                         extension='tsv')
        kos = name_files(sample_names,
                         os.path.join(processing_dir, 'humann2'),
                         subfolder='regrouped',
                         tag='kos',
                         extension='tsv')

        #(merged_norm_kos, merged_kos) = generate_ko_files(workflow,
        #                                                  genefamilies,
        #                                                  processing_dir)

        biom_files = batch_convert_tsv_to_biom(workflow,
                                               tax_profile_outputs[1])
        tax_biom_files = stage_files(workflow, biom_files, processing_dir)

        kneaddata_log_files = name_files(sample_names,
                                         os.path.join(processing_dir,
                                                      'kneaddata'),
                                         subfolder='main',
                                         extension='log')

        pub_raw_dir = os.path.join(public_dir, 'raw')
        pub_tax_profile_dir = os.path.join(public_dir, 'tax_profile')
        pub_func_profile_dir = os.path.join(public_dir, 'func_profile')
        map(create_folders,
            [pub_raw_dir, pub_tax_profile_dir, pub_func_profile_dir])

        knead_read_counts = os.path.join(processing_dir, 'counts', 'merged',
                                         'kneaddata_read_count_table.tsv')

        tax_profile_pcl = add_metadata_to_tsv(
            workflow, [tax_profile_outputs[0]],
            args.metadata_file,
            'metagenomics',
            id_col=conf.get('metadata_id_col'),
            col_replace=conf.get('analysis_col_patterns'),
            target_cols=conf.get('target_metadata_cols'),
            aux_files=[knead_read_counts])
        func_profile_pcl = add_metadata_to_tsv(
            workflow, [func_profile_outputs[0]],
            args.metadata_file,
            'metagenomics',
            id_col=conf.get('metadata_id_col'),
            col_replace=conf.get('analysis_col_patterns'),
            target_cols=conf.get('target_metadata_cols'),
            aux_files=[knead_read_counts])

        pub_files = [
            stage_files(workflow, files, target_dir) for (files, target_dir) in
            [(cleaned_fastqs,
              pub_raw_dir), ([tax_profile_outputs[0]], pub_tax_profile_dir),
             (tax_profile_outputs[1],
              pub_tax_profile_dir), (
                  tax_biom_files,
                  pub_tax_profile_dir), (tax_profile_pcl, pub_tax_profile_dir),
             (func_profile_outputs,
              pub_func_profile_dir), (
                  func_profile_pcl,
                  pub_func_profile_dir), (kneaddata_log_files, pub_raw_dir)]
        ]

        norm_genefamilies = name_files(sample_names,
                                       os.path.join(processing_dir, 'humann2',
                                                    'relab'),
                                       subfolder='genes',
                                       tag='genefamilies_relab',
                                       extension='tsv')
        norm_ecs_files = name_files(sample_names,
                                    os.path.join(processing_dir, 'humann2',
                                                 'relab'),
                                    subfolder='ecs',
                                    tag='ecs_relab',
                                    extension='tsv')
        norm_path_files = name_files(sample_names,
                                     os.path.join(processing_dir, 'humann2',
                                                  'relab'),
                                     subfolder='pathways',
                                     tag='pathabundance_relab',
                                     extension='tsv')
        norm_kos_files = name_files(sample_names,
                                    os.path.join(processing_dir, 'humann2',
                                                 'relab'),
                                    subfolder='kos_relab',
                                    extension='tsv')

        func_tar_files = []
        for (sample, gene_file, ecs_file,
             path_file) in zip(sample_names, norm_genefamilies, norm_ecs_files,
                               norm_path_files):
            tar_path = os.path.join(pub_func_profile_dir,
                                    "%s_humann2.tgz" % sample)
            func_tar_file = tar_files(workflow,
                                      [gene_file, ecs_file, path_file],
                                      tar_path,
                                      depends=func_profile_outputs)
            func_tar_files.append(func_tar_file)

        workflow.go()
Esempio n. 13
0
def main(args):
    ## First parse the metadata file
    metadata_df = pd.read_csv(args.metadata_file, dtype='object')
    metadata_conf = parse_cfg_file(args.config_file)
    conf_rename_cols = metadata_conf.get('col_rename')
    conf_recode_cols = metadata_conf.get('value_recode')

    ## Then parse the data dictionary
    dictionary_df = pd.read_excel(args.data_dictionary)

    ## Now let's create a dictionary lookup for the coded to human readable
    col_name_lookup = pd.Series(dictionary_df['Variable Name'].values,
                                index=dictionary_df['Code'].values)
    col_name_lookup = dict((k, v) for (k, v) in col_name_lookup.iteritems())

    value_field_lookup = {}
    dictionary_df[
        dictionary_df['Pick Lists  (Value, Missing, Name)'].notnull()].apply(
            populate_value_lookup, axis=1, args=(value_field_lookup, ))

    value_field_lookup['diagnosis'] = dict(
        (key.replace('.0', ''), val)
        for (key, val) in value_field_lookup['diagnosis'].iteritems())

    ## Replace coded values
    metadata_df.replace(value_field_lookup, inplace=True)

    ## Replace some other left-over yes/no fields
    replace_yes_no = {'0': 'No', '1': 'Yes', '0.0': 'No', '1.0': 'Yes'}
    replace_cols = [
        'bx_q31', 'bx_q33', 'bx_q35', 'i_q3', 'i_q4', 'i_q5', 'i_q6', 'i_q7',
        'i_q8', 'i_q9', 'i_q10', 'i_q11', 'i_q12', 'i_q13', 'i_q14', 'i_q15',
        'ic_q1', 'ic_q5', 'ic_q6', 'i_q16', 'i_q17', 'i_q18', 'i_q19', 'i_q20',
        'i_q21', 'i_q22', 'i_q23', 'i_q24', 'i_q25', 'i_q26', 'i_q27', 'i_q28',
        'i_q29', 'i_q30', 'i_q31', 'i_q32', 'i_q33', 'i_q34', 'i_q35', 'i_q36',
        'i_q37', 'i_q38', 'i_q39', 'i_q40', 'i_q41', 'i_q42', 'i_q43', 'i_q44',
        'i_q45', 'i_q46', 'i_q47', 'i_q48', 'i_q49', 'i_q50', 'bl_q12',
        'bl_q14', 'bl_q16', 'st_q1', 'st_q9', 'st_q19', 'st_q21', 'st_q23',
        'hbi_q2', 'hbi_q9', 'hbi_q10', 'hbi_q11', 'hbi_q12', 'hbi_q13',
        'hbi_q14', 'hbi_q15', 'hbi_q16', 'sccai_q2', 'sccai_q2', 'sccai_q11',
        'sccai_q12', 'sccai_q13', 'sccai_q14', 'ses_score2', 'mbs_q15',
        'dr_q2', 'dr_q2a', 'dr_q2b', 'dr_q2c', 'dr_q3', 'dr_q4', 'dr_q5',
        'dr_q6', 'dr_q7'
    ]

    map(lambda field: metadata_df[field].replace(replace_yes_no, inplace=True),
        replace_cols)
    [
        metadata_df[field].replace(replace_val, inplace=True)
        for (field, replace_val) in conf_recode_cols.iteritems()
    ]

    ## Drop a couple of the confusing biopsy location columns
    drop_cols = [
        'bx_q8', 'bx_q10', 'bx_q16', 'bx_q18', 'bx_q24', 'bx_q26',
        'Site/Sub/Coll'
    ]
    map(lambda col_name: metadata_df.drop(col_name, axis=1, inplace=True),
        drop_cols)

    ## We need to sanitize some of the disease location columns
    metadata_df['mc_q4'] = metadata_df['mc_q4'].apply(
        lambda x: x.replace(" ", "").split('(')[0] if pd.notnull(x) else x)
    metadata_df['mc_q7'] = metadata_df['mc_q7'].apply(
        lambda x: x.replace(" ", "").split('(')[0] if pd.notnull(x) else x)

    ## Rename and write out new CSV file
    col_name_lookup.update(conf_rename_cols)
    metadata_df.rename(columns=col_name_lookup, inplace=True)
    metadata_df.to_csv(args.output_file, index=False)