def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file, section='MVX') knead_human_genome_db = conf.get('databases').get('knead_dna') ## Parse the manifest file containing all data files from this submission manifest = parse_cfg_file(args.manifest_file) project = manifest.get('project') data_files = manifest.get('submitted_files') submission_date = manifest.get('submission_date') if data_files and data_files.get('MVX', {}).get('input'): input_files = data_files.get('MVX').get('input') input_file_ext = data_files.get('MVX').get('input_file_extension') pair_identifier = data_files.get('MVX').get('pair_identifier') project_dirs = create_project_dirs([ conf.get('deposition_dir'), conf.get('processing_dir'), conf.get('public_dir') ], project, submission_date, 'MVX') deposited_files = stage_files(workflow, input_files, project_dirs[0], symlink=True) mvx_qc_output = shotgun.quality_control( workflow, input_files, project_dirs[1], args.threads, [knead_human_genome_db], pair_identifier=pair_identifier, remove_intermediate_output=True) paired_fastq_files = deinterleave_fastq(workflow, input_files, project_dirs[1]) paired_fastq_tars = [] for (mate_1, mate_2) in zip(paired_fastq_files[0], paired_fastq_files[1]): sample_name = sample_names(mate_1, input_file_ext, pair_identifier=pair_identifier) tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name) paired_fastq_tar = tar_files(workflow, [mate_1, mate_2], tar_path, depends=[mate_1, mate_2], compress=False) paired_fastq_tars.append(paired_fastq_tar) workflow.go()
def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file, section='MBX') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') project = manifest.get('project') creation_date = manifest.get('submission_date') dataset_cfg = manifest.get('config') if data_files and data_files.get('MBX'): input_files = data_files.get('MBX').get('input') sample_names = get_sample_names(input_files) project_dirs = create_project_dirs([conf.get('deposition_dir'), conf.get('processing_dir'), conf.get('public_dir')], project, creation_date, 'Metabolomics') (deposition_dir, processing_dir, public_dir) = project_dirs base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..')) manifest_file = stage_files(workflow, [args.manifest_file], base_depo_dir) deposited_files = stage_files(workflow, input_files, deposition_dir) # Our metabolite data are just a series of spreadsheets that we are # going to want to append some metadata too. If they are Excel files # we will want to process them and convert to CSV. processed_files = excel_to_csv(workflow, deposited_files, processing_dir) pcl_files = add_metadata_to_tsv(workflow, processed_files, args.metadata_file, 'metabolomics', conf.get('metadata_id_col'), metadata_rows=dataset_cfg.get('metadata_rows'), col_offset=dataset_cfg.get('col_offset'), target_cols=conf.get('target_metadata_cols', None)) public_files = stage_files(workflow, pcl_files, public_dir) workflow.go()
def main(args): metadata_df = pd.read_csv(args.metadata_file, dtype='str') input_analysis_df = pd.read_table(args.input_analysis_file, dtype='str') analysis_cols = input_analysis_df.columns.tolist()[1:] config = parse_cfg_file(args.config_file) metadata_subset_df = metadata_df[metadata_df['data_type'] == args.data_type] if (args.old_id not in metadata_df.columns or args.new_id not in metadata_df.columns): raise ValueError('Could not find current column identifier or new ' 'column identifier in HMP2 metadata file.') (column_mapping, not_found) = get_column_mapping(metadata_subset_df, analysis_cols, args.old_id, args.new_id, args.data_type, config, args.no_tag) ## TODO: Deal with the not found IDs here at some point. input_analysis_df.rename(columns=column_mapping, inplace=True) filter_cols = column_mapping.values() filter_cols.insert(0, input_analysis_df.columns[0]) input_analysis_df = input_analysis_df.filter(filter_cols) input_analysis_df.to_csv(args.output_file, sep="\t", index=False, na_rep="NA")
def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file, section='HG') ## Parse the manifest file containing all data files from this submission manifest = parse_cfg_file(args.manifest_file) project = manifest.get('project') data_files = manifest.get('submitted_files') submission_date = manifest.get('submission_date') if data_files and data_files.get('HG', {}).get('input'): input_files = data_files.get('HG').get('input') project_dirs = create_project_dirs([ conf.get('deposition_dir'), conf.get('processing_dir'), conf.get('public_dir') ], project, submission_date, 'HG') deposited_files = stage_files(workflow, input_files, project_dirs[0], symlink=True) fastq_files = bam_to_fastq(workflow, input_files, project_dirs[1], paired_end=True, threads=args.threads, compress=False) paired_fastq_files = paired_files(fastq_files, '_R1') paired_fastq_tars = [] for (mate_1, mate_2) in zip(paired_fastq_files[0], paired_fastq_files[1]): sample_name = sample_names(mate_4, pair_identifier="_R1") tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name) paired_fastq_tar = tar_files(workflow, [mate_1, mate_2], tar_path, depends=[mate_1, mate_2]) paired_fastq_tars.append(paired_fastq_tar) md5sum_files = generate_md5_checksums(workflow, paired_fastq_tars) workflow.go()
def main(workflow): args = workflow.parse_args() config = parse_cfg_file(args.config_file) manifest = parse_cfg_file(args.manifest_file) if manifest.get('submitted_files'): data_files = manifest.get('submitted_files') ## Validate our metadata files validate_metadata_file(workflow, args.studytrax_metadata_file, config.get('validators').get('studytrax')) validate_metadata_file( workflow, args.broad_sample_tracking_file, config.get('validators').get('broad_sample_status')) validate_metadata_file(workflow, args.proteomics_metadata, config.get('validators').get('proteomics')) metadata_file = None metadata_dir = os.path.join(config.get('public_dir'), config.get('project'), 'metadata') new_metadata_files = generate_metadata_file( workflow, config, data_files, args.studytrax_metadata_file, args.broad_sample_tracking_file, args.auxillary_metadata) metadata_file = merge_metadata_files(new_metadata_files, metadata_dir) if args.metadata_file: metadata_file = merge_metadata_files([args.metadata_file] + metadata_file, metadata_dir) final_metadata_file = make_metadata_human_readable(metadata_file) validate_metadata_file(workflow, final_metadata_file, config.get('validators').get('hmp2_metadata')) #pub_metadata_file = stage_files(workflow, metadata_file, # metadata_dir) workflow.go()
def main(args): config = parse_cfg_file(args.config_file, section=args.data_type) analysis_col_patterns = config.get('analysis_col_patterns') target_metadata_cols = funcy.flatten(config.get('target_metadata_cols')) data_type_label = config.get('data_type_mapping').get(args.data_type) output_pcl = add_metadata_to_tsv(args.input_file, args.metadata_file, data_type_label, args.id_col, analysis_col_patterns, args.drop_missing_cols, target_metadata_cols, supplement=args.supplement)
def main(args): manifest = parse_cfg_file(args.input_manifest) analysis_cfg = parse_cfg_file(args.config_file) mapping_cols = analysis_cfg.get('mapping_columns') data_files = manifest.get('submitted_files') studytrax_df = pd.read_csv(args.studytrax_metadata, dtype='str') broad_sample_df = pd.read_csv(args.broad_sample_sheet, dtype='str') # When we have some samples that don't map to their desired columns # we are going to want to run them through every single sample ID column # and see if we can find a hit. clinical_search_cols = [ 'st_q13', 'st_q12', 'st_q10', 'st_q4', 'st_q17', 'st_q11' ] broad_search_cols = [ 'Viromics', 'MbX', 'Proteomics', 'Parent Sample A', 'Parent Sample B', 'DNA/RNA' ] if data_files: for (dtype, mapping_cols) in [('MVX', mapping_cols.get('MVX')), ('HTX', mapping_cols.get('HTX')), ('RRBS', mapping_cols.get('RRBS')), ('SER', mapping_cols.get('SER')), ('HG', mapping_cols.get('HG'))]: if not data_files.get(dtype): continue samples = set([ os.path.splitext(os.path.basename(sample_id))[0] for sample_id in data_files.get(dtype).get('input') ]) if not samples: continue mapping_cols = funcy.flatten(mapping_cols) found_samples = check_sample_mapping(samples, studytrax_df, mapping_cols) print "Correctly mapped %s samples" % len(found_samples) missing_samples = samples - found_samples print "%s samples were not mapped" % len(missing_samples) print if len(missing_samples) == 0: continue # Sometimes we have a combination of characters that were recorded incorrectly # (i.e. 1 becomes I or O becomes 0 so we want to isolate and test for all of these) for (orig_char, replace_char) in [('1', 'I'), ('O', '0'), ('I', '1'), ('0', 'O'), ('SM-', 'SM')]: mod_samples = map(lambda s: s.replace(orig_char, replace_char), missing_samples) found_samples = check_sample_mapping(mod_samples, studytrax_df, mapping_cols) if found_samples: print "Found %s more samples after replacing character %s with %s in sample IDs:" % ( len(found_samples), orig_char, replace_char) print "\n".join(found_samples) print found_samples = map( lambda s: s.replace(replace_char, orig_char), found_samples) missing_samples = missing_samples - set(found_samples) for col in clinical_search_cols: found_samples = check_sample_mapping(missing_samples, studytrax_df, [col]) if found_samples: print print "Found %s samples for data type %s in incorrect column %s:" % ( len(found_samples), dtype, col) print "\n".join(list(found_samples)) print missing_samples = missing_samples - found_samples # For Viromics data we can reference the Broad tracking sheet. if dtype == 'MVX': for col in broad_search_cols: found_samples = check_sample_mapping( missing_samples, broad_sample_df, [col]) if found_samples: print print "Found %s samples for data type %s in Broad tracking sheet, column %s:" % ( len(found_samples), dtype, col) print "\n".join(list(found_samples)) print missing_samples = missing_samples - found_samples print "Final missing samples: %s" % (len(missing_samples)) print "\n".join(missing_samples)
def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file, section='16S') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') project = manifest.get('project') creation_date = manifest.get('submission_date') gg_tax = conf.get('databases').get('gg_taxonomy') gg_usearch = conf.get('databases').get('gg_usearch') gg_fasta = conf.get('databases').get('gg_fasta') if data_files and data_files.get('16S', {}).get('input'): input_files = data_files.get('16S').get('input') input_extension = data_files.get('16S').get('file_extension') barcode_file = data_files.get('16S').get('barcode_file') pair_identifier = data_files.get('16S').get('pair_identifier') index_identifier = data_files.get('16S').get('index_identifier') if index_identifier: index_files = [in_file for in_file in input_files if index_identifier in in_file] input_files = set(input_files) - set(index_files) project_dirs = create_project_dirs([conf.get('deposition_dir'), conf.get('processing_dir'), conf.get('public_dir')], project, creation_date, '16S') base_depo_dir = os.path.abspath(os.path.join(project_dirs[0], '..')) manifest_file = stage_files(workflow, [args.manifest_file], base_depo_dir) sequence_files = stage_files(workflow, input_files, project_dirs[0], symlink=True) # An entry point into this pipeline is any analysis conducted by Baylor/CMMR # which will require a slight branching of the pipeline utilized. # We are making a very fraught assumption here that if only one sequence file # is passed in alongside a centroid FASTA file we are dealing with any files # generated by CMMR otu_table = data_files.get('16S').get('otu_table') centroid_fasta = data_files.get('16S').get('centroid_fasta') if otu_table and centroid_fasta: if len(sequence_files) < 1: merged_fastq = sequence_files[0] fixed_otu_table = fix_CMMR_OTU_table_taxonomy_labels(workflow, otu_table, project_dirs[1]) else: if barcode_file: sequence_files = demultiplex(workflow, input_files, project_dirs[1], barcode_file, index_files, conf.get('min_pred_qc_score'), pair_identifier) merged_fastq = merge_samples_and_rename(workflow, sequence_files, input_extension, project_dirs[1], pair_identifier, args.threads) qc_fasta_outs = quality_control(workflow, merged_fastq, project_dirs[1], args.threads, conf.get('maxee'), conf.get('min_trunc_len_max')) if not otu_table: closed_ref_tsv = taxonomic_profile(workflow, qc_fasta_outs[0], qc_fasta_outs[1], qc_fasta_outs[2], project_dirs[1], args.threads, conf.get('percent_identity'), gg_usearch, gg_fasta, gg_tax, conf.get('min_size')) predict_metagenomes_tsv = functional_profile(workflow, closed_ref_tsv, project_dirs[1]) workflow.go()
def main(workflow): args = workflow.parse_args() conf_mtx = parse_cfg_file(args.config_file, section='MTX') conf_mgx = parse_cfg_file(args.config_file, section='MGX') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') project = manifest.get('project') creation_date = manifest.get('submission_date') adapters_file = manifest.get('adapters_file') contaminate_db = conf_mtx.get('databases').get('knead_dna') mtx_db = conf_mtx.get('databases').get('knead_mtx') rrna_db = conf_mtx.get('databases').get('knead_rrna') adapter_sequences = conf_mtx.get('adapter_sequences') qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads func_threads = args.threads_humann if args.threads_humann else args.threads if data_files and data_files.get('MTX', {}).get('input'): input_files_mtx = data_files.get('MTX').get('input') file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq') pair_identifier_mtx = data_files.get('MTX').get('pair_identifier') input_file_tags = data_files.get('MTX').get('tags') input_tax_profiles = [] project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'), conf_mtx.get('processing_dir'), conf_mtx.get('public_dir')], project, creation_date, 'MTX') public_dir_mtx = project_dirs_mtx[-1] base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..')) manifest_file = stage_files(workflow, [args.manifest_file], base_depo_dir) deposited_files_mtx = stage_files(workflow, input_files_mtx, project_dirs_mtx[0], symlink=True) if file_extension_mtx == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files_mtx, project_dirs_mtx[1], paired_end=True, compress=False, threads=args.threads) pair_identifier_mtx = "_R1" else: paired_end_seqs = deposited_files_mtx if adapters_file: adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE " "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file) (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow, paired_end_seqs, file_extension_mtx, project_dirs_mtx[1], qc_threads, databases=[contaminate_db, rrna_db, mtx_db], pair_identifier=pair_identifier_mtx, additional_options=adapter_trim_opts, remove_intermediate_output=True) sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx) ########################################## # MGX FILE PROCESSING # ########################################## # Ideally we would be passed in a set of corresponding metagenome # sequence(s) to go with our metatranscriptomic files but we also # have two other scenarios: # # 1.) No accompanying metagenomic sequences exist; in this # case we will proceed just using the metatranscriptomic # data. # 2.) Taxonomic profiles are passed directly in in our MANIFEST # file; here we remove these from our input files and # prevent them from running through the kneaddata -> # metaphlan2 portions of our pipeline if data_files.get('MGX', {}).get('input'): input_files_mgx = data_files.get('MGX').get('input') file_extension_mgx = data_files.get('MGX').get('file_ext') pair_identifier_mgx = data_files.get('MGX').get('pair_identifier') input_tax_profiles = [in_file for in_file in input_files_mgx if 'taxonomic_profile.tsv' in in_file] input_files_mgx = set(input_files_mgx) - set(input_tax_profiles) if input_files_mgx: sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx) project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'), conf_mgx.get('processing_dir'), conf_mgx.get('public_dir')], project, creation_date, 'WGS') public_dir_mgx = project_dirs_mgx[-1] deposited_files_mgx = stage_files(workflow, input_files_mgx, project_dirs_mgx[0], symlink=True) if file_extension_mgx == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files_mgx, project_dirs_mgx[1], paired_end=True, compress=False, threads=args.threads) pair_identifier_mgx = "_R1" else: paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx) (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow, paired_end_seqs_mgx, project_dirs_mgx[1], qc_threads, [contaminate_db, rrna_db], remove_intermediate_output=True) tax_outs_mgx = taxonomic_profile(workflow, cleaned_fastqs_mgx, project_dirs_mgx[1], tax_threads, '*.fastq') func_outs_mgx = functional_profile(workflow, cleaned_fastqs_mgx, project_dirs_mgx[1], func_threads, tax_outs_mgx[1], remove_intermediate_output=True) input_tax_profiles.extend(tax_outs_mgx[1]) pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw') pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile') pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile') map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir, pub_wgs_func_profile_dir]) norm_genefamilies_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='ecs', tag='genefamilies_ecs_relab', extension='tsv') norm_path_files_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='pathways', tag='pathabundance_relab', extension='tsv') pcl_files = add_metadata_to_tsv(workflow, [tax_outs_mgx[1]] + func_outs_mgx, 'metagenomics', conf_mgx.get('metadata_id_col'), conf_mgx.get('analysis_col_patterns'), conf_mgx.get('target_metadata_cols')) func_tar_files_wgs = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx, norm_genefamilies_mgx, norm_ecs_files_mgx, norm_path_files_mgx): tar_path = os.path.join(pub_wgs_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_outs_mgx) func_tar_files_wgs.append(func_tar_file) ########################################## # MTX FILE PROCESSING # ########################################## # Here we want to see if we can create a set of matching cleaned # MTX files to corresponding MGX taxonomic profiles. If these exist # we want to run functional profiling wit hthe corresponding MGX # taxonomic profile otherwise we will run a taxonomic profiling # on the MTX sequences and run functional profiling with the produced # taxonomic profile. func_outs_match_mtx = [] if input_tax_profiles: (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx, '.fastq', data_files.get('MTX').get('metadata_id_col', 'External ID'), input_tax_profiles, data_files.get('MGX').get('tax_profile_id', 'External ID'), args.metadata_file, tags=input_file_tags) func_outs_match_mtx = functional_profile(workflow, matched_fqs, project_dirs_mtx[1], func_threads, matched_tax_profiles, remove_intermediate_output=True) # Reset the remaining MTX files left over here so that we can run them through # the metaphlan2 -> humann2 pipeline. cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs) if cleaned_fastqs_mtx: tax_outs_mtx = taxonomic_profile(workflow, cleaned_fastqs_mtx, project_dirs_mtx[1], tax_threads, '*.fastq') func_outs_mtx = functional_profile(workflow, cleaned_fastqs_mtx, file_extension_mtx, project_dirs_mtx[1], func_threads, tax_outs_mtx[1], remove_intermediate_output=True) func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx) else: func_outs_mtx = func_outs_match_mtx # We'll need to generate DNA/RNA normalized files to be displayed # in our visualization output. (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow, func_outs_mgx[0], func_outs_mgx[1], func_outs_mgx[2], func_outs_mtx[0], func_outs_mtx[1], func_outs_mtx[2], project_dirs_mtx[1]) pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw') pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile') pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile') map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir, pub_mtx_func_profile_dir]) norm_genefamilies_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='ecs', tag='genefamilies_ecs_relab', extension='tsv') norm_path_files_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='pathways', tag='pathabundance_relab', extension='tsv') func_tar_files_mtx = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx, norm_genefamilies_mtx, norm_ecs_files_mtx, norm_path_files_mtx): tar_path = os.path.join(pub_mtx_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_outs_mtx) func_tar_files_mtx.append(func_tar_file) workflow.go()
def main(args): config = parse_cfg_file(args.config) study_trax_df = pd.read_csv(args.studytrax_metadata, dtype='str') broad_sample_df = pd.read_csv(args.broad_sample_tracking, na_values=['destroyed', 'missed'], parse_dates=['Actual Date of Receipt']) proteomics_df = None metadata_df = None new_metadata_df = None date_today = datetime.date.today() metadata_file = os.path.join(args.output_dir, 'hmp2_metadata_%s.csv' % date_today) ## Before we filter our metadata rows down to just to rows associated ## with the files we have present, we'll want a list of all the collection ## dates collection_dates_dict = get_collection_dates(broad_sample_df) biopsy_date_map = None if args.proteomics_metadata: proteomics_df = pd.read_table(args.proteomics_metadata) if args.biopsy_dates: biopsy_date_map = parse_biopsy_dates(args.biopsy_dates) ## The update procedure either assumes that we have an exisitng metadata ## file that we are going to be appending too/updating or that we are ## creating a fresh metadata sheet and will be adding the files in the ## manifest file too it. ## TODO: This needs to be re-worked to account for snagging datatypes as as well. #if not args.metadata_file or args.refresh_all: # sequence_files.extend(get_all_sequence_files(config.get('deposition_dir'), # config.get('input_extensions'))) if args.manifest_file: manifest = parse_cfg_file(args.manifest_file) submitted_files = manifest.get('submitted_files') if submitted_files: new_metadata = [] for (dtype, items) in submitted_files.iteritems(): input_files = items.get('input') pair_identifier = items.get('pair_identifier') if pair_identifier: (input_pair1, input_pair2) = bb_utils.paired_files( input_files, pair_identifier) input_files = input_pair1 if input_pair1 else input_files else: new_metadata.append( get_metadata_rows(config, study_trax_df, broad_sample_df, proteomics_df, dtype, input_files, pair_identifier)) new_metadata_df = pd.concat(new_metadata, ignore_index=True) #new_metadata_df[new_metadata_df['External ID'].isnull()] = None new_metadata_df['Site/Sub/Coll ID'] = new_metadata_df[ 'Site/Sub/Coll'].map(lambda sid: str(sid)) #new_metadata_df['Participant ID'] = new_metadata_df['Subject'].map(lambda subj: 'C' + str(subj)) if 'Collection #' in new_metadata_df.columns: new_metadata_df['visit_num'] = new_metadata_df['Collection #'] new_metadata_df['Project'] = new_metadata_df.apply(get_project_id, axis=1) new_metadata_df['ProjectSpecificID'] = pd.to_numeric( new_metadata_df['ProjectSpecificID']) new_metadata_df['Site'] = new_metadata_df['SiteName'] new_metadata_df = new_metadata_df.apply(generate_external_id, axis=1) new_metadata_df = remove_columns(new_metadata_df, config.get('drop_cols')) if args.metadata_file: metadata_df = pd.read_csv(args.metadata_file, parse_dates=['Actual Date of Receipt']) site_mapping = config.get('site_map') metadata_df['Site/Sub/Coll ID'] = metadata_df.apply( fix_site_sub_coll_id, args=(site_mapping, ), axis=1) metadata_df['PDO Number'] = metadata_df.apply(get_pdo_number, axis=1) if new_metadata_df and not new_metadata_df.empty: metadata_df = pd.concat([metadata_df, new_metadata_df], ignore_index=True) metadata_df = metadata_df.drop_duplicates( subset=['External ID', 'Site/Sub/Coll ID', 'data_type'], keep='last') else: metadata_df = new_metadata_df metadata_df[metadata_df['External ID'].isnull()] = metadata_df[ metadata_df['External ID'].isnull()].apply(generate_external_id, axis=1) if args.auxillary_metadata: for aux_file in args.auxillary_metadata: supp_df = pd.read_table(aux_file) supp_columns = supp_df.columns.tolist() idx_offset = 1 if 'data_type' in supp_columns: join_id = supp_columns[:1] + ['data_type'] idx_offset = 2 else: join_id = supp_columns[0] ## We need to do this in two stages. If the columns already exist ## here we want to update them. If they do not exist we append ## them. metadata_cols = metadata_df.columns.tolist() new_cols = set(supp_columns[idx_offset:]) - set(metadata_cols) existing_cols = set( supp_columns[idx_offset:]).intersection(metadata_cols) if new_cols: supp_new_df = supp_df.filter(items=supp_columns[:idx_offset] + list(new_cols)) metadata_df = metadata_df.merge(supp_new_df, how='left', on=join_id) if existing_cols: supp_existing_df = supp_df.filter( items=supp_columns[:idx_offset] + list(existing_cols)) metadata_df.set_index(join_id, inplace=True) supp_existing_df.set_index(join_id, inplace=True) metadata_df.update(supp_existing_df) metadata_df.reset_index(inplace=True) if args.add_all_stool_collections: metadata_df = add_all_stool_collections(metadata_df, study_trax_df, broad_sample_df) metadata_df['Actual Date of Receipt'] = pd.to_datetime( metadata_df['Actual Date of Receipt']) metadata_df['visit_num'] = metadata_df.apply(fill_visit_nums, axis=1) metadata_df['hbi_score'] = pd.to_numeric(metadata_df['hbi_score']) if 'Site' in metadata_df.columns.tolist(): metadata_df['SiteName'] = metadata_df['Site'] else: metadata_df['Site'] = metadata_df['SiteName'] ## Couple small remaining changes metadata_df.ix[metadata_df.hbi_score > 900, 'hbi_score'] = None metadata_df.ix[metadata_df.consent_age > 150, 'consent_age'] = None metadata_df['total_reads'].loc[metadata_df['total_reads'].astype( 'str').str.startswith('PDO')] = None metadata_df['Research Project'] = "ibdmdb" metadata_df = generate_collection_statistics(metadata_df, collection_dates_dict, biopsy_date_map) metadata_df = add_baseline_metadata_values(metadata_df, study_trax_df, config.get('baseline_cols')) metadata_df[metadata_df['SiteName'].isnull()] = fix_site_name( metadata_df[metadata_df['SiteName'].isnull()]) metadata_df = reorder_columns(metadata_df, config.get('col_order')) metadata_df.drop(['Site'], 1, inplace=True) metadata_df = metadata_df.sort_values( ['data_type', 'Participant ID', 'visit_num']) metadata_df.to_csv(metadata_file, index=False)
def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file) data_type_mapping = conf.get('datatype_mapping') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') metadata_df = pd.read_csv(args.metadata_file) baseline_metadata_df = pd.read_csv(args.baseline_metadata_file) md5sums_map = {} ## For every set of data files we're going to want to iterate over each ## section and process the data files one section at a time, handling ## the unique pieces of metadata for each data type as needed. if data_files: username = conf.get('username') password = conf.get('password') session = cutlass.iHMPSession(username, password, ssl=False) dcc_objs = [] dcc_project = dcc.get_project(conf, session) dcc_study = dcc.crud_study(conf, session, dcc_project.id) dcc_subjects = dcc.group_osdf_objects(dcc_study.subjects(), 'rand_subject_id') dcc_subjects = dcc.crud_subjects(dcc_subjects, dcc_study, baseline_metadata_df, conf) for data_type in data_files: dtype_metadata = conf.get(data_type) input_files = data_files.get(data_type, {}).get('input') output_files = data_files.get(data_type, {}).get('output') md5sums_file = data_files.get(data_type).get('md5sums_file') file_tags = data_files.get(data_type).get('tags', []) if md5sums_file: md5sums_map.update(parse_checksums_file(md5sums_file)) else: raise ValueError("MD5 checksums file is required.") ## Getting data files from different souces means that the ## identifier we use to map a file to a piece of metadata may ## be different. We need to account for this and create a map ## of the 'universal ID' back to the specific file it references id_cols = conf['metadata_id_mappings'][data_type] seq_fname_map = dcc.create_seq_fname_map(data_type, input_files, tags=file_tags) sample_ids = seq_fname_map.keys() dtype_name = data_type_mapping.get(data_type) id_col = conf['metadata_id_mappings'][data_type] sample_metadata_df = metadata_df[(metadata_df[id_col].isin(sample_ids)) & (metadata_df['data_type'] == dtype_name)] ## Just in case there are more samples missing_samples = set(sample_ids) - set(sample_metadata_df[id_col].tolist()) ## In our proteomics dataset we occasionally see two datasets tied to the same ## sample so we need to do a little extra work to figure out which dataset ## we are working with. is_proteomics = True if data_type == "proteomics" else False ## Add an extra column to our sample metadata with the corresponding sequence file ## for easy access later on. #sample_metadata_df['seq_file'] = None sample_metadata_df = sample_metadata_df.apply(dcc.map_sample_id_to_file, args=(id_col, seq_fname_map, is_proteomics), axis=1) #sample_metadata_df = sample_metadata_df.dropna(axis=0, subset=['seq_file']) output_files_map = None if output_files: ## Do a bunch of stuff here since we have output files output_files_map = dcc.create_output_file_map(data_type, output_files, tags=file_tags) for (subject_id, metadata) in sample_metadata_df.groupby(['Participant ID']): dcc_subject = dcc_subjects.get(subject_id[1:]) if dcc_subject: dcc_subject = dcc_subject[0] else: raise ValueError('Could not find Subject object for subject ID %s' % subject_id) dcc_visits = dcc.group_osdf_objects(dcc_subject.visits(), 'visit_id') for (idx, row) in metadata.iterrows(): dcc_visit = dcc.crud_visit(dcc_visits, row['visit_num'], dcc_subject.id, data_type, row, conf) dcc_visits.setdefault(dcc_visit.visit_id, []).append(dcc_visit) dcc_samples = dcc.group_osdf_objects(dcc_visit.samples(), 'name') dcc_sample = dcc.crud_sample(dcc_samples, row.get('site_sub_coll'), dcc_visit.id, conf, row) input_dcc_objs = [] if data_type == "MBX": dcc_prep = dcc.crud_host_assay_prep(dcc_sample, conf.get('data_study'), data_type, conf.get(data_type), row) ## MBX is a curious case in that we have multiple raw files (and multiple outputs) so ## we'll need to handle all these files (in another loop.) I need to refactor this ## to be way more elegant in the future. ## ## This is going to be a bit trickier than other data types because of that input_metabolomes = row.filter(like='metabolome') for (metabolome_type, metabolome_file) in input_metabolomes.iteritems(): metabolome_fname = os.path.basename(metabolome_file[0]) analysis_type = metabolome_type.split('_', 1)[-1] dcc_seq_obj = dcc.crud_metabolome(dcc_prep, metabolome_file[0], md5sums_map.get(metabolome_fname), dcc_sample.name, conf.get('data_study'), dtype_metadata, row) input_dcc_objs.append(dcc_seq_obj) elif data_type == "MPX": #url_param = '_raw_url' dcc_prep = dcc.crud_microb_assay_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_seq_obj = dcc.crud_proteome(dcc_prep, file_md5sum, dcc_sample.name, dtype_metadata, row) elif data_type == "HTX": dcc_prep = dcc.crud_host_seq_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_seq_obj = dcc.crud_host_tx_raw_seq_set(dcc_prep, file_md5sum, dcc_sample.name, conf.get(data_type), row) elif data_type == "HG": dcc_prep = dcc.crud_host_seq_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_seq_obj = dcc.crud_host_wgs_raw_seq_set(dcc_prep, file_md5sum, dcc_sample.name, dtype_metadata, row) elif data_type == "MTX": mtx_raw_seq_set = row.get('microb_transcriptomics_raw_seq_set') mtx_raw_fname = os.path.basename(mtx_raw_seq_set[0]) dcc_prep = dcc.crud_wgs_dna_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_seq_obj = dcc.crud_microb_transcriptomics_raw_seq_set(dcc_prep, mtx_raw_seq_set[0], md5sums_map.get(mtx_raw_fname), dcc_sample.name, dtype_metadata, row) elif data_type == "MGX": wgs_raw_seq_set = row.get('wgs_raw_seq_set') wgs_raw_fname = os.path.basename(wgs_raw_seq_set[0]) dcc_prep = dcc.crud_wgs_dna_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_seq_obj = dcc.crud_wgs_raw_seq_set(dcc_prep, wgs_raw_seq_set[0], md5sums_map.get(wgs_raw_fname), dcc_sample.name, dtype_metadata, row) elif data_type == "MVX": raw_seq_set_fname = os.path.basename(row.get('wgs_raw_seq_set')[0]) viral_seq_set_fname = os.path.basename(row.get('viral_seq_set')[0]) dcc_prep = dcc.crud_wgs_dna_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_raw_seq_set = dcc.crud_wgs_raw_seq_set(dcc_prep, row.get('wgs_raw_seq_set')[0], md5sums_map.get(raw_seq_set_fname), dcc_sample.name, dtype_metadata, row) dcc_viral_seq_set = dcc.crud_viral_seq_set(dcc_raw_seq_set, row.get('viral_seq_set')[0], md5sums_map.get(viral_seq_set_fname), dtype_metadata, row) input_dcc_objs.extend([dcc_raw_seq_set, dcc_viral_seq_set]) elif data_type == '16SBP' or data_type == "16S": raw_seq_set_fname = os.path.basename(row.get('16S_raw_seq_set')[0]) trimmed_seq_set_fname = os.path.basename(row.get('16S_trimmed_seq_set')[0]) dcc_prep = dcc.crud_sixs_dna_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_raw_seq_set = dcc.crud_sixs_raw_seq_set(dcc_prep, md5sums_map.get(raw_seq_set_fname), dtype_metadata, row) dcc_trimmed_seq_set = dcc.crud_sixs_trimmed_seq_set(dcc_raw_seq_set, md5sums_map.get(trimmed_seq_set_fname), dtype_metadata, row) input_dcc_objs.extend([dcc_raw_seq_set, dcc_trimmed_seq_set]) elif data_type == 'RRBS': raw_epigenetics_seq_set = row.get('host_epigenetics_raw_seq_set') raw_epigenetics_fname = os.path.basename(raw_epigenetics_seq_set[0]) dcc_prep = dcc.crud_host_seq_prep(dcc_sample, conf.get('data_study'), data_type, dtype_metadata, row) dcc_seq_obj = dcc.crud_host_epigenetics_raw_seq_set(session, dcc_prep, raw_epigenetics_seq_set[0], md5sums_map.get(raw_epigenetics_fname), dcc_sample.name, conf.get('data_study'), conf.get(data_type), row) elif data_type == 'SER': dcc_prep = dcc.crud_host_assay_prep(dcc_sample, conf.get('data_study'), data_type, conf.get(data_type), row) dcc_seq_obj = dcc.crud_serology(session, dcc_prep, file_md5sum, dcc_sample.name, conf.get('data_study'), dtype_metadata, row) if len(input_dcc_objs) == 0: input_dcc_objs.append(dcc_seq_obj) ##uploaded_files = upload_data_files(workflow, input_dcc_objs) ## The only output type currently supported are AbundanceMatrices ## so those are the only we will work with. Short-sided and ## ugly but can re-work this later. if output_files_map and row.get('External ID') in output_files_map: seq_out_files = output_files_map.get(row.get('External ID')) dcc_output_objs = [] for (output_ftype, output_files) in seq_out_files.iteritems(): for output_file in output_files: output_filename = os.path.basename(output_file) output_md5sum = md5sums_map.get(output_filename) if not output_md5sum: raise ValueError("Could not find md5sum for file", output_filename) if data_type == "MBX" and file_tags: output_base = os.path.splitext(output_filename) ## MBX data is a bit tricky since we can have multiple inputs and outputs ## that need to be threaded together. analysis_type = output_base.split('_', 1)[-1] dcc_parent_obj = next((p for p in input_dcc_objs if analysis_type in p.urls[0]), None) else: dcc_parent_obj = input_dcc_objs[-1] ## We need a special case here when dealing with Host Genomes... ## TODO: Clean this up to make this a lot better... if data_type == "HG": dcc_output_obj = dcc.crud_host_variant_call(session, dcc_parent_obj, output_file, output_md5sum, conf.get('data_study'), dtype_metadata, row) else: dcc_output_obj = dcc.crud_abundance_matrix(session, dcc_parent_obj, output_file, output_md5sum, dcc_sample.name, conf.get('data_study'), dtype_metadata, row) dcc_output_objs.append(dcc_output_obj) uploaded_file = upload_data_files(workflow, dcc_output_objs)
def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file, section='MGX') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') project = manifest.get('project') creation_date = manifest.get('submission_date') contaminate_db = conf.get('databases').get('knead_dna') if data_files and data_files.get('MGX'): input_files = data_files.get('MGX').get('input') pair_identifier = data_files.get('MGX').get('pair_identifier') file_extension = data_files.get('MGX', {}).get('input_extension', '.fastq') sample_names = get_sample_names(input_files, file_extension) project_dirs = create_project_dirs([ conf.get('deposition_dir'), conf.get('processing_dir'), conf.get('public_dir') ], project, creation_date, 'WGS') (deposition_dir, processing_dir, public_dir) = project_dirs base_depo_dir = os.path.abspath(os.path.join(deposition_dir, '..')) manifest_file = stage_files(workflow, [args.manifest_file], base_depo_dir) deposited_files = stage_files(workflow, input_files, deposition_dir, symlink=True) if file_extension == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files, processing_dir, paired_end=True, compress=False, threads=args.threads) pair_identifier = "_R1" else: paired_end_seqs = input_files qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads (cleaned_fastqs, read_counts) = quality_control(workflow, paired_end_seqs, '.fastq', processing_dir, qc_threads, contaminate_db, pair_identifier=pair_identifier, remove_intermediate_output=True) ## Generate taxonomic profile output. Output are stored in a list ## and are the following: ## ## * Merged taxonomic profile ## * Individual taxonomic files ## * metaphlan2 SAM files tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads tax_profile_outputs = taxonomic_profile(workflow, cleaned_fastqs, processing_dir, tax_threads, '.fastq') ## Generate functional profile output using humann2. Outputs are the ## the following: ## ## * Merged normalized genefamilies ## * Merged normalized ecs ## * Merged normalized pathways ## * Merged genefamilies ## * Merged ecs ## * Merged pathways func_threads = args.threads_humann if args.threads_humann else args.threads func_profile_outputs = functional_profile( workflow, cleaned_fastqs, '.fastq', processing_dir, func_threads, tax_profile_outputs[1], remove_intermediate_output=True) ## The current biobakery workflows do not generate KO's from our genefamilies ## so we're going to want to do that ourselves. genefamilies = name_files(sample_names, os.path.join(processing_dir, 'metaphlan2'), subfolder='main', tag='genefamilies', extension='tsv') pathways = name_files(sample_names, os.path.join(processing_dir, 'humann2'), subfolder='main', tag='pathabundance', extension='tsv') ecs = name_files(sample_names, os.path.join(processing_dir, 'humann2'), subfolder='regrouped', tag='ecs', extension='tsv') kos = name_files(sample_names, os.path.join(processing_dir, 'humann2'), subfolder='regrouped', tag='kos', extension='tsv') #(merged_norm_kos, merged_kos) = generate_ko_files(workflow, # genefamilies, # processing_dir) biom_files = batch_convert_tsv_to_biom(workflow, tax_profile_outputs[1]) tax_biom_files = stage_files(workflow, biom_files, processing_dir) kneaddata_log_files = name_files(sample_names, os.path.join(processing_dir, 'kneaddata'), subfolder='main', extension='log') pub_raw_dir = os.path.join(public_dir, 'raw') pub_tax_profile_dir = os.path.join(public_dir, 'tax_profile') pub_func_profile_dir = os.path.join(public_dir, 'func_profile') map(create_folders, [pub_raw_dir, pub_tax_profile_dir, pub_func_profile_dir]) knead_read_counts = os.path.join(processing_dir, 'counts', 'merged', 'kneaddata_read_count_table.tsv') tax_profile_pcl = add_metadata_to_tsv( workflow, [tax_profile_outputs[0]], args.metadata_file, 'metagenomics', id_col=conf.get('metadata_id_col'), col_replace=conf.get('analysis_col_patterns'), target_cols=conf.get('target_metadata_cols'), aux_files=[knead_read_counts]) func_profile_pcl = add_metadata_to_tsv( workflow, [func_profile_outputs[0]], args.metadata_file, 'metagenomics', id_col=conf.get('metadata_id_col'), col_replace=conf.get('analysis_col_patterns'), target_cols=conf.get('target_metadata_cols'), aux_files=[knead_read_counts]) pub_files = [ stage_files(workflow, files, target_dir) for (files, target_dir) in [(cleaned_fastqs, pub_raw_dir), ([tax_profile_outputs[0]], pub_tax_profile_dir), (tax_profile_outputs[1], pub_tax_profile_dir), ( tax_biom_files, pub_tax_profile_dir), (tax_profile_pcl, pub_tax_profile_dir), (func_profile_outputs, pub_func_profile_dir), ( func_profile_pcl, pub_func_profile_dir), (kneaddata_log_files, pub_raw_dir)] ] norm_genefamilies = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='ecs', tag='ecs_relab', extension='tsv') norm_path_files = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='pathways', tag='pathabundance_relab', extension='tsv') norm_kos_files = name_files(sample_names, os.path.join(processing_dir, 'humann2', 'relab'), subfolder='kos_relab', extension='tsv') func_tar_files = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names, norm_genefamilies, norm_ecs_files, norm_path_files): tar_path = os.path.join(pub_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_profile_outputs) func_tar_files.append(func_tar_file) workflow.go()
def main(args): ## First parse the metadata file metadata_df = pd.read_csv(args.metadata_file, dtype='object') metadata_conf = parse_cfg_file(args.config_file) conf_rename_cols = metadata_conf.get('col_rename') conf_recode_cols = metadata_conf.get('value_recode') ## Then parse the data dictionary dictionary_df = pd.read_excel(args.data_dictionary) ## Now let's create a dictionary lookup for the coded to human readable col_name_lookup = pd.Series(dictionary_df['Variable Name'].values, index=dictionary_df['Code'].values) col_name_lookup = dict((k, v) for (k, v) in col_name_lookup.iteritems()) value_field_lookup = {} dictionary_df[ dictionary_df['Pick Lists (Value, Missing, Name)'].notnull()].apply( populate_value_lookup, axis=1, args=(value_field_lookup, )) value_field_lookup['diagnosis'] = dict( (key.replace('.0', ''), val) for (key, val) in value_field_lookup['diagnosis'].iteritems()) ## Replace coded values metadata_df.replace(value_field_lookup, inplace=True) ## Replace some other left-over yes/no fields replace_yes_no = {'0': 'No', '1': 'Yes', '0.0': 'No', '1.0': 'Yes'} replace_cols = [ 'bx_q31', 'bx_q33', 'bx_q35', 'i_q3', 'i_q4', 'i_q5', 'i_q6', 'i_q7', 'i_q8', 'i_q9', 'i_q10', 'i_q11', 'i_q12', 'i_q13', 'i_q14', 'i_q15', 'ic_q1', 'ic_q5', 'ic_q6', 'i_q16', 'i_q17', 'i_q18', 'i_q19', 'i_q20', 'i_q21', 'i_q22', 'i_q23', 'i_q24', 'i_q25', 'i_q26', 'i_q27', 'i_q28', 'i_q29', 'i_q30', 'i_q31', 'i_q32', 'i_q33', 'i_q34', 'i_q35', 'i_q36', 'i_q37', 'i_q38', 'i_q39', 'i_q40', 'i_q41', 'i_q42', 'i_q43', 'i_q44', 'i_q45', 'i_q46', 'i_q47', 'i_q48', 'i_q49', 'i_q50', 'bl_q12', 'bl_q14', 'bl_q16', 'st_q1', 'st_q9', 'st_q19', 'st_q21', 'st_q23', 'hbi_q2', 'hbi_q9', 'hbi_q10', 'hbi_q11', 'hbi_q12', 'hbi_q13', 'hbi_q14', 'hbi_q15', 'hbi_q16', 'sccai_q2', 'sccai_q2', 'sccai_q11', 'sccai_q12', 'sccai_q13', 'sccai_q14', 'ses_score2', 'mbs_q15', 'dr_q2', 'dr_q2a', 'dr_q2b', 'dr_q2c', 'dr_q3', 'dr_q4', 'dr_q5', 'dr_q6', 'dr_q7' ] map(lambda field: metadata_df[field].replace(replace_yes_no, inplace=True), replace_cols) [ metadata_df[field].replace(replace_val, inplace=True) for (field, replace_val) in conf_recode_cols.iteritems() ] ## Drop a couple of the confusing biopsy location columns drop_cols = [ 'bx_q8', 'bx_q10', 'bx_q16', 'bx_q18', 'bx_q24', 'bx_q26', 'Site/Sub/Coll' ] map(lambda col_name: metadata_df.drop(col_name, axis=1, inplace=True), drop_cols) ## We need to sanitize some of the disease location columns metadata_df['mc_q4'] = metadata_df['mc_q4'].apply( lambda x: x.replace(" ", "").split('(')[0] if pd.notnull(x) else x) metadata_df['mc_q7'] = metadata_df['mc_q7'].apply( lambda x: x.replace(" ", "").split('(')[0] if pd.notnull(x) else x) ## Rename and write out new CSV file col_name_lookup.update(conf_rename_cols) metadata_df.rename(columns=col_name_lookup, inplace=True) metadata_df.to_csv(args.output_file, index=False)