def test_paired_files(self): """ Test the paired files function """ files = ["s1.R1.fastq", "s1.R2.fastq", "s2.R1.fastq", "s2.R2.fastq"] expected_pairs = [["s1.R1.fastq", "s2.R1.fastq"], ["s1.R2.fastq", "s2.R2.fastq"]] actual_pairs = utilities.paired_files(files, ".fastq", pair_identifier=".R1") self.assertEqual(expected_pairs[0], actual_pairs[0]) self.assertEqual(expected_pairs[1], actual_pairs[1])
def test_paired_files_identifier_not_found(self): """ Test the paired files function with an identifier that is not found""" files = [ "sample-1.R1.fastq", "sample-1.R2.fastq", "sample-2.R1.fastq", "sample-2.R2.fastq" ] expected_pairs = [[], []] actual_pairs = utilities.paired_files(files, ".fastq", pair_identifier="_R1.") self.assertEqual(expected_pairs[0], actual_pairs[0]) self.assertEqual(expected_pairs[1], actual_pairs[1])
def test_paired_files_identifier_includes_extension(self): """ Test the paired files function with an identifier that is not found because it includes the period from the file extension""" files = [ "sample-1.R1.fastq", "sample-1.R2.fastq", "sample-2.R1.fastq", "sample-2.R2.fastq" ] expected_pairs = [[], []] actual_pairs = utilities.paired_files(files, ".fastq", pair_identifier="R1.") self.assertEqual(expected_pairs[0], actual_pairs[0]) self.assertEqual(expected_pairs[1], actual_pairs[1])
def test_paired_files_duplicate_identifier_3(self): """ Test the paired files function with pair identified duplicated """ files = [ "MR100.R1.fastq", "MR100.R2.fastq", "MR200.R1.fastq", "MR200.R2.fastq" ] expected_pairs = [["MR100.R1.fastq", "MR200.R1.fastq"], ["MR100.R2.fastq", "MR200.R2.fastq"]] actual_pairs = utilities.paired_files(files, "fastq", pair_identifier="R1") self.assertEqual(expected_pairs[0], actual_pairs[0]) self.assertEqual(expected_pairs[1], actual_pairs[1])
def test_paired_files_duplicate_identifier_2(self): """ Test the paired files function with a second identifier duplicated Also test extension without leading period. """ files = [ "s_2_1.fastq", "s_2_2.fastq", "s_2_3_1.fastq", "s_2_3_2.fastq" ] expected_pairs = [["s_2_1.fastq", "s_2_3_1.fastq"], ["s_2_2.fastq", "s_2_3_2.fastq"]] actual_pairs = utilities.paired_files(files, "fastq", pair_identifier="_1") self.assertEqual(expected_pairs[0], actual_pairs[0]) self.assertEqual(expected_pairs[1], actual_pairs[1])
def test_paired_files_duplicate_identifier_1(self): """ Test the paired files function with a first identifier duplicated""" files = [ "s_1_1.fastq.gz", "s_1_2.fastq.gz", "s_1_3_1.fastq.gz", "s_1_3_2.fastq.gz" ] expected_pairs = [["s_1_1.fastq.gz", "s_1_3_1.fastq.gz"], ["s_1_2.fastq.gz", "s_1_3_2.fastq.gz"]] actual_pairs = utilities.paired_files(files, ".fastq.gz", pair_identifier="_1") self.assertEqual(expected_pairs[0], actual_pairs[0]) self.assertEqual(expected_pairs[1], actual_pairs[1])
def main(workflow): args = workflow.parse_args() conf = parse_cfg_file(args.config_file, section='HG') ## Parse the manifest file containing all data files from this submission manifest = parse_cfg_file(args.manifest_file) project = manifest.get('project') data_files = manifest.get('submitted_files') submission_date = manifest.get('submission_date') if data_files and data_files.get('HG', {}).get('input'): input_files = data_files.get('HG').get('input') project_dirs = create_project_dirs([ conf.get('deposition_dir'), conf.get('processing_dir'), conf.get('public_dir') ], project, submission_date, 'HG') deposited_files = stage_files(workflow, input_files, project_dirs[0], symlink=True) fastq_files = bam_to_fastq(workflow, input_files, project_dirs[1], paired_end=True, threads=args.threads, compress=False) paired_fastq_files = paired_files(fastq_files, '_R1') paired_fastq_tars = [] for (mate_1, mate_2) in zip(paired_fastq_files[0], paired_fastq_files[1]): sample_name = sample_names(mate_4, pair_identifier="_R1") tar_path = os.path.join(project_dirs[-1], "%s.tar" % sample_name) paired_fastq_tar = tar_files(workflow, [mate_1, mate_2], tar_path, depends=[mate_1, mate_2]) paired_fastq_tars.append(paired_fastq_tar) md5sum_files = generate_md5_checksums(workflow, paired_fastq_tars) workflow.go()
args.input_extension = args.input_extension.replace(".gz", "") args.input_extension = args.input_extension.replace(".bz2", "") else: # if the input files are fasta, bypass quality control qc_output_files = demultiplexed_files ### STEP #2: Run taxonomic profiling on all of the filtered files ### if not args.bypass_taxonomic_profiling: merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile( workflow, qc_output_files, args.output, args.threads, args.input_extension) elif not args.bypass_functional_profiling or not args.bypass_strain_profiling: # get the names of the taxonomic profiling files allowing for pairs input_pair1, input_pair2 = utilities.paired_files(demultiplexed_files, original_extension, args.pair_identifier) sample_names = utilities.sample_names( input_pair1 if input_pair1 else input_files, original_extension, args.pair_identifier) tsv_profiles = utilities.name_files(sample_names, demultiplex_output_folder, tag="taxonomic_profile", extension="tsv") # check all of the expected profiles are found if len(tsv_profiles) != len(list(filter(os.path.isfile, tsv_profiles))): sys.exit( "ERROR: Bypassing taxonomic profiling but all of the tsv taxonomy profile files are not found in the input folder. Expecting the following input files:\n" + "\n".join(tsv_profiles)) # run taxonomic profile steps bypassing metaphlan2 merged_taxonomic_profile, taxonomy_tsv_files, taxonomy_sam_files = shotgun.taxonomic_profile(
def demultiplex_dual(workflow, output_folder, input_files, extension, barcode_files, dual_barcode_path, min_phred, pair_identifier): """Demultiplex the files (dual indexed paired) Args: workflow (anadama2.workflow): An instance of the workflow class. input_files (list): A list of paths to fastq(gz) files for input to ea-utils. extension (string): The extension for all files. output_folder (string): The path of the output folder. barcode_files (list): A list of barcode files. dual_index_path (string): A paths to the dual index file. min_phred (int): The min phred quality score to use in the demultiplex command. pair_identifier (string): The string in the file basename to identify the first pair in the set. Requires: ea-utils fastq-multx: A tool to demultiplex fastq files. Returns: list: A list of the demultiplexed files string: output folder of demultiplexed files """ # capture the demultiplex stats in log file, one for each set of input files demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log",create_folder=True) demultiplex_output_folder = os.path.dirname(demultiplex_log) # create a tracked executable fastq_multx_tracked = TrackedExecutable("fastq-multx", version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`") # check for paired input files input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier) # get barcode files barcode1, barcode2 = utilities.paired_files(barcode_files, extension, pair_identifier) # get the second pair identifier pair_identifier2 = pair_identifier.replace("1", "2", 1) try: file_handle = open(dual_barcode_path) lines = file_handle.readlines() file_handle.close() except EnvironmentError: sys.exit("ERROR: Unable to read dual barcode file: " + dual_barcode_path) run_name = os.path.basename(input_pair1[0]).replace(pair_identifier, "").replace("." + extension, "") demultiplex_files = set() for line in lines: # ignore headers or comment lines if not line.startswith("#"): sample_name = line.split("\t")[0] if sample_name: nm1 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier + "." + extension nm2 = demultiplex_output_folder + "/" + run_name + "_" + sample_name + pair_identifier2 + "." + extension demultiplex_files.add(nm1) demultiplex_files.add(nm2) # get the names of the expected output files # demultiplex_files = utilities.name_files(samples, demultiplex_output_folder, extension=extension) workflow.add_task( "fastq-multx -B [depends[0]] [depends[1]] [depends[2]] [depends[3]] [depends[4]]\ -o n/a -o n/a -o [args[0]]/[args[5]]_%[args[3]].[args[1]] -o [args[0]]/[args[5]]_%[args[4]].[args[1]]\ -q [args[2]] > [targets[0]]", depends=[dual_barcode_path, barcode1[0], barcode2[0], input_pair1[0], input_pair2[0]], args=[demultiplex_output_folder, extension, min_phred, pair_identifier, pair_identifier2, run_name, fastq_multx_tracked], targets=[demultiplex_log, TrackedDirectory(demultiplex_output_folder)], name="demultiplex_dual") demultiplex_files = demultiplex_check(workflow, demultiplex_log, demultiplex_files) return demultiplex_files, demultiplex_output_folder
def demultiplex(workflow, input_files, extension, output_folder, barcode_file, index_files, min_phred, pair_identifier): """Demultiplex the files (single end or paired) Args: workflow (anadama2.workflow): An instance of the workflow class. input_files (list): A list of paths to fastq files for input to ea-utils. extension (string): The extension for all files. output_folder (string): The path of the output folder. barcode_file (string): A file of barcodes. index_files (string): A list of paths to the index files. min_phred (int): The min phred quality score to use in the demultiplex command. pair_identifier (string): The string in the file basename to identify the first pair in the set. Requires: ea-utils fastq-multx: A tool to demultiplex fastq files. Returns: list: A list of the demultiplexed files string: output folder of demultiplexed files """ # error if there is more than one index file if len(index_files) > 1: sys.exit("ERROR: Only one index file expected for demultiplexing step.") # read the barcode file to get the expected output files try: file_handle=open(barcode_file) lines=file_handle.readlines() file_handle.close() except EnvironmentError: sys.exit("ERROR: Unable to read barcode file: " + barcode_file) samples=set() for line in lines: # ignore headers or comment lines if not line.startswith("#"): sample_name=line.rstrip().split("\t")[0] if sample_name: samples.add(sample_name) # get the names of the expected output files demultiplex_fastq_files = utilities.name_files(samples,output_folder,subfolder="demultiplex",extension="fastq") # name the barcode file with the reverse complement barcodes added expanded_barcode_file = utilities.name_files("expanded_barcode_file.txt",output_folder,subfolder="demultiplex",create_folder=True) # create a file that includes the reverse complements of the barcodes workflow.add_task( "reverse_compliment_barcodes.py --input [depends[0]] --output [targets[0]]", depends=barcode_file, targets=expanded_barcode_file) # check for paired input files input_pair1, input_pair2 = utilities.paired_files(input_files, extension, pair_identifier) # capture the demultiplex stats in output files, one for each set of input files if input_pair1: demultiplex_log = utilities.name_files(input_pair1[0],output_folder,subfolder="demultiplex",extension="log") else: demultiplex_log = utilities.name_files(input_files[0],output_folder,subfolder="demultiplex",extension="log") # get the output folder for all files demultiplex_output_folder = os.path.dirname(demultiplex_log) # get the basenames of the output files, one for each sample demultiplex_output_basenames = utilities.name_files(samples,output_folder,subfolder="demultiplex") # create a tracked executable fastq_multx_tracked = TrackedExecutable("fastq-multx",version_command="echo 'fastq-multx' `fastq-multx 2>&1 | grep Version`") if input_pair1 and input_pair2: # this run has paired input files # get the second pair identifier pair_identifier2=pair_identifier.replace("1","2",1) # get the names of the expected output files demultiplex_fastq_files_R1 = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames] demultiplex_fastq_files_R2 = [file+pair_identifier2+".fastq" for file in demultiplex_output_basenames] demultiplex_fastq_files = demultiplex_fastq_files_R1+demultiplex_fastq_files_R2 if index_files: # this run has index files workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] [depends[3]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, index_files[0], input_pair1[0], input_pair2[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2], targets=demultiplex_log, name="demultiplex") else: workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%[args[2]].fastq [args[1]]/%[args[3]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, input_pair1[0], input_pair2[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier, pair_identifier2], targets=demultiplex_log, name="demultiplex") else: # this run has single end input files # get the names of the expected output files demultiplex_fastq_files = [file+pair_identifier+".fastq" for file in demultiplex_output_basenames] if index_files: # this run has index files workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] [depends[2]] -o [args[1]]/%_I1_001.fastq [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, index_files[0], input_files[0], fastq_multx_tracked], args=[min_phred, demultiplex_output_folder, pair_identifier], targets=demultiplex_log, name="demultiplex") else: workflow.add_task( "fastq-multx -l [depends[0]] [depends[1]] -o [args[1]]/%[args[2]].fastq -q [args[0]] > [targets[0]]", depends=[expanded_barcode_file, input_files[0]], args=[min_phred, demultiplex_output_folder, pair_identifier, fastq_multx_tracked], targets=demultiplex_log, name="demultiplex") demultiplex_fastq_files = demultiplex_check(workflow, demultiplex_log, demultiplex_fastq_files) return demultiplex_fastq_files, demultiplex_output_folder
def _generate_metadata_file(task): input_files = [seq_file.name for seq_file in task.depends[:-3]] studytrax_metadata = task.depends[-4].name broad_sample_sheet = task.depends[-3].name auxillary_metadata = task.depends[-1].name if task.depends[-1].name != '/dev/null' else None metadata_out_file = task.targets[0].name data_type_map = config.get('dtype_mapping') studytrax_df = pd.read_csv(studytrax_metadata) broad_sample_df = pd.read_csv(broad_sample_sheet, na_values=['destroyed', 'missed'], parse_dates=['Actual Date of Receipt']) collection_dates_dict = m_utils.get_collection_dates(broad_sample_df) if pair_identifier: (input_pair1, input_pair2) = bb_utils.paired_files(input_files, pair_identifier) input_files = input_pair1 if input_pair1 else input_files sample_mapping = dict(zip(bb_utils.sample_names(input_files, pair_identifier), map(get_sample_id_from_fname, input_files))) sample_ids = [sid.replace(pair_identifier, '') for sid in sample_mapping.values()] sample_subset_df = broad_sample_df[(broad_sample_df['Parent Sample A'].isin(sample_ids)) | (broad_sample_df['Proteomics'].isin(sample_ids)) | (broad_sample_df['MbX'].isin(sample_ids)) | (broad_sample_df['Site/Sub/Coll']).isin(sample_ids)] metadata_df = sample_subset_df.merge(studytrax_df, left_on='Parent Sample A', right_on='st_q4', how='left') ## We sometimes get a situation where our studytrax metadata is missing ## some of the proteomics sample ID's so we need to make sure we replicate ## them metadata_df.loc[metadata_df['st_q17'].isnull(), 'st_q17'] = metadata_df['Proteomics'] metadata_df.loc[metadata_df['st_q11'].isnull(), 'st_q11'] = metadata_df['MbX'] metadata_df['data_type'] = data_type_map.get(data_type) if proteomics_metadata: proteomics_df = m_utils.add_proteomics_metadata(sample_subset_df, proteomics_metadata, sample_mapping) metadata_df = metadata_df.merge(proteomics_df, on='Parent Sample A', how='left') metadata_df['External ID'] = new_metadata_df.apply(generate_external_id, axis=1) metadata_df['Site/Sub/Coll ID'] = metadata_df['Site/Sub/Coll'].map(lambda sid: str(sid)) metadata_df['Site'] = metadata_df['SiteName'] metadata_df['Participant ID'] = metadata_df['Subject'].map(lambda subj: 'C' + str(subj)) metadata_df['visit_num'] = metadata_df['Collection #'] metadata_df['Research Project'] = config.get('research_project') metadata_df['Project'] = metadata_df.apply(m_utils.get_project_id, axis=1) metadata_df = generate_collection_statistics(metadata_df, collection_dates_dict) metadata_df = metadata_df.drop(config.get('drop_cols'), axis=1, inplace=True) if auxillary_metadata: ## Auxillary metadata are columns that will be added into our ## existing metadata rows. metadata_df = m_utils.add_auxiliary_metadata(metadata_df,auxillary_metadata) metadata_df.to_csv(metadata_out_file, index=False)
desc="the path to the run_dbcan.py script", default="/app/") # get the arguments from the command line args = workflow.parse_args() # get all input files with the input extension provided on the command line # return an error if no files are found input_files = utilities.find_files(args.input, extension=args.input_extension, exit_if_not_found=True) ### STEP #1: Run quality control on all input files ### sample_names = utilities.sample_names(input_files, args.input_extension) input_pair1, input_pair2 = utilities.paired_files(input_files, args.input_extension, args.pair_identifier) paired = False if input_pair1: sample_names = utilities.sample_names(input_pair1, args.input_extension, args.pair_identifier) qc_targets = [ utilities.name_files([ name + ".trimmed.1.fastq", name + ".trimmed.2.fastq", name + ".trimmed.single.1.fastq", name + ".trimmed.single.2.fastq", name + ".trimmed.single.12.fastq" ], args.output, subfolder="kneaddata", create_folder=True) for name in sample_names ]
def main(workflow): args = workflow.parse_args() conf_mtx = parse_cfg_file(args.config_file, section='MTX') conf_mgx = parse_cfg_file(args.config_file, section='MGX') manifest = parse_cfg_file(args.manifest_file) data_files = manifest.get('submitted_files') project = manifest.get('project') creation_date = manifest.get('submission_date') adapters_file = manifest.get('adapters_file') contaminate_db = conf_mtx.get('databases').get('knead_dna') mtx_db = conf_mtx.get('databases').get('knead_mtx') rrna_db = conf_mtx.get('databases').get('knead_rrna') adapter_sequences = conf_mtx.get('adapter_sequences') qc_threads = args.threads_kneaddata if args.threads_kneaddata else args.threads tax_threads = args.threads_metaphlan if args.threads_metaphlan else args.threads func_threads = args.threads_humann if args.threads_humann else args.threads if data_files and data_files.get('MTX', {}).get('input'): input_files_mtx = data_files.get('MTX').get('input') file_extension_mtx = data_files.get('MTX').get('input_extension', '.fastq') pair_identifier_mtx = data_files.get('MTX').get('pair_identifier') input_file_tags = data_files.get('MTX').get('tags') input_tax_profiles = [] project_dirs_mtx = create_project_dirs([conf_mtx.get('deposition_dir'), conf_mtx.get('processing_dir'), conf_mtx.get('public_dir')], project, creation_date, 'MTX') public_dir_mtx = project_dirs_mtx[-1] base_depo_dir = os.path.abspath(os.path.join(project_dirs_mtx[0], '..')) manifest_file = stage_files(workflow, [args.manifest_file], base_depo_dir) deposited_files_mtx = stage_files(workflow, input_files_mtx, project_dirs_mtx[0], symlink=True) if file_extension_mtx == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files_mtx, project_dirs_mtx[1], paired_end=True, compress=False, threads=args.threads) pair_identifier_mtx = "_R1" else: paired_end_seqs = deposited_files_mtx if adapters_file: adapter_trim_opts = (" --trimmomatic-options \"ILLUMINACLIP:%s:2:30:10:8:TRUE " "SLIDINGWINDOW:4:20 MINLEN:50\"" % adapters_file) (cleaned_fastqs_mtx, read_counts_mtx) = quality_control(workflow, paired_end_seqs, file_extension_mtx, project_dirs_mtx[1], qc_threads, databases=[contaminate_db, rrna_db, mtx_db], pair_identifier=pair_identifier_mtx, additional_options=adapter_trim_opts, remove_intermediate_output=True) sample_names_mtx = sample_names(cleaned_fastqs_mtx, file_extension_mtx) ########################################## # MGX FILE PROCESSING # ########################################## # Ideally we would be passed in a set of corresponding metagenome # sequence(s) to go with our metatranscriptomic files but we also # have two other scenarios: # # 1.) No accompanying metagenomic sequences exist; in this # case we will proceed just using the metatranscriptomic # data. # 2.) Taxonomic profiles are passed directly in in our MANIFEST # file; here we remove these from our input files and # prevent them from running through the kneaddata -> # metaphlan2 portions of our pipeline if data_files.get('MGX', {}).get('input'): input_files_mgx = data_files.get('MGX').get('input') file_extension_mgx = data_files.get('MGX').get('file_ext') pair_identifier_mgx = data_files.get('MGX').get('pair_identifier') input_tax_profiles = [in_file for in_file in input_files_mgx if 'taxonomic_profile.tsv' in in_file] input_files_mgx = set(input_files_mgx) - set(input_tax_profiles) if input_files_mgx: sample_names_mgx = sample_names(input_files_mgx, file_extension_mgx, file_extension_mgx) project_dirs_mgx = create_project_dirs([conf_mgx.get('deposition_dir'), conf_mgx.get('processing_dir'), conf_mgx.get('public_dir')], project, creation_date, 'WGS') public_dir_mgx = project_dirs_mgx[-1] deposited_files_mgx = stage_files(workflow, input_files_mgx, project_dirs_mgx[0], symlink=True) if file_extension_mgx == ".bam": ## Need to sort our BAM files to be sure here... paired_end_seqs = bam_to_fastq(workflow, deposited_files_mgx, project_dirs_mgx[1], paired_end=True, compress=False, threads=args.threads) pair_identifier_mgx = "_R1" else: paired_end_seqs_mgx = paired_files(deposited_files_mgx, pair_identifier_mgx) (cleaned_fastqs_mgx, read_counts_mgx) = quality_control(workflow, paired_end_seqs_mgx, project_dirs_mgx[1], qc_threads, [contaminate_db, rrna_db], remove_intermediate_output=True) tax_outs_mgx = taxonomic_profile(workflow, cleaned_fastqs_mgx, project_dirs_mgx[1], tax_threads, '*.fastq') func_outs_mgx = functional_profile(workflow, cleaned_fastqs_mgx, project_dirs_mgx[1], func_threads, tax_outs_mgx[1], remove_intermediate_output=True) input_tax_profiles.extend(tax_outs_mgx[1]) pub_wgs_raw_dir = os.path.join(public_dir_mgx, 'raw') pub_wgs_tax_profile_dir = os.path.join(public_dir_mgx, 'tax_profile') pub_wgs_func_profile_dir = os.path.join(public_dir_mgx, 'func_profile') map(create_folders, [pub_wgs_raw_dir, pub_wgs_tax_profile_dir, pub_wgs_func_profile_dir]) norm_genefamilies_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='ecs', tag='genefamilies_ecs_relab', extension='tsv') norm_path_files_mgx = name_files(sample_names, project_dirs_mgx[1], subfolder='pathways', tag='pathabundance_relab', extension='tsv') pcl_files = add_metadata_to_tsv(workflow, [tax_outs_mgx[1]] + func_outs_mgx, 'metagenomics', conf_mgx.get('metadata_id_col'), conf_mgx.get('analysis_col_patterns'), conf_mgx.get('target_metadata_cols')) func_tar_files_wgs = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mgx, norm_genefamilies_mgx, norm_ecs_files_mgx, norm_path_files_mgx): tar_path = os.path.join(pub_wgs_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_outs_mgx) func_tar_files_wgs.append(func_tar_file) ########################################## # MTX FILE PROCESSING # ########################################## # Here we want to see if we can create a set of matching cleaned # MTX files to corresponding MGX taxonomic profiles. If these exist # we want to run functional profiling wit hthe corresponding MGX # taxonomic profile otherwise we will run a taxonomic profiling # on the MTX sequences and run functional profiling with the produced # taxonomic profile. func_outs_match_mtx = [] if input_tax_profiles: (matched_fqs, matched_tax_profiles) = match_tax_profiles(cleaned_fastqs_mtx, '.fastq', data_files.get('MTX').get('metadata_id_col', 'External ID'), input_tax_profiles, data_files.get('MGX').get('tax_profile_id', 'External ID'), args.metadata_file, tags=input_file_tags) func_outs_match_mtx = functional_profile(workflow, matched_fqs, project_dirs_mtx[1], func_threads, matched_tax_profiles, remove_intermediate_output=True) # Reset the remaining MTX files left over here so that we can run them through # the metaphlan2 -> humann2 pipeline. cleaned_fastqs_mtx = set(cleaned_fastqs_mtx) - set(matched_fqs) if cleaned_fastqs_mtx: tax_outs_mtx = taxonomic_profile(workflow, cleaned_fastqs_mtx, project_dirs_mtx[1], tax_threads, '*.fastq') func_outs_mtx = functional_profile(workflow, cleaned_fastqs_mtx, file_extension_mtx, project_dirs_mtx[1], func_threads, tax_outs_mtx[1], remove_intermediate_output=True) func_outs_mtx = list(func_outs_mtx).extend(func_outs_match_mtx) else: func_outs_mtx = func_outs_match_mtx # We'll need to generate DNA/RNA normalized files to be displayed # in our visualization output. (norm_gene_ratio, norm_ecs_ratio, norm_path_ratio) = norm_ratio(workflow, func_outs_mgx[0], func_outs_mgx[1], func_outs_mgx[2], func_outs_mtx[0], func_outs_mtx[1], func_outs_mtx[2], project_dirs_mtx[1]) pub_mtx_raw_dir = os.path.join(public_dir_mtx, 'raw') pub_mtx_tax_profile_dir = os.path.join(public_dir_mtx, 'tax_profile') pub_mtx_func_profile_dir = os.path.join(public_dir_mtx, 'func_profile') map(create_folders, [pub_mtx_raw_dir, pub_mtx_tax_profile_dir, pub_mtx_func_profile_dir]) norm_genefamilies_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='genes', tag='genefamilies_relab', extension='tsv') norm_ecs_files_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='ecs', tag='genefamilies_ecs_relab', extension='tsv') norm_path_files_mtx = name_files(sample_names_mtx, project_dirs_mtx[1], subfolder='pathways', tag='pathabundance_relab', extension='tsv') func_tar_files_mtx = [] for (sample, gene_file, ecs_file, path_file) in zip(sample_names_mtx, norm_genefamilies_mtx, norm_ecs_files_mtx, norm_path_files_mtx): tar_path = os.path.join(pub_mtx_func_profile_dir, "%s_humann2.tgz" % sample) func_tar_file = tar_files(workflow, [gene_file, ecs_file, path_file], tar_path, depends=func_outs_mtx) func_tar_files_mtx.append(func_tar_file) workflow.go()
def main(args): config = parse_cfg_file(args.config) study_trax_df = pd.read_csv(args.studytrax_metadata, dtype='str') broad_sample_df = pd.read_csv(args.broad_sample_tracking, na_values=['destroyed', 'missed'], parse_dates=['Actual Date of Receipt']) proteomics_df = None metadata_df = None new_metadata_df = None date_today = datetime.date.today() metadata_file = os.path.join(args.output_dir, 'hmp2_metadata_%s.csv' % date_today) ## Before we filter our metadata rows down to just to rows associated ## with the files we have present, we'll want a list of all the collection ## dates collection_dates_dict = get_collection_dates(broad_sample_df) biopsy_date_map = None if args.proteomics_metadata: proteomics_df = pd.read_table(args.proteomics_metadata) if args.biopsy_dates: biopsy_date_map = parse_biopsy_dates(args.biopsy_dates) ## The update procedure either assumes that we have an exisitng metadata ## file that we are going to be appending too/updating or that we are ## creating a fresh metadata sheet and will be adding the files in the ## manifest file too it. ## TODO: This needs to be re-worked to account for snagging datatypes as as well. #if not args.metadata_file or args.refresh_all: # sequence_files.extend(get_all_sequence_files(config.get('deposition_dir'), # config.get('input_extensions'))) if args.manifest_file: manifest = parse_cfg_file(args.manifest_file) submitted_files = manifest.get('submitted_files') if submitted_files: new_metadata = [] for (dtype, items) in submitted_files.iteritems(): input_files = items.get('input') pair_identifier = items.get('pair_identifier') if pair_identifier: (input_pair1, input_pair2) = bb_utils.paired_files( input_files, pair_identifier) input_files = input_pair1 if input_pair1 else input_files else: new_metadata.append( get_metadata_rows(config, study_trax_df, broad_sample_df, proteomics_df, dtype, input_files, pair_identifier)) new_metadata_df = pd.concat(new_metadata, ignore_index=True) #new_metadata_df[new_metadata_df['External ID'].isnull()] = None new_metadata_df['Site/Sub/Coll ID'] = new_metadata_df[ 'Site/Sub/Coll'].map(lambda sid: str(sid)) #new_metadata_df['Participant ID'] = new_metadata_df['Subject'].map(lambda subj: 'C' + str(subj)) if 'Collection #' in new_metadata_df.columns: new_metadata_df['visit_num'] = new_metadata_df['Collection #'] new_metadata_df['Project'] = new_metadata_df.apply(get_project_id, axis=1) new_metadata_df['ProjectSpecificID'] = pd.to_numeric( new_metadata_df['ProjectSpecificID']) new_metadata_df['Site'] = new_metadata_df['SiteName'] new_metadata_df = new_metadata_df.apply(generate_external_id, axis=1) new_metadata_df = remove_columns(new_metadata_df, config.get('drop_cols')) if args.metadata_file: metadata_df = pd.read_csv(args.metadata_file, parse_dates=['Actual Date of Receipt']) site_mapping = config.get('site_map') metadata_df['Site/Sub/Coll ID'] = metadata_df.apply( fix_site_sub_coll_id, args=(site_mapping, ), axis=1) metadata_df['PDO Number'] = metadata_df.apply(get_pdo_number, axis=1) if new_metadata_df and not new_metadata_df.empty: metadata_df = pd.concat([metadata_df, new_metadata_df], ignore_index=True) metadata_df = metadata_df.drop_duplicates( subset=['External ID', 'Site/Sub/Coll ID', 'data_type'], keep='last') else: metadata_df = new_metadata_df metadata_df[metadata_df['External ID'].isnull()] = metadata_df[ metadata_df['External ID'].isnull()].apply(generate_external_id, axis=1) if args.auxillary_metadata: for aux_file in args.auxillary_metadata: supp_df = pd.read_table(aux_file) supp_columns = supp_df.columns.tolist() idx_offset = 1 if 'data_type' in supp_columns: join_id = supp_columns[:1] + ['data_type'] idx_offset = 2 else: join_id = supp_columns[0] ## We need to do this in two stages. If the columns already exist ## here we want to update them. If they do not exist we append ## them. metadata_cols = metadata_df.columns.tolist() new_cols = set(supp_columns[idx_offset:]) - set(metadata_cols) existing_cols = set( supp_columns[idx_offset:]).intersection(metadata_cols) if new_cols: supp_new_df = supp_df.filter(items=supp_columns[:idx_offset] + list(new_cols)) metadata_df = metadata_df.merge(supp_new_df, how='left', on=join_id) if existing_cols: supp_existing_df = supp_df.filter( items=supp_columns[:idx_offset] + list(existing_cols)) metadata_df.set_index(join_id, inplace=True) supp_existing_df.set_index(join_id, inplace=True) metadata_df.update(supp_existing_df) metadata_df.reset_index(inplace=True) if args.add_all_stool_collections: metadata_df = add_all_stool_collections(metadata_df, study_trax_df, broad_sample_df) metadata_df['Actual Date of Receipt'] = pd.to_datetime( metadata_df['Actual Date of Receipt']) metadata_df['visit_num'] = metadata_df.apply(fill_visit_nums, axis=1) metadata_df['hbi_score'] = pd.to_numeric(metadata_df['hbi_score']) if 'Site' in metadata_df.columns.tolist(): metadata_df['SiteName'] = metadata_df['Site'] else: metadata_df['Site'] = metadata_df['SiteName'] ## Couple small remaining changes metadata_df.ix[metadata_df.hbi_score > 900, 'hbi_score'] = None metadata_df.ix[metadata_df.consent_age > 150, 'consent_age'] = None metadata_df['total_reads'].loc[metadata_df['total_reads'].astype( 'str').str.startswith('PDO')] = None metadata_df['Research Project'] = "ibdmdb" metadata_df = generate_collection_statistics(metadata_df, collection_dates_dict, biopsy_date_map) metadata_df = add_baseline_metadata_values(metadata_df, study_trax_df, config.get('baseline_cols')) metadata_df[metadata_df['SiteName'].isnull()] = fix_site_name( metadata_df[metadata_df['SiteName'].isnull()]) metadata_df = reorder_columns(metadata_df, config.get('col_order')) metadata_df.drop(['Site'], 1, inplace=True) metadata_df = metadata_df.sort_values( ['data_type', 'Participant ID', 'visit_num']) metadata_df.to_csv(metadata_file, index=False)
def merge_pairs_and_rename(workflow, method, input_files, extension, output_folder, pair_identifier, threads, fastq_ascii): """ Merge the files if pairs and rename sequence ids to match sample id Args: workflow (anadama2.workflow): An instance of the workflow class. method (string): tools for sequence analysis, usearch default or vsearch input_files (list): A list of paths to fastq files. extension (string): The extension for all files. output_folder (string): The path of the output folder. pair_identifier (string): The string in the file basename to identify the first pair in the set. threads (int): The number of threads for each task. Requires: usearch or vsearch Returns: list: A list of the renamed files. """ pair1, pair2 = utilities.paired_files(input_files, extension, pair_identifier) if pair1 and pair2: # paired input files were found # if the files are gzipped, first decompress as fastq_mergepairs will take in fastq.gz but the output will not be correctly formatted if pair1[0].endswith(".gz"): # get the names of the decompressed output files decompressed_pair1 = utilities.name_files( [os.path.basename(file).replace(".gz", "") for file in pair1], output_folder, subfolder="merged_renamed") # get the names of the decompressed output files decompressed_pair2 = utilities.name_files( [os.path.basename(file).replace(".gz", "") for file in pair2], output_folder, subfolder="merged_renamed") # add tasks to decompress the files workflow.add_task_group("gunzip -c [depends[0]] > [targets[0]]", depends=pair1 + pair2, targets=decompressed_pair1 + decompressed_pair2) # the pair files to be used for the remaining tasks are those that are decompressed pair1 = decompressed_pair1 pair2 = decompressed_pair2 # get the sample names from the input file names sample_names = [ os.path.basename(file).replace(pair_identifier + ".fastq", "") for file in pair1 ] # get the names of the output files stitched_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="stitched", extension="fastq", create_folder=True) unjoined_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="unjoined", extension="fastq") # run usearch to merge pairs, if input files are non-empty for read1, read2, stitched_output, unjoined_output in zip( pair1, pair2, stitched_files, unjoined_files): if method == 'vsearch': workflow.add_task( utilities.partial_function(merge_pairs, method="vsearch", threads=threads), depends=[read1, read2, TrackedExecutable("vsearch")], targets=[stitched_output, unjoined_output], name="vsearch_fastq_mergepairs") else: workflow.add_task( utilities.partial_function(merge_pairs, method="userach", threads=threads, fastq_ascii=fastq_ascii), depends=[read1, read2, TrackedExecutable("usearch")], targets=[stitched_output, unjoined_output], name="usearch_fastq_mergepairs") # merge the stitched and unjoined from the prior step renamed_files = utilities.name_files(sample_names, output_folder, subfolder="merged_renamed", tag="renamed", extension="fastq") workflow.add_task_group( "merge_and_rename_fastq.py [depends[0]] [depends[1]] _stitched [targets[0]]", depends=zip(stitched_files, unjoined_files), targets=renamed_files) else: # these files are not pairs and do not need to be merged # rename the files renamed_files = utilities.name_files(input_files, output_folder, subfolder="merged_renamed", tag="renamed", extension="fastq", create_folder=True) workflow.add_task_group( "merge_and_rename_fastq.py [depends[0]] '' '' [targets[0]]", depends=input_files, targets=renamed_files) return renamed_files