def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir, } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("detect_sv", samples) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, run_info_yaml): work_dir = os.getcwd() align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date = get_flowcell_info(fc_dir) run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) # process each flowcell lane lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = _run_parallel("process_lane", lanes, dirs, config) _run_parallel("process_alignment", lane_items, dirs, config) # process samples, potentially multiplexed across multiple lanes sample_files, sample_fastq, sample_info = \ organize_samples(dirs, fc_name, fc_date, run_items) samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file) for n, bam_files in sample_files) _run_parallel("process_sample", samples, dirs, config) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) ## process each flowcell lane #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) #lane_items = run_parallel("process_lane", lanes) logger.info (">>> Parse lane") lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config) #for item in lane_items: #utils.prettyprint_dict(item) logger.info (">>> Process alignment") align_items = run_parallel("process_alignment", lane_items) ## process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) logger.info (">>> Merge samples") samples = run_parallel("merge_sample", samples) logger.info (">>> Recalibrate samples") samples = run_parallel("recalibrate_sample", samples) logger.info (">>> realign sample") samples = parallel_realign_sample(samples, run_parallel) logger.info (">>> variantcall") samples = parallel_variantcall(samples, run_parallel) logger.info (">>> postprocess_variatns") samples = run_parallel("postprocess_variants", samples) logger.info (">>> combine_multiple_calles") samples = combine_multiple_callers(samples) logger.info (">>> detect_sv") samples = run_parallel("detect_sv", samples) logger.info (">>> combine_calls") samples = run_parallel("combine_calls", samples) logger.info (">>> process_sample") run_parallel("process_sample", samples) logger.info (">>> Generate bigwig") run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) logger.info (">>> Writing project summary") write_project_summary(samples) logger.info (">>> Writing metrics") write_metrics(run_info, fc_name, fc_date, dirs) logger.info (">>> Done")
def run(self, config, config_file, run_parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("generate_transcript_counts", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) return samples
def run(self, config, config_file, run_parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("generate_transcript_counts", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def _organize_merge_samples(align_out, dirs, config_file): """Back compatibility handling organizing and merging samples. """ samples = merge.organize_samples([align_out], dirs, config_file) assert len(samples) == 1 and len(samples[0]) == 1 sample_data = samples[0][0] sample_data["dirs"]["work"] = os.path.dirname(align_out["work_bam"]) samples2 = sample.merge_sample(sample_data) assert len(samples2) == 1 and len(samples2[0]) == 1 data = samples2[0][0] data["dirs"] = copy.deepcopy(dirs) return data
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run(self, config, config_file, run_parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = run_parallel("detect_sv", samples) samples = qcsummary.generate_parallel(samples, run_parallel) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) return samples
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, run_info_yaml): # Working directory has to be identical to where (demultiplexed) fastq files are located fc_dir = os.path.normpath(fc_dir) work_dir = os.getcwd() align_dir = os.path.join(work_dir, "alignments") #(_, fastq_dir_label) = os.path.split(work_dir) #fastq_dir = os.path.join(project_dir, fastq_dir_label) #fc_name, fc_date = get_flowcell_info(fc_dir) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) #run_info = _get_run_info(fc_name, fc_date, config, run_info_yaml) #fastq_dir, galaxy_dir, config_dir = _get_full_paths(fastq_dir, config, config_file) galaxy_dir, config_dir = _get_full_paths(config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) # fastq = fastq_dir, dirs = dict(galaxy= galaxy_dir, align = align_dir, work = work_dir, config = config_dir, flowcell = fc_dir, fc_dir = fc_dir ) # Since demultiplexing is already done, just extract run_items run_items = run_info['details'] lane_items = [] for info in run_items: print info lane_items.extend(make_lane_items(info, fc_date, fc_name, dirs, config)) _run_parallel("process_alignment", lane_items, dirs, config) # Process samples sample_files, sample_fastq, sample_info = \ organize_samples(dirs, fc_name, fc_date, run_items) samples = ((n, sample_fastq[n], sample_info[n], bam_files, dirs, config, config_file) for n, bam_files in sample_files) _run_parallel("process_sample", samples, dirs, config)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = utils.RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("process_lane") # upload the sequencing report to Google Docs # will skip this for now and rely on external mechanism for uploading this data #gdocs_indicator = os.path.join(work_dir, "gdocs_report_complete.txt") #if not os.path.exists(gdocs_indicator) \ #and queue_report(fc_date, fc_name, os.path.abspath(run_info_yaml), dirs, config, config_file): # utils.touch_file(gdocs_indicator) # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) [to_compress.add(f) for f in align_items[0]['fastq']] prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) prog.progress("variantcall") samples = run_parallel("detect_sv", samples) prog.progress("detect_sv") samples = run_parallel("process_sample", samples) prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.progress("write_metrics") # Write statusdb metrics # will skip this for now and rely on external mechanism for uploading this data #report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config) #Compress all files in to_compress if config['algorithm'].get('compress_files', True): (before, after) = utils.compress_files(to_compress) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("process_lane") # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) _add_to_compress(to_compress, align_items, 'align_items') prog.dummy() prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.dummy() prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("variantcall") samples = run_parallel("detect_sv", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("detect_sv") samples = run_parallel("process_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.dummy() prog.progress("write_metrics") # Compress all files in to_compress if config['algorithm'].get('compress_files', True): sizes = run_parallel("compress_files", [[[cf]] for cf in to_compress]) before = sum([s[0] for s in sizes]) after = sum([s[1] for s in sizes]) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))