def run(self, config, config_file, run_parallel, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["process_alignment", "postprocess_alignment"], samples, dirs, config, multiplier=alignprep.parallel_multiplier(samples)) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("postprocess_alignment", samples) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) logger.info("Timing: coverage") samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs, config, multiplier=len(regions["analysis"]), max_multicore=1) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) ## Finalizing BAMs and population databases, handle multicore computation with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: structural variation") samples = structural.run(samples, run_parallel) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["align_prep_full"], lane_items, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel( "align_prep_full", [list(x) + [config_file] for x in lane_items]) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) logger.info("Timing: coverage") samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs, config, multiplier=len(regions["analysis"])) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) ## Finalizing BAMs and population databases, handle multicore computation with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["align_prep_full"], lane_items, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel("process_alignment", lane_items) ## Finalize (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["align_prep_full"], lane_items, dirs["work"], config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel( "align_prep_full", [list(x) + [config_file] for x in lane_items]) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel( parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs["work"], config, multiplier=len(regions["analysis"])) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs["work"], config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = validate.summarize_grading(samples) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["process_alignment"], lane_items, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel("process_alignment", lane_items) ## Finalize (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def main(system_config_file, cur_config_file): config = utils.merge_config_files([system_config_file, cur_config_file]) run_module = "bcbio.hbc.linker" trim_vals = config["algorithm"]["simple_trims"] fastq_dir = utils.add_full_path(config["dir"]["fastq"]) cur_files = [ os.path.join(fastq_dir, x["file"]) for x in config["experiments"] ] dirs = { "config": utils.add_full_path(os.path.dirname(system_config_file)), "work": os.getcwd(), "align": utils.add_full_path(config["dir"]["align"]) } dirs["galaxy"] = os.path.dirname( utils.add_full_path(config["galaxy_config"], dirs["config"])) config["dir"]["trim"] = utils.add_full_path(config["dir"]["work_trim"]) config["dir"]["fastq"] = fastq_dir config["dir"]["work_fastq"] = utils.add_full_path( config["dir"]["work_fastq"]) run_parallel = parallel_runner(run_module, dirs, config, system_config_file) aligned = [] for i in range(len(trim_vals.values()[0])): print cur_files in_args = [(f, i, trim_vals, config) for f in cur_files] align_trimmed_files = run_parallel("trim_with_aligner", in_args) cur_files = [ x["unaligned"] for x in align_trimmed_files if x["unaligned"] ] aligned.append([x["aligned"] for x in align_trimmed_files]) trimmed_fastq = combine_aligned(aligned, config) align_bams = do_alignment(trimmed_fastq, config, dirs, run_parallel) count_files = count_targets(align_bams, config) combine.identify_top_ranked(count_files, config)
def main(system_config_file, cur_config_file): config = utils.merge_config_files([system_config_file, cur_config_file]) run_module = "bcbio.hbc.linker" trim_vals = config["algorithm"]["simple_trims"] fastq_dir = utils.add_full_path(config["dir"]["fastq"]) cur_files = [os.path.join(fastq_dir, x["file"]) for x in config["experiments"]] dirs = {"config": utils.add_full_path(os.path.dirname(system_config_file)), "work" : os.getcwd(), "align": utils.add_full_path(config["dir"]["align"])} dirs["galaxy"] = os.path.dirname(utils.add_full_path(config["galaxy_config"], dirs["config"])) config["dir"]["trim"] = utils.add_full_path(config["dir"]["work_trim"]) config["dir"]["fastq"] = fastq_dir config["dir"]["work_fastq"] = utils.add_full_path(config["dir"]["work_fastq"]) run_parallel = parallel_runner(run_module, dirs, config, system_config_file) aligned = [] for i in range(len(trim_vals.values()[0])): print cur_files in_args = [(f, i, trim_vals, config) for f in cur_files] align_trimmed_files = run_parallel("trim_with_aligner", in_args) cur_files = [x["unaligned"] for x in align_trimmed_files if x["unaligned"]] aligned.append([x["aligned"] for x in align_trimmed_files]) trimmed_fastq = combine_aligned(aligned, config) align_bams = do_alignment(trimmed_fastq, config, dirs, run_parallel) count_files = count_targets(align_bams, config) combine.identify_top_ranked(count_files, config)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = run_info.organize(dirs, config, run_info_yaml) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane lane_items = lane.process_all_lanes(run_items, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0]) qcsummary.write_metrics(final, dirs)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) align_dir = os.path.join(work_dir, "alignments") fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) ## process each flowcell lane #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) #lane_items = run_parallel("process_lane", lanes) logger.info (">>> Parse lane") lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config) #for item in lane_items: #utils.prettyprint_dict(item) logger.info (">>> Process alignment") align_items = run_parallel("process_alignment", lane_items) ## process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) logger.info (">>> Merge samples") samples = run_parallel("merge_sample", samples) logger.info (">>> Recalibrate samples") samples = run_parallel("recalibrate_sample", samples) logger.info (">>> realign sample") samples = parallel_realign_sample(samples, run_parallel) logger.info (">>> variantcall") samples = parallel_variantcall(samples, run_parallel) logger.info (">>> postprocess_variatns") samples = run_parallel("postprocess_variants", samples) logger.info (">>> combine_multiple_calles") samples = combine_multiple_callers(samples) logger.info (">>> detect_sv") samples = run_parallel("detect_sv", samples) logger.info (">>> combine_calls") samples = run_parallel("combine_calls", samples) logger.info (">>> process_sample") run_parallel("process_sample", samples) logger.info (">>> Generate bigwig") run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) logger.info (">>> Writing project summary") write_project_summary(samples) logger.info (">>> Writing metrics") write_metrics(run_info, fc_name, fc_date, dirs) logger.info (">>> Done")
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ var_dir = os.path.join(self.data_dir, "variants") files = [ os.path.join(var_dir, "S1-variants.vcf"), os.path.join(var_dir, "S2-variants.vcf") ] out_file = os.path.join(var_dir, "S1_S2-combined.vcf") ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config( os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) run_parallel = parallel_runner({ "type": "local", "cores": 1 }, {}, config) region_dir = os.path.join(var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(out_file): os.remove(out_file) vcfutils.parallel_combine_variants(files, out_file, ref_file, config, run_parallel)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir, } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("detect_sv", samples) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(run_module, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("recalibrate_sample", samples) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("process_sample", samples) samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def merge_vcf_files(sample_files, cores, config): out_file = config["outputs"]["merge"] config["algorithm"] = {} run_parallel = parallel_runner({"type": "local", "cores": min(cores, 8)}, {}, config) vcfutils.parallel_combine_variants(sample_files, out_file, config["ref"]["GRCh37"], config, run_parallel) return out_file
def _run_toplevel(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_items = run_info.organize(dirs, config, run_info_yaml) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane lane_items = lane.process_all_lanes(run_items, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) final = [] with utils.curdir_tmpdir() as tmpdir: tempfile.tempdir = tmpdir for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, run_parallel, parallel, config) versioncheck.testall(pipeline_items) for xs in pipeline.run(config, config_file, run_parallel, parallel, dirs, pipeline_items): if len(xs) == 1: upload.from_sample(xs[0]) final.append(xs[0])
def parallel_callable_loci(in_bam, ref_file, config): num_cores = config["algorithm"].get("num_cores", 1) data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config} parallel = {"type": "local", "cores": num_cores, "module": "bcbio.distributed"} runner = parallel_runner(parallel, {}, config) split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam") out = parallel_split_combine([[data]], split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"])[0] return out[0]["callable_bed"]
def merge_vcf_files(sample_files, cores, config): out_file = config["outputs"]["merge"] config["algorithm"] = {} run_parallel = parallel_runner({ "type": "local", "cores": min(cores, 8) }, {}, config) vcfutils.parallel_combine_variants(sample_files, out_file, config["ref"]["GRCh37"], config, run_parallel) return out_file
def parallel_callable_loci(in_bam, ref_file, config): num_cores = config["algorithm"].get("num_cores", 1) data = {"work_bam": in_bam, "sam_ref": ref_file, "config": config} parallel = { "type": "local", "cores": num_cores, "module": "bcbio.distributed" } runner = parallel_runner(parallel, {}, config) split_fn = shared.process_bam_by_chromosome("-callable.bed", "work_bam") out = parallel_split_combine([[data]], split_fn, runner, "calc_callable_loci", "combine_bed", "callable_bed", ["config"])[0] return out[0]["callable_bed"]
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ var_dir = os.path.join(self.data_dir, "variants") files = [os.path.join(var_dir, "S1-variants.vcf"), os.path.join(var_dir, "S2-variants.vcf")] out_file = os.path.join(var_dir, "S1_S2-combined.vcf") ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) run_parallel = parallel_runner({"type": "local", "cores": 1}, {}, config) region_dir = os.path.join(var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(out_file): os.remove(out_file) vcfutils.parallel_combine_variants(files, out_file, ref_file, config, run_parallel)
def test_1_parallel_vcf_combine(self): """Parallel combination of VCF files, split by chromosome. """ files = [os.path.join(self.var_dir, "S1-variants.vcf"), os.path.join(self.var_dir, "S2-variants.vcf")] ref_file = os.path.join(self.data_dir, "genomes", "hg19", "seq", "hg19.fa") config = load_config(os.path.join(self.data_dir, "automated", "post_process-sample.yaml")) run_parallel = parallel_runner({"type": "local", "cores": 1}, {}, config) region_dir = os.path.join(self.var_dir, "S1_S2-combined-regions") if os.path.exists(region_dir): shutil.rmtree(region_dir) if os.path.exists(self.combo_file): os.remove(self.combo_file) vcfutils.parallel_combine_variants(files, self.combo_file, ref_file, config, run_parallel) for fname in files: if os.path.exists(fname + ".gz"): subprocess.check_call(["gunzip", fname + ".gz"]) if os.path.exists(fname + ".gz.tbi"): os.remove(fname + ".gz.tbi")
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """ Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths( get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = { "fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir } run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = lane.process_all_lanes(lanes, run_parallel) pipelines = _pair_lanes_with_pipelines(lane_items) for pipeline, pipeline_items in pipelines.items(): pipeline_items = _add_provenance(pipeline_items, dirs, config) for xs in pipeline.run(config, config_file, run_parallel, dirs, pipeline_items): assert len(xs) == 1 upload.from_sample(xs[0]) qcsummary.write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, work_dir, parallel, fc_dir=None, run_info_yaml=None): """Run toplevel analysis, processing a set of input files. config_file -- Main YAML configuration file with system parameters fc_dir -- Directory of fastq files to process run_info_yaml -- YAML configuration file specifying inputs to process """ setup_logging(config) fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None, config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} config = _set_resources(parallel, config) run_parallel = parallel_runner(parallel, dirs, config, config_file) # process each flowcell lane run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) align_items = run_parallel("process_alignment", lane_items) # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) samples = run_parallel("prep_recal", samples) samples = recalibrate.parallel_write_recal_bam(samples, run_parallel) samples = parallel_realign_sample(samples, run_parallel) samples = parallel_variantcall(samples, run_parallel) samples = run_parallel("postprocess_variants", samples) samples = combine_multiple_callers(samples) samples = run_parallel("detect_sv", samples) samples = run_parallel("combine_calls", samples) run_parallel("process_sample", samples) run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs)
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("process_lane") # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) _add_to_compress(to_compress, lane_items, 'lane_items') prog.dummy() prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) _add_to_compress(to_compress, align_items, 'align_items') prog.dummy() prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.dummy() prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("variantcall") samples = run_parallel("detect_sv", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("detect_sv") samples = run_parallel("process_sample", samples) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) _add_to_compress(to_compress, samples, 'samples') prog.dummy() prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.dummy() prog.progress("write_metrics") # Compress all files in to_compress if config['algorithm'].get('compress_files', True): sizes = run_parallel("compress_files", [[[cf]] for cf in to_compress]) before = sum([s[0] for s in sizes]) after = sum([s[1] for s in sizes]) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))
def run_main(config, config_file, fc_dir, work_dir, run_info_yaml): _record_sw_versions(config, os.path.join(work_dir, "bcbb_software_versions.txt")) prog = utils.RecordProgress(work_dir) to_compress = set() prog.progress("analysis_start") align_dir = os.path.join(work_dir, "alignments") run_module = "bcbio.distributed" fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml) fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir), config, config_file) config_file = os.path.join(config_dir, os.path.basename(config_file)) dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir, "work": work_dir, "flowcell": fc_dir, "config": config_dir} run_parallel = parallel_runner(run_module, dirs, config, config_file) run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name) lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items) lane_items = run_parallel("process_lane", lanes) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("process_lane") # upload the sequencing report to Google Docs # will skip this for now and rely on external mechanism for uploading this data #gdocs_indicator = os.path.join(work_dir, "gdocs_report_complete.txt") #if not os.path.exists(gdocs_indicator) \ #and queue_report(fc_date, fc_name, os.path.abspath(run_info_yaml), dirs, config, config_file): # utils.touch_file(gdocs_indicator) # Remove spiked in controls, contaminants etc. lane_items = run_parallel("remove_contaminants", lane_items) [to_compress.add(f) for f in lane_items[0][0:2]] prog.progress("remove_contaminants") align_items = run_parallel("process_alignment", lane_items) [to_compress.add(f) for f in align_items[0]['fastq']] prog.progress("process_alignment") # process samples, potentially multiplexed across multiple lanes samples = organize_samples(align_items, dirs, config_file) samples = run_parallel("merge_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("merge_sample") samples = run_parallel("mark_duplicates_sample", samples) to_compress.add(samples[0][0]['fastq1']) to_compress.add(samples[0][0]['fastq2']) prog.progress("mark_duplicates_sample") run_parallel("screen_sample_contaminants", samples) prog.progress("screen_sample_contaminants") samples = run_parallel("recalibrate_sample", samples) prog.progress("recalibrate_sample") samples = parallel_realign_sample(samples, run_parallel) prog.progress("realign_sample") samples = parallel_variantcall(samples, run_parallel) prog.progress("variantcall") samples = run_parallel("detect_sv", samples) prog.progress("detect_sv") samples = run_parallel("process_sample", samples) prog.progress("process_sample") samples = run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) prog.progress("generate_bigwig") write_project_summary(samples) write_metrics(run_info, fc_name, fc_date, dirs) prog.progress("write_metrics") # Write statusdb metrics # will skip this for now and rely on external mechanism for uploading this data #report_to_statusdb(fc_name, fc_date, run_info_yaml, dirs, config) #Compress all files in to_compress if config['algorithm'].get('compress_files', True): (before, after) = utils.compress_files(to_compress) logger.info("Space used by the files before compressing (in bytes): " \ + str(before)) logger.info("Space used by the files after compressing (in bytes): " \ + str(after)) logger.info("Saved space (in bytes): " + str(before - after))