Example #1
0
    def run(self, config, config_file, parallel, dirs, samples):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"],
                              (["reference", "fasta"], ["reference", "aligner"], ["files"])),
                        samples, config, dirs, "multicore",
                        multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
            with profile.report("alignment preparation", dirs):
                samples = run_parallel("prep_align_inputs", samples)
                samples = disambiguate.split(samples)
            with profile.report("alignment", dirs):
                samples = run_parallel("process_alignment", samples)
                samples = alignprep.merge_split_alignments(samples, run_parallel)
                samples = disambiguate.resolve(samples, run_parallel)
            with profile.report("callable regions", dirs):
                samples = run_parallel("postprocess_alignment", samples)
                samples = run_parallel("combine_sample_regions", [samples])
                samples = region.clean_sample_data(samples)
            with profile.report("coverage", dirs):
                samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                        samples, config, dirs, "full",
                        multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel:
            with profile.report("alignment post-processing", dirs):
                samples = region.parallel_prep_region(samples, run_parallel)
            with profile.report("variant calling", dirs):
                samples = genotype.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]),
                        samples, config, dirs, "persample") as run_parallel:
            with profile.report("joint squaring off/backfilling", dirs):
                samples = joint.square_off(samples, run_parallel)
            with profile.report("variant post-processing", dirs):
                samples = run_parallel("postprocess_variants", samples)
                samples = run_parallel("split_variants_by_sample", samples)
            with profile.report("validation", dirs):
                samples = run_parallel("compare_to_rm", samples)
                samples = genotype.combine_multiple_callers(samples)
        ## Finalizing BAMs and population databases, handle multicore computation
        with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation",
                                         "bcbio-variation-recall"]),
                        samples, config, dirs, "multicore2") as run_parallel:
            with profile.report("prepped BAM merging", dirs):
                samples = region.delayed_bamprep_merge(samples, run_parallel)
            with profile.report("ensemble calling", dirs):
                samples = ensemble.combine_calls_parallel(samples, run_parallel)
            with profile.report("validation summary", dirs):
                samples = validate.summarize_grading(samples)
            with profile.report("structural variation", dirs):
                samples = structural.run(samples, run_parallel)
            with profile.report("population database", dirs):
                samples = population.prep_db_parallel(samples, run_parallel)
            with profile.report("quality control", dirs):
                samples = qcsummary.generate_parallel(samples, run_parallel)
            with profile.report("archive", dirs):
                samples = archive.compress(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Example #2
0
File: main.py Project: kevyin/bcbb
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    align_dir = os.path.join(work_dir, "alignments")
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir, "align": align_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    ## process each flowcell lane
    #run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    #lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    #lane_items = run_parallel("process_lane", lanes)

    logger.info (">>> Parse lane")
    lane_items = parse_lane(run_info["details"], fc_name, fc_date, dirs, config)
    
    #for item in lane_items:
        #utils.prettyprint_dict(item)

    logger.info (">>> Process alignment")
    align_items = run_parallel("process_alignment", lane_items)

    ## process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    logger.info (">>> Merge samples")
    samples = run_parallel("merge_sample", samples)
    logger.info (">>> Recalibrate samples")
    samples = run_parallel("recalibrate_sample", samples)
    logger.info (">>> realign sample")
    samples = parallel_realign_sample(samples, run_parallel)
    logger.info (">>> variantcall")
    samples = parallel_variantcall(samples, run_parallel)
    logger.info (">>> postprocess_variatns")
    samples = run_parallel("postprocess_variants", samples)
    logger.info (">>> combine_multiple_calles")
    samples = combine_multiple_callers(samples)
    logger.info (">>> detect_sv")
    samples = run_parallel("detect_sv", samples)
    logger.info (">>> combine_calls")
    samples = run_parallel("combine_calls", samples)
    logger.info (">>> process_sample")
    run_parallel("process_sample", samples)
    logger.info (">>> Generate bigwig")
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    logger.info (">>> Writing project summary")
    write_project_summary(samples)
    logger.info (">>> Writing metrics")
    write_metrics(run_info, fc_name, fc_date, dirs)
    logger.info (">>> Done")
Example #3
0
    def run(self, config, config_file, parallel, dirs, samples):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"],
                              (["reference", "fasta"], ["reference", "aligner"], ["files"])),
                        samples, config, dirs, "multicore",
                        multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
            with profile.report("alignment preparation", dirs):
                samples = run_parallel("prep_align_inputs", samples)
                samples = disambiguate.split(samples)
            with profile.report("alignment", dirs):
                samples = run_parallel("process_alignment", samples)
                samples = alignprep.merge_split_alignments(samples, run_parallel)
                samples = disambiguate.resolve(samples, run_parallel)
            with profile.report("callable regions", dirs):
                samples = run_parallel("postprocess_alignment", samples)
                samples = run_parallel("combine_sample_regions", [samples])
                samples = region.clean_sample_data(samples)
            with profile.report("coverage", dirs):
                samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                        samples, config, dirs, "full",
                        multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel:
            with profile.report("alignment post-processing", dirs):
                samples = region.parallel_prep_region(samples, run_parallel)
            with profile.report("variant calling", dirs):
                samples = genotype.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]),
                        samples, config, dirs, "persample") as run_parallel:
            with profile.report("variant post-processing", dirs):
                samples = run_parallel("postprocess_variants", samples)
                samples = run_parallel("split_variants_by_sample", samples)
            with profile.report("validation", dirs):
                samples = run_parallel("compare_to_rm", samples)
                samples = genotype.combine_multiple_callers(samples)
        ## Finalizing BAMs and population databases, handle multicore computation
        with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation",
                                         "bcbio-variation-recall"]),
                        samples, config, dirs, "multicore2") as run_parallel:
            with profile.report("prepped BAM merging", dirs):
                samples = region.delayed_bamprep_merge(samples, run_parallel)
            with profile.report("ensemble calling", dirs):
                samples = ensemble.combine_calls_parallel(samples, run_parallel)
            with profile.report("validation summary", dirs):
                samples = validate.summarize_grading(samples)
            with profile.report("structural variation", dirs):
                samples = structural.run(samples, run_parallel)
            with profile.report("population database", dirs):
                samples = population.prep_db_parallel(samples, run_parallel)
            with profile.report("quality control", dirs):
                samples = qcsummary.generate_parallel(samples, run_parallel)
            with profile.report("archive", dirs):
                samples = archive.compress(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Example #4
0
 def run(self, config, config_file, run_parallel, dirs, lane_items):
     # Handle alignment and preparation requiring the entire input file
     samples = run_parallel("align_prep_full", (list(x) + [config_file] for x in lane_items))
     samples = callable.combine_sample_regions(samples)
     # Handle all variant calling on sub-regions of the input file
     samples = region.parallel_prep_region(samples, run_parallel)
     samples = region.parallel_variantcall_region(samples, run_parallel)
     samples = combine_multiple_callers(samples)
     samples = run_parallel("combine_calls", samples)
     return samples
Example #5
0
    def run(self, config, config_file, run_parallel, parallel, dirs, samples):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with global_parallel(parallel, "multicore", ["process_alignment", "postprocess_alignment"],
                             samples, dirs, config,
                             multiplier=alignprep.parallel_multiplier(samples)) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment")
            samples = run_parallel("prep_align_inputs", samples)
            samples = disambiguate.split(samples)
            samples = run_parallel("process_alignment", samples)
            samples = alignprep.merge_split_alignments(samples, run_parallel)
            samples = disambiguate.resolve(samples, run_parallel)
            samples = run_parallel("postprocess_alignment", samples)
            regions = callable.combine_sample_regions(samples)
            samples = region.add_region_info(samples, regions)
            samples = region.clean_sample_data(samples)
            logger.info("Timing: coverage")
            samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"],
                             samples, dirs, config,
                             multiplier=len(regions["analysis"]), max_multicore=1) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment post-processing")
            samples = region.parallel_prep_region(samples, regions, run_parallel)
            logger.info("Timing: variant calling")
            samples = region.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with global_parallel(parallel, "persample", ["postprocess_variants"],
                             samples, dirs, config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: variant post-processing")
            samples = run_parallel("postprocess_variants", samples)
            logger.info("Timing: validation")
            samples = run_parallel("compare_to_rm", samples)
            samples = combine_multiple_callers(samples)
            logger.info("Timing: ensemble calling")
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
            samples = validate.summarize_grading(samples)
        ## Finalizing BAMs and population databases, handle multicore computation
        with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"],
                             samples, dirs, config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: prepped BAM merging")
            samples = region.delayed_bamprep_merge(samples, run_parallel)
            logger.info("Timing: structural variation")
            samples = structural.run(samples, run_parallel)
            logger.info("Timing: population database")
            samples = population.prep_db_parallel(samples, run_parallel)
            logger.info("Timing: quality control")
            samples = qcsummary.generate_parallel(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Example #6
0
    def run(self, config, config_file, run_parallel, parallel, dirs,
            lane_items):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with global_parallel(parallel, "multicore", ["align_prep_full"],
                             lane_items, dirs, config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment")
            samples = run_parallel(
                "align_prep_full",
                [list(x) + [config_file] for x in lane_items])
            regions = callable.combine_sample_regions(samples)
            samples = region.add_region_info(samples, regions)
            samples = region.clean_sample_data(samples)
            logger.info("Timing: coverage")
            samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with global_parallel(parallel,
                             "full", ["piped_bamprep", "variantcall_sample"],
                             samples,
                             dirs,
                             config,
                             multiplier=len(regions["analysis"])) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment post-processing")
            samples = region.parallel_prep_region(samples, regions,
                                                  run_parallel)
            logger.info("Timing: variant calling")
            samples = region.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with global_parallel(parallel, "persample", ["postprocess_variants"],
                             samples, dirs, config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: variant post-processing")
            samples = run_parallel("postprocess_variants", samples)
            logger.info("Timing: validation")
            samples = run_parallel("compare_to_rm", samples)
            samples = combine_multiple_callers(samples)
            logger.info("Timing: ensemble calling")
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
            samples = validate.summarize_grading(samples)
            logger.info("Timing: quality control")
            samples = qcsummary.generate_parallel(samples, run_parallel)
        ## Finalizing BAMs and population databases, handle multicore computation
        with global_parallel(parallel, "multicore2",
                             ["prep_gemini_db", "delayed_bam_merge"], samples,
                             dirs, config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: prepped BAM merging")
            samples = region.delayed_bamprep_merge(samples, run_parallel)
            logger.info("Timing: population database")
            samples = population.prep_db_parallel(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Example #7
0
    def run(self, config, config_file, run_parallel, parallel, dirs,
            lane_items):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with global_parallel(parallel, "multicore", ["align_prep_full"],
                             lane_items, dirs["work"], config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment")
            samples = run_parallel(
                "align_prep_full",
                [list(x) + [config_file] for x in lane_items])
            regions = callable.combine_sample_regions(samples)
            samples = region.add_region_info(samples, regions)
            samples = region.clean_sample_data(samples)

        ## Variant calling on sub-regions of the input file (full cluster)
        with global_parallel(
                parallel,
                "full", ["piped_bamprep", "variantcall_sample"],
                samples,
                dirs["work"],
                config,
                multiplier=len(regions["analysis"])) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: alignment post-processing")
            samples = region.parallel_prep_region(samples, regions,
                                                  run_parallel)
            logger.info("Timing: variant calling")
            samples = region.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with global_parallel(parallel, "persample", ["postprocess_variants"],
                             samples, dirs["work"], config) as parallel:
            run_parallel = parallel_runner(parallel, dirs, config)
            logger.info("Timing: variant post-processing")
            samples = run_parallel("postprocess_variants", samples)
            samples = combine_multiple_callers(samples)
            logger.info("Timing: ensemble calling")
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
            logger.info("Timing: prepped BAM merging")
            samples = region.delayed_bamprep_merge(samples, run_parallel)
            logger.info("Timing: validation")
            samples = run_parallel("compare_to_rm", samples)
            samples = validate.summarize_grading(samples)
            logger.info("Timing: population database")
            samples = population.prep_db_parallel(samples, run_parallel)
            logger.info("Timing: quality control")
            samples = qcsummary.generate_parallel(samples, run_parallel)
            logger.info("Timing: finished")
        return samples
Example #8
0
File: main.py Project: jme9/wabio
def run_main(config,
             config_file,
             work_dir,
             parallel,
             fc_dir=None,
             run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(
        get_fastq_dir(fc_dir) if fc_dir else None, config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {
        "fastq": fastq_dir,
        "galaxy": galaxy_dir,
        "work": work_dir,
        "flowcell": fc_dir,
        "config": config_dir
    }
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"],
                                           fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Example #9
0
 def run(self, config, config_file, run_parallel, dirs, lane_items):
     # Handle alignment and preparation requiring the entire input file
     samples = run_parallel("align_prep_full", (list(x) + [config_file] for x in lane_items))
     regions = callable.combine_sample_regions(samples)
     samples = region.add_region_info(samples, regions)
     # Handle all variant calling on sub-regions of the input file
     samples = region.clean_sample_data(samples)
     samples = region.parallel_prep_region(samples, regions, run_parallel)
     samples = region.parallel_variantcall_region(samples, run_parallel)
     samples = run_parallel("postprocess_variants", samples)
     samples = combine_multiple_callers(samples)
     samples = ensemble.combine_calls_parallel(samples, run_parallel)
     samples = population.prep_db_parallel(samples, run_parallel)
     samples = region.delayed_bamprep_merge(samples, run_parallel)
     samples = qcsummary.generate_parallel(samples, run_parallel)
     samples = validate.summarize_grading(samples)
     return samples
Example #10
0
 def run(self, config, config_file, run_parallel, dirs, lane_items):
     lane_items = run_parallel("trim_lane", lane_items)
     align_items = run_parallel("process_alignment", lane_items)
     # process samples, potentially multiplexed across multiple lanes
     samples = organize_samples(align_items, dirs, config_file)
     samples = run_parallel("merge_sample", samples)
     samples = run_parallel("prep_recal", samples)
     samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
     samples = parallel_realign_sample(samples, run_parallel)
     samples = parallel_variantcall(samples, run_parallel)
     samples = run_parallel("postprocess_variants", samples)
     samples = combine_multiple_callers(samples)
     samples = ensemble.combine_calls_parallel(samples, run_parallel)
     samples = run_parallel("detect_sv", samples)
     samples = qcsummary.generate_parallel(samples, run_parallel)
     run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
     return samples
Example #11
0
 def run(self, config, config_file, run_parallel, dirs, lane_items):
     lane_items = run_parallel("trim_lane", lane_items)
     align_items = run_parallel("process_alignment", lane_items)
     # process samples, potentially multiplexed across multiple lanes
     samples = organize_samples(align_items, dirs, config_file)
     samples = run_parallel("merge_sample", samples)
     samples = run_parallel("prep_recal", samples)
     samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
     samples = parallel_realign_sample(samples, run_parallel)
     samples = parallel_variantcall(samples, run_parallel)
     samples = run_parallel("postprocess_variants", samples)
     samples = combine_multiple_callers(samples)
     samples = ensemble.combine_calls_parallel(samples, run_parallel)
     samples = run_parallel("detect_sv", samples)
     samples = qcsummary.generate_parallel(samples, run_parallel)
     run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
     return samples
Example #12
0
 def run(self, config, config_file, run_parallel, dirs, lane_items):
     # Handle alignment and preparation requiring the entire input file
     samples = run_parallel("align_prep_full",
                            (list(x) + [config_file] for x in lane_items))
     regions = callable.combine_sample_regions(samples)
     samples = region.add_region_info(samples, regions)
     # Handle all variant calling on sub-regions of the input file
     samples = region.clean_sample_data(samples)
     samples = region.parallel_prep_region(samples, regions, run_parallel)
     samples = region.parallel_variantcall_region(samples, run_parallel)
     samples = run_parallel("postprocess_variants", samples)
     samples = combine_multiple_callers(samples)
     samples = ensemble.combine_calls_parallel(samples, run_parallel)
     samples = population.prep_db_parallel(samples, run_parallel)
     samples = region.delayed_bamprep_merge(samples, run_parallel)
     samples = qcsummary.generate_parallel(samples, run_parallel)
     samples = validate.summarize_grading(samples)
     return samples
Example #13
0
 def run(self, config, config_file, run_parallel, parallel, dirs, lane_items):
     raise NotImplementedError("`variant` processing is deprecated: please use `variant2`"
                               "The next version will alias variant to the new variant2 pipeline")
     lane_items = run_parallel("trim_lane", lane_items)
     align_items = run_parallel("process_alignment", lane_items)
     # process samples, potentially multiplexed across multiple lanes
     samples = organize_samples(align_items, dirs, config_file)
     samples = run_parallel("merge_sample", samples)
     samples = run_parallel("prep_recal", samples)
     samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
     samples = parallel_realign_sample(samples, run_parallel)
     samples = parallel_variantcall(samples, run_parallel)
     samples = run_parallel("postprocess_variants", samples)
     samples = combine_multiple_callers(samples)
     samples = ensemble.combine_calls_parallel(samples, run_parallel)
     samples = run_parallel("detect_sv", samples)
     samples = qcsummary.generate_parallel(samples, run_parallel)
     run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
     return samples
Example #14
0
 def run(self, config, config_file, run_parallel, parallel, dirs, lane_items):
     raise NotImplementedError("`variant` processing is deprecated: please use `variant2`"
                               "The next version will alias variant to the new variant2 pipeline")
     lane_items = run_parallel("trim_lane", lane_items)
     align_items = run_parallel("process_alignment", lane_items)
     # process samples, potentially multiplexed across multiple lanes
     samples = organize_samples(align_items, dirs, config_file)
     samples = run_parallel("merge_sample", samples)
     samples = run_parallel("prep_recal", samples)
     samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
     samples = parallel_realign_sample(samples, run_parallel)
     samples = parallel_variantcall(samples, run_parallel)
     samples = run_parallel("postprocess_variants", samples)
     samples = combine_multiple_callers(samples)
     samples = ensemble.combine_calls_parallel(samples, run_parallel)
     samples = run_parallel("detect_sv", samples)
     samples = qcsummary.generate_parallel(samples, run_parallel)
     run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
     return samples
Example #15
0
def run_main(config, config_file, work_dir, parallel,
             fc_dir=None, run_info_yaml=None):
    """Run toplevel analysis, processing a set of input files.

    config_file -- Main YAML configuration file with system parameters
    fc_dir -- Directory of fastq files to process
    run_info_yaml -- YAML configuration file specifying inputs to process
    """
    setup_logging(config)
    fc_name, fc_date, run_info = get_run_info(fc_dir, config, run_info_yaml)
    fastq_dir, galaxy_dir, config_dir = _get_full_paths(get_fastq_dir(fc_dir) if fc_dir else None,
                                                        config, config_file)
    config_file = os.path.join(config_dir, os.path.basename(config_file))
    dirs = {"fastq": fastq_dir, "galaxy": galaxy_dir,
            "work": work_dir, "flowcell": fc_dir, "config": config_dir}
    config = _set_resources(parallel, config)
    run_parallel = parallel_runner(parallel, dirs, config, config_file)

    # process each flowcell lane
    run_items = add_multiplex_across_lanes(run_info["details"], dirs["fastq"], fc_name)
    lanes = ((info, fc_name, fc_date, dirs, config) for info in run_items)
    lane_items = run_parallel("process_lane", lanes)
    align_items = run_parallel("process_alignment", lane_items)
    # process samples, potentially multiplexed across multiple lanes
    samples = organize_samples(align_items, dirs, config_file)
    samples = run_parallel("merge_sample", samples)
    samples = run_parallel("prep_recal", samples)
    samples = recalibrate.parallel_write_recal_bam(samples, run_parallel)
    samples = parallel_realign_sample(samples, run_parallel)
    samples = parallel_variantcall(samples, run_parallel)
    samples = run_parallel("postprocess_variants", samples)
    samples = combine_multiple_callers(samples)
    samples = run_parallel("detect_sv", samples)
    samples = run_parallel("combine_calls", samples)
    run_parallel("process_sample", samples)
    run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]})
    write_project_summary(samples)
    write_metrics(run_info, fc_name, fc_date, dirs)
Example #16
0
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples):
    ## Alignment and preparation requiring the entire input file (multicore cluster)
    with prun.start(
            _wres(
                parallel, ["aligner", "samtools", "sambamba"],
                (["reference", "fasta"], ["reference", "aligner"], ["files"])),
            samples,
            config,
            dirs,
            "multicore",
            multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[
                dirs, config, run_info_yaml,
                [x[0]["description"] for x in samples]
            ]])
        ww = initialize_watcher(samples)
        with profile.report("alignment preparation", dirs):
            samples = run_parallel("prep_align_inputs", samples)
            ww.report("prep_align_inputs", samples)
            samples = run_parallel("disambiguate_split", [samples])
        with profile.report("alignment", dirs):
            samples = run_parallel("process_alignment", samples)
            ww.report("process_alignment", samples)
            samples = disambiguate.resolve(samples, run_parallel)
            samples = alignprep.merge_split_alignments(samples, run_parallel)
        with profile.report("callable regions", dirs):
            samples = run_parallel("prep_samples", [samples])
            ww.report("prep_samples", samples)
            samples = run_parallel("postprocess_alignment", samples)
            ww.report("postprocess_alignment", samples)
            samples = run_parallel("combine_sample_regions", [samples])
            samples = region.clean_sample_data(samples)
            ww.report("combine_sample_regions", samples)
        with profile.report("hla typing", dirs):
            samples = hla.run(samples, run_parallel)
            ww.report("call_hla", samples)

    ## Variant calling on sub-regions of the input file (full cluster)
    with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                    samples,
                    config,
                    dirs,
                    "full",
                    multiplier=region.get_max_counts(samples),
                    max_multicore=1) as run_parallel:
        with profile.report("alignment post-processing", dirs):
            samples = region.parallel_prep_region(samples, run_parallel)
        with profile.report("variant calling", dirs):
            samples = genotype.parallel_variantcall_region(
                samples, run_parallel)

    ## Finalize variants, BAMs and population databases (per-sample multicore cluster)
    with prun.start(_wres(parallel, [
            "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini",
            "samtools", "fastqc", "sambamba", "bcbio-variation-recall",
            "qsignature", "svcaller"
    ]),
                    samples,
                    config,
                    dirs,
                    "multicore2",
                    multiplier=structural.parallel_multiplier(
                        samples)) as run_parallel:
        with profile.report("joint squaring off/backfilling", dirs):
            samples = joint.square_off(samples, run_parallel)
        with profile.report("variant post-processing", dirs):
            samples = run_parallel("postprocess_variants", samples)
            samples = run_parallel("split_variants_by_sample", samples)
        with profile.report("prepped BAM merging", dirs):
            samples = region.delayed_bamprep_merge(samples, run_parallel)
        with profile.report("validation", dirs):
            samples = run_parallel("compare_to_rm", samples)
            samples = genotype.combine_multiple_callers(samples)
        with profile.report("ensemble calling", dirs):
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
        with profile.report("validation summary", dirs):
            samples = validate.summarize_grading(samples)
        with profile.report("structural variation precall", dirs):
            samples = structural.run(samples, run_parallel, "precall")
        with profile.report("structural variation", dirs):
            samples = structural.run(samples, run_parallel, "initial")
        with profile.report("structural variation", dirs):
            samples = structural.run(samples, run_parallel, "standard")
        with profile.report("structural variation ensemble", dirs):
            samples = structural.run(samples, run_parallel, "ensemble")
        with profile.report("structural variation validation", dirs):
            samples = run_parallel("validate_sv", samples)
        with profile.report("heterogeneity", dirs):
            samples = heterogeneity.run(samples, run_parallel)
        with profile.report("population database", dirs):
            samples = population.prep_db_parallel(samples, run_parallel)
        with profile.report("quality control", dirs):
            ww.report("pre_qc", samples)
            samples = qcsummary.generate_parallel(samples, run_parallel)
            ww.report("qc_summary", samples)
        with profile.report("archive", dirs):
            samples = archive.compress(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
    logger.info("Timing: finished")
    return samples
Example #17
0
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples):
    ## Alignment and preparation requiring the entire input file (multicore cluster)
    with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"],
                            (["reference", "fasta"], ["reference", "aligner"], ["files"])),
                    samples, config, dirs, "multicore",
                    multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml,
                                                            [x[0]["description"] for x in samples]]])
        ww = WorldWatcher(dirs["work"], is_on=any([dd.get_cwl_reporting(d[0]) for d in samples]))
        ww.initialize(samples)
        with profile.report("alignment preparation", dirs):
            samples = run_parallel("prep_align_inputs", samples)
            ww.report("prep_align_inputs", samples)
            samples = run_parallel("disambiguate_split", [samples])
        with profile.report("alignment", dirs):
            samples = run_parallel("process_alignment", samples)
            ww.report("process_alignment", samples)
            samples = disambiguate.resolve(samples, run_parallel)
            samples = alignprep.merge_split_alignments(samples, run_parallel)
        with profile.report("callable regions", dirs):
            samples = run_parallel("prep_samples", [samples])
            ww.report("prep_samples", samples)
            samples = run_parallel("postprocess_alignment", samples)
            ww.report("postprocess_alignment", samples)
            samples = run_parallel("combine_sample_regions", [samples])
            samples = region.clean_sample_data(samples)
            ww.report("combine_sample_regions", samples)
        with profile.report("structural variation initial", dirs):
            samples = structural.run(samples, run_parallel, "initial")
            ww.report("sv_initial", samples)
        with profile.report("hla typing", dirs):
            samples = hla.run(samples, run_parallel)
            ww.report("call_hla", samples)

    ## Variant calling on sub-regions of the input file (full cluster)
    with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                    samples, config, dirs, "full",
                    multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel:
        with profile.report("alignment post-processing", dirs):
            samples = region.parallel_prep_region(samples, run_parallel)
        with profile.report("variant calling", dirs):
            samples = genotype.parallel_variantcall_region(samples, run_parallel)

    ## Finalize variants, BAMs and population databases (per-sample multicore cluster)
    with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation",
                                     "gemini", "samtools", "fastqc", "bamtools",
                                     "bcbio-variation-recall", "qsignature",
                                     "svcaller"]),
                    samples, config, dirs, "multicore2",
                    multiplier=structural.parallel_multiplier(samples)) as run_parallel:
        with profile.report("joint squaring off/backfilling", dirs):
            samples = joint.square_off(samples, run_parallel)
        with profile.report("variant post-processing", dirs):
            samples = run_parallel("postprocess_variants", samples)
            samples = run_parallel("split_variants_by_sample", samples)
        with profile.report("prepped BAM merging", dirs):
            samples = region.delayed_bamprep_merge(samples, run_parallel)
        with profile.report("validation", dirs):
            samples = run_parallel("compare_to_rm", samples)
            samples = genotype.combine_multiple_callers(samples)
        with profile.report("ensemble calling", dirs):
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
        with profile.report("validation summary", dirs):
            samples = validate.summarize_grading(samples)
        with profile.report("structural variation final", dirs):
            samples = structural.run(samples, run_parallel, "standard")
        with profile.report("structural variation ensemble", dirs):
            samples = structural.run(samples, run_parallel, "ensemble")
        with profile.report("structural variation validation", dirs):
            samples = run_parallel("validate_sv", samples)
        with profile.report("heterogeneity", dirs):
            samples = heterogeneity.run(samples, run_parallel)
        with profile.report("population database", dirs):
            samples = population.prep_db_parallel(samples, run_parallel)
        with profile.report("quality control", dirs):
            ww.report("pre_qc", samples)
            samples = qcsummary.generate_parallel(samples, run_parallel)
            ww.report("qc_summary", samples)
        with profile.report("archive", dirs):
            samples = archive.compress(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
    logger.info("Timing: finished")
    return samples