Ejemplo n.º 1
0
def rnaseq_variant_calling(samples, run_parallel):
    """
    run RNA-seq variant calling using GATK
    """
    samples = run_parallel("run_rnaseq_variant_calling", samples)
    variantcaller = dd.get_variantcaller(to_single_data(samples[0]))
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for d in joint.square_off(samples, run_parallel):
            out.extend(
                [[to_single_data(xs)]
                 for xs in multi.split_variants_by_sample(to_single_data(d))])
        samples = out
    if variantcaller:
        samples = run_parallel("run_rnaseq_ann_filter", samples)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for data in (to_single_data(xs) for xs in samples):
            if "variants" not in data:
                data["variants"] = []
            data["variants"].append({
                "variantcaller": "gatk-haplotype",
                "vcf": data["vrn_file_orig"],
                "population": {
                    "vcf": data["vrn_file"]
                }
            })
            data["vrn_file"] = data.pop("vrn_file_orig")
            out.append([data])
        samples = out
    return samples
Ejemplo n.º 2
0
    def run(self, config, config_file, parallel, dirs, samples):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"],
                              (["reference", "fasta"], ["reference", "aligner"], ["files"])),
                        samples, config, dirs, "multicore",
                        multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
            with profile.report("alignment preparation", dirs):
                samples = run_parallel("prep_align_inputs", samples)
                samples = disambiguate.split(samples)
            with profile.report("alignment", dirs):
                samples = run_parallel("process_alignment", samples)
                samples = alignprep.merge_split_alignments(samples, run_parallel)
                samples = disambiguate.resolve(samples, run_parallel)
            with profile.report("callable regions", dirs):
                samples = run_parallel("postprocess_alignment", samples)
                samples = run_parallel("combine_sample_regions", [samples])
                samples = region.clean_sample_data(samples)
            with profile.report("coverage", dirs):
                samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                        samples, config, dirs, "full",
                        multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel:
            with profile.report("alignment post-processing", dirs):
                samples = region.parallel_prep_region(samples, run_parallel)
            with profile.report("variant calling", dirs):
                samples = genotype.parallel_variantcall_region(samples, run_parallel)

        ## Finalize variants (per-sample cluster)
        with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]),
                        samples, config, dirs, "persample") as run_parallel:
            with profile.report("joint squaring off/backfilling", dirs):
                samples = joint.square_off(samples, run_parallel)
            with profile.report("variant post-processing", dirs):
                samples = run_parallel("postprocess_variants", samples)
                samples = run_parallel("split_variants_by_sample", samples)
            with profile.report("validation", dirs):
                samples = run_parallel("compare_to_rm", samples)
                samples = genotype.combine_multiple_callers(samples)
        ## Finalizing BAMs and population databases, handle multicore computation
        with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation",
                                         "bcbio-variation-recall"]),
                        samples, config, dirs, "multicore2") as run_parallel:
            with profile.report("prepped BAM merging", dirs):
                samples = region.delayed_bamprep_merge(samples, run_parallel)
            with profile.report("ensemble calling", dirs):
                samples = ensemble.combine_calls_parallel(samples, run_parallel)
            with profile.report("validation summary", dirs):
                samples = validate.summarize_grading(samples)
            with profile.report("structural variation", dirs):
                samples = structural.run(samples, run_parallel)
            with profile.report("population database", dirs):
                samples = population.prep_db_parallel(samples, run_parallel)
            with profile.report("quality control", dirs):
                samples = qcsummary.generate_parallel(samples, run_parallel)
            with profile.report("archive", dirs):
                samples = archive.compress(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Ejemplo n.º 3
0
def rnaseq_variant_calling(samples, run_parallel):
    """
    run RNA-seq variant calling using GATK
    """
    samples = run_parallel("run_rnaseq_variant_calling", samples)
    variantcaller = dd.get_variantcaller(to_single_data(samples[0]))
    if variantcaller and ("gatk-haplotype" in variantcaller):
        samples = joint.square_off(samples, run_parallel)
        samples = run_parallel("run_rnaseq_ann_filter", samples)
    return samples
Ejemplo n.º 4
0
def rnaseq_variant_calling(samples, run_parallel):
    """
    run RNA-seq variant calling using GATK
    """
    samples = run_parallel("run_rnaseq_variant_calling", samples)
    variantcaller = dd.get_variantcaller(to_single_data(samples[0]))
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for d in joint.square_off(samples, run_parallel):
            out.extend([[to_single_data(xs)] for xs in multi.split_variants_by_sample(to_single_data(d))])
        samples = out
        samples = run_parallel("run_rnaseq_ann_filter", samples)
    return samples
Ejemplo n.º 5
0
def rnaseq_variant_calling(samples, run_parallel):
    """
    run RNA-seq variant calling using GATK
    """
    samples = run_parallel("run_rnaseq_variant_calling", samples)
    variantcaller = dd.get_variantcaller(to_single_data(samples[0]))
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for d in joint.square_off(samples, run_parallel):
            out.extend([[to_single_data(xs)] for xs in multi.split_variants_by_sample(to_single_data(d))])
        samples = out
    if variantcaller:
        samples = run_parallel("run_rnaseq_ann_filter", samples)
    if variantcaller and ("gatk-haplotype" in variantcaller):
        out = []
        for data in (to_single_data(xs) for xs in samples):
            if "variants" not in data:
                data["variants"] = []
            data["variants"].append({"variantcaller": "gatk-haplotype", "vcf": data["vrn_file_orig"],
                                     "population": {"vcf": data["vrn_file"]}})
            data["vrn_file"] = data.pop("vrn_file_orig")
            out.append([data])
        samples = out
    return samples
Ejemplo n.º 6
0
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples):
    ## Alignment and preparation requiring the entire input file (multicore cluster)
    with prun.start(
            _wres(
                parallel, ["aligner", "samtools", "sambamba"],
                (["reference", "fasta"], ["reference", "aligner"], ["files"])),
            samples,
            config,
            dirs,
            "multicore",
            multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[
                dirs, config, run_info_yaml,
                [x[0]["description"] for x in samples]
            ]])
        ww = initialize_watcher(samples)
        with profile.report("alignment preparation", dirs):
            samples = run_parallel("prep_align_inputs", samples)
            ww.report("prep_align_inputs", samples)
            samples = run_parallel("disambiguate_split", [samples])
        with profile.report("alignment", dirs):
            samples = run_parallel("process_alignment", samples)
            ww.report("process_alignment", samples)
            samples = disambiguate.resolve(samples, run_parallel)
            samples = alignprep.merge_split_alignments(samples, run_parallel)
        with profile.report("callable regions", dirs):
            samples = run_parallel("prep_samples", [samples])
            ww.report("prep_samples", samples)
            samples = run_parallel("postprocess_alignment", samples)
            ww.report("postprocess_alignment", samples)
            samples = run_parallel("combine_sample_regions", [samples])
            samples = region.clean_sample_data(samples)
            ww.report("combine_sample_regions", samples)
        with profile.report("hla typing", dirs):
            samples = hla.run(samples, run_parallel)
            ww.report("call_hla", samples)

    ## Variant calling on sub-regions of the input file (full cluster)
    with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                    samples,
                    config,
                    dirs,
                    "full",
                    multiplier=region.get_max_counts(samples),
                    max_multicore=1) as run_parallel:
        with profile.report("alignment post-processing", dirs):
            samples = region.parallel_prep_region(samples, run_parallel)
        with profile.report("variant calling", dirs):
            samples = genotype.parallel_variantcall_region(
                samples, run_parallel)

    ## Finalize variants, BAMs and population databases (per-sample multicore cluster)
    with prun.start(_wres(parallel, [
            "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini",
            "samtools", "fastqc", "sambamba", "bcbio-variation-recall",
            "qsignature", "svcaller"
    ]),
                    samples,
                    config,
                    dirs,
                    "multicore2",
                    multiplier=structural.parallel_multiplier(
                        samples)) as run_parallel:
        with profile.report("joint squaring off/backfilling", dirs):
            samples = joint.square_off(samples, run_parallel)
        with profile.report("variant post-processing", dirs):
            samples = run_parallel("postprocess_variants", samples)
            samples = run_parallel("split_variants_by_sample", samples)
        with profile.report("prepped BAM merging", dirs):
            samples = region.delayed_bamprep_merge(samples, run_parallel)
        with profile.report("validation", dirs):
            samples = run_parallel("compare_to_rm", samples)
            samples = genotype.combine_multiple_callers(samples)
        with profile.report("ensemble calling", dirs):
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
        with profile.report("validation summary", dirs):
            samples = validate.summarize_grading(samples)
        with profile.report("structural variation precall", dirs):
            samples = structural.run(samples, run_parallel, "precall")
        with profile.report("structural variation", dirs):
            samples = structural.run(samples, run_parallel, "initial")
        with profile.report("structural variation", dirs):
            samples = structural.run(samples, run_parallel, "standard")
        with profile.report("structural variation ensemble", dirs):
            samples = structural.run(samples, run_parallel, "ensemble")
        with profile.report("structural variation validation", dirs):
            samples = run_parallel("validate_sv", samples)
        with profile.report("heterogeneity", dirs):
            samples = heterogeneity.run(samples, run_parallel)
        with profile.report("population database", dirs):
            samples = population.prep_db_parallel(samples, run_parallel)
        with profile.report("quality control", dirs):
            ww.report("pre_qc", samples)
            samples = qcsummary.generate_parallel(samples, run_parallel)
            ww.report("qc_summary", samples)
        with profile.report("archive", dirs):
            samples = archive.compress(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
    logger.info("Timing: finished")
    return samples
Ejemplo n.º 7
0
    def run(self, config, config_file, parallel, dirs, samples):
        ## Alignment and preparation requiring the entire input file (multicore cluster)
        with prun.start(_wres(
                parallel, ["aligner", "samtools", "sambamba"],
            (["reference", "fasta"], ["reference", "aligner"], ["files"])),
                        samples,
                        config,
                        dirs,
                        "multicore",
                        multiplier=alignprep.parallel_multiplier(
                            samples)) as run_parallel:
            with profile.report("alignment preparation", dirs):
                samples = run_parallel("prep_align_inputs", samples)
                samples = disambiguate.split(samples)
            with profile.report("alignment", dirs):
                samples = run_parallel("process_alignment", samples)
                samples = alignprep.merge_split_alignments(
                    samples, run_parallel)
                samples = disambiguate.resolve(samples, run_parallel)
            with profile.report("callable regions", dirs):
                samples = run_parallel("postprocess_alignment", samples)
                samples = run_parallel("combine_sample_regions", [samples])
                samples = region.clean_sample_data(samples)
            with profile.report("coverage", dirs):
                samples = coverage.summarize_samples(samples, run_parallel)

        ## Variant calling on sub-regions of the input file (full cluster)
        with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                        samples,
                        config,
                        dirs,
                        "full",
                        multiplier=region.get_max_counts(samples),
                        max_multicore=1) as run_parallel:
            with profile.report("alignment post-processing", dirs):
                samples = region.parallel_prep_region(samples, run_parallel)
            with profile.report("variant calling", dirs):
                samples = genotype.parallel_variantcall_region(
                    samples, run_parallel)

        ## Finalize variants, BAMs and population databases (per-sample multicore cluster)
        with prun.start(
                _wres(parallel, [
                    "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini",
                    "samtools", "fastqc", "bamtools", "bcbio-variation-recall",
                    "qsignature"
                ]), samples, config, dirs, "multicore2") as run_parallel:
            with profile.report("joint squaring off/backfilling", dirs):
                samples = joint.square_off(samples, run_parallel)
            with profile.report("variant post-processing", dirs):
                samples = run_parallel("postprocess_variants", samples)
                samples = run_parallel("split_variants_by_sample", samples)
            with profile.report("prepped BAM merging", dirs):
                samples = region.delayed_bamprep_merge(samples, run_parallel)
            with profile.report("validation", dirs):
                samples = run_parallel("compare_to_rm", samples)
                samples = genotype.combine_multiple_callers(samples)
            with profile.report("ensemble calling", dirs):
                samples = ensemble.combine_calls_parallel(
                    samples, run_parallel)
            with profile.report("validation summary", dirs):
                samples = validate.summarize_grading(samples)
            with profile.report("structural variation", dirs):
                samples = structural.run(samples, run_parallel)
            with profile.report("population database", dirs):
                samples = population.prep_db_parallel(samples, run_parallel)
            with profile.report("quality control", dirs):
                samples = qcsummary.generate_parallel(samples, run_parallel)
            with profile.report("archive", dirs):
                samples = archive.compress(samples, run_parallel)
        logger.info("Timing: finished")
        return samples
Ejemplo n.º 8
0
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples):
    ## Alignment and preparation requiring the entire input file (multicore cluster)
    with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"],
                            (["reference", "fasta"], ["reference", "aligner"], ["files"])),
                    samples, config, dirs, "multicore",
                    multiplier=alignprep.parallel_multiplier(samples)) as run_parallel:
        with profile.report("organize samples", dirs):
            samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml,
                                                            [x[0]["description"] for x in samples]]])
        ww = WorldWatcher(dirs["work"], is_on=any([dd.get_cwl_reporting(d[0]) for d in samples]))
        ww.initialize(samples)
        with profile.report("alignment preparation", dirs):
            samples = run_parallel("prep_align_inputs", samples)
            ww.report("prep_align_inputs", samples)
            samples = run_parallel("disambiguate_split", [samples])
        with profile.report("alignment", dirs):
            samples = run_parallel("process_alignment", samples)
            ww.report("process_alignment", samples)
            samples = disambiguate.resolve(samples, run_parallel)
            samples = alignprep.merge_split_alignments(samples, run_parallel)
        with profile.report("callable regions", dirs):
            samples = run_parallel("prep_samples", [samples])
            ww.report("prep_samples", samples)
            samples = run_parallel("postprocess_alignment", samples)
            ww.report("postprocess_alignment", samples)
            samples = run_parallel("combine_sample_regions", [samples])
            samples = region.clean_sample_data(samples)
            ww.report("combine_sample_regions", samples)
        with profile.report("structural variation initial", dirs):
            samples = structural.run(samples, run_parallel, "initial")
            ww.report("sv_initial", samples)
        with profile.report("hla typing", dirs):
            samples = hla.run(samples, run_parallel)
            ww.report("call_hla", samples)

    ## Variant calling on sub-regions of the input file (full cluster)
    with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]),
                    samples, config, dirs, "full",
                    multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel:
        with profile.report("alignment post-processing", dirs):
            samples = region.parallel_prep_region(samples, run_parallel)
        with profile.report("variant calling", dirs):
            samples = genotype.parallel_variantcall_region(samples, run_parallel)

    ## Finalize variants, BAMs and population databases (per-sample multicore cluster)
    with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation",
                                     "gemini", "samtools", "fastqc", "bamtools",
                                     "bcbio-variation-recall", "qsignature",
                                     "svcaller"]),
                    samples, config, dirs, "multicore2",
                    multiplier=structural.parallel_multiplier(samples)) as run_parallel:
        with profile.report("joint squaring off/backfilling", dirs):
            samples = joint.square_off(samples, run_parallel)
        with profile.report("variant post-processing", dirs):
            samples = run_parallel("postprocess_variants", samples)
            samples = run_parallel("split_variants_by_sample", samples)
        with profile.report("prepped BAM merging", dirs):
            samples = region.delayed_bamprep_merge(samples, run_parallel)
        with profile.report("validation", dirs):
            samples = run_parallel("compare_to_rm", samples)
            samples = genotype.combine_multiple_callers(samples)
        with profile.report("ensemble calling", dirs):
            samples = ensemble.combine_calls_parallel(samples, run_parallel)
        with profile.report("validation summary", dirs):
            samples = validate.summarize_grading(samples)
        with profile.report("structural variation final", dirs):
            samples = structural.run(samples, run_parallel, "standard")
        with profile.report("structural variation ensemble", dirs):
            samples = structural.run(samples, run_parallel, "ensemble")
        with profile.report("structural variation validation", dirs):
            samples = run_parallel("validate_sv", samples)
        with profile.report("heterogeneity", dirs):
            samples = heterogeneity.run(samples, run_parallel)
        with profile.report("population database", dirs):
            samples = population.prep_db_parallel(samples, run_parallel)
        with profile.report("quality control", dirs):
            ww.report("pre_qc", samples)
            samples = qcsummary.generate_parallel(samples, run_parallel)
            ww.report("qc_summary", samples)
        with profile.report("archive", dirs):
            samples = archive.compress(samples, run_parallel)
        with profile.report("upload", dirs):
            samples = run_parallel("upload_samples", samples)
            for sample in samples:
                run_parallel("upload_samples_project", [sample])
    logger.info("Timing: finished")
    return samples