def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start( _wres(parallel, ["aligner", "picard"]), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples), ) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel( "organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]] ) samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("clean_chipseq_alignment", samples) samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): for sample in samples: run_parallel("upload_samples", [sample]) return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["aligner", "picard"]), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier( samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("clean_chipseq_alignment", samples) samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) return samples
def run(self, config, config_file, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner"]), lane_items, config, dirs, "multicore") as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("process_alignment", lane_items) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) regions = run_parallel("combine_sample_regions", [samples])[0] samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) ## Processing on sub regions with prun.start(_wres(parallel, ["gatk", "picard", "samtools"]), samples, config, dirs, "full", multiplier=len(regions["analysis"]), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, regions, run_parallel) samples = region.parallel_variantcall_region(samples, run_parallel) print len(samples) ## Finalize BAMs and QC with prun.start(_wres(parallel, ["fastqc", "bamtools", "samtools"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) print len(samples) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def standardpipeline(config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"]), samples, config, dirs, "multicore") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ## Quality control with prun.start( _wres(parallel, ["fastqc", "qsignature", "kraken", "gatk", "samtools"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples): """ organizes RNA-seq and small-RNAseq samples, converting from BAM if necessary and trimming if necessary """ pipeline = dd.get_in_samples(samples, dd.get_analysis) trim_reads_set = any([tz.get_in(["algorithm", "trim_reads"], d) for d in dd.sample_data_iterator(samples)]) resources = ["picard"] needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set) if needs_trimming: resources.append("atropos") with prun.start(_wres(parallel, resources), samples, config, dirs, "trimming", max_multicore=1 if not needs_trimming else None) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) samples = run_parallel("prepare_sample", samples) if needs_trimming: with profile.report("adapter trimming", dirs): if _is_smallrnaseq(pipeline): samples = run_parallel("trim_srna_sample", samples) else: samples = run_parallel("trim_sample", samples) return samples
def rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples): """ organizes RNA-seq and small-RNAseq samples, converting from BAM if necessary and trimming if necessary """ pipeline = dd.get_in_samples(samples, dd.get_analysis) trim_reads_set = _is_trim_set(samples) resources = ["picard"] needs_trimming = (_is_smallrnaseq(pipeline) or trim_reads_set) if needs_trimming: resources.append("cutadapt") with prun.start(_wres(parallel, resources), samples, config, dirs, "trimming", max_multicore=1) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) samples = run_parallel("prepare_sample", samples) if needs_trimming: with profile.report("adapter trimming", dirs): if _is_smallrnaseq(pipeline): samples = run_parallel("trim_srna_sample", samples) else: samples = run_parallel("trim_sample", samples) return samples
def wgbsseqpipeline(config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["fastqc", "picard"], ensure_mem={"fastqc" : 4}), samples, config, dirs, "trimming") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_bs_sample", samples) with prun.start(_wres(parallel, ["aligner", "bismark", "picard", "samtools"]), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ['samtools']), samples, config, dirs, 'deduplication') as run_parallel: with profile.report('deduplicate', dirs): samples = run_parallel('deduplicate_bismark', samples) with prun.start(_wres(parallel, ["caller"], ensure_mem={"caller": 5}), samples, config, dirs, "multicore2", multiplier=24) as run_parallel: with profile.report("cpg calling", dirs): samples = run_parallel("cpg_calling", samples) # with prun.start(_wres(parallel, ["picard", "fastqc", "samtools"]), # samples, config, dirs, "qc") as run_parallel: # with profile.report("quality control", dirs): # samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner"]), samples, config, dirs, "multicore") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel( "organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]] ) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ## Quality control with prun.start( _wres(parallel, ["fastqc", "bamtools", "samtools", "qsignature", "kraken"]), samples, config, dirs, "multicore2", ) as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): for sample in samples: run_parallel("upload_samples", [sample]) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 30}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = annotate_combined_count_file(combined, gtf_file) for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["sailfish"]), samples, config, dirs, "sailfish") as run_parallel: with profile.report("sailfish", dirs): samples = run_parallel("run_sailfish", samples) return samples
def fastrnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["samtools"]), samples, config, dirs, "fastrnaseq") as run_parallel: with profile.report("fastrnaseq", dirs): samples = rnaseq.fast_rnaseq(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for samples in samples: run_parallel("upload_samples_project", [samples]) logger.info("Timing: finished") return samples
def singlecellrnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["samtools"]), samples, config, dirs, "singlecell-rnaseq") as run_parallel: with profile.report("singlecell-rnaseq", dirs): samples = rnaseq.singlecell_rnaseq(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for samples in samples: run_parallel("upload_samples_project", [samples]) logger.info("Timing: finished") return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "cutadapt"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={ "tophat": 8, "tophat2": 8, "star": 2 }), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier( samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel( samples, run_parallel) with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel( samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start( _wres(parallel, ["picard", "fastqc", "rnaseqc", "kraken"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "cutadapt"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["sailfish"]), samples, config, dirs, "sailfish") as run_parallel: with profile.report("sailfish", dirs): samples = run_parallel("run_sailfish", samples) return samples
def singlecellrnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["samtools", "rapmap"]), samples, config, dirs, "singlecell-rnaseq") as run_parallel: with profile.report("singlecell-rnaseq", dirs): samples = rnaseq.singlecell_rnaseq(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for samples in samples: run_parallel("upload_samples_project", [samples]) logger.info("Timing: finished") return samples
def smallrnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): # causes a circular import at the top level from bcbio.srna.group import report as srna_report samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start( _wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={ "bowtie": 8, "bowtie2": 8, "star": 2 }), [samples[0]], config, dirs, "alignment") as run_parallel: with profile.report("prepare", dirs): samples = run_parallel("seqcluster_prepare", [samples]) with profile.report("seqcluster alignment", dirs): samples = run_parallel("srna_alignment", [samples]) with prun.start( _wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={ "tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8 }), samples, config, dirs, "alignment_samples", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["picard", "miraligner"]), samples, config, dirs, "annotation") as run_parallel: with profile.report("small RNA annotation", dirs): samples = run_parallel("srna_annotation", samples) with prun.start( _wres(parallel, ["seqcluster", "mirge"], ensure_mem={"seqcluster": 8}), [samples[0]], config, dirs, "cluster") as run_parallel: with profile.report("cluster", dirs): samples = run_parallel("seqcluster_cluster", [samples]) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("report", dirs): srna_report(samples) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) return samples
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start( _wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={ "tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8 }), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel( samples, run_parallel) with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel( samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start( _wres( parallel, ["samtools", "fastqc", "qualimap", "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "cutadapt"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel( "organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]] ) with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start( _wres(parallel, ["aligner", "picard"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 2}), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples), ) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start( _wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount" ) as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel(samples, run_parallel) with prun.start( _wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1 ) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel(samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start( _wres(parallel, ["picard", "fastqc", "rnaseqc", "kraken"]), samples, config, dirs, "qc" ) as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): for sample in samples: run_parallel("upload_samples", [sample]) logger.info("Timing: finished") return samples
def fastrnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["samtools"]), samples, config, dirs, "fastrnaseq") as run_parallel: with profile.report("fastrnaseq", dirs): samples = rnaseq.fast_rnaseq(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for samples in samples: run_parallel("upload_samples_project", [samples]) logger.info("Timing: finished") return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): # causes a circular import at the top level from bcbio.srna.group import report as srna_report with prun.start(_wres(parallel, ["picard", "cutadapt"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_srna_sample", samples) with prun.start( _wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={ "bowtie": 8, "bowtie2": 8, "star": 2 }), [samples[0]], config, dirs, "alignment") as run_parallel: with profile.report("prepare", dirs): samples = run_parallel("seqcluster_prepare", [samples]) with profile.report("alignment", dirs): samples = run_parallel("srna_alignment", [samples]) with prun.start(_wres(parallel, ["picard", "miraligner"]), samples, config, dirs, "annotation") as run_parallel: with profile.report("small RNA annotation", dirs): samples = run_parallel("srna_annotation", samples) with prun.start( _wres(parallel, ["seqcluster"], ensure_mem={"seqcluster": 8}), [samples[0]], config, dirs, "cluster") as run_parallel: with profile.report("cluster", dirs): samples = run_parallel("seqcluster_cluster", [samples]) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("report", dirs): srna_report(samples) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={ "tophat": 8, "tophat2": 8, "star": 40 }), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier( samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) combined = combine_count_files( [x[0].get("count_file") for x in samples]) gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = annotate_combined_count_file(combined, gtf_file) for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def fastrnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) ww = initialize_watcher(samples) with prun.start(_wres(parallel, ["samtools"]), samples, config, dirs, "fastrnaseq") as run_parallel: with profile.report("fastrnaseq", dirs): samples = rnaseq.fast_rnaseq(samples, run_parallel) ww.report("fastrnaseq", samples) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) ww.report("qcsummary", samples) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for samples in samples: run_parallel("upload_samples_project", [samples]) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, lane_items): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner"]), lane_items, config, dirs, "multicore") as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("process_alignment", lane_items) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ## Quality control with prun.start(_wres(parallel, ["fastqc", "bamtools", "samtools"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner"]), samples, config, dirs, "multicore") as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ## Quality control with prun.start(_wres(parallel, ["fastqc", "bamtools", "samtools"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["aligner"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 2}), [samples[0]], config, dirs, "organize_samples") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with prun.start(_wres(parallel, ["picard", "cutadapt"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 2}), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc", "kraken"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={"tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8}), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel(samples, run_parallel) with prun.start(_wres(parallel, ["ericscript"]), samples, config, dirs, "fusion-standalone-callers") as run_parallel: with profile.report("Detect gene fusions", dirs): rnaseq.detect_fusions(samples) with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel(samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start(_wres(parallel, ["samtools", "fastqc", "qualimap", "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 40}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def chipseqpipeline(config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["aligner", "picard"]), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with profile.report("alignment", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("clean_chipseq_alignment", samples) with prun.start(_wres(parallel, ["peakcaller"]), samples, config, dirs, "peakcalling", multiplier = peaks._get_multiplier(samples)) as run_parallel: with profile.report("peakcalling", dirs): samples = peaks.peakcall_prepare(samples, run_parallel) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) return samples
def chipseqpipeline(config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["aligner", "picard"]), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with profile.report("alignment", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("clean_chipseq_alignment", samples) with prun.start(_wres(parallel, ["peakcaller"]), samples, config, dirs, "peakcalling", multiplier = peaks._get_multiplier(samples)) as run_parallel: with profile.report("peakcalling", dirs): samples = peaks.peakcall_prepare(samples, run_parallel) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): # causes a circular import at the top level from bcbio.srna.group import report as srna_report with prun.start(_wres(parallel, ["picard", "cutadapt"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_srna_sample", samples) with prun.start(_wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={"bowtie": 8, "bowtie2": 8, "star": 2}), [samples[0]], config, dirs, "alignment") as run_parallel: with profile.report("prepare", dirs): samples = run_parallel("seqcluster_prepare", [samples]) with profile.report("alignment", dirs): samples = run_parallel("srna_alignment", [samples]) with prun.start(_wres(parallel, ["picard", "miraligner"]), samples, config, dirs, "annotation") as run_parallel: with profile.report("small RNA annotation", dirs): samples = run_parallel("srna_annotation", samples) with prun.start(_wres(parallel, ["seqcluster"], ensure_mem={"seqcluster": 8}), [samples[0]], config, dirs, "cluster") as run_parallel: with profile.report("cluster", dirs): samples = run_parallel("seqcluster_cluster", [samples]) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("report", dirs): srna_report(samples) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) return samples
def smallrnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): # causes a circular import at the top level from bcbio.srna.group import report as srna_report samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start(_wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={"bowtie": 8, "bowtie2": 8, "star": 2}), [samples[0]], config, dirs, "alignment") as run_parallel: with profile.report("prepare", dirs): samples = run_parallel("seqcluster_prepare", [samples]) with profile.report("seqcluster alignment", dirs): samples = run_parallel("srna_alignment", [samples]) with prun.start(_wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={"tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8}), samples, config, dirs, "alignment_samples", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["picard", "miraligner"]), samples, config, dirs, "annotation") as run_parallel: with profile.report("small RNA annotation", dirs): samples = run_parallel("srna_annotation", samples) with prun.start(_wres(parallel, ["seqcluster", "mirge"], ensure_mem={"seqcluster": 8}), [samples[0]], config, dirs, "cluster") as run_parallel: with profile.report("cluster", dirs): samples = run_parallel("seqcluster_cluster", [samples]) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("report", dirs): srna_report(samples) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={ "tophat": 8, "tophat2": 8, "star": 40 }), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier( samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) with prun.start( _wres(parallel, ["picard", "fastqc", "rnaseqc", "kraken"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = run_parallel("disambiguate_split", [samples]) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = alignprep.merge_split_alignments(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) with profile.report("coverage", dirs): samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region(samples, run_parallel) ## Finalize variants, BAMs and population databases (per-sample multicore cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini", "samtools", "fastqc", "bamtools", "bcbio-variation-recall", "qsignature"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel) with profile.report("heterogeneity", dirs): samples = heterogeneity.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) with profile.report("upload", dirs): for sample in samples: run_parallel("upload_samples", [sample]) logger.info("Timing: finished") return samples
"bcbio_system.yaml") except ValueError as err: print(err) print( "WARNING: Attempting to read bcbio_system.yaml in the current directory." ) system_config = "bcbio_system.yaml" with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) config["log_dir"] = os.path.join(os.path.abspath(os.getcwd()), "log") parallel = clargs.to_parallel(args) parallel.update({'progs': ['samtools', 'sambamba']}) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] samples = _get_samples_to_process(args.csv, out_dir, config, args.force_single, args.separators) parallel = resources.calculate(parallel, [samples], sysinfo, config) with prun.start(parallel, samples, config, dirs) as run_parallel: with profile.report("prepare bcbio samples", dirs): samples = run_parallel("prepare_bcbio_samples", samples) create_new_csv(samples, args)
parser.add_argument("-p", "--tag", help="Tag name to label jobs on the cluster", default="bcb-prep") parser.add_argument("-t", "--paralleltype", choices=["local", "ipython"], default="local", help="Run with iptyhon") args = parser.parse_args() out_dir = os.path.abspath(args.out) utils.safe_makedir(out_dir) system_config = os.path.join(_get_data_dir(), "galaxy", "bcbio_system.yaml") with open(system_config) as in_handle: config = yaml.load(in_handle) res = {'cores': args.cores_per_job} config["algorithm"] = {"num_cores": args.cores_per_job} config["resources"].update({'sambamba': res, 'samtools': res}) parallel = clargs.to_parallel(args) parallel.update({'progs': ['samtools', 'sambamba']}) parallel = log.create_base_logger(config, parallel) log.setup_local_logging(config, parallel) dirs = {'work': os.path.abspath(os.getcwd())} system.write_info(dirs, parallel, config) sysinfo = system.machine_info()[0] samples = _get_samples_to_process(args.csv, out_dir, config) parallel = resources.calculate(parallel, [samples], sysinfo, config) with prun.start(parallel, samples, config, dirs) as run_parallel: with profile.report("prepare bcbio samples", dirs): samples = run_parallel("prepare_bcbio_samples", samples) create_new_csv(samples, args)
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) # Assign GATK supplied memory if required for post-process recalibration align_programs = ["aligner", "samtools", "sambamba"] if any(tz.get_in(["algorithm", "recalibrate"], utils.to_single_data(d)) in [True, "gatk"] for d in samples): align_programs.append("gatk") with prun.start(_wres(parallel, align_programs, (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = run_parallel("disambiguate_split", [samples]) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = alignprep.merge_split_alignments(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = run_parallel("calculate_sv_bins", [samples]) samples = run_parallel("calculate_sv_coverage", samples) samples = run_parallel("normalize_sv_coverage", [samples]) samples = region.clean_sample_data(samples) with profile.report("hla typing", dirs): samples = hla.run(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region(samples, run_parallel) ## Finalize variants, BAMs and population databases (per-sample multicore cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini", "samtools", "fastqc", "sambamba", "bcbio-variation-recall", "qsignature", "svcaller", "kraken", "preseq"]), samples, config, dirs, "multicore2", multiplier=structural.parallel_multiplier(samples)) as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "initial") with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "standard") with profile.report("structural variation ensemble", dirs): samples = structural.run(samples, run_parallel, "ensemble") with profile.report("structural variation validation", dirs): samples = run_parallel("validate_sv", samples) with profile.report("heterogeneity", dirs): samples = heterogeneity.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("peddy check", dirs): samples = peddy.run_peddy_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def variant2pipeline(config, run_info_yaml, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start( _wres( parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[ dirs, config, run_info_yaml, [x[0]["description"] for x in samples] ]]) ww = initialize_watcher(samples) with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) ww.report("prep_align_inputs", samples) samples = run_parallel("disambiguate_split", [samples]) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) ww.report("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = alignprep.merge_split_alignments(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("prep_samples", [samples]) ww.report("prep_samples", samples) samples = run_parallel("postprocess_alignment", samples) ww.report("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) ww.report("combine_sample_regions", samples) with profile.report("hla typing", dirs): samples = hla.run(samples, run_parallel) ww.report("call_hla", samples) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region( samples, run_parallel) ## Finalize variants, BAMs and population databases (per-sample multicore cluster) with prun.start(_wres(parallel, [ "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini", "samtools", "fastqc", "sambamba", "bcbio-variation-recall", "qsignature", "svcaller" ]), samples, config, dirs, "multicore2", multiplier=structural.parallel_multiplier( samples)) as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation precall", dirs): samples = structural.run(samples, run_parallel, "precall") with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "initial") with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel, "standard") with profile.report("structural variation ensemble", dirs): samples = structural.run(samples, run_parallel, "ensemble") with profile.report("structural variation validation", dirs): samples = run_parallel("validate_sv", samples) with profile.report("heterogeneity", dirs): samples = heterogeneity.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): ww.report("pre_qc", samples) samples = qcsummary.generate_parallel(samples, run_parallel) ww.report("qc_summary", samples) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres( parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier( samples)) as run_parallel: with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments( samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) with profile.report("coverage", dirs): samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region( samples, run_parallel) ## Finalize variants, BAMs and population databases (per-sample multicore cluster) with prun.start( _wres(parallel, [ "gatk", "gatk-vqsr", "snpeff", "bcbio_variation", "gemini", "samtools", "fastqc", "bamtools", "bcbio-variation-recall", "qsignature" ]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel( samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) logger.info("Timing: finished") return samples
def rnaseqpipeline(config, run_info_yaml, parallel, dirs, samples): samples = rnaseq_prep_samples(config, run_info_yaml, parallel, dirs, samples) with prun.start( _wres(parallel, ["aligner", "picard", "samtools"], ensure_mem={ "tophat": 10, "tophat2": 10, "star": 2, "hisat2": 8 }), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = run_parallel("disambiguate_split", [samples]) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression (threaded)", dirs): samples = rnaseq.quantitate_expression_parallel( samples, run_parallel) with prun.start(_wres(parallel, ["dexseq", "express"]), samples, config, dirs, "rnaseqcount-singlethread", max_multicore=1) as run_parallel: with profile.report("estimate expression (single threaded)", dirs): samples = rnaseq.quantitate_expression_noparallel( samples, run_parallel) samples = rnaseq.combine_files(samples) with prun.start(_wres(parallel, ["gatk", "vardict"]), samples, config, dirs, "rnaseq-variation") as run_parallel: with profile.report("RNA-seq variant calling", dirs): samples = rnaseq.rnaseq_variant_calling(samples, run_parallel) with prun.start( _wres( parallel, ["samtools", "fastqc", "qualimap", "kraken", "gatk", "preseq"], ensure_mem={"qualimap": 4}), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("create SummarizedExperiment object", dirs): samples = rnaseq.load_summarizedexperiment(samples) with profile.report("upload", dirs): samples = run_parallel("upload_samples", samples) for sample in samples: run_parallel("upload_samples_project", [sample]) with profile.report("bcbioRNAseq loading", dirs): tools_on = dd.get_in_samples(samples, dd.get_tools_on) bcbiornaseq_on = tools_on and "bcbiornaseq" in tools_on if bcbiornaseq_on: if len(samples) < 3: logger.warn( "bcbioRNASeq needs at least three samples total, skipping." ) elif len(samples) > 100: logger.warn("Over 100 samples, skipping bcbioRNASeq.") else: run_parallel("run_bcbiornaseqload", [sample]) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "gatk"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) regions = run_parallel("combine_sample_regions", [samples])[0] samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) with profile.report("coverage", dirs): samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=len(regions["analysis"]), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, regions, run_parallel) with profile.report("variant calling", dirs): samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]), samples, config, dirs, "persample") as run_parallel: with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = combine_multiple_callers(samples) ## Finalizing BAMs and population databases, handle multicore computation with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation", "bcbio-variation-recall"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples