def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 40}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc","kraken"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard"]), samples, config, dirs, "trimming") as run_parallel: samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 30}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: samples = rnaseq.estimate_expression(samples, run_parallel) #samples = rnaseq.detect_fusion(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = annotate_combined_count_file(combined, gtf_file) for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(parallel, samples, config, dirs, "trimming") as run_parallel: samples = run_parallel("trim_lane", samples) with prun.start( _wprogs(parallel, ["aligner"], {"tophat": 8, "tophat2": 8, "star": 30}), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples), ) as run_parallel: samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) with prun.start( _wprogs(parallel, ["samtools", "gatk", "cufflinks"]), samples, config, dirs, "rnaseqcount" ) as run_parallel: samples = rnaseq.estimate_expression(samples, run_parallel) # samples = rnaseq.detect_fusion(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) organism = utils.get_in(samples[0][0], ("genome_resources", "aliases", "ensembl"), None) annotated = annotate_combined_count_file(combined, organism) for x in samples: x[0]["combined_counts"] = combined x[0]["annotated_combined_counts"] = annotated with prun.start( _wprogs(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample" ) as run_parallel: samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_transcriptome_gtf(odata) if not gtf_file: gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def run(self, config, config_file, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) with profile.report("coverage", dirs): samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]), samples, config, dirs, "persample") as run_parallel: with profile.report("joint squaring off/backfilling", dirs): samples = joint.square_off(samples, run_parallel) with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) ## Finalizing BAMs and population databases, handle multicore computation with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation", "bcbio-variation-recall"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) logger.info("Timing: finished") return samples
def _create_combined_fasta(data, out_dir): """ if there are genomes to be disambiguated, create a FASTA file of all of the transcripts for all genomes """ items = disambiguate.split([data]) fasta_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) ref_file = dd.get_ref_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + ".fa") if file_exists(out_file): fasta_files.append(out_file) else: out_file = _gtf_to_fasta(gtf_file, ref_file, out_file) out_file = _clean_gtf_fa(out_file, out_file) fasta_files.append(out_file) out_stem = os.path.join(out_dir, dd.get_genome_build(data)) if dd.get_disambiguate(data): out_stem = "-".join([out_stem] + dd.get_disambiguate(data)) combined_file = out_stem + ".fa" if file_exists(combined_file): return combined_file fasta_file_string = " ".join(fasta_files) cmd = "cat {fasta_file_string} > {tx_out_file}" with file_transaction(combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining transcriptome FASTA files.") return combined_file
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) samples = disambiguate.split(lane_items) samples = run_parallel("process_alignment", samples) samples = run_parallel("clean_chipseq_alignment", samples) samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["aligner"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 2}), [samples[0]], config, dirs, "organize_samples") as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) with prun.start(_wres(parallel, ["picard", "cutadapt"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={"tophat": 8, "tophat2": 8, "star": 2}), samples, config, dirs, "alignment", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("transcript assembly", dirs): samples = rnaseq.assemble_transcripts(run_parallel, samples) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc", "kraken"]), samples, config, dirs, "qc") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with prun.start(_wres(parallel, ["aligner", "samtools", "sambamba"], (["reference", "fasta"], ["reference", "aligner"], ["files"])), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("alignment preparation", dirs): samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) with profile.report("alignment", dirs): samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) with profile.report("callable regions", dirs): samples = run_parallel("postprocess_alignment", samples) samples = run_parallel("combine_sample_regions", [samples]) samples = region.clean_sample_data(samples) with profile.report("coverage", dirs): samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with prun.start(_wres(parallel, ["gatk", "picard", "variantcaller"]), samples, config, dirs, "full", multiplier=region.get_max_counts(samples), max_multicore=1) as run_parallel: with profile.report("alignment post-processing", dirs): samples = region.parallel_prep_region(samples, run_parallel) with profile.report("variant calling", dirs): samples = genotype.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with prun.start(_wres(parallel, ["gatk", "gatk-vqsr", "snpeff", "bcbio_variation"]), samples, config, dirs, "persample") as run_parallel: with profile.report("variant post-processing", dirs): samples = run_parallel("postprocess_variants", samples) samples = run_parallel("split_variants_by_sample", samples) with profile.report("validation", dirs): samples = run_parallel("compare_to_rm", samples) samples = genotype.combine_multiple_callers(samples) ## Finalizing BAMs and population databases, handle multicore computation with prun.start(_wres(parallel, ["gemini", "samtools", "fastqc", "bamtools", "bcbio_variation", "bcbio-variation-recall"]), samples, config, dirs, "multicore2") as run_parallel: with profile.report("prepped BAM merging", dirs): samples = region.delayed_bamprep_merge(samples, run_parallel) with profile.report("ensemble calling", dirs): samples = ensemble.combine_calls_parallel(samples, run_parallel) with profile.report("validation summary", dirs): samples = validate.summarize_grading(samples) with profile.report("structural variation", dirs): samples = structural.run(samples, run_parallel) with profile.report("population database", dirs): samples = population.prep_db_parallel(samples, run_parallel) with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) with profile.report("archive", dirs): samples = archive.compress(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, samples): ## Alignment and preparation requiring the entire input file (multicore cluster) with global_parallel(parallel, "multicore", ["process_alignment", "postprocess_alignment"], samples, dirs, config, multiplier=alignprep.parallel_multiplier(samples)) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment") samples = run_parallel("prep_align_inputs", samples) samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) samples = alignprep.merge_split_alignments(samples, run_parallel) samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("postprocess_alignment", samples) regions = callable.combine_sample_regions(samples) samples = region.add_region_info(samples, regions) samples = region.clean_sample_data(samples) logger.info("Timing: coverage") samples = coverage.summarize_samples(samples, run_parallel) ## Variant calling on sub-regions of the input file (full cluster) with global_parallel(parallel, "full", ["piped_bamprep", "variantcall_sample"], samples, dirs, config, multiplier=len(regions["analysis"]), max_multicore=1) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: alignment post-processing") samples = region.parallel_prep_region(samples, regions, run_parallel) logger.info("Timing: variant calling") samples = region.parallel_variantcall_region(samples, run_parallel) ## Finalize variants (per-sample cluster) with global_parallel(parallel, "persample", ["postprocess_variants"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: variant post-processing") samples = run_parallel("postprocess_variants", samples) logger.info("Timing: validation") samples = run_parallel("compare_to_rm", samples) samples = combine_multiple_callers(samples) logger.info("Timing: ensemble calling") samples = ensemble.combine_calls_parallel(samples, run_parallel) samples = validate.summarize_grading(samples) ## Finalizing BAMs and population databases, handle multicore computation with global_parallel(parallel, "multicore2", ["prep_gemini_db", "delayed_bam_merge"], samples, dirs, config) as parallel: run_parallel = parallel_runner(parallel, dirs, config) logger.info("Timing: prepped BAM merging") samples = region.delayed_bamprep_merge(samples, run_parallel) logger.info("Timing: structural variation") samples = structural.run(samples, run_parallel) logger.info("Timing: population database") samples = population.prep_db_parallel(samples, run_parallel) logger.info("Timing: quality control") samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["aligner", "picard"]), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "persample") as run_parallel: samples = run_parallel("clean_chipseq_alignment", samples) samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) samples = disambiguate.split(lane_items) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = run_parallel("generate_transcript_counts", samples) combined = combine_count_files([x[0].get("count_file") for x in samples]) for x in samples: x[0]["combined_counts"] = combined samples = qcsummary.generate_parallel(samples, run_parallel) #run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) return samples
def run(self, config, run_info_yaml, parallel, dirs, samples): with prun.start(_wres(parallel, ["aligner", "picard"]), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier(samples)) as run_parallel: with profile.report("organize samples", dirs): samples = run_parallel("organize_samples", [[dirs, config, run_info_yaml, [x[0]["description"] for x in samples]]]) samples = run_parallel("prepare_sample", samples) samples = run_parallel("trim_sample", samples) samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["picard", "fastqc"]), samples, config, dirs, "persample") as run_parallel: samples = run_parallel("clean_chipseq_alignment", samples) samples = qcsummary.generate_parallel(samples, run_parallel) return samples
def run(self, config, config_file, parallel, dirs, samples): with prun.start(_wres(parallel, ["picard", "AlienTrimmer"]), samples, config, dirs, "trimming") as run_parallel: with profile.report("adapter trimming", dirs): samples = run_parallel("process_lane", samples) samples = run_parallel("trim_lane", samples) with prun.start(_wres(parallel, ["aligner", "picard"], ensure_mem={ "tophat": 8, "tophat2": 8, "star": 40 }), samples, config, dirs, "multicore", multiplier=alignprep.parallel_multiplier( samples)) as run_parallel: with profile.report("alignment", dirs): samples = disambiguate.split(samples) samples = run_parallel("process_alignment", samples) with prun.start(_wres(parallel, ["samtools", "cufflinks"]), samples, config, dirs, "rnaseqcount") as run_parallel: with profile.report("disambiguation", dirs): samples = disambiguate.resolve(samples, run_parallel) with profile.report("estimate expression", dirs): samples = rnaseq.estimate_expression(samples, run_parallel) combined = combine_count_files( [x[0].get("count_file") for x in samples]) gtf_file = utils.get_in(samples[0][0], ('genome_resources', 'rnaseq', 'transcripts'), None) annotated = annotate_combined_count_file(combined, gtf_file) for x in samples: x[0]["combined_counts"] = combined if annotated: x[0]["annotated_combined_counts"] = annotated with prun.start(_wres(parallel, ["picard", "fastqc", "rnaseqc"]), samples, config, dirs, "persample") as run_parallel: with profile.report("quality control", dirs): samples = qcsummary.generate_parallel(samples, run_parallel) logger.info("Timing: finished") return samples
def run(self, config, config_file, run_parallel, parallel, dirs, lane_items): lane_items = run_parallel("trim_lane", lane_items) samples = disambiguate.split(lane_items) samples = run_parallel("process_alignment", samples) samples = disambiguate.resolve(samples, run_parallel) samples = rnaseq.estimate_expression(samples, run_parallel) #samples = rnaseq.detect_fusion(samples, run_parallel) combined = combine_count_files([x[0].get("count_file") for x in samples]) organism = utils.get_in(samples[0][0], ('genome_resources', 'aliases', 'ensembl'), None) annotated = annotate_combined_count_file(combined, organism) for x in samples: x[0]["combined_counts"] = combined x[0]["annotated_combined_counts"] = annotated samples = qcsummary.generate_parallel(samples, run_parallel) #run_parallel("generate_bigwig", samples, {"programs": ["ucsc_bigwig"]}) return samples
def create_combined_tx2gene(data): out_dir = os.path.join(dd.get_work_dir(data), "inputs", "transcriptome") items = disambiguate.split([data]) tx2gene_files = [] for i in items: odata = i[0] gtf_file = dd.get_gtf_file(odata) out_file = os.path.join(out_dir, dd.get_genome_build(odata) + "-tx2gene.csv") if file_exists(out_file): tx2gene_files.append(out_file) else: out_file = gtf.tx2genefile(gtf_file, out_file, tsv=False) tx2gene_files.append(out_file) combined_file = os.path.join(out_dir, "tx2gene.csv") if file_exists(combined_file): return combined_file tx2gene_file_string = " ".join(tx2gene_files) cmd = "cat {tx2gene_file_string} > {tx_out_file}" with file_transaction(data, combined_file) as tx_out_file: do.run(cmd.format(**locals()), "Combining tx2gene CSV files.") return combined_file
def disambiguate_split(*args): return disambiguate.split(*args)