def RunQC(tenx, workflow): workflow.transform( name="quality_control", func=Run, args=(tenx, pypeliner.managed.InputFile("cellranger.complete"), pypeliner.managed.OutputFile("qc.complete"))) return workflow
def RunReport(sampleid, workflow): workflow.transform(name="report", func=Run, args=(sampleid, pypeliner.managed.InputFile("qc.complete"), pypeliner.managed.OutputFile("report.complete"))) return workflow
def RunClustering(sampleid, workflow): workflow.transform( name="clustering", func=Run, args=(sampleid, pypeliner.managed.InputFile("qc.complete"), pypeliner.managed.OutputFile("clustering.complete"))) return workflow
def RunQC(sampleid, workflow, species=None): if species == "mouse": umi = "mouse_umi.png" mito = "mouse_mito.png" ribo = "mouse_ribo.png" counts = "mouse_counts.png" sce = "mouse_raw_sce.rdata" else: umi = "umi.png" mito = "mito.png" ribo = "ribo.png" counts = "counts.png" sce = "raw_sce.rdata" workflow.transform(name="quality_control_{}".format(species), func=Run, args=( sampleid, species, pypeliner.managed.TempOutputFile(umi), pypeliner.managed.TempOutputFile(mito), pypeliner.managed.TempOutputFile(ribo), pypeliner.managed.TempOutputFile(counts), pypeliner.managed.TempOutputFile(sce), )) return workflow
def RunPseudo(sampleid, workflow, full=False): workflow.transform(name="kallisto", func=RunKallisto, args=( sampleid, pypeliner.managed.OutputFile("kallisto.complete"), )) return workflow
def RunHRD(sampleid, workflow): workflow.transform( name="hrd", func=Run, args=(sampleid, pypeliner.managed.TempInputFile("raw_sce.rdata"), pypeliner.managed.TempOutputFile("sce_hrd.rdata"), "/work/shah/reference/transcriptomes/markers/hrd_pathway.yaml", 20, 0.01)) return workflow
def RunExhaustion(sampleid, workflow): workflow.transform( name="exhaustion", func=Run, args=( sampleid, pypeliner.managed.TempInputFile("raw_sce.rdata"), pypeliner.managed.TempOutputFile("sce_exhaustion.rdata"), "/work/shah/reference/transcriptomes/markers/hgsc_exhausted.yaml", 10, 2)) return workflow
def RunCellranger(sampleid, workflow): workflow.transform ( name = "cellranger_counts", func = Run, args = ( sampleid, pypeliner.managed.OutputFile("cellranger.complete"), ) ) return workflow
def RunStatistics(workflow): all_samples = open(config.samples, "r").read().splitlines() workflow.transform ( name = "pull_rdata", func = RunDownload, args = ( all_samples, pypeliner.managed.TempOutputFile("sample_path.json","sample") ) ) return workflow
def RunCorrection(sampleid, workflow): workflow.transform ( name = "batch_correct", func = Run, args = ( sampleid, pypeliner.managed.InputFile("qc.complete"), pypeliner.managed.OutputFile("correction.complete") ) ) return workflow
def RunCellAssign(sampleid, workflow): workflow.transform( name="cellassign", func=Run, args=(sampleid, pypeliner.managed.InputFile("qc.complete"), pypeliner.managed.OutputFile("cellassign.complete"))) workflow.transform( name="cellassignanalysis", func=Analysis, args=(sampleid, pypeliner.managed.InputFile("cellassign.complete"), pypeliner.managed.OutputFile("cellassignanalysis.complete"))) return workflow
def create_workflow_2(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) workflow.transform(name='dofilestuff1', func=do_file_stuff, args=(mgd.InputFile(input_filename), mgd.TempOutputFile('intermediate1'), 'a')) workflow.transform(name='dofilestuff2', func=do_file_stuff, args=(mgd.TempInputFile('intermediate1'), mgd.OutputFile(output_filename), 'b')) return workflow
def RunSeuratQC(bus_path, workflow): workflow.transform(name="sctransform".format(species), func=Run, args=( sampleid, species, pypeliner.managed.TempOutputFile(umi), pypeliner.managed.TempOutputFile(mito), pypeliner.managed.TempOutputFile(ribo), pypeliner.managed.TempOutputFile(counts), pypeliner.managed.TempOutputFile(sce), )) return workflow
def RunCellranger(sampleid, workflow): workflow.transform ( name = "download_fastqs", func = DownloadFastqs, args = ( sampleid, pypeliner.managed.TempOutputFile("download_fastqs.complete"), ) ) workflow.transform ( name = "cellranger_counts_human", func = Counts, args = ( sampleid, pypeliner.managed.TempInputFile("download_fastqs.complete"), pypeliner.managed.TempOutputFile("cellranger_human.complete"), config.reference ) ) workflow.transform ( name = "cellranger_counts_mouse", func = Counts, args = ( sampleid, pypeliner.managed.TempInputFile("download_fastqs.complete"), pypeliner.managed.TempOutputFile("cellranger_mouse.complete"), config.mouse_reference ) ) workflow.transform ( name = "cellranger_upload_human", func = RunUpload, args = ( sampleid, pypeliner.managed.TempOutputFile("human_upload.complete"), "human" ) ) workflow.transform ( name = "cellranger_upload_mouse", func = RunUpload, args = ( sampleid + "_mouse", pypeliner.managed.TempOutputFile("mouse_upload.complete"), "mouse" ) ) return workflow
def RunCellAssign(sampleid, workflow): workflow.transform(name="cellassign", func=Run, args=(sampleid, pypeliner.managed.TempInputFile("raw_sce.rdata"), pypeliner.managed.TempOutputFile("sce_cas.rdata"), config.rho_matrix, 10, 2)) workflow.transform( name="cellassignanalysis", func=Analysis, args=( sampleid, pypeliner.managed.TempInputFile("sce_cas.rdata"), pypeliner.managed.TempOutputFile("celltypes.png"), pypeliner.managed.TempOutputFile("tsne_by_celltype.png"), pypeliner.managed.TempOutputFile("umap_by_celltype.png"), )) return workflow
def create_workflow_2(input_filename, output_filename): workflow = pypeliner.workflow.Workflow() workflow.transform( name='dofilestuff1', func='pypeliner.tests.tasks.do_file_stuff', args=( mgd.InputFile(input_filename), mgd.TempOutputFile('intermediate1'), 'a')) workflow.transform( name='dofilestuff2', func='pypeliner.tests.tasks.do_file_stuff', args=( mgd.TempInputFile('intermediate1'), mgd.OutputFile(output_filename), 'b')) return workflow
def create_workflow_1(input_filename, output_filename): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1}) # Read data into a managed object workflow.transform(name='read', func=read_stuff, ret=mgd.TempOutputObj('input_data'), args=(mgd.InputFile(input_filename), )) # Extract a property of the managed object, modify it # and store the result in another managed object workflow.transform( name='do', func=do_stuff, ret=mgd.TempOutputObj('output_data'), args=(mgd.TempInputObj('input_data').prop('some_string'), )) # Write the object to an output file workflow.transform(name='write', func=write_stuff, args=(mgd.TempInputObj('output_data'), mgd.TempOutputFile('output_file'))) # Recursive workflow workflow.subworkflow(name='sub_workflow_2', func=create_workflow_2, args=(mgd.TempInputFile('output_file'), mgd.OutputFile(output_filename))) return workflow
def RunSeurat(workflow): workflow.transform( name="run_convert", func=RunConvert, axes=('sample', ), args=( pypeliner.managed.TempInputFile("sce.rdata", "sample"), pypeliner.managed.TempOutputFile("seurat.rdata", "sample"), )) workflow.transform( name="run_qc", func=RunSeuratWorkflow, axes=('sample', ), args=( pypeliner.managed.TempInputFile("seurat.rdata", "sample"), pypeliner.managed.TempOutputFile("seurat_qcd.rdata", "sample"), pypeliner.managed.TempOutputFile("sce_qcd.rdata", "sample"), )) workflow.transform( name="visualize_sample", func=RunSeuratViz, axes=('sample', ), args=( pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"), pypeliner.managed.TempOutputFile("seurat_umap.png", "sample"), pypeliner.managed.TempOutputFile("seurat_umap_celltype.png", "sample"), pypeliner.managed.TempOutputFile("seurat_ridge.png", "sample"), pypeliner.managed.TempOutputFile("seurat_features.png", "sample"), )) workflow.transform( name="find_markers", func=RunMarkers, axes=('sample', ), args=( pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"), pypeliner.managed.TempOutputFile("markers.csv", "sample"), )) return workflow
def RunDifferentialAnalysis(sampleid, workflow): workflow.transform( name="clustering_de", func=RunClusteringDE, args=(sampleid, pypeliner.managed.InputFile("clustering.complete"), pypeliner.managed.OutputFile("clustering_de.complete"))) workflow.transform( name="celltype_de", func=RunCellTypeDE, args=(sampleid, pypeliner.managed.InputFile("cellassign.complete"), pypeliner.managed.OutputFile("celltype_de.complete"))) workflow.transform( name="clone_de", func=RunCloneDE, args=(sampleid, pypeliner.managed.InputFile("clonealign.complete"), pypeliner.managed.OutputFile("clone_de.complete"))) return workflow
def RunScvis(sampleid, workflow): workflow.transform( name="scvis_input", func=BuildInput, args=(sampleid, pypeliner.managed.InputFile("qc.complete"), pypeliner.managed.OutputFile("scvis_input.complete"))) workflow.transform( name="run_scvis", func=Run, args=(sampleid, pypeliner.managed.InputFile("scvis_input.complete"), pypeliner.managed.OutputFile("scvis.complete"))) workflow.transform( name="scvis_analysis", func=Analysis, args=(sampleid, pypeliner.managed.InputFile("scvis.complete"), pypeliner.managed.OutputFile("scvis_analysis.complete"))) return workflow
def RunCollection(workflow): print(config.samples) all_samples = json.loads(open(config.samples, "r").read()) workflow.transform( name="seurat_integrate", func=RunSeuratIntegration, args=( all_samples, pypeliner.managed.TempOutputFile("integrated_seurat_seurat.rdata"), pypeliner.managed.TempOutputFile("integrated_seurat_sce.rdata"), pypeliner.managed.TempOutputFile("integrated_seurat_umap.png"), )) workflow.transform( name="harmony_integrate", func=RunHarmonyIntegration, args=( all_samples, pypeliner.managed.TempOutputFile( "integrated_harmony_seurat.rdata"), pypeliner.managed.TempOutputFile("integrated_harmony_sce.rdata"), pypeliner.managed.TempOutputFile("integrated_harmony_umap.png"), pypeliner.managed.TempOutputFile("merged_sce.rdata"), )) workflow.transform( name="scanorama_integrate", func=RunScanoramaIntegration, args=( pypeliner.managed.TempInputFile("merged_sce.rdata"), pypeliner.managed.TempOutputFile("integrated_scanorama_sce.rdata"), pypeliner.managed.TempOutputFile("integrated_scanorama_umap.png"), )) return workflow
def create_workflow(): workflow = pypeliner.workflow.Workflow() bcl_directory = args.get("bcl", None) fastq_directories = args.get("fastqs") aggregate = args.get("aggregate_mlibs", list()) agg_type = args.get("agg_method", "scanorama") libbase = args.get("lib_base", None) additional = args.get("additional", []) prefix = config.prefix output = config.jobpath recipe = args.get("recipe", "basic") try: cellranger_folder = os.path.join(output, prefix) os.makedirs(cellranger_folder) except Exception as e: pass if fastq_directories == None: fastq_directories = [] results = Results(output) runner = PrimaryRun(workflow, prefix, output) """ Aggregating Libraries """ if aggregate != None and len(aggregate) > 0: if agg_type == "tenx": runner.aggregate_libraries_tenx(aggregate, libbase) args["tenx"] = os.path.join(output, "run_{}/outs".format(prefix)) if agg_type == "scanorama": runner.aggregate_libraries_scanorama() """ Setup """ tenx_analysis = args.get("tenx", None) bcls = runner.set_bcl(bcl_directory) fastqs = runner.set_fastq(fastq_directories) workflow = runner.get_workflow() tenx_analysis = args.get("tenx", None) if fastqs != []: tenx_analysis = os.path.join(config.jobpath, prefix, "outs") rdata = args.get("rdata", None) secondary_analysis = SecondaryAnalysis(workflow, prefix, output) tenx = TenxAnalysis(tenx_analysis) """ QC """ secondary_analysis.run_scater() secondary_analysis.build_sce(tenx) secondary_analysis.set_rdata(rdata) results.add_analysis(tenx_analysis) results.add_workflow(secondary_analysis.rscript) results.add_sce(secondary_analysis.sce) umi = os.path.join(output, "figures/umi_distribution.png") mito = os.path.join(output, "figures/mito_distribution.png") ribo = os.path.join(output, "figures/ribo_distribution.png") freq = os.path.join(output, "figures/highestExprs.png") tech = os.path.join(output, "figures/mean_variance_trend.png") high_var = os.path.join(output, "figures/highly_variable_genes.png") results.add_plot(umi, "UMI Distribution") results.add_plot(mito, "Mito Distribution") results.add_plot(ribo, "Ribo Distribution") results.add_plot(freq, "Highest Frequency") results.add_plot(tech, "Mean Variance Trend") results.add_plot(high_var, "Highly Variable Genes") results.add_cellassign_pkl(secondary_analysis.cell_assign_fit) results.add_cellassign_raw(secondary_analysis.cell_assign_rdata) """ Differential Expression """ if config.run_de: other_samples = [] for other_sample in compare: print("blah") exit(0) secondary_analysis.run_de(other_sample) """ CellAssign """ if config.run_cellassign: tenx = TenxAnalysis(tenx_analysis) if hasattr(config, "rho_matrix"): rho_matrix = eval(open(config.rho_matrix, "r").read()) elif hasattr(config, "tissue"): sce = SingleCellExperiment.fromRData(secondary_analysis.sce) rho_matrix = generate_json(tenx, sce, config.organ) else: raise AssertionError("Not implemented.") secondary_analysis.run_cell_assign(rho_matrix, tenx_analysis, additional=combine_assign) results.add_cellassign_pkl(secondary_analysis.cell_assign_fit) results.add_cellassign_raw(secondary_analysis.cell_assign_rdata) path = secondary_analysis.plot_cell_types() results.add_plot(path, "Cell Type Frequency") path = secondary_analysis.plot_cell_type_by_cluster(tenx_analysis) results.add_plot(path, "Cell Type by Cluster") path = secondary_analysis.plot_tsne_by_cell_type() results.add_plot(path, "TSNE by Cell Type") path = secondary_analysis.plot_pca_by_cell_type() results.add_plot(path, "PCA by Cell Type") # path = secondary_analysis.plot_umap_by_cell_type() # results.add_plot(path, "UMAP by Cell Type") path1, path2 = secondary_analysis.marker_analysis(tenx, rho_matrix) results.add_plot(path1, "Heat Marker Gene Matrix") results.add_plot(path2, "Stacked Vin Marker Gene Matrix") """ SCVis """ if config.run_scvis: secondary_analysis.run_scviz(config.perplexity, config.components) """ CloneAlign """ if config.run_clonealign and config.copy_number_data is not None and config.clone_assignments is not None: secondary_analysis.run_clone_align(tenx, config.copy_number_data, config.clone_assignments) if config.plot_scvis: embedding_file = "{0}_{1}/perplexity_{0}_regularizer_0.001_batch_size_512_learning_rate_0.01_latent_dimension_2_activation_ELU_seed_1_iter_3000.tsv".format( config.perplexity, config.components) path = secondary_analysis.plot_scvis_by_cluster(tenx_analysis, embedding_file, pcs=config.components) path = os.path.join(output, path) results.add_plot(path, "SCVis by Cluster") if os.path.exists(config.run_cellassign): path = secondary_analysis.plot_scvis_by_cell_type( embedding_file, pcs=config.components) results.add_plot(path, "SCVIS by Cell Type") """ Cluster Analysis """ if config.clustering: path = secondary_analysis.plot_pca_by_cluster(tenx_analysis, pcs=config.components) results.add_plot(path, "PCA by Cluster") path = secondary_analysis.plot_tsne_by_cluster(tenx_analysis, pcs=config.components) results.add_plot(path, "TSNE by Cluster") path = secondary_analysis.plot_umap_by_cluster(tenx_analysis, pcs=config.components) results.add_plot(path, "UMAP by Cluster") secondary_analysis.plot_cluster_markers(tenx_analysis, rep="PCA", pcs=config.components) pca_cluster_markers = glob.glob("figures/expression/*pca*png") for png in pca_cluster_markers: title = png.split("/")[-1].replace(".png", "").replace( "counts", "gene markers").upper().replace("_", "") results.add_plot(png, title) secondary_analysis.plot_cluster_markers(tenx_analysis, rep="TSNE", pcs=config.components) pca_cluster_markers = glob.glob("figures/expression/*tsne*png") for png in pca_cluster_markers: title = png.split("/")[-1].replace(".png", "").replace( "counts", "gene markers").upper().replace("_", "") results.add_plot(png, title) secondary_analysis.plot_cluster_markers(tenx_analysis, rep="UMAP", pcs=config.components) pca_cluster_markers = glob.glob("figures/expression/*umap*png") for png in pca_cluster_markers: title = png.split("/")[-1].replace(".png", "").replace( "counts", "gene markers").upper().replace("_", "") results.add_plot(png, title) embedding_file = "{0}_{1}/perplexity_{0}_regularizer_0.001_batch_size_512_learning_rate_0.01_latent_dimension_2_activation_ELU_seed_1_iter_3000.tsv".format( config.perplexity, config.components) secondary_analysis.plot_cluster_markers(tenx_analysis, rep="SCVIS", pcs=config.components, embedding_file=embedding_file) pca_cluster_markers = glob.glob("figures/expression/*scvis_5_50*png") for png in pca_cluster_markers: title = png.split("/")[-1].replace(".png", "").replace( "counts", "gene markers").upper().replace("_", "") results.add_plot(png, title) """ Gene Level """ """ Reporting """ if config.report: workflow.transform(name="{}_markdown".format(prefix), func=exportMD, args=(results, )) if config.report: workflow.transform(name="{}_finalize".format(prefix), func=exportFinalize, args=(results, )) workflow = secondary_analysis.get_workflow() return workflow
def create_workflow(): """ Generates tasks as Pypeliner workflow based on input arguments. The workflow will start from the most raw input provided and override any downstream tasks with subsequently provided input arguments. Parellelization is performed over provided samplesheets and replication within samples. Args: None Yields: Pypeliner workflow object. """ bcl_directory = args.get("bcl", None) fastq_directory = args.get("fastq", None) tenx_analysis = args.get("tenx", None) rdata = args.get("rdata", None) bcl = BinaryBaseCall(bcl_directory) workflow = pypeliner.workflow.Workflow() if bcl_directory: workflow.transform ( name = "bcl_to_fastq", func = CellRanger.mkfastq, ret = pypeliner.managed.TempOutputObj("fastq_object"), args = ( bcl_object, ) ) if bcl_directory != None or fastq_directory != None: if fastq_directory != None: fastq = FastQDirectory(fastq_directory) # fastqs = list() # for sample_sheet in glob.iglob(os.path.join(fastq_directory, "**/*.csv")): else: fastq = pypeliner.managed.TempInputObj("fastq_object") workflow.transform ( name = "fastq_counts", func = CellRanger.count, ret = pypeliner.managed.TempOutputObj("tenx_analysis"), args = ( fastq, ) ) tenx = None if tenx_analysis != None and rdata == None: tenx = TenxAnalysis(tenx_analysis) elif tenx_analysis == None and rdata == None: tenx = pypeliner.managed.TempInputObj("tenx_analysis") if tenx != None: workflow.transform ( name = "tenx_read10xcounts", func = TenX.read10xCounts, ret = pypeliner.managed.TempOutputObj("single_cell_experiment"), args = ( tenx, ) ) if rdata != None: single_cell_experiment = TenxAnalysis.from_rdata(rdata) else: single_cell_experiment = pypeliner.managed.TempInputObj("single_cell_experiment") # # workflow.transform ( # name = "tenx_barcoderanks", # func = TenX.barcodeRanks, # args = ( # pypeliner.managed.TempInputObj("tenx_analysis"), # ) # ) # # workflow.transform ( # name = "tenx_emptydrops", # func = TenX.emptyDrops, # args = ( # pypeliner.managed.TempInputObj("tenx_analysis"), # ) # ) workflow.transform ( name = "clonealign", func = CloneAlign.run, ret = pypeliner.managed.TempOutputObj("clone_align_fit"), args = ( single_cell_experiment, ) ) # """ # workflow.transform ( # name = "cellasign", # func = CellAssign.run_em, # ret = pypeliner.managed.TempOutputObj("cell_assignments"), # args = ( # single_cell_experiment, # ) # ) # # workflow.transform ( # name = "scviz", # func = SCViz.run, # ret = pypeliner.managed.TempOutputObj("scviz_dim_reduction"), # args = ( # single_cell_experiment, # ) # ) # # workflow.transform ( # name = "html_output", # func = HTMLResults.generate, # args = ( # pypeliner.managed.TempInputObj("fastq_object") # pypeliner.managed.TempInputObj("ten_analysis"), # pypeliner.managed.TempInputObj("single_cell_experiment"), # pypeliner.managed.TempInputObj("clone_align_fit"), # pypeliner.managed.TempInputObj("cell_assignments"), # pypeliner.managed.TempInputObj("scviz_dim_reduction"), # ) # ) """ return workflow
if __name__ == "__main__": fileConfig("logging_config.ini") logging.info("Starting bam_demultiplexer pipeline...") args = vars(parse_args()) pyp = pypeliner.app.Pypeline([demultiplex_bam], config=args) #?? workflow = pypeliner.workflow.Workflow() # demultiplex workflow.transform(name='demultiplex', func=demultiplex_bam.demultiplex, args=(pypeliner.managed.InputFile(args['bam']), pypeliner.managed.TempOutputFile( 'demux', 'split'), args['barcode_csv'])) # collate demultiplexed bams workflow.transform(name='collate', axes=('split', ), func=samtools_wrapper.collate, args=(pypeliner.managed.TempInputFile('demux', 'split'), pypeliner.managed.TempOutputFile( 'collated', 'split'))) # convert collated bams to fastq workflow.transform( name='fastq', axes=('split', ),
'tool_name'), mgd.TempSpace('tool_raw_data', 'tool_name', cleanup='none'), ), ) workflow.transform( name='plot', axes=('tool_name', ), func=destruct.benchmark.destruct_test.create_roc_plot, args=( mgd.TempInputObj('simulation.params'), mgd.TempInputObj('tool_defs', 'tool_name'), mgd.InputFile(os.path.join(args['results_dir'], 'genome.fasta')), mgd.InputFile(os.path.join(args['results_dir'], 'simulated.tsv')), mgd.InputFile( os.path.join(args['results_dir'], 'results_{tool_name}.tsv'), 'tool_name'), mgd.OutputFile( os.path.join(args['results_dir'], 'annotated_{tool_name}.tsv'), 'tool_name'), mgd.OutputFile( os.path.join(args['results_dir'], 'identified_{tool_name}.tsv'), 'tool_name'), mgd.OutputFile( os.path.join(args['results_dir'], 'plots_{tool_name}.pdf'), 'tool_name'), ), ) pyp.run(workflow)
def create_destruct_fastq_workflow( fastq1_filenames, fastq2_filenames, sample1_filenames, sample2_filenames, stats_filenames, breakpoint_table, breakpoint_library_table, breakpoint_read_table, config, ref_data_dir, raw_data_dir=None, ): workflow = pypeliner.workflow.Workflow() # Set the library ids workflow.setobj( obj=mgd.TempOutputObj('library_id', 'bylibrary'), value=destruct.tasks.create_library_ids(fastq1_filenames.keys()), ) workflow.transform( name='readstats', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.read_stats', ret=mgd.TempOutputObj('stats', 'bylibrary'), args=( mgd.InputFile('stats.txt', 'bylibrary', fnames=stats_filenames), config['fragment_length_num_stddevs'], ), ) # Align a sample of reads and calculate alignment statistics workflow.transform( name='prepseed_sample', axes=('bylibrary', ), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), 36, mgd.TempOutputFile('sample.seed', 'bylibrary'), ), ) workflow.commandline( name='bwtrealign_sample', axes=('bylibrary', ), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('sample.seed', 'bylibrary'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_aligntrue', '-a', '-', '-1', mgd.InputFile('sample1.fq.gz', 'bylibrary', fnames=sample1_filenames), '-2', mgd.InputFile('sample2.fq.gz', 'bylibrary', fnames=sample2_filenames), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '-s', mgd.TempOutputFile('samples.align.true', 'bylibrary'), ), ) workflow.transform( name='scorestats', axes=('bylibrary', ), ctx=medmem, func='destruct.score_stats.create_score_stats', args=( mgd.TempInputFile('samples.align.true', 'bylibrary'), config['match_score'], mgd.TempOutputFile('score.stats', 'bylibrary'), ), ) # Split discordant fastqs and align workflow.transform( name='splitfastq1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads1', 'bylibrary', 'byread'), ), ) workflow.transform( name='splitfastq2', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.split_fastq', args=( mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), int(config['reads_per_split']), mgd.TempOutputFile('reads2', 'bylibrary', 'byread', axes_origin=[]), ), ) workflow.transform( name='prepseed', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.tasks.prepare_seed_fastq', args=( mgd.TempInputFile('reads1', 'bylibrary', 'byread'), mgd.TempInputFile('reads2', 'bylibrary', 'byread'), 36, mgd.TempOutputFile('reads.seed', 'bylibrary', 'byread'), ), ) workflow.commandline( name='bwtrealign', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'bowtie', config['genome_fasta'], mgd.TempInputFile('reads.seed', 'bylibrary', 'byread'), '--chunkmbs', '512', '-k', '1000', '-m', '1000', '--strata', '--best', '-S', '|', 'destruct_realign2', '-l', mgd.TempInputObj('library_id', 'bylibrary'), '-a', '-', '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '-r', config['genome_fasta'], '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmin', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'), '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--tchimer', config['chimeric_threshold'], '--talign', config['alignment_threshold'], '--pchimer', config['chimeric_prior'], '--tvalid', config['readvalid_threshold'], '-z', mgd.TempInputFile('score.stats', 'bylibrary'), '--span', mgd.TempOutputFile('spanning.alignments', 'bylibrary', 'byread'), '--split', mgd.TempOutputFile('split.alignments', 'bylibrary', 'byread'), ), ) workflow.transform( name='merge_spanning_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('spanning.alignments_1', 'bylibrary'), ), ) workflow.commandline( name='filterreads', axes=('bylibrary', ), ctx=lowmem, args=( 'destruct_filterreads', '-n', '2', '-a', mgd.TempInputFile('spanning.alignments_1', 'bylibrary'), '-r', config['satellite_regions'], '>', mgd.TempOutputFile('spanning.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_split_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_files_by_line', args=( mgd.TempInputFile('split.alignments', 'bylibrary', 'byread'), mgd.TempOutputFile('split.alignments', 'bylibrary'), ), ) workflow.transform( name='merge_spanning_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('spanning.alignments', 'bylibrary'), mgd.TempOutputFile('spanning.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) workflow.transform( name='merge_split_2', ctx=lowmem, func='destruct.tasks.merge_alignment_files', args=( mgd.TempInputFile('split.alignments', 'bylibrary'), mgd.TempOutputFile('split.alignments'), mgd.TempInputObj('library_id', 'bylibrary'), ), ) # Cluster spanning reads workflow.setobj( obj=mgd.TempOutputObj('chrom.args', 'bychromarg'), value=destruct.tasks.generate_chromosome_args(config['chromosomes']), ) workflow.transform( name='write_stats_table', ctx=lowmem, func='destruct.tasks.write_stats_table', args=( mgd.TempInputObj('library_id', 'bylibrary'), mgd.TempInputObj('stats', 'bylibrary'), mgd.TempOutputFile('libstats.tsv'), ), ) workflow.commandline( name='cluster', axes=('bychromarg', ), ctx=medmem, args=( 'destruct_mclustermatepairs', '-a', mgd.TempInputFile('spanning.alignments'), '-s', mgd.TempInputFile('libstats.tsv'), '-c', mgd.TempOutputFile('clusters', 'bychromarg'), mgd.TempInputObj('chrom.args', 'bychromarg'), '--clustmin', config['cluster_readcount_threshold'], '--fragmax', config['fragment_length_max'], ), ) # Predict breakpoints from split reads workflow.transform( name='predict_breaks', axes=('bychromarg', ), ctx=medmem, func='destruct.predict_breaks.predict_breaks', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('spanning.alignments'), mgd.TempInputFile('split.alignments'), mgd.TempOutputFile('breakpoints_2', 'bychromarg'), ), ) workflow.transform( name='merge_clusters', ctx=lowmem, func='destruct.tasks.merge_clusters', args=( mgd.TempInputFile('clusters', 'bychromarg'), mgd.TempInputFile('breakpoints_2', 'bychromarg'), mgd.TempOutputFile('clusters'), mgd.TempOutputFile('breakpoints_2'), mgd.TempOutputFile('merge_clusters.debug'), ), ) # Realign reads to breakpoints workflow.commandline( name='realigntobreaks', axes=('bylibrary', 'byread'), ctx=medmem, args=( 'destruct_realigntobreaks2', '-r', config['genome_fasta'], '-b', mgd.TempInputFile('breakpoints_2'), '-c', mgd.TempInputFile('clusters'), '-g', config['gap_score'], '-x', config['mismatch_score'], '-m', config['match_score'], '--flmax', mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'), '--span', mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'), '-1', mgd.TempInputFile('reads1', 'bylibrary', 'byread'), '-2', mgd.TempInputFile('reads2', 'bylibrary', 'byread'), '--realignments', mgd.TempOutputFile('realignments', 'bylibrary', 'byread'), ), ) # Calculate likelihoods based on realignments workflow.transform( name='calculate_realignment_likelihoods', axes=('bylibrary', 'byread'), ctx=medmem, func='destruct.predict_breaks.calculate_realignment_likelihoods', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempInputFile('realignments', 'bylibrary', 'byread'), mgd.TempInputFile('score.stats', 'bylibrary'), mgd.TempOutputFile('likelihoods_2', 'bylibrary', 'byread'), config['match_score'], mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_mean'), mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_stddev'), ), ) workflow.transform( name='merge_likelihoods_1', axes=('bylibrary', ), ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary', 'byread'), mgd.TempOutputFile('likelihoods_2', 'bylibrary'), mgd.TempSpace('merge_likelihoods_1_temp', 'bylibrary'), '1', ), ) workflow.transform( name='merge_likelihoods_2', ctx=lowmem, func='destruct.tasks.merge_sorted_files_by_line', args=( mgd.TempInputFile('likelihoods_2', 'bylibrary'), mgd.TempOutputFile('likelihoods_2'), mgd.TempSpace('merge_likelihoods_2_temp'), '1', ), ) # Set cover for multi mapping reads workflow.transform( name='calc_weights', ctx=medmem, func='destruct.predict_breaks.calculate_cluster_weights', args=( mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('cluster_weights'), ), ) workflow.commandline( name='setcover', ctx=medmem, args=( 'destruct_setcover', '-c', mgd.TempInputFile('clusters'), '-w', mgd.TempInputFile('cluster_weights'), '-a', mgd.TempOutputFile('clusters_setcover'), ), ) # Select cluster based on setcover workflow.transform( name='select_clusters', ctx=medmem, func='destruct.predict_breaks.select_clusters', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('breakpoints_2'), mgd.TempOutputFile('breakpoints_1'), mgd.TempInputFile('likelihoods_2'), mgd.TempOutputFile('likelihoods_1'), ), ) # Select prediction based on max likelihood workflow.transform( name='select_predictions', ctx=himem, func='destruct.predict_breaks.select_predictions', args=( mgd.TempInputFile('breakpoints_1'), mgd.TempOutputFile('breakpoints'), mgd.TempInputFile('likelihoods_1'), mgd.TempOutputFile('likelihoods'), config['mate_score_threshold'], config['template_length_min_threshold'], config['min_alignment_log_likelihood'], ), ) # Optionally tabulate supporting reads workflow.transform( name='tabreads', ctx=medmem, func='destruct.tasks.tabulate_reads', args=( mgd.TempInputFile('clusters_setcover'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), mgd.InputFile('reads1.fq.gz', 'bylibrary', fnames=fastq1_filenames), mgd.InputFile('reads2.fq.gz', 'bylibrary', fnames=fastq2_filenames), mgd.TempOutputFile('breakreads.table.unsorted'), ), ) workflow.commandline( name='sortreads', ctx=medmem, args=( 'sort', '-n', mgd.TempInputFile('breakreads.table.unsorted'), '>', mgd.OutputFile(breakpoint_read_table), ), ) # Tabulate results workflow.transform( name='tabulate', ctx=himem, func='destruct.tasks.tabulate_results', args=( mgd.TempInputFile('breakpoints'), mgd.TempInputFile('likelihoods'), mgd.TempInputObj('library_id', 'bylibrary'), config['genome_fasta'], config['gtf_filename'], config['dgv_filename'], mgd.OutputFile(breakpoint_table), mgd.OutputFile(breakpoint_library_table), ), ) return workflow
workflow.setobj(mgd.TempOutputObj('simulation.params'), sim_config['simulation']) if sim_config['reference'].get('chromosomes', None) is not None: genome_fasta = os.path.join(args['results_dir'], 'genome.fa') source_bam = os.path.join(args['results_dir'], 'source.bam') workflow.setobj(mgd.TempOutputObj('chromosomes'), sim_config['reference']['chromosomes']) workflow.transform( name='create_ref_bam', func=destruct.benchmark.destruct_test.create_ref_bam, args=( mgd.InputFile(args['ref']), mgd.InputFile(args['bam']), mgd.OutputFile(genome_fasta), mgd.OutputFile(source_bam), mgd.TempInputObj('chromosomes'), ), ) else: genome_fasta = args['ref'] source_bam = args['bam'] workflow.transform( name='create_sim', func=destruct.benchmark.create_breakpoint_simulation. create_breakpoints, args=(
def create_resample_simulation_workflow( sim_defs, mixture_filename, source_filename, normal_filename, tumour_filename, breakpoint_filename, config, ref_data_dir, ): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.setobj( obj=mgd.TempOutputObj('sim_defs'), value=sim_defs, ) workflow.transform( name='simulate_germline_alleles', ctx={'mem': 8}, func=remixt.simulations.pipeline.simulate_germline_alleles, args=( mgd.TempOutputFile('germline_alleles'), mgd.TempInputObj('sim_defs'), config, ref_data_dir, ), ) workflow.transform( name='resample_normal_data', ctx={'mem': 128}, func=remixt.simulations.pipeline.resample_normal_data, args=( mgd.OutputFile(normal_filename), mgd.InputFile(source_filename), mgd.InputFile(mixture_filename), mgd.TempInputFile('germline_alleles'), mgd.TempInputObj('sim_defs'), ), ) workflow.transform( name='resample_tumour_data', ctx={'mem': 128}, func=remixt.simulations.pipeline.resample_tumour_data, args=( mgd.OutputFile(tumour_filename), mgd.InputFile(source_filename), mgd.InputFile(mixture_filename), mgd.TempInputFile('germline_alleles'), mgd.TempInputObj('sim_defs'), ), ) workflow.transform( name='write_breakpoints', func=remixt.simulations.pipeline.write_breakpoints, args=( mgd.OutputFile(breakpoint_filename), mgd.InputFile(mixture_filename), ), ) return workflow
def generate_bam( simulation_params, chromosomes, include_nonchromosomal, simulated_bam_filename, genome_fasta_filename, simulated_table_filename, raw_data_dir, ): workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4}) workflow.setobj(mgd.TempOutputObj('simulation.params'), simulation_params) workflow.setobj(mgd.TempOutputObj('chromosomes'), chromosomes) workflow.setobj(mgd.TempOutputObj('include_nonchromosomal'), include_nonchromosomal) workflow.transform( name='create_genome', func=destruct.benchmark.destruct_test.create_genome, args=( mgd.TempInputObj('chromosomes'), mgd.TempInputObj('include_nonchromosomal'), mgd.OutputFile(genome_fasta_filename), ), ) workflow.transform( name='create_sim', func=destruct.benchmark.create_breakpoint_simulation.create, args=( mgd.TempInputObj('simulation.params'), mgd.InputFile(genome_fasta_filename), mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.fasta')), mgd.OutputFile(simulated_table_filename), mgd.TempOutputFile('concordant.1.fastq'), mgd.TempOutputFile('concordant.2.fastq'), mgd.TempOutputFile('discordant.1.fastq'), mgd.TempOutputFile('discordant.2.fastq'), ), ) workflow.commandline( name='cat1', args=( 'cat', mgd.TempInputFile('concordant.1.fastq'), mgd.TempInputFile('discordant.1.fastq'), '>', mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')), ), ) workflow.commandline( name='cat2', args=( 'cat', mgd.TempInputFile('concordant.2.fastq'), mgd.TempInputFile('discordant.2.fastq'), '>', mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')), ), ) workflow.subworkflow( name='bwa_align', func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow, args=( mgd.InputFile(genome_fasta_filename), mgd.InputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')), mgd.InputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')), mgd.TempOutputFile('simulated.unsorted.bam'), ), ) workflow.transform( name='samtools_sort_index', func=destruct.benchmark.destruct_test.samtools_sort_index, args=( mgd.TempInputFile('simulated.unsorted.bam'), mgd.OutputFile(simulated_bam_filename), ), ) return workflow
def RunCloneAlignWorkflow(workflow): print("Creating workflow.") all_samples = open(config.samples, "r").read().splitlines() all_samples = [ sample.strip() for sample in all_samples if sample.strip() != "" ] workflow.transform(name="download_collection", func=RunDownload, args=(all_samples, pypeliner.managed.TempOutputFile( "sample_path.json", "sample"))) workflow.transform( name="extract_rdata", func=RunExtract, axes=('sample', ), args=( pypeliner.managed.TempInputFile("sample_path.json", "sample"), pypeliner.managed.TempOutputFile("sample.rdata", "sample"), )) workflow.transform( name="run_cellassign", func=RunCellAssign, axes=('sample', ), args=( pypeliner.managed.TempInputFile("sample.rdata", "sample"), pypeliner.managed.TempOutputFile("cell_annotated.rdata", "sample"), )) workflow.transform(name="run_modecn", func=RunModeCopyNumber, axes=('sample', ), args=(pypeliner.managed.TempOutputFile( "copy_number_data.csv", "sample"), )) workflow.transform( name="run_clonealigninputs", func=RunCloneAlignInput, axes=('sample', ), args=( pypeliner.managed.TempInputFile("cell_annotated.rdata", "sample"), pypeliner.managed.TempInputFile("copy_number_data.csv", "sample"), pypeliner.managed.TempOutputFile("clone.rdata", "sample"), pypeliner.managed.TempOutputFile("cnv.rdata", "sample"), pypeliner.managed.TempOutputFile("rawcnv.rdata", "sample"), )) workflow.transform( name="run_clonealign", func=RunCloneAlign, axes=('sample', ), args=( pypeliner.managed.TempInputFile("clone.rdata", "sample"), pypeliner.managed.TempInputFile("cnv.rdata", "sample"), pypeliner.managed.TempOutputFile("clone_annotated.rdata", "sample"), pypeliner.managed.TempOutputFile("cal.rdata", "sample"), )) if len(all_samples) > 1: workflow.transform( name="run_convert", func=RunConvert, axes=('sample', ), args=( pypeliner.managed.TempInputFile("clone_annotated.rdata", "sample"), pypeliner.managed.TempInputFile("cell_annotated.rdata", "sample"), pypeliner.managed.TempOutputFile("seurat.rdata", "sample"), )) workflow.transform( name="run_qc", func=RunSeuratWorkflow, axes=('sample', ), args=( pypeliner.managed.TempInputFile("seurat.rdata", "sample"), pypeliner.managed.TempOutputFile("seurat_qcd.rdata", "sample"), pypeliner.managed.TempOutputFile("sce_qcd.rdata", "sample"), )) workflow.transform( name="visualize_sample", func=RunSeuratViz, axes=('sample', ), args=( pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"), pypeliner.managed.TempOutputFile("seurat_umap.png", "sample"), pypeliner.managed.TempOutputFile("seurat_umap_celltype.png", "sample"), pypeliner.managed.TempOutputFile("seurat_umap_clone.png", "sample"), )) workflow.transform( name="integrate", func=RunIntegration, args=( pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"), pypeliner.managed.TempOutputFile("seurat_integrated.rdata"), pypeliner.managed.TempOutputFile("sce_integrated.rdata"), )) workflow.transform( name="run_figures", func=RunFigures, args=( pypeliner.managed.TempInputFile("sce_integrated.rdata"), pypeliner.managed.TempOutputFile("umap_cell.png"), pypeliner.managed.TempOutputFile("umap_clone.png"), pypeliner.managed.TempOutputFile("umap_sample.png"), )) else: workflow.transform( name="run_figures_single_sample", func=RunFigures, axes=('sample', ), args=( pypeliner.managed.TempInputFile("clone_annotated.rdata", "sample"), pypeliner.managed.TempOutputFile("umap_cell.png", "sample"), pypeliner.managed.TempOutputFile("umap_clone.png", "sample"), pypeliner.managed.TempOutputFile("umap_sample.png", "sample"), )) return workflow