Beispiel #1
0
def RunQC(tenx, workflow):
    workflow.transform(
        name="quality_control",
        func=Run,
        args=(tenx, pypeliner.managed.InputFile("cellranger.complete"),
              pypeliner.managed.OutputFile("qc.complete")))
    return workflow
Beispiel #2
0
def RunReport(sampleid, workflow):
    workflow.transform(name="report",
                       func=Run,
                       args=(sampleid,
                             pypeliner.managed.InputFile("qc.complete"),
                             pypeliner.managed.OutputFile("report.complete")))
    return workflow
Beispiel #3
0
def RunClustering(sampleid, workflow):
    workflow.transform(
        name="clustering",
        func=Run,
        args=(sampleid, pypeliner.managed.InputFile("qc.complete"),
              pypeliner.managed.OutputFile("clustering.complete")))
    return workflow
Beispiel #4
0
def RunQC(sampleid, workflow, species=None):
    if species == "mouse":
        umi = "mouse_umi.png"
        mito = "mouse_mito.png"
        ribo = "mouse_ribo.png"
        counts = "mouse_counts.png"
        sce = "mouse_raw_sce.rdata"
    else:
        umi = "umi.png"
        mito = "mito.png"
        ribo = "ribo.png"
        counts = "counts.png"
        sce = "raw_sce.rdata"
    workflow.transform(name="quality_control_{}".format(species),
                       func=Run,
                       args=(
                           sampleid,
                           species,
                           pypeliner.managed.TempOutputFile(umi),
                           pypeliner.managed.TempOutputFile(mito),
                           pypeliner.managed.TempOutputFile(ribo),
                           pypeliner.managed.TempOutputFile(counts),
                           pypeliner.managed.TempOutputFile(sce),
                       ))
    return workflow
def RunPseudo(sampleid, workflow, full=False):
    workflow.transform(name="kallisto",
                       func=RunKallisto,
                       args=(
                           sampleid,
                           pypeliner.managed.OutputFile("kallisto.complete"),
                       ))
    return workflow
Beispiel #6
0
def RunHRD(sampleid, workflow):
    workflow.transform(
        name="hrd",
        func=Run,
        args=(sampleid, pypeliner.managed.TempInputFile("raw_sce.rdata"),
              pypeliner.managed.TempOutputFile("sce_hrd.rdata"),
              "/work/shah/reference/transcriptomes/markers/hrd_pathway.yaml",
              20, 0.01))
    return workflow
Beispiel #7
0
def RunExhaustion(sampleid, workflow):
    workflow.transform(
        name="exhaustion",
        func=Run,
        args=(
            sampleid, pypeliner.managed.TempInputFile("raw_sce.rdata"),
            pypeliner.managed.TempOutputFile("sce_exhaustion.rdata"),
            "/work/shah/reference/transcriptomes/markers/hgsc_exhausted.yaml",
            10, 2))
    return workflow
Beispiel #8
0
def RunCellranger(sampleid, workflow):
    workflow.transform (
        name = "cellranger_counts",
        func = Run,
        args = (
            sampleid,
            pypeliner.managed.OutputFile("cellranger.complete"),
        )
    )
    return workflow
Beispiel #9
0
def RunStatistics(workflow):
    all_samples = open(config.samples, "r").read().splitlines()
    workflow.transform (
        name = "pull_rdata",
        func = RunDownload,
        args = (
            all_samples,
            pypeliner.managed.TempOutputFile("sample_path.json","sample")
        )
    )
    return workflow
Beispiel #10
0
def RunCorrection(sampleid, workflow):
    workflow.transform (
        name = "batch_correct",
        func = Run,
        args = (
            sampleid,
            pypeliner.managed.InputFile("qc.complete"),
            pypeliner.managed.OutputFile("correction.complete")
        )
    )
    return workflow
Beispiel #11
0
def RunCellAssign(sampleid, workflow):
    workflow.transform(
        name="cellassign",
        func=Run,
        args=(sampleid, pypeliner.managed.InputFile("qc.complete"),
              pypeliner.managed.OutputFile("cellassign.complete")))
    workflow.transform(
        name="cellassignanalysis",
        func=Analysis,
        args=(sampleid, pypeliner.managed.InputFile("cellassign.complete"),
              pypeliner.managed.OutputFile("cellassignanalysis.complete")))
    return workflow
Beispiel #12
0
def create_workflow_2(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    workflow.transform(name='dofilestuff1',
                       func=do_file_stuff,
                       args=(mgd.InputFile(input_filename),
                             mgd.TempOutputFile('intermediate1'), 'a'))

    workflow.transform(name='dofilestuff2',
                       func=do_file_stuff,
                       args=(mgd.TempInputFile('intermediate1'),
                             mgd.OutputFile(output_filename), 'b'))

    return workflow
Beispiel #13
0
def RunSeuratQC(bus_path, workflow):

    workflow.transform(name="sctransform".format(species),
                       func=Run,
                       args=(
                           sampleid,
                           species,
                           pypeliner.managed.TempOutputFile(umi),
                           pypeliner.managed.TempOutputFile(mito),
                           pypeliner.managed.TempOutputFile(ribo),
                           pypeliner.managed.TempOutputFile(counts),
                           pypeliner.managed.TempOutputFile(sce),
                       ))
    return workflow
def RunCellranger(sampleid, workflow):
    workflow.transform (
        name = "download_fastqs",
        func = DownloadFastqs,
        args = (
            sampleid,
            pypeliner.managed.TempOutputFile("download_fastqs.complete"),
        )
    )
    workflow.transform (
        name = "cellranger_counts_human",
        func = Counts,
        args = (
            sampleid,
            pypeliner.managed.TempInputFile("download_fastqs.complete"),
            pypeliner.managed.TempOutputFile("cellranger_human.complete"),
            config.reference
        )
    )
    workflow.transform (
        name = "cellranger_counts_mouse",
        func = Counts,
        args = (
            sampleid,
            pypeliner.managed.TempInputFile("download_fastqs.complete"),
            pypeliner.managed.TempOutputFile("cellranger_mouse.complete"),
            config.mouse_reference
        )
    )
    workflow.transform (
        name = "cellranger_upload_human",
        func = RunUpload,
        args = (
            sampleid,
            pypeliner.managed.TempOutputFile("human_upload.complete"),
            "human"
        )
    )
    workflow.transform (
        name = "cellranger_upload_mouse",
        func = RunUpload,
        args = (
            sampleid + "_mouse",
            pypeliner.managed.TempOutputFile("mouse_upload.complete"),
            "mouse"
        )
    )
    return workflow
Beispiel #15
0
def RunCellAssign(sampleid, workflow):
    workflow.transform(name="cellassign",
                       func=Run,
                       args=(sampleid,
                             pypeliner.managed.TempInputFile("raw_sce.rdata"),
                             pypeliner.managed.TempOutputFile("sce_cas.rdata"),
                             config.rho_matrix, 10, 2))
    workflow.transform(
        name="cellassignanalysis",
        func=Analysis,
        args=(
            sampleid,
            pypeliner.managed.TempInputFile("sce_cas.rdata"),
            pypeliner.managed.TempOutputFile("celltypes.png"),
            pypeliner.managed.TempOutputFile("tsne_by_celltype.png"),
            pypeliner.managed.TempOutputFile("umap_by_celltype.png"),
        ))
    return workflow
Beispiel #16
0
def create_workflow_2(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow()

    workflow.transform(
        name='dofilestuff1',
        func='pypeliner.tests.tasks.do_file_stuff',
        args=(
            mgd.InputFile(input_filename),
            mgd.TempOutputFile('intermediate1'),
            'a'))

    workflow.transform(
        name='dofilestuff2',
        func='pypeliner.tests.tasks.do_file_stuff',
        args=(
            mgd.TempInputFile('intermediate1'),
            mgd.OutputFile(output_filename),
            'b'))

    return workflow
Beispiel #17
0
def create_workflow_1(input_filename, output_filename):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 1})

    # Read data into a managed object
    workflow.transform(name='read',
                       func=read_stuff,
                       ret=mgd.TempOutputObj('input_data'),
                       args=(mgd.InputFile(input_filename), ))

    # Extract a property of the managed object, modify it
    # and store the result in another managed object
    workflow.transform(
        name='do',
        func=do_stuff,
        ret=mgd.TempOutputObj('output_data'),
        args=(mgd.TempInputObj('input_data').prop('some_string'), ))

    # Write the object to an output file
    workflow.transform(name='write',
                       func=write_stuff,
                       args=(mgd.TempInputObj('output_data'),
                             mgd.TempOutputFile('output_file')))

    # Recursive workflow
    workflow.subworkflow(name='sub_workflow_2',
                         func=create_workflow_2,
                         args=(mgd.TempInputFile('output_file'),
                               mgd.OutputFile(output_filename)))

    return workflow
def RunSeurat(workflow):
    workflow.transform(
        name="run_convert",
        func=RunConvert,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("sce.rdata", "sample"),
            pypeliner.managed.TempOutputFile("seurat.rdata", "sample"),
        ))

    workflow.transform(
        name="run_qc",
        func=RunSeuratWorkflow,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("seurat.rdata", "sample"),
            pypeliner.managed.TempOutputFile("seurat_qcd.rdata", "sample"),
            pypeliner.managed.TempOutputFile("sce_qcd.rdata", "sample"),
        ))

    workflow.transform(
        name="visualize_sample",
        func=RunSeuratViz,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"),
            pypeliner.managed.TempOutputFile("seurat_umap.png", "sample"),
            pypeliner.managed.TempOutputFile("seurat_umap_celltype.png",
                                             "sample"),
            pypeliner.managed.TempOutputFile("seurat_ridge.png", "sample"),
            pypeliner.managed.TempOutputFile("seurat_features.png", "sample"),
        ))

    workflow.transform(
        name="find_markers",
        func=RunMarkers,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"),
            pypeliner.managed.TempOutputFile("markers.csv", "sample"),
        ))
    return workflow
Beispiel #19
0
def RunDifferentialAnalysis(sampleid, workflow):
    workflow.transform(
        name="clustering_de",
        func=RunClusteringDE,
        args=(sampleid, pypeliner.managed.InputFile("clustering.complete"),
              pypeliner.managed.OutputFile("clustering_de.complete")))
    workflow.transform(
        name="celltype_de",
        func=RunCellTypeDE,
        args=(sampleid, pypeliner.managed.InputFile("cellassign.complete"),
              pypeliner.managed.OutputFile("celltype_de.complete")))
    workflow.transform(
        name="clone_de",
        func=RunCloneDE,
        args=(sampleid, pypeliner.managed.InputFile("clonealign.complete"),
              pypeliner.managed.OutputFile("clone_de.complete")))

    return workflow
Beispiel #20
0
def RunScvis(sampleid, workflow):
    workflow.transform(
        name="scvis_input",
        func=BuildInput,
        args=(sampleid, pypeliner.managed.InputFile("qc.complete"),
              pypeliner.managed.OutputFile("scvis_input.complete")))
    workflow.transform(
        name="run_scvis",
        func=Run,
        args=(sampleid, pypeliner.managed.InputFile("scvis_input.complete"),
              pypeliner.managed.OutputFile("scvis.complete")))
    workflow.transform(
        name="scvis_analysis",
        func=Analysis,
        args=(sampleid, pypeliner.managed.InputFile("scvis.complete"),
              pypeliner.managed.OutputFile("scvis_analysis.complete")))

    return workflow
Beispiel #21
0
def RunCollection(workflow):
    print(config.samples)
    all_samples = json.loads(open(config.samples, "r").read())

    workflow.transform(
        name="seurat_integrate",
        func=RunSeuratIntegration,
        args=(
            all_samples,
            pypeliner.managed.TempOutputFile("integrated_seurat_seurat.rdata"),
            pypeliner.managed.TempOutputFile("integrated_seurat_sce.rdata"),
            pypeliner.managed.TempOutputFile("integrated_seurat_umap.png"),
        ))

    workflow.transform(
        name="harmony_integrate",
        func=RunHarmonyIntegration,
        args=(
            all_samples,
            pypeliner.managed.TempOutputFile(
                "integrated_harmony_seurat.rdata"),
            pypeliner.managed.TempOutputFile("integrated_harmony_sce.rdata"),
            pypeliner.managed.TempOutputFile("integrated_harmony_umap.png"),
            pypeliner.managed.TempOutputFile("merged_sce.rdata"),
        ))

    workflow.transform(
        name="scanorama_integrate",
        func=RunScanoramaIntegration,
        args=(
            pypeliner.managed.TempInputFile("merged_sce.rdata"),
            pypeliner.managed.TempOutputFile("integrated_scanorama_sce.rdata"),
            pypeliner.managed.TempOutputFile("integrated_scanorama_umap.png"),
        ))

    return workflow
Beispiel #22
0
def create_workflow():

    workflow = pypeliner.workflow.Workflow()

    bcl_directory = args.get("bcl", None)
    fastq_directories = args.get("fastqs")
    aggregate = args.get("aggregate_mlibs", list())
    agg_type = args.get("agg_method", "scanorama")
    libbase = args.get("lib_base", None)
    additional = args.get("additional", [])
    prefix = config.prefix
    output = config.jobpath
    recipe = args.get("recipe", "basic")

    try:
        cellranger_folder = os.path.join(output, prefix)
        os.makedirs(cellranger_folder)
    except Exception as e:
        pass

    if fastq_directories == None:
        fastq_directories = []

    results = Results(output)
    runner = PrimaryRun(workflow, prefix, output)
    """
    Aggregating Libraries
    """

    if aggregate != None and len(aggregate) > 0:
        if agg_type == "tenx":
            runner.aggregate_libraries_tenx(aggregate, libbase)
            args["tenx"] = os.path.join(output, "run_{}/outs".format(prefix))
        if agg_type == "scanorama":
            runner.aggregate_libraries_scanorama()
    """
    Setup
    """
    tenx_analysis = args.get("tenx", None)

    bcls = runner.set_bcl(bcl_directory)
    fastqs = runner.set_fastq(fastq_directories)
    workflow = runner.get_workflow()

    tenx_analysis = args.get("tenx", None)

    if fastqs != []:
        tenx_analysis = os.path.join(config.jobpath, prefix, "outs")

    rdata = args.get("rdata", None)

    secondary_analysis = SecondaryAnalysis(workflow, prefix, output)
    tenx = TenxAnalysis(tenx_analysis)
    """
    QC
    """

    secondary_analysis.run_scater()
    secondary_analysis.build_sce(tenx)
    secondary_analysis.set_rdata(rdata)

    results.add_analysis(tenx_analysis)
    results.add_workflow(secondary_analysis.rscript)
    results.add_sce(secondary_analysis.sce)

    umi = os.path.join(output, "figures/umi_distribution.png")
    mito = os.path.join(output, "figures/mito_distribution.png")
    ribo = os.path.join(output, "figures/ribo_distribution.png")
    freq = os.path.join(output, "figures/highestExprs.png")
    tech = os.path.join(output, "figures/mean_variance_trend.png")
    high_var = os.path.join(output, "figures/highly_variable_genes.png")

    results.add_plot(umi, "UMI Distribution")
    results.add_plot(mito, "Mito Distribution")
    results.add_plot(ribo, "Ribo Distribution")
    results.add_plot(freq, "Highest Frequency")
    results.add_plot(tech, "Mean Variance Trend")
    results.add_plot(high_var, "Highly Variable Genes")

    results.add_cellassign_pkl(secondary_analysis.cell_assign_fit)
    results.add_cellassign_raw(secondary_analysis.cell_assign_rdata)
    """
    Differential Expression
    """
    if config.run_de:
        other_samples = []
        for other_sample in compare:
            print("blah")
            exit(0)
            secondary_analysis.run_de(other_sample)
    """
    CellAssign
    """
    if config.run_cellassign:
        tenx = TenxAnalysis(tenx_analysis)
        if hasattr(config, "rho_matrix"):
            rho_matrix = eval(open(config.rho_matrix, "r").read())
        elif hasattr(config, "tissue"):
            sce = SingleCellExperiment.fromRData(secondary_analysis.sce)
            rho_matrix = generate_json(tenx, sce, config.organ)
        else:
            raise AssertionError("Not implemented.")
        secondary_analysis.run_cell_assign(rho_matrix,
                                           tenx_analysis,
                                           additional=combine_assign)
        results.add_cellassign_pkl(secondary_analysis.cell_assign_fit)
        results.add_cellassign_raw(secondary_analysis.cell_assign_rdata)

        path = secondary_analysis.plot_cell_types()
        results.add_plot(path, "Cell Type Frequency")
        path = secondary_analysis.plot_cell_type_by_cluster(tenx_analysis)
        results.add_plot(path, "Cell Type by Cluster")

        path = secondary_analysis.plot_tsne_by_cell_type()
        results.add_plot(path, "TSNE by Cell Type")

        path = secondary_analysis.plot_pca_by_cell_type()
        results.add_plot(path, "PCA by Cell Type")

        # path = secondary_analysis.plot_umap_by_cell_type()
        # results.add_plot(path, "UMAP by Cell Type")

        path1, path2 = secondary_analysis.marker_analysis(tenx, rho_matrix)
        results.add_plot(path1, "Heat Marker Gene Matrix")
        results.add_plot(path2, "Stacked Vin Marker Gene Matrix")
    """
    SCVis
    """
    if config.run_scvis:
        secondary_analysis.run_scviz(config.perplexity, config.components)
    """
    CloneAlign
    """
    if config.run_clonealign and config.copy_number_data is not None and config.clone_assignments is not None:
        secondary_analysis.run_clone_align(tenx, config.copy_number_data,
                                           config.clone_assignments)

    if config.plot_scvis:
        embedding_file = "{0}_{1}/perplexity_{0}_regularizer_0.001_batch_size_512_learning_rate_0.01_latent_dimension_2_activation_ELU_seed_1_iter_3000.tsv".format(
            config.perplexity, config.components)
        path = secondary_analysis.plot_scvis_by_cluster(tenx_analysis,
                                                        embedding_file,
                                                        pcs=config.components)
        path = os.path.join(output, path)
        results.add_plot(path, "SCVis by Cluster")

        if os.path.exists(config.run_cellassign):
            path = secondary_analysis.plot_scvis_by_cell_type(
                embedding_file, pcs=config.components)
            results.add_plot(path, "SCVIS by Cell Type")
    """
    Cluster Analysis
    """
    if config.clustering:
        path = secondary_analysis.plot_pca_by_cluster(tenx_analysis,
                                                      pcs=config.components)
        results.add_plot(path, "PCA by Cluster")

        path = secondary_analysis.plot_tsne_by_cluster(tenx_analysis,
                                                       pcs=config.components)
        results.add_plot(path, "TSNE by Cluster")

        path = secondary_analysis.plot_umap_by_cluster(tenx_analysis,
                                                       pcs=config.components)
        results.add_plot(path, "UMAP by Cluster")

        secondary_analysis.plot_cluster_markers(tenx_analysis,
                                                rep="PCA",
                                                pcs=config.components)

        pca_cluster_markers = glob.glob("figures/expression/*pca*png")
        for png in pca_cluster_markers:
            title = png.split("/")[-1].replace(".png", "").replace(
                "counts", "gene markers").upper().replace("_", "")
            results.add_plot(png, title)

        secondary_analysis.plot_cluster_markers(tenx_analysis,
                                                rep="TSNE",
                                                pcs=config.components)

        pca_cluster_markers = glob.glob("figures/expression/*tsne*png")
        for png in pca_cluster_markers:
            title = png.split("/")[-1].replace(".png", "").replace(
                "counts", "gene markers").upper().replace("_", "")
            results.add_plot(png, title)

        secondary_analysis.plot_cluster_markers(tenx_analysis,
                                                rep="UMAP",
                                                pcs=config.components)

        pca_cluster_markers = glob.glob("figures/expression/*umap*png")
        for png in pca_cluster_markers:
            title = png.split("/")[-1].replace(".png", "").replace(
                "counts", "gene markers").upper().replace("_", "")
            results.add_plot(png, title)

        embedding_file = "{0}_{1}/perplexity_{0}_regularizer_0.001_batch_size_512_learning_rate_0.01_latent_dimension_2_activation_ELU_seed_1_iter_3000.tsv".format(
            config.perplexity, config.components)
        secondary_analysis.plot_cluster_markers(tenx_analysis,
                                                rep="SCVIS",
                                                pcs=config.components,
                                                embedding_file=embedding_file)

        pca_cluster_markers = glob.glob("figures/expression/*scvis_5_50*png")
        for png in pca_cluster_markers:
            title = png.split("/")[-1].replace(".png", "").replace(
                "counts", "gene markers").upper().replace("_", "")
            results.add_plot(png, title)
    """
    Gene Level
    """
    """
    Reporting
    """
    if config.report:
        workflow.transform(name="{}_markdown".format(prefix),
                           func=exportMD,
                           args=(results, ))

    if config.report:
        workflow.transform(name="{}_finalize".format(prefix),
                           func=exportFinalize,
                           args=(results, ))

    workflow = secondary_analysis.get_workflow()
    return workflow
Beispiel #23
0
def create_workflow():
    """
    Generates tasks as Pypeliner workflow based on input arguments.
    The workflow will start from the most raw input provided and override
    any downstream tasks with subsequently provided input arguments.
    Parellelization is performed over provided samplesheets and replication
    within samples.

    Args:
        None

    Yields:
        Pypeliner workflow object.
    """

    bcl_directory = args.get("bcl", None)
    fastq_directory = args.get("fastq", None)
    tenx_analysis = args.get("tenx", None)
    rdata = args.get("rdata", None)

    bcl = BinaryBaseCall(bcl_directory)

    workflow = pypeliner.workflow.Workflow()

    if bcl_directory:
        workflow.transform (
            name = "bcl_to_fastq",
            func = CellRanger.mkfastq,
            ret = pypeliner.managed.TempOutputObj("fastq_object"),
            args = (
                bcl_object,
            )
        )

    if bcl_directory != None or fastq_directory != None:
        if fastq_directory != None:
            fastq = FastQDirectory(fastq_directory)
            # fastqs = list()
            # for sample_sheet in glob.iglob(os.path.join(fastq_directory, "**/*.csv")):
        else:
            fastq = pypeliner.managed.TempInputObj("fastq_object")
        workflow.transform (
            name = "fastq_counts",
            func = CellRanger.count,
            ret = pypeliner.managed.TempOutputObj("tenx_analysis"),
            args = (
                fastq,
            )
        )

    tenx = None
    if tenx_analysis != None and rdata == None:
        tenx = TenxAnalysis(tenx_analysis)
    elif tenx_analysis == None and rdata == None:
        tenx = pypeliner.managed.TempInputObj("tenx_analysis")
    if tenx != None:
        workflow.transform (
            name = "tenx_read10xcounts",
            func = TenX.read10xCounts,
            ret = pypeliner.managed.TempOutputObj("single_cell_experiment"),
            args = (
                tenx,
            )
        )

    if rdata != None:
        single_cell_experiment = TenxAnalysis.from_rdata(rdata)
    else:
        single_cell_experiment = pypeliner.managed.TempInputObj("single_cell_experiment")

    #
    # workflow.transform (
    #     name = "tenx_barcoderanks",
    #     func = TenX.barcodeRanks,
    #     args = (
    #         pypeliner.managed.TempInputObj("tenx_analysis"),
    #     )
    # )
    #
    # workflow.transform (
    #     name = "tenx_emptydrops",
    #     func = TenX.emptyDrops,
    #     args = (
    #         pypeliner.managed.TempInputObj("tenx_analysis"),
    #     )
    # )

    workflow.transform (
        name = "clonealign",
        func = CloneAlign.run,
        ret = pypeliner.managed.TempOutputObj("clone_align_fit"),
        args = (
            single_cell_experiment,
        )
    )

    # """
    # workflow.transform (
    #     name = "cellasign",
    #     func = CellAssign.run_em,
    #     ret = pypeliner.managed.TempOutputObj("cell_assignments"),
    #     args = (
    #         single_cell_experiment,
    #     )
    # )
    #
    # workflow.transform (
    #     name = "scviz",
    #     func = SCViz.run,
    #     ret = pypeliner.managed.TempOutputObj("scviz_dim_reduction"),
    #     args = (
    #         single_cell_experiment,
    #     )
    # )
    #
    # workflow.transform (
    #     name = "html_output",
    #     func = HTMLResults.generate,
    #     args = (
    #         pypeliner.managed.TempInputObj("fastq_object")
    #         pypeliner.managed.TempInputObj("ten_analysis"),
    #         pypeliner.managed.TempInputObj("single_cell_experiment"),
    #         pypeliner.managed.TempInputObj("clone_align_fit"),
    #         pypeliner.managed.TempInputObj("cell_assignments"),
    #         pypeliner.managed.TempInputObj("scviz_dim_reduction"),
    #     )
    # )
    """

    return workflow
if __name__ == "__main__":

    fileConfig("logging_config.ini")

    logging.info("Starting bam_demultiplexer pipeline...")

    args = vars(parse_args())

    pyp = pypeliner.app.Pypeline([demultiplex_bam], config=args)  #??

    workflow = pypeliner.workflow.Workflow()

    # demultiplex
    workflow.transform(name='demultiplex',
                       func=demultiplex_bam.demultiplex,
                       args=(pypeliner.managed.InputFile(args['bam']),
                             pypeliner.managed.TempOutputFile(
                                 'demux', 'split'), args['barcode_csv']))

    # collate demultiplexed bams
    workflow.transform(name='collate',
                       axes=('split', ),
                       func=samtools_wrapper.collate,
                       args=(pypeliner.managed.TempInputFile('demux', 'split'),
                             pypeliner.managed.TempOutputFile(
                                 'collated', 'split')))

    # convert collated bams to fastq
    workflow.transform(
        name='fastq',
        axes=('split', ),
                'tool_name'),
            mgd.TempSpace('tool_raw_data', 'tool_name', cleanup='none'),
        ),
    )

    workflow.transform(
        name='plot',
        axes=('tool_name', ),
        func=destruct.benchmark.destruct_test.create_roc_plot,
        args=(
            mgd.TempInputObj('simulation.params'),
            mgd.TempInputObj('tool_defs', 'tool_name'),
            mgd.InputFile(os.path.join(args['results_dir'], 'genome.fasta')),
            mgd.InputFile(os.path.join(args['results_dir'], 'simulated.tsv')),
            mgd.InputFile(
                os.path.join(args['results_dir'], 'results_{tool_name}.tsv'),
                'tool_name'),
            mgd.OutputFile(
                os.path.join(args['results_dir'], 'annotated_{tool_name}.tsv'),
                'tool_name'),
            mgd.OutputFile(
                os.path.join(args['results_dir'],
                             'identified_{tool_name}.tsv'), 'tool_name'),
            mgd.OutputFile(
                os.path.join(args['results_dir'], 'plots_{tool_name}.pdf'),
                'tool_name'),
        ),
    )

    pyp.run(workflow)
Beispiel #26
0
def create_destruct_fastq_workflow(
    fastq1_filenames,
    fastq2_filenames,
    sample1_filenames,
    sample2_filenames,
    stats_filenames,
    breakpoint_table,
    breakpoint_library_table,
    breakpoint_read_table,
    config,
    ref_data_dir,
    raw_data_dir=None,
):
    workflow = pypeliner.workflow.Workflow()

    # Set the library ids

    workflow.setobj(
        obj=mgd.TempOutputObj('library_id', 'bylibrary'),
        value=destruct.tasks.create_library_ids(fastq1_filenames.keys()),
    )

    workflow.transform(
        name='readstats',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.read_stats',
        ret=mgd.TempOutputObj('stats', 'bylibrary'),
        args=(
            mgd.InputFile('stats.txt', 'bylibrary', fnames=stats_filenames),
            config['fragment_length_num_stddevs'],
        ),
    )

    # Align a sample of reads and calculate alignment statistics

    workflow.transform(
        name='prepseed_sample',
        axes=('bylibrary', ),
        ctx=medmem,
        func='destruct.tasks.prepare_seed_fastq',
        args=(
            mgd.InputFile('sample1.fq.gz',
                          'bylibrary',
                          fnames=sample1_filenames),
            mgd.InputFile('sample2.fq.gz',
                          'bylibrary',
                          fnames=sample2_filenames),
            36,
            mgd.TempOutputFile('sample.seed', 'bylibrary'),
        ),
    )

    workflow.commandline(
        name='bwtrealign_sample',
        axes=('bylibrary', ),
        ctx=medmem,
        args=(
            'bowtie',
            config['genome_fasta'],
            mgd.TempInputFile('sample.seed', 'bylibrary'),
            '--chunkmbs',
            '512',
            '-k',
            '1000',
            '-m',
            '1000',
            '--strata',
            '--best',
            '-S',
            '|',
            'destruct_aligntrue',
            '-a',
            '-',
            '-1',
            mgd.InputFile('sample1.fq.gz',
                          'bylibrary',
                          fnames=sample1_filenames),
            '-2',
            mgd.InputFile('sample2.fq.gz',
                          'bylibrary',
                          fnames=sample2_filenames),
            '-r',
            config['genome_fasta'],
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmin',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'),
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '-s',
            mgd.TempOutputFile('samples.align.true', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='scorestats',
        axes=('bylibrary', ),
        ctx=medmem,
        func='destruct.score_stats.create_score_stats',
        args=(
            mgd.TempInputFile('samples.align.true', 'bylibrary'),
            config['match_score'],
            mgd.TempOutputFile('score.stats', 'bylibrary'),
        ),
    )

    # Split discordant fastqs and align

    workflow.transform(
        name='splitfastq1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.split_fastq',
        args=(
            mgd.InputFile('reads1.fq.gz', 'bylibrary',
                          fnames=fastq1_filenames),
            int(config['reads_per_split']),
            mgd.TempOutputFile('reads1', 'bylibrary', 'byread'),
        ),
    )

    workflow.transform(
        name='splitfastq2',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.split_fastq',
        args=(
            mgd.InputFile('reads2.fq.gz', 'bylibrary',
                          fnames=fastq2_filenames),
            int(config['reads_per_split']),
            mgd.TempOutputFile('reads2', 'bylibrary', 'byread',
                               axes_origin=[]),
        ),
    )

    workflow.transform(
        name='prepseed',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        func='destruct.tasks.prepare_seed_fastq',
        args=(
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            36,
            mgd.TempOutputFile('reads.seed', 'bylibrary', 'byread'),
        ),
    )

    workflow.commandline(
        name='bwtrealign',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        args=(
            'bowtie',
            config['genome_fasta'],
            mgd.TempInputFile('reads.seed', 'bylibrary', 'byread'),
            '--chunkmbs',
            '512',
            '-k',
            '1000',
            '-m',
            '1000',
            '--strata',
            '--best',
            '-S',
            '|',
            'destruct_realign2',
            '-l',
            mgd.TempInputObj('library_id', 'bylibrary'),
            '-a',
            '-',
            '-1',
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            '-2',
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            '-r',
            config['genome_fasta'],
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmin',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_min'),
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '--tchimer',
            config['chimeric_threshold'],
            '--talign',
            config['alignment_threshold'],
            '--pchimer',
            config['chimeric_prior'],
            '--tvalid',
            config['readvalid_threshold'],
            '-z',
            mgd.TempInputFile('score.stats', 'bylibrary'),
            '--span',
            mgd.TempOutputFile('spanning.alignments', 'bylibrary', 'byread'),
            '--split',
            mgd.TempOutputFile('split.alignments', 'bylibrary', 'byread'),
        ),
    )

    workflow.transform(
        name='merge_spanning_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_files_by_line',
        args=(
            mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'),
            mgd.TempOutputFile('spanning.alignments_1', 'bylibrary'),
        ),
    )

    workflow.commandline(
        name='filterreads',
        axes=('bylibrary', ),
        ctx=lowmem,
        args=(
            'destruct_filterreads',
            '-n',
            '2',
            '-a',
            mgd.TempInputFile('spanning.alignments_1', 'bylibrary'),
            '-r',
            config['satellite_regions'],
            '>',
            mgd.TempOutputFile('spanning.alignments', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_split_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_files_by_line',
        args=(
            mgd.TempInputFile('split.alignments', 'bylibrary', 'byread'),
            mgd.TempOutputFile('split.alignments', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_spanning_2',
        ctx=lowmem,
        func='destruct.tasks.merge_alignment_files',
        args=(
            mgd.TempInputFile('spanning.alignments', 'bylibrary'),
            mgd.TempOutputFile('spanning.alignments'),
            mgd.TempInputObj('library_id', 'bylibrary'),
        ),
    )

    workflow.transform(
        name='merge_split_2',
        ctx=lowmem,
        func='destruct.tasks.merge_alignment_files',
        args=(
            mgd.TempInputFile('split.alignments', 'bylibrary'),
            mgd.TempOutputFile('split.alignments'),
            mgd.TempInputObj('library_id', 'bylibrary'),
        ),
    )

    # Cluster spanning reads

    workflow.setobj(
        obj=mgd.TempOutputObj('chrom.args', 'bychromarg'),
        value=destruct.tasks.generate_chromosome_args(config['chromosomes']),
    )

    workflow.transform(
        name='write_stats_table',
        ctx=lowmem,
        func='destruct.tasks.write_stats_table',
        args=(
            mgd.TempInputObj('library_id', 'bylibrary'),
            mgd.TempInputObj('stats', 'bylibrary'),
            mgd.TempOutputFile('libstats.tsv'),
        ),
    )

    workflow.commandline(
        name='cluster',
        axes=('bychromarg', ),
        ctx=medmem,
        args=(
            'destruct_mclustermatepairs',
            '-a',
            mgd.TempInputFile('spanning.alignments'),
            '-s',
            mgd.TempInputFile('libstats.tsv'),
            '-c',
            mgd.TempOutputFile('clusters', 'bychromarg'),
            mgd.TempInputObj('chrom.args', 'bychromarg'),
            '--clustmin',
            config['cluster_readcount_threshold'],
            '--fragmax',
            config['fragment_length_max'],
        ),
    )

    # Predict breakpoints from split reads

    workflow.transform(
        name='predict_breaks',
        axes=('bychromarg', ),
        ctx=medmem,
        func='destruct.predict_breaks.predict_breaks',
        args=(
            mgd.TempInputFile('clusters', 'bychromarg'),
            mgd.TempInputFile('spanning.alignments'),
            mgd.TempInputFile('split.alignments'),
            mgd.TempOutputFile('breakpoints_2', 'bychromarg'),
        ),
    )

    workflow.transform(
        name='merge_clusters',
        ctx=lowmem,
        func='destruct.tasks.merge_clusters',
        args=(
            mgd.TempInputFile('clusters', 'bychromarg'),
            mgd.TempInputFile('breakpoints_2', 'bychromarg'),
            mgd.TempOutputFile('clusters'),
            mgd.TempOutputFile('breakpoints_2'),
            mgd.TempOutputFile('merge_clusters.debug'),
        ),
    )

    # Realign reads to breakpoints

    workflow.commandline(
        name='realigntobreaks',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        args=(
            'destruct_realigntobreaks2',
            '-r',
            config['genome_fasta'],
            '-b',
            mgd.TempInputFile('breakpoints_2'),
            '-c',
            mgd.TempInputFile('clusters'),
            '-g',
            config['gap_score'],
            '-x',
            config['mismatch_score'],
            '-m',
            config['match_score'],
            '--flmax',
            mgd.TempInputObj('stats', 'bylibrary').prop('fragment_length_max'),
            '--span',
            mgd.TempInputFile('spanning.alignments', 'bylibrary', 'byread'),
            '-1',
            mgd.TempInputFile('reads1', 'bylibrary', 'byread'),
            '-2',
            mgd.TempInputFile('reads2', 'bylibrary', 'byread'),
            '--realignments',
            mgd.TempOutputFile('realignments', 'bylibrary', 'byread'),
        ),
    )

    # Calculate likelihoods based on realignments

    workflow.transform(
        name='calculate_realignment_likelihoods',
        axes=('bylibrary', 'byread'),
        ctx=medmem,
        func='destruct.predict_breaks.calculate_realignment_likelihoods',
        args=(
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempInputFile('realignments', 'bylibrary', 'byread'),
            mgd.TempInputFile('score.stats', 'bylibrary'),
            mgd.TempOutputFile('likelihoods_2', 'bylibrary', 'byread'),
            config['match_score'],
            mgd.TempInputObj('stats',
                             'bylibrary').prop('fragment_length_mean'),
            mgd.TempInputObj('stats',
                             'bylibrary').prop('fragment_length_stddev'),
        ),
    )

    workflow.transform(
        name='merge_likelihoods_1',
        axes=('bylibrary', ),
        ctx=lowmem,
        func='destruct.tasks.merge_sorted_files_by_line',
        args=(
            mgd.TempInputFile('likelihoods_2', 'bylibrary', 'byread'),
            mgd.TempOutputFile('likelihoods_2', 'bylibrary'),
            mgd.TempSpace('merge_likelihoods_1_temp', 'bylibrary'),
            '1',
        ),
    )

    workflow.transform(
        name='merge_likelihoods_2',
        ctx=lowmem,
        func='destruct.tasks.merge_sorted_files_by_line',
        args=(
            mgd.TempInputFile('likelihoods_2', 'bylibrary'),
            mgd.TempOutputFile('likelihoods_2'),
            mgd.TempSpace('merge_likelihoods_2_temp'),
            '1',
        ),
    )

    # Set cover for multi mapping reads

    workflow.transform(
        name='calc_weights',
        ctx=medmem,
        func='destruct.predict_breaks.calculate_cluster_weights',
        args=(
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempOutputFile('cluster_weights'),
        ),
    )

    workflow.commandline(
        name='setcover',
        ctx=medmem,
        args=(
            'destruct_setcover',
            '-c',
            mgd.TempInputFile('clusters'),
            '-w',
            mgd.TempInputFile('cluster_weights'),
            '-a',
            mgd.TempOutputFile('clusters_setcover'),
        ),
    )

    # Select cluster based on setcover

    workflow.transform(
        name='select_clusters',
        ctx=medmem,
        func='destruct.predict_breaks.select_clusters',
        args=(
            mgd.TempInputFile('clusters_setcover'),
            mgd.TempInputFile('breakpoints_2'),
            mgd.TempOutputFile('breakpoints_1'),
            mgd.TempInputFile('likelihoods_2'),
            mgd.TempOutputFile('likelihoods_1'),
        ),
    )

    # Select prediction based on max likelihood

    workflow.transform(
        name='select_predictions',
        ctx=himem,
        func='destruct.predict_breaks.select_predictions',
        args=(
            mgd.TempInputFile('breakpoints_1'),
            mgd.TempOutputFile('breakpoints'),
            mgd.TempInputFile('likelihoods_1'),
            mgd.TempOutputFile('likelihoods'),
            config['mate_score_threshold'],
            config['template_length_min_threshold'],
            config['min_alignment_log_likelihood'],
        ),
    )

    # Optionally tabulate supporting reads

    workflow.transform(
        name='tabreads',
        ctx=medmem,
        func='destruct.tasks.tabulate_reads',
        args=(
            mgd.TempInputFile('clusters_setcover'),
            mgd.TempInputFile('likelihoods'),
            mgd.TempInputObj('library_id', 'bylibrary'),
            mgd.InputFile('reads1.fq.gz', 'bylibrary',
                          fnames=fastq1_filenames),
            mgd.InputFile('reads2.fq.gz', 'bylibrary',
                          fnames=fastq2_filenames),
            mgd.TempOutputFile('breakreads.table.unsorted'),
        ),
    )

    workflow.commandline(
        name='sortreads',
        ctx=medmem,
        args=(
            'sort',
            '-n',
            mgd.TempInputFile('breakreads.table.unsorted'),
            '>',
            mgd.OutputFile(breakpoint_read_table),
        ),
    )

    # Tabulate results

    workflow.transform(
        name='tabulate',
        ctx=himem,
        func='destruct.tasks.tabulate_results',
        args=(
            mgd.TempInputFile('breakpoints'),
            mgd.TempInputFile('likelihoods'),
            mgd.TempInputObj('library_id', 'bylibrary'),
            config['genome_fasta'],
            config['gtf_filename'],
            config['dgv_filename'],
            mgd.OutputFile(breakpoint_table),
            mgd.OutputFile(breakpoint_library_table),
        ),
    )

    return workflow
    workflow.setobj(mgd.TempOutputObj('simulation.params'),
                    sim_config['simulation'])

    if sim_config['reference'].get('chromosomes', None) is not None:
        genome_fasta = os.path.join(args['results_dir'], 'genome.fa')
        source_bam = os.path.join(args['results_dir'], 'source.bam')

        workflow.setobj(mgd.TempOutputObj('chromosomes'),
                        sim_config['reference']['chromosomes'])

        workflow.transform(
            name='create_ref_bam',
            func=destruct.benchmark.destruct_test.create_ref_bam,
            args=(
                mgd.InputFile(args['ref']),
                mgd.InputFile(args['bam']),
                mgd.OutputFile(genome_fasta),
                mgd.OutputFile(source_bam),
                mgd.TempInputObj('chromosomes'),
            ),
        )

    else:
        genome_fasta = args['ref']
        source_bam = args['bam']

    workflow.transform(
        name='create_sim',
        func=destruct.benchmark.create_breakpoint_simulation.
        create_breakpoints,
        args=(
Beispiel #28
0
def create_resample_simulation_workflow(
    sim_defs,
    mixture_filename,
    source_filename,
    normal_filename,
    tumour_filename,
    breakpoint_filename,
    config,
    ref_data_dir,
):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4})

    workflow.setobj(
        obj=mgd.TempOutputObj('sim_defs'),
        value=sim_defs,
    )

    workflow.transform(
        name='simulate_germline_alleles',
        ctx={'mem': 8},
        func=remixt.simulations.pipeline.simulate_germline_alleles,
        args=(
            mgd.TempOutputFile('germline_alleles'),
            mgd.TempInputObj('sim_defs'),
            config,
            ref_data_dir,
        ),
    )

    workflow.transform(
        name='resample_normal_data',
        ctx={'mem': 128},
        func=remixt.simulations.pipeline.resample_normal_data,
        args=(
            mgd.OutputFile(normal_filename),
            mgd.InputFile(source_filename),
            mgd.InputFile(mixture_filename),
            mgd.TempInputFile('germline_alleles'),
            mgd.TempInputObj('sim_defs'),
        ),
    )

    workflow.transform(
        name='resample_tumour_data',
        ctx={'mem': 128},
        func=remixt.simulations.pipeline.resample_tumour_data,
        args=(
            mgd.OutputFile(tumour_filename),
            mgd.InputFile(source_filename),
            mgd.InputFile(mixture_filename),
            mgd.TempInputFile('germline_alleles'),
            mgd.TempInputObj('sim_defs'),
        ),
    )

    workflow.transform(
        name='write_breakpoints',
        func=remixt.simulations.pipeline.write_breakpoints,
        args=(
            mgd.OutputFile(breakpoint_filename),
            mgd.InputFile(mixture_filename),
        ),
    )

    return workflow
Beispiel #29
0
def generate_bam(
    simulation_params,
    chromosomes,
    include_nonchromosomal,
    simulated_bam_filename,
    genome_fasta_filename,
    simulated_table_filename,
    raw_data_dir,
):
    workflow = pypeliner.workflow.Workflow(default_ctx={'mem': 4})

    workflow.setobj(mgd.TempOutputObj('simulation.params'), simulation_params)
    workflow.setobj(mgd.TempOutputObj('chromosomes'), chromosomes)
    workflow.setobj(mgd.TempOutputObj('include_nonchromosomal'),
                    include_nonchromosomal)

    workflow.transform(
        name='create_genome',
        func=destruct.benchmark.destruct_test.create_genome,
        args=(
            mgd.TempInputObj('chromosomes'),
            mgd.TempInputObj('include_nonchromosomal'),
            mgd.OutputFile(genome_fasta_filename),
        ),
    )

    workflow.transform(
        name='create_sim',
        func=destruct.benchmark.create_breakpoint_simulation.create,
        args=(
            mgd.TempInputObj('simulation.params'),
            mgd.InputFile(genome_fasta_filename),
            mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.fasta')),
            mgd.OutputFile(simulated_table_filename),
            mgd.TempOutputFile('concordant.1.fastq'),
            mgd.TempOutputFile('concordant.2.fastq'),
            mgd.TempOutputFile('discordant.1.fastq'),
            mgd.TempOutputFile('discordant.2.fastq'),
        ),
    )

    workflow.commandline(
        name='cat1',
        args=(
            'cat',
            mgd.TempInputFile('concordant.1.fastq'),
            mgd.TempInputFile('discordant.1.fastq'),
            '>',
            mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')),
        ),
    )

    workflow.commandline(
        name='cat2',
        args=(
            'cat',
            mgd.TempInputFile('concordant.2.fastq'),
            mgd.TempInputFile('discordant.2.fastq'),
            '>',
            mgd.OutputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')),
        ),
    )

    workflow.subworkflow(
        name='bwa_align',
        func=destruct.benchmark.align.bwa.workflow.bwa_align_workflow,
        args=(
            mgd.InputFile(genome_fasta_filename),
            mgd.InputFile(os.path.join(raw_data_dir, 'simulated.1.fastq')),
            mgd.InputFile(os.path.join(raw_data_dir, 'simulated.2.fastq')),
            mgd.TempOutputFile('simulated.unsorted.bam'),
        ),
    )

    workflow.transform(
        name='samtools_sort_index',
        func=destruct.benchmark.destruct_test.samtools_sort_index,
        args=(
            mgd.TempInputFile('simulated.unsorted.bam'),
            mgd.OutputFile(simulated_bam_filename),
        ),
    )

    return workflow
Beispiel #30
0
def RunCloneAlignWorkflow(workflow):
    print("Creating workflow.")
    all_samples = open(config.samples, "r").read().splitlines()
    all_samples = [
        sample.strip() for sample in all_samples if sample.strip() != ""
    ]
    workflow.transform(name="download_collection",
                       func=RunDownload,
                       args=(all_samples,
                             pypeliner.managed.TempOutputFile(
                                 "sample_path.json", "sample")))
    workflow.transform(
        name="extract_rdata",
        func=RunExtract,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("sample_path.json", "sample"),
            pypeliner.managed.TempOutputFile("sample.rdata", "sample"),
        ))
    workflow.transform(
        name="run_cellassign",
        func=RunCellAssign,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("sample.rdata", "sample"),
            pypeliner.managed.TempOutputFile("cell_annotated.rdata", "sample"),
        ))
    workflow.transform(name="run_modecn",
                       func=RunModeCopyNumber,
                       axes=('sample', ),
                       args=(pypeliner.managed.TempOutputFile(
                           "copy_number_data.csv", "sample"), ))
    workflow.transform(
        name="run_clonealigninputs",
        func=RunCloneAlignInput,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("cell_annotated.rdata", "sample"),
            pypeliner.managed.TempInputFile("copy_number_data.csv", "sample"),
            pypeliner.managed.TempOutputFile("clone.rdata", "sample"),
            pypeliner.managed.TempOutputFile("cnv.rdata", "sample"),
            pypeliner.managed.TempOutputFile("rawcnv.rdata", "sample"),
        ))
    workflow.transform(
        name="run_clonealign",
        func=RunCloneAlign,
        axes=('sample', ),
        args=(
            pypeliner.managed.TempInputFile("clone.rdata", "sample"),
            pypeliner.managed.TempInputFile("cnv.rdata", "sample"),
            pypeliner.managed.TempOutputFile("clone_annotated.rdata",
                                             "sample"),
            pypeliner.managed.TempOutputFile("cal.rdata", "sample"),
        ))

    if len(all_samples) > 1:
        workflow.transform(
            name="run_convert",
            func=RunConvert,
            axes=('sample', ),
            args=(
                pypeliner.managed.TempInputFile("clone_annotated.rdata",
                                                "sample"),
                pypeliner.managed.TempInputFile("cell_annotated.rdata",
                                                "sample"),
                pypeliner.managed.TempOutputFile("seurat.rdata", "sample"),
            ))

        workflow.transform(
            name="run_qc",
            func=RunSeuratWorkflow,
            axes=('sample', ),
            args=(
                pypeliner.managed.TempInputFile("seurat.rdata", "sample"),
                pypeliner.managed.TempOutputFile("seurat_qcd.rdata", "sample"),
                pypeliner.managed.TempOutputFile("sce_qcd.rdata", "sample"),
            ))

        workflow.transform(
            name="visualize_sample",
            func=RunSeuratViz,
            axes=('sample', ),
            args=(
                pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"),
                pypeliner.managed.TempOutputFile("seurat_umap.png", "sample"),
                pypeliner.managed.TempOutputFile("seurat_umap_celltype.png",
                                                 "sample"),
                pypeliner.managed.TempOutputFile("seurat_umap_clone.png",
                                                 "sample"),
            ))

        workflow.transform(
            name="integrate",
            func=RunIntegration,
            args=(
                pypeliner.managed.TempInputFile("seurat_qcd.rdata", "sample"),
                pypeliner.managed.TempOutputFile("seurat_integrated.rdata"),
                pypeliner.managed.TempOutputFile("sce_integrated.rdata"),
            ))

        workflow.transform(
            name="run_figures",
            func=RunFigures,
            args=(
                pypeliner.managed.TempInputFile("sce_integrated.rdata"),
                pypeliner.managed.TempOutputFile("umap_cell.png"),
                pypeliner.managed.TempOutputFile("umap_clone.png"),
                pypeliner.managed.TempOutputFile("umap_sample.png"),
            ))
    else:
        workflow.transform(
            name="run_figures_single_sample",
            func=RunFigures,
            axes=('sample', ),
            args=(
                pypeliner.managed.TempInputFile("clone_annotated.rdata",
                                                "sample"),
                pypeliner.managed.TempOutputFile("umap_cell.png", "sample"),
                pypeliner.managed.TempOutputFile("umap_clone.png", "sample"),
                pypeliner.managed.TempOutputFile("umap_sample.png", "sample"),
            ))
    return workflow