Python Pipeline Beispiele, ruffus.Pipeline Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: preprocess.py Projekt: DamLabResources/chroCRISPR

def quick(ifold):

    # sorting bam file
    pipeline = ruffus.Pipeline('BamDNaseSeq')
    bam_file = '*.bam'
    sort_bam_regex = r'(.*)\/(.*).bam$'
    sort_bam_task = pipeline.collate(tasks.sort_bam,
                                     name='sorting_bam',
                                     input=os.path.join(ifold, bam_file),
                                     filter=ruffus.regex(sort_bam_regex),
                                     output=r'\1/\2.sorted.bam')
    ## bam to bed using bam2bed
    sorted_bam_file = '*.sorted.bam'
    sorted_bam_regex = r'(.*)\/(.*).sorted.bam$'
    sorted_bam_task = pipeline.collate(tasks.bam2bed,
                                       name='bam2bed',
                                       input=os.path.join(
                                           ifold, sorted_bam_file),
                                       filter=ruffus.regex(sorted_bam_regex),
                                       output=r'\1/\2.sorted.bed')
    sorted_bam_task.follows('sorting_bam')

    full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed'])

    full_pipe.run()

Beispiel #2

0

Datei anzeigen

Datei: template_pipeline.py Projekt: pythseq/cgat-core

def main(argv=None):
    if argv is None:
        argv = sys.argv

    options, args = P.initialize(argv,
                                 config_file="template.yml",
                                 defaults={
                                     "min_value": 0.0,
                                     "num_samples": 1000,
                                     "mu": 0.0,
                                     "sigma": 1.0
                                 })

    pipeline = ruffus.Pipeline("template_pipeline")

    task_create_files = pipeline.originate(
        task_func=create_files,
        output=["sample_{:02}.txt".format(x) for x in range(10)])

    task_compute_mean = pipeline.transform(task_func=compute_mean,
                                           input=task_create_files,
                                           filter=ruffus.suffix(".txt"),
                                           output=".mean")

    task_combine_means = pipeline.merge(task_func=combine_means,
                                        input=task_compute_mean,
                                        output="means.txt")

    # primary targets
    pipeline.merge(task_func=P.EmptyRunner("all"),
                   input=task_combine_means,
                   output="all")

    E.debug("starting workflow")
    return P.run_workflow(options, args)

Beispiel #3

0

Datei anzeigen

Datei: test_pipeline_actions.py Projekt: alsmith151/cgat-core

def build_pipeline():
    pipeline = ruffus.Pipeline(name="test1")
    start_tasks = pipeline.originate(task_func=PassThroughRunner(
        name="dummy_f1", f=lambda x: None),
                                     output=["a.1", "b.1"])

    pipeline.merge(task_func=EmptyRunner(name="all"),
                   input=start_tasks,
                   output="all")

    yield pipeline

Beispiel #4

0

Datei anzeigen

Datei: test_pool_manager.py Projekt: xnox/ruffus

    def build_pipeline(self, pipeline_name, **kwargs):
        # fudge: clear all previous pipelines
        ruffus.Pipeline.clear_all()
        pipeline = ruffus.Pipeline(pipeline_name)

        task_create_files = pipeline.originate(
            task_func=create_files,
            output=["sample_{:02}.txt".format(x) for x in range(10)])

        task_compute_mean = pipeline.transform(task_func=compute_mean,
                                               input=task_create_files,
                                               filter=ruffus.suffix(".txt"),
                                               output=".mean")

        task_combine_means = pipeline.merge(task_func=combine_means,
                                            input=task_compute_mean,
                                            output="means.txt")

        task_run_local_job1 = pipeline.transform(task_func=run_local_job1,
                                                 input=task_create_files,
                                                 filter=ruffus.suffix(".txt"),
                                                 output=".local1")

        # test jobs_limit with local running
        task_run_local_job2 = pipeline.transform(task_func=run_local_job2,
                                                 input=task_create_files,
                                                 filter=ruffus.suffix(".txt"),
                                                 output=".local2").jobs_limit(
                                                     NUM_CORES // 2)

        # multiprocessing and DRMAA do not work at the moment likely
        # cause is the shared session object.
        if not HAVE_DRMAA or (kwargs.get("multiprocess", 1) > 1):
            return

        task_run_remote_job1 = pipeline.transform(task_func=run_remote_job1,
                                                  input=task_create_files,
                                                  filter=ruffus.suffix(".txt"),
                                                  output=".remote1")

        # test jobs_limit with remote running
        task_run_remote_job2 = pipeline.transform(
            task_func=run_remote_job2,
            input=task_create_files,
            filter=ruffus.suffix(".txt"),
            output=".remote2").jobs_limit(NUM_CORES // 2)

Beispiel #5

0

Datei anzeigen

    def build_pipeline(self, pipeline_name):
        # fudge: clear all previous pipelines
        ruffus.Pipeline.clear_all()
        pipeline = ruffus.Pipeline(pipeline_name)

        task_create_files = pipeline.originate(
            task_func=create_files,
            output=["sample_{:02}.txt".format(x) for x in range(10)])

        task_compute_mean = pipeline.transform(
            task_func=compute_mean,
            input=task_create_files,
            filter=ruffus.suffix(".txt"),
            output=".mean")

        task_combine_means = pipeline.merge(
            task_func=combine_means,
            input=task_compute_mean,
            output="means.txt")

Beispiel #6

0

Datei anzeigen

Datei: preprocess.py Projekt: DamLabResources/chroCRISPR

def mappipe(ifold, ref_file, minlen=20, rclip=0):

    ifold = os.path.join(ifold, '')
    ifile = '*.fastq.gz'  #  '*.fastq.gz'
    #ref_file = '/data/index/HG19.fasta'
    trim_regex = r'(.*)\/(SRR.+).fastq.gz$'
    pipeline = ruffus.Pipeline('FastqDNaseSeq')
    trim_task = pipeline.collate(
        tasks.trimmer,
        name='TrimGalore',
        input=ifold + ifile,
        filter=ruffus.regex(trim_regex),
        output=r'\1/\2_trimmed.fq.gz',
        # extras[0]: minimum length,
        # [1]:right end clip size
        extras=[[minlen, rclip]])
    trfile = '*_trimmed.fq.gz'
    aln_regex = r'(.*)\/(.*).fq.gz$'
    align_task = pipeline.collate(tasks.bwa_aln,
                                  name='bwa_aln',
                                  input=ifold + trfile,
                                  filter=ruffus.regex(aln_regex),
                                  output=r'\1/\2.sai',
                                  extras=[ref_file])
    align_task.follows('TrimGalore')

    ## sai to sam file using bwa samse
    sai_file = '*.sai'
    samse_regex = r'(.*)\/(.*).sai$'
    samse_task = pipeline.collate(
        tasks.bwa_samse,
        name='bwa_samse',
        input=ifold + sai_file,
        filter=ruffus.regex(samse_regex),
        output=r'\1/\2.sam',
        # extras[0]: fastq required for samse,
        # [1]: ref indexed fasta,
        # [2]: max multiple mapped reads [Default=3]
        extras=[[r'\1/\2.fq.gz', ref_file, 10]])
    samse_task.follows('bwa_aln')

    ## sam to bam using sambamba view
    sam_file = '*.sam'
    tobam_regex = r'(.*)\/(.*).sam$'
    tobam_task = pipeline.collate(tasks.sam_to_bam,
                                  name='sam_bam',
                                  input=ifold + sam_file,
                                  filter=ruffus.regex(tobam_regex),
                                  output=r'\1/\2.bam')
    tobam_task.follows('bwa_samse')

    ## sorting bam with sambamba sort
    bam_file = '*trimmed.bam'
    sort_bam_regex = r'(.*)\/(.*).bam$'
    sort_bam_task = pipeline.collate(tasks.sort_bam,
                                     name='sorting_bam',
                                     input=ifold + bam_file,
                                     filter=ruffus.regex(sort_bam_regex),
                                     output=r'\1/\2.sorted.bam')
    sort_bam_task.follows('sam_bam')

    ## bam to bed using bam2bed
    sorted_bam_file = '*trimmed.sorted.bam'
    sorted_bam_regex = r'(.*)\/(.*).sorted.bam$'
    sorted_bam_task = pipeline.collate(tasks.bam2bed,
                                       name='bam2bed',
                                       input=ifold + sorted_bam_file,
                                       filter=ruffus.regex(sorted_bam_regex),
                                       output=r'\1/\2.sorted.bed')
    sorted_bam_task.follows('sorting_bam')

    full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed'])

    full_pipe.run()

Beispiel #7

0

Datei anzeigen

def main(argv):

    options, args = P.parse_commandline(argv)

    if options.config_file:
        PARAMS = P.get_parameters(options.config_file)
    else:
        sys.exit(P.main(options, args))

    with arvados_enabled(always_mount=options.always_mount):
        mountpoint = PARAMS.get("mount_point", None)
        if mountpoint:
            redirect_defaults2mountpoint(mountpoint)

        with LibraryContext(PARAMS, options, args, argv, "daisy"):
            # A selection of command line arguments are added to PARAMS
            # as 'extras' not implemented in ruffus 2.6.3
            kwargs = collections.defaultdict(dict)
            if options.only_info:
                kwargs["extras"].update({'only_info': True})
                P.PARAMS["only_info"] = True
            if options.is_test:
                kwargs["extras"].update({'is_test': True})
                P.PARAMS["is_test"] = True

            E.debug("construction of workflow started")
            pipeline = ruffus.Pipeline('benchmark')
            # Tool execution
            suffix, tool_runners = add_tools_to_pipeline(pipeline,
                                                         map_tool_to_runner,
                                                         config=P.PARAMS,
                                                         **kwargs)

            E.debug("added tools to workflow ")
            # Optionally, add externally computed files as
            # pseudo-tools:
            if "external" in P.PARAMS["setup"]:
                external_runners = add_external_data_to_pipeline(
                    pipeline, config=P.PARAMS, **kwargs)
                tool_runners.extend(external_runners)

            # Optionally, combine tool runs into aggregate
            # outputs. The type of the output is preserved
            # (VCF -> VCF, etc.)
            # For example, call individual members in a trio
            # and then build a combined VCF to analyse mendelian
            # inconsistencies.
            if "collate" in P.PARAMS["setup"]:
                collate_runners = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["collate"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_collate", False):
                    tool_runners = []
                if P.PARAMS["setup"].get("no_collate_metrics", False):
                    collate_runners = []
                E.debug("added collators to workflow ")
            else:
                collate_runners = []

            # Optionally, split up the output before applying
            # additional analyses. The type of the output is preserved
            # (VCF -> VCF, etc).
            # For example, identify false positives, false negatives
            # and true positives and collect metrics individually.
            if "split" in P.PARAMS["setup"]:
                split_runners = add_splits_to_pipeline(
                    pipeline,
                    map_split_to_runner,
                    tool_runners,
                    P.PARAMS["setup"]["split"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_split", False):
                    tool_runners = []
                E.debug("added splitters to workflow ")
            else:
                split_runners = []

            metric_runners = []
            for prefix, r in zip(
                ["tool", "collate", "split"],
                [tool_runners, collate_runners, split_runners]):
                if not r:
                    continue

                metrics = None

                if prefix == "collate" and "collate_metrics" in P.PARAMS[
                        "setup"]:
                    metrics = P.PARAMS["setup"]["collate_metrics"]
                elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["split_metrics"]
                elif "metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["metrics"]
                else:
                    raise KeyError(
                        "configuration file requires a 'setup:metrics' section"
                    )

                # Metric execution
                mm = add_metrics_to_pipeline(pipeline,
                                             metrics,
                                             map_metric_to_runner,
                                             r,
                                             suffix=suffix,
                                             prefix=prefix + "_",
                                             config=P.PARAMS,
                                             **kwargs)

                if len(mm) == 0:
                    raise ValueError(
                        "workflow construction error: "
                        "no metric tasks result for metrics {}".format(
                            metrics))

                metric_runners.extend(mm)
                E.debug("added {}_metrics to workflow".format(prefix))

            # add plot task
            if "aggregate" in P.PARAMS["setup"]:
                aggregate_metrics = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["aggregate"],
                    metric_runners,
                    config=P.PARAMS)

                E.debug("added metric aggregation to workflow")
            else:
                aggregate_metrics = []

            add_upload_to_pipeline(pipeline,
                                   metric_runners + aggregate_metrics,
                                   P.PARAMS)
            E.debug("added upload to workflow".format(prefix))

            # add export task
            export = P.PARAMS["setup"].get("export",
                                           ["tools", "collate", "split"])
            map_export2runner = {
                "collate": collate_runners,
                "tools": tool_runners,
                "split": split_runners
            }

            export_runners = []
            for e in export:
                try:
                    export_runners.extend(map_export2runner[e])
                except KeyError:
                    raise KeyError("unknown export section: {}".format(e))

            add_export_to_pipeline(pipeline,
                                   export_runners,
                                   suffix=suffix,
                                   config=P.PARAMS)

            E.debug("added export to workflow")

            add_all_task_to_pipeline(pipeline,
                                     metric_runners + aggregate_metrics)

            # Collate output files to facilitate analysis
            if "collation" in P.PARAMS:
                collators = add_collations_to_pipeline(pipeline,
                                                       map_collate_to_runner,
                                                       P.PARAMS["collation"],
                                                       config=P.PARAMS)

            E.debug("construction of workflow completed")

    E.stop()