def quick(ifold): # sorting bam file pipeline = ruffus.Pipeline('BamDNaseSeq') bam_file = '*.bam' sort_bam_regex = r'(.*)\/(.*).bam$' sort_bam_task = pipeline.collate(tasks.sort_bam, name='sorting_bam', input=os.path.join(ifold, bam_file), filter=ruffus.regex(sort_bam_regex), output=r'\1/\2.sorted.bam') ## bam to bed using bam2bed sorted_bam_file = '*.sorted.bam' sorted_bam_regex = r'(.*)\/(.*).sorted.bam$' sorted_bam_task = pipeline.collate(tasks.bam2bed, name='bam2bed', input=os.path.join( ifold, sorted_bam_file), filter=ruffus.regex(sorted_bam_regex), output=r'\1/\2.sorted.bed') sorted_bam_task.follows('sorting_bam') full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed']) full_pipe.run()
def main(argv=None): if argv is None: argv = sys.argv options, args = P.initialize(argv, config_file="template.yml", defaults={ "min_value": 0.0, "num_samples": 1000, "mu": 0.0, "sigma": 1.0 }) pipeline = ruffus.Pipeline("template_pipeline") task_create_files = pipeline.originate( task_func=create_files, output=["sample_{:02}.txt".format(x) for x in range(10)]) task_compute_mean = pipeline.transform(task_func=compute_mean, input=task_create_files, filter=ruffus.suffix(".txt"), output=".mean") task_combine_means = pipeline.merge(task_func=combine_means, input=task_compute_mean, output="means.txt") # primary targets pipeline.merge(task_func=P.EmptyRunner("all"), input=task_combine_means, output="all") E.debug("starting workflow") return P.run_workflow(options, args)
def build_pipeline(): pipeline = ruffus.Pipeline(name="test1") start_tasks = pipeline.originate(task_func=PassThroughRunner( name="dummy_f1", f=lambda x: None), output=["a.1", "b.1"]) pipeline.merge(task_func=EmptyRunner(name="all"), input=start_tasks, output="all") yield pipeline
def build_pipeline(self, pipeline_name, **kwargs): # fudge: clear all previous pipelines ruffus.Pipeline.clear_all() pipeline = ruffus.Pipeline(pipeline_name) task_create_files = pipeline.originate( task_func=create_files, output=["sample_{:02}.txt".format(x) for x in range(10)]) task_compute_mean = pipeline.transform(task_func=compute_mean, input=task_create_files, filter=ruffus.suffix(".txt"), output=".mean") task_combine_means = pipeline.merge(task_func=combine_means, input=task_compute_mean, output="means.txt") task_run_local_job1 = pipeline.transform(task_func=run_local_job1, input=task_create_files, filter=ruffus.suffix(".txt"), output=".local1") # test jobs_limit with local running task_run_local_job2 = pipeline.transform(task_func=run_local_job2, input=task_create_files, filter=ruffus.suffix(".txt"), output=".local2").jobs_limit( NUM_CORES // 2) # multiprocessing and DRMAA do not work at the moment likely # cause is the shared session object. if not HAVE_DRMAA or (kwargs.get("multiprocess", 1) > 1): return task_run_remote_job1 = pipeline.transform(task_func=run_remote_job1, input=task_create_files, filter=ruffus.suffix(".txt"), output=".remote1") # test jobs_limit with remote running task_run_remote_job2 = pipeline.transform( task_func=run_remote_job2, input=task_create_files, filter=ruffus.suffix(".txt"), output=".remote2").jobs_limit(NUM_CORES // 2)
def build_pipeline(self, pipeline_name): # fudge: clear all previous pipelines ruffus.Pipeline.clear_all() pipeline = ruffus.Pipeline(pipeline_name) task_create_files = pipeline.originate( task_func=create_files, output=["sample_{:02}.txt".format(x) for x in range(10)]) task_compute_mean = pipeline.transform( task_func=compute_mean, input=task_create_files, filter=ruffus.suffix(".txt"), output=".mean") task_combine_means = pipeline.merge( task_func=combine_means, input=task_compute_mean, output="means.txt")
def mappipe(ifold, ref_file, minlen=20, rclip=0): ifold = os.path.join(ifold, '') ifile = '*.fastq.gz' # '*.fastq.gz' #ref_file = '/data/index/HG19.fasta' trim_regex = r'(.*)\/(SRR.+).fastq.gz$' pipeline = ruffus.Pipeline('FastqDNaseSeq') trim_task = pipeline.collate( tasks.trimmer, name='TrimGalore', input=ifold + ifile, filter=ruffus.regex(trim_regex), output=r'\1/\2_trimmed.fq.gz', # extras[0]: minimum length, # [1]:right end clip size extras=[[minlen, rclip]]) trfile = '*_trimmed.fq.gz' aln_regex = r'(.*)\/(.*).fq.gz$' align_task = pipeline.collate(tasks.bwa_aln, name='bwa_aln', input=ifold + trfile, filter=ruffus.regex(aln_regex), output=r'\1/\2.sai', extras=[ref_file]) align_task.follows('TrimGalore') ## sai to sam file using bwa samse sai_file = '*.sai' samse_regex = r'(.*)\/(.*).sai$' samse_task = pipeline.collate( tasks.bwa_samse, name='bwa_samse', input=ifold + sai_file, filter=ruffus.regex(samse_regex), output=r'\1/\2.sam', # extras[0]: fastq required for samse, # [1]: ref indexed fasta, # [2]: max multiple mapped reads [Default=3] extras=[[r'\1/\2.fq.gz', ref_file, 10]]) samse_task.follows('bwa_aln') ## sam to bam using sambamba view sam_file = '*.sam' tobam_regex = r'(.*)\/(.*).sam$' tobam_task = pipeline.collate(tasks.sam_to_bam, name='sam_bam', input=ifold + sam_file, filter=ruffus.regex(tobam_regex), output=r'\1/\2.bam') tobam_task.follows('bwa_samse') ## sorting bam with sambamba sort bam_file = '*trimmed.bam' sort_bam_regex = r'(.*)\/(.*).bam$' sort_bam_task = pipeline.collate(tasks.sort_bam, name='sorting_bam', input=ifold + bam_file, filter=ruffus.regex(sort_bam_regex), output=r'\1/\2.sorted.bam') sort_bam_task.follows('sam_bam') ## bam to bed using bam2bed sorted_bam_file = '*trimmed.sorted.bam' sorted_bam_regex = r'(.*)\/(.*).sorted.bam$' sorted_bam_task = pipeline.collate(tasks.bam2bed, name='bam2bed', input=ifold + sorted_bam_file, filter=ruffus.regex(sorted_bam_regex), output=r'\1/\2.sorted.bed') sorted_bam_task.follows('sorting_bam') full_pipe = ruffus.Pipeline('Full pipeline', input=['bam2bed']) full_pipe.run()
def main(argv): options, args = P.parse_commandline(argv) if options.config_file: PARAMS = P.get_parameters(options.config_file) else: sys.exit(P.main(options, args)) with arvados_enabled(always_mount=options.always_mount): mountpoint = PARAMS.get("mount_point", None) if mountpoint: redirect_defaults2mountpoint(mountpoint) with LibraryContext(PARAMS, options, args, argv, "daisy"): # A selection of command line arguments are added to PARAMS # as 'extras' not implemented in ruffus 2.6.3 kwargs = collections.defaultdict(dict) if options.only_info: kwargs["extras"].update({'only_info': True}) P.PARAMS["only_info"] = True if options.is_test: kwargs["extras"].update({'is_test': True}) P.PARAMS["is_test"] = True E.debug("construction of workflow started") pipeline = ruffus.Pipeline('benchmark') # Tool execution suffix, tool_runners = add_tools_to_pipeline(pipeline, map_tool_to_runner, config=P.PARAMS, **kwargs) E.debug("added tools to workflow ") # Optionally, add externally computed files as # pseudo-tools: if "external" in P.PARAMS["setup"]: external_runners = add_external_data_to_pipeline( pipeline, config=P.PARAMS, **kwargs) tool_runners.extend(external_runners) # Optionally, combine tool runs into aggregate # outputs. The type of the output is preserved # (VCF -> VCF, etc.) # For example, call individual members in a trio # and then build a combined VCF to analyse mendelian # inconsistencies. if "collate" in P.PARAMS["setup"]: collate_runners = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["collate"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_collate", False): tool_runners = [] if P.PARAMS["setup"].get("no_collate_metrics", False): collate_runners = [] E.debug("added collators to workflow ") else: collate_runners = [] # Optionally, split up the output before applying # additional analyses. The type of the output is preserved # (VCF -> VCF, etc). # For example, identify false positives, false negatives # and true positives and collect metrics individually. if "split" in P.PARAMS["setup"]: split_runners = add_splits_to_pipeline( pipeline, map_split_to_runner, tool_runners, P.PARAMS["setup"]["split"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_split", False): tool_runners = [] E.debug("added splitters to workflow ") else: split_runners = [] metric_runners = [] for prefix, r in zip( ["tool", "collate", "split"], [tool_runners, collate_runners, split_runners]): if not r: continue metrics = None if prefix == "collate" and "collate_metrics" in P.PARAMS[ "setup"]: metrics = P.PARAMS["setup"]["collate_metrics"] elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["split_metrics"] elif "metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["metrics"] else: raise KeyError( "configuration file requires a 'setup:metrics' section" ) # Metric execution mm = add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, r, suffix=suffix, prefix=prefix + "_", config=P.PARAMS, **kwargs) if len(mm) == 0: raise ValueError( "workflow construction error: " "no metric tasks result for metrics {}".format( metrics)) metric_runners.extend(mm) E.debug("added {}_metrics to workflow".format(prefix)) # add plot task if "aggregate" in P.PARAMS["setup"]: aggregate_metrics = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["aggregate"], metric_runners, config=P.PARAMS) E.debug("added metric aggregation to workflow") else: aggregate_metrics = [] add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics, P.PARAMS) E.debug("added upload to workflow".format(prefix)) # add export task export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"]) map_export2runner = { "collate": collate_runners, "tools": tool_runners, "split": split_runners } export_runners = [] for e in export: try: export_runners.extend(map_export2runner[e]) except KeyError: raise KeyError("unknown export section: {}".format(e)) add_export_to_pipeline(pipeline, export_runners, suffix=suffix, config=P.PARAMS) E.debug("added export to workflow") add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics) # Collate output files to facilitate analysis if "collation" in P.PARAMS: collators = add_collations_to_pipeline(pipeline, map_collate_to_runner, P.PARAMS["collation"], config=P.PARAMS) E.debug("construction of workflow completed") E.stop()