def main(argv): def _add_input(parser): parser.add_option("--data-dir", default=".") parser.add_option("--force", default=False, action="store_true") parser.add_option("--min-depth", default=0, type="int") parser.add_option("--follow-links", default=False, action="store_true") parser.add_option("--limit-metrics", default=0, type="int") parser.add_option("--output-filename-metrics") parser.add_option("--input-filename-metrics") P.initialize(argv, callback=_add_input) options = E.get_args() if options.config_file: PARAMS = P.get_parameters(options.config_file) else: sys.exit(P.main(options)) if os.path.exists("results.commit"): if not options.force: raise ValueError( "a results.commit file already exists. Please remove " "before uploading.") data_dir = os.path.abspath(options.data_dir) if options.input_filename_metrics: with IOTools.open_file(options.input_filename_metrics) as inf: infiles = [x.strip() for x in inf.readlines() if x.strip()] if options.limit_metrics: infiles = infiles[:options.limit_metrics] else: E.info(f"collecting files to upload starting in {data_dir}") infiles = [] for root, dirs, files in os.walk(data_dir, followlinks=options.follow_links): E.debug(f"working on {root}: dirs={len(dirs)}, files={len(files)}") # ignore first level (tools) (needs better check) depth = root[len(data_dir):].count(os.sep) if "benchmark.info" in files: if depth <= options.min_depth: E.info(f"skipping - depth not high enough: {depth}") else: infiles.append(os.path.join(root, "benchmark.info")) if options.limit_metrics and len(infiles) > options.limit_metrics: E.info(f"stopping collection as {len(infiles)} reached") break E.info("found a potential {} benchmark.info files to upload".format(len(infiles))) if options.output_filename_metrics: with IOTools.open_file(options.output_filename_metrics, "w") as outf: outf.write("\n".join(infiles) + "\n") # find all files of interest oldwd = os.getcwd() os.chdir(data_dir) upload_result(infiles, "results.commit", PARAMS) os.chdir(oldwd) E.stop()
def test_all_cluster_parameters_can_be_set(grid_run_patch, option, field, value): P.initialize(argv=["mytool", "{}={}".format(option, value)]) with patch("cgatcore.pipeline.execution.will_run_on_cluster", return_value=True): # fails with NameError if drmaa not configured # and import drmaa has failed with pytest.raises(NameError): P.run("echo here") grid_run_patch.assert_called_once() options = get_options(grid_run_patch) assert options[field] == value
def test_default_queue_can_be_overridden(grid_run_patch): P.initialize(argv=["mytool", "--cluster-queue=test.q"]) with patch("cgatcore.pipeline.execution.will_run_on_cluster", return_value=True): # fails with NameError if drmaa not configured # and import drmaa has failed with pytest.raises(NameError): P.run("echo here") grid_run_patch.assert_called_once() options = get_options(grid_run_patch) assert options["queue"] == "test.q" assert options["queue_manager"] == "sge"
def main(argv=None): if argv is None: argv = sys.argv options, args = P.initialize(argv, config_file="template.yml", defaults={ "min_value": 0.0, "num_samples": 1000, "mu": 0.0, "sigma": 1.0 }) pipeline = ruffus.Pipeline("template_pipeline") task_create_files = pipeline.originate( task_func=create_files, output=["sample_{:02}.txt".format(x) for x in range(10)]) task_compute_mean = pipeline.transform(task_func=compute_mean, input=task_create_files, filter=ruffus.suffix(".txt"), output=".mean") task_combine_means = pipeline.merge(task_func=combine_means, input=task_compute_mean, output="means.txt") # primary targets pipeline.merge(task_func=P.EmptyRunner("all"), input=task_combine_means, output="all") E.debug("starting workflow") return P.run_workflow(options, args)
import re import shutil import sqlite3 import glob # import modules from the cgat code collection import cgatcore.experiment as E import cgatpipelines.tasks.mapping as mapping from cgatcore import pipeline as P import cgatpipelines.tasks.readqc as readqc import cgatpipelines.tasks.preprocess as preprocess import cgatcore.iotools as iotools # Initialize the pipeline P.initialize() # Define input files and preprocessing steps list of acceptable input # formats INPUT_FORMATS = ["*.fastq.1.gz", "*.fastq.gz", "*.sra", "*.csfasta.gz", "*.remote"] # Regular expression to extract a track from an input file. Does not # preserve a directory as part of the track. REGEX_TRACK = r"(?P<track>[^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" # Regular expression to extract a track from both processed and # unprocessed files REGEX_TRACK_BOTH = r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)" SEQUENCEFILES_REGEX = r"([^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"
def setUp(self): # ignore command line arguments for pytest P.initialize(argv=["test"]) self.work_dir = P.get_temp_dir(shared=True)
def test_pipeline_action_state(capsys, build_pipeline): P.initialize(argv=["toolname", "state"]) P.run_workflow(E.get_args(), pipeline=build_pipeline) captured = capsys.readouterr() assert captured.out.startswith("function\tactive")
def test_pipeline_action_show(capsys, build_pipeline): P.initialize(argv=["toolname", "show", "all"]) P.run_workflow(E.get_args(), pipeline=build_pipeline) captured = capsys.readouterr() assert "Tasks which will be run" in captured.out
def main(argv): options = P.initialize(argv, config_file="benchmark.yml") # compatibility with cgatcore < 0.6.3 if isinstance(options, tuple): options = options[0] # not sure what this does # if not options.config_file: # P.get_parameters(options.config_file) # else: # sys.exit(P.main(options, args)) params = P.get_params() with arvados_enabled(always_mount=options.always_mount): mountpoint = params.get("mount_point", None) if mountpoint: redirect_defaults2mountpoint(mountpoint) # A selection of command line arguments are added to PARAMS # as 'extras' not implemented in ruffus 2.6.3 kwargs = collections.defaultdict(dict) if options.only_info: kwargs["extras"].update({'only_info': True}) P.PARAMS["only_info"] = True if options.is_test: kwargs["extras"].update({'is_test': True}) P.PARAMS["is_test"] = True E.debug("construction of workflow started") pipeline = ruffus.Pipeline('benchmark') # Tool execution suffix, tool_runners = add_tools_to_pipeline(pipeline, map_tool_to_runner, config=P.PARAMS, **kwargs) E.debug("added {} tools to workflow".format(len(tool_runners))) # Optionally, add externally computed files as # pseudo-tools: if "external" in P.PARAMS["setup"]: external_runners = add_external_data_to_pipeline(pipeline, config=P.PARAMS, **kwargs) tool_runners.extend(external_runners) # Optionally, combine tool runs into aggregate # outputs. The type of the output is preserved # (VCF -> VCF, etc.) # For example, call individual members in a trio # and then build a combined VCF to analyse mendelian # inconsistencies. if "collate" in P.PARAMS["setup"]: collate_runners = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["collate"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_collate", False): tool_runners = [] if P.PARAMS["setup"].get("no_collate_metrics", False): collate_runners = [] E.debug("added {} collators to workflow".format( len(collate_runners))) else: collate_runners = [] # Optionally, split up the output before applying # additional analyses. The type of the output is preserved # (VCF -> VCF, etc). # For example, identify false positives, false negatives # and true positives and collect metrics individually. if "split" in P.PARAMS["setup"]: split_runners = add_splits_to_pipeline(pipeline, map_split_to_runner, tool_runners, P.PARAMS["setup"]["split"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_split", False): tool_runners = [] E.debug("added {} splitters to workflow".format( len(split_runners))) else: split_runners = [] metric_runners = [] for prefix, r in zip(["tool", "collate", "split"], [tool_runners, collate_runners, split_runners]): if not r: continue metrics = None if prefix == "collate" and "collate_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["collate_metrics"] elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["split_metrics"] elif "metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["metrics"] else: raise KeyError( "configuration file requires a 'setup:metrics' section") # Metric execution mm = add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, r, suffix=suffix, prefix=prefix + "_", config=P.PARAMS, **kwargs) if len(mm) == 0: raise ValueError( "workflow construction error: " "no metric tasks result for metrics {}".format(metrics)) metric_runners.extend(mm) E.debug("added {} {}_metrics to workflow".format(len(mm), prefix)) # add plot task if "aggregate" in P.PARAMS["setup"]: aggregate_metrics = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["aggregate"], metric_runners, config=P.PARAMS) E.debug("added metric aggregation to workflow") else: aggregate_metrics = [] add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics, P.PARAMS) E.debug("added upload to workflow".format(prefix)) # add export task export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"]) map_export2runner = { "collate": collate_runners, "tools": tool_runners, "split": split_runners } export_runners = [] for e in export: try: export_runners.extend(map_export2runner[e]) except KeyError: raise KeyError("unknown export section: {}".format(e)) add_export_to_pipeline(pipeline, export_runners, suffix=suffix, config=P.PARAMS) E.debug("added export to workflow") add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics) # Collate output files to facilitate analysis if "collation" in P.PARAMS: collators = add_collations_to_pipeline(pipeline, map_collate_to_runner, P.PARAMS["collation"], config=P.PARAMS) E.debug("construction of workflow completed") E.debug("starting workflow") P.run_workflow(options, pipeline=pipeline)