def main(argv):

    def _add_input(parser):
        parser.add_option("--data-dir", default=".")
        parser.add_option("--force", default=False, action="store_true")
        parser.add_option("--min-depth", default=0, type="int")
        parser.add_option("--follow-links", default=False, action="store_true")
        parser.add_option("--limit-metrics", default=0, type="int")
        parser.add_option("--output-filename-metrics")
        parser.add_option("--input-filename-metrics")

    P.initialize(argv, callback=_add_input)
    options = E.get_args()

    if options.config_file:
        PARAMS = P.get_parameters(options.config_file)
    else:
        sys.exit(P.main(options))

    if os.path.exists("results.commit"):
        if not options.force:
            raise ValueError(
                "a results.commit file already exists. Please remove "
                "before uploading.")

    data_dir = os.path.abspath(options.data_dir)
    if options.input_filename_metrics:
        with IOTools.open_file(options.input_filename_metrics) as inf:
            infiles = [x.strip() for x in inf.readlines() if x.strip()]
        if options.limit_metrics:
            infiles = infiles[:options.limit_metrics]
    else:
        E.info(f"collecting files to upload starting in {data_dir}")
        infiles = []
        for root, dirs, files in os.walk(data_dir, followlinks=options.follow_links):
            E.debug(f"working on {root}: dirs={len(dirs)}, files={len(files)}")
            # ignore first level (tools) (needs better check)
            depth = root[len(data_dir):].count(os.sep)
            if "benchmark.info" in files:
                if depth <= options.min_depth:
                    E.info(f"skipping - depth not high enough: {depth}")
                else:
                    infiles.append(os.path.join(root, "benchmark.info"))

            if options.limit_metrics and len(infiles) > options.limit_metrics:
                E.info(f"stopping collection as {len(infiles)} reached")
                break

    E.info("found a potential {} benchmark.info files to upload".format(len(infiles)))
    if options.output_filename_metrics:
        with IOTools.open_file(options.output_filename_metrics, "w") as outf:
            outf.write("\n".join(infiles) + "\n")

    # find all files of interest
    oldwd = os.getcwd()
    os.chdir(data_dir)
    upload_result(infiles, "results.commit", PARAMS)
    os.chdir(oldwd)

    E.stop()
def test_all_cluster_parameters_can_be_set(grid_run_patch, option, field,
                                           value):
    P.initialize(argv=["mytool", "{}={}".format(option, value)])
    with patch("cgatcore.pipeline.execution.will_run_on_cluster",
               return_value=True):
        # fails with NameError if drmaa not configured
        # and import drmaa has failed
        with pytest.raises(NameError):
            P.run("echo here")
        grid_run_patch.assert_called_once()
        options = get_options(grid_run_patch)
        assert options[field] == value
def test_default_queue_can_be_overridden(grid_run_patch):
    P.initialize(argv=["mytool", "--cluster-queue=test.q"])
    with patch("cgatcore.pipeline.execution.will_run_on_cluster",
               return_value=True):
        # fails with NameError if drmaa not configured
        # and import drmaa has failed
        with pytest.raises(NameError):
            P.run("echo here")
        grid_run_patch.assert_called_once()
        options = get_options(grid_run_patch)
        assert options["queue"] == "test.q"
        assert options["queue_manager"] == "sge"
Example #4
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    options, args = P.initialize(argv,
                                 config_file="template.yml",
                                 defaults={
                                     "min_value": 0.0,
                                     "num_samples": 1000,
                                     "mu": 0.0,
                                     "sigma": 1.0
                                 })

    pipeline = ruffus.Pipeline("template_pipeline")

    task_create_files = pipeline.originate(
        task_func=create_files,
        output=["sample_{:02}.txt".format(x) for x in range(10)])

    task_compute_mean = pipeline.transform(task_func=compute_mean,
                                           input=task_create_files,
                                           filter=ruffus.suffix(".txt"),
                                           output=".mean")

    task_combine_means = pipeline.merge(task_func=combine_means,
                                        input=task_compute_mean,
                                        output="means.txt")

    # primary targets
    pipeline.merge(task_func=P.EmptyRunner("all"),
                   input=task_combine_means,
                   output="all")

    E.debug("starting workflow")
    return P.run_workflow(options, args)
Example #5
0
import re
import shutil
import sqlite3
import glob

# import modules from the cgat code collection
import cgatcore.experiment as E
import cgatpipelines.tasks.mapping as mapping
from cgatcore import pipeline as P
import cgatpipelines.tasks.readqc as readqc
import cgatpipelines.tasks.preprocess as preprocess
import cgatcore.iotools as iotools


# Initialize the pipeline
P.initialize()

# Define input files and preprocessing steps list of acceptable input
# formats
INPUT_FORMATS = ["*.fastq.1.gz", "*.fastq.gz", "*.sra", "*.csfasta.gz", "*.remote"]

# Regular expression to extract a track from an input file. Does not
# preserve a directory as part of the track.
REGEX_TRACK = r"(?P<track>[^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"

# Regular expression to extract a track from both processed and
# unprocessed files
REGEX_TRACK_BOTH = r"(processed.dir/)*([^/]+)\.(fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"

SEQUENCEFILES_REGEX = r"([^/]+).(?P<suffix>fastq.1.gz|fastq.gz|sra|csfasta.gz|remote)"
 def setUp(self):
     # ignore command line arguments for pytest
     P.initialize(argv=["test"])
     self.work_dir = P.get_temp_dir(shared=True)
def test_pipeline_action_state(capsys, build_pipeline):
    P.initialize(argv=["toolname", "state"])
    P.run_workflow(E.get_args(), pipeline=build_pipeline)
    captured = capsys.readouterr()
    assert captured.out.startswith("function\tactive")
def test_pipeline_action_show(capsys, build_pipeline):
    P.initialize(argv=["toolname", "show", "all"])
    P.run_workflow(E.get_args(), pipeline=build_pipeline)
    captured = capsys.readouterr()
    assert "Tasks which will be run" in captured.out
def main(argv):

    options = P.initialize(argv, config_file="benchmark.yml")

    # compatibility with cgatcore < 0.6.3
    if isinstance(options, tuple):
        options = options[0]

    # not sure what this does
    # if not options.config_file:
    #     P.get_parameters(options.config_file)
    # else:
    #     sys.exit(P.main(options, args))

    params = P.get_params()

    with arvados_enabled(always_mount=options.always_mount):
        mountpoint = params.get("mount_point", None)
        if mountpoint:
            redirect_defaults2mountpoint(mountpoint)

        # A selection of command line arguments are added to PARAMS
        # as 'extras' not implemented in ruffus 2.6.3
        kwargs = collections.defaultdict(dict)
        if options.only_info:
            kwargs["extras"].update({'only_info': True})
            P.PARAMS["only_info"] = True
        if options.is_test:
            kwargs["extras"].update({'is_test': True})
            P.PARAMS["is_test"] = True

        E.debug("construction of workflow started")
        pipeline = ruffus.Pipeline('benchmark')
        # Tool execution
        suffix, tool_runners = add_tools_to_pipeline(pipeline,
                                                     map_tool_to_runner,
                                                     config=P.PARAMS,
                                                     **kwargs)

        E.debug("added {} tools to workflow".format(len(tool_runners)))
        # Optionally, add externally computed files as
        # pseudo-tools:
        if "external" in P.PARAMS["setup"]:
            external_runners = add_external_data_to_pipeline(pipeline,
                                                             config=P.PARAMS,
                                                             **kwargs)
            tool_runners.extend(external_runners)

        # Optionally, combine tool runs into aggregate
        # outputs. The type of the output is preserved
        # (VCF -> VCF, etc.)
        # For example, call individual members in a trio
        # and then build a combined VCF to analyse mendelian
        # inconsistencies.
        if "collate" in P.PARAMS["setup"]:
            collate_runners = add_collations_to_pipeline(
                pipeline,
                map_collate_to_runner,
                P.PARAMS["setup"]["collate"],
                tasks=tool_runners,
                config=P.PARAMS)
            if P.PARAMS["setup"].get("only_collate", False):
                tool_runners = []
            if P.PARAMS["setup"].get("no_collate_metrics", False):
                collate_runners = []
            E.debug("added {} collators to workflow".format(
                len(collate_runners)))
        else:
            collate_runners = []

        # Optionally, split up the output before applying
        # additional analyses. The type of the output is preserved
        # (VCF -> VCF, etc).
        # For example, identify false positives, false negatives
        # and true positives and collect metrics individually.
        if "split" in P.PARAMS["setup"]:
            split_runners = add_splits_to_pipeline(pipeline,
                                                   map_split_to_runner,
                                                   tool_runners,
                                                   P.PARAMS["setup"]["split"],
                                                   tasks=tool_runners,
                                                   config=P.PARAMS)
            if P.PARAMS["setup"].get("only_split", False):
                tool_runners = []
            E.debug("added {} splitters to workflow".format(
                len(split_runners)))
        else:
            split_runners = []

        metric_runners = []
        for prefix, r in zip(["tool", "collate", "split"],
                             [tool_runners, collate_runners, split_runners]):
            if not r:
                continue

            metrics = None

            if prefix == "collate" and "collate_metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["collate_metrics"]
            elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["split_metrics"]
            elif "metrics" in P.PARAMS["setup"]:
                metrics = P.PARAMS["setup"]["metrics"]
            else:
                raise KeyError(
                    "configuration file requires a 'setup:metrics' section")

            # Metric execution
            mm = add_metrics_to_pipeline(pipeline,
                                         metrics,
                                         map_metric_to_runner,
                                         r,
                                         suffix=suffix,
                                         prefix=prefix + "_",
                                         config=P.PARAMS,
                                         **kwargs)

            if len(mm) == 0:
                raise ValueError(
                    "workflow construction error: "
                    "no metric tasks result for metrics {}".format(metrics))

            metric_runners.extend(mm)
            E.debug("added {} {}_metrics to workflow".format(len(mm), prefix))

        # add plot task
        if "aggregate" in P.PARAMS["setup"]:
            aggregate_metrics = add_collations_to_pipeline(
                pipeline,
                map_collate_to_runner,
                P.PARAMS["setup"]["aggregate"],
                metric_runners,
                config=P.PARAMS)

            E.debug("added metric aggregation to workflow")
        else:
            aggregate_metrics = []

        add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics,
                               P.PARAMS)
        E.debug("added upload to workflow".format(prefix))

        # add export task
        export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"])
        map_export2runner = {
            "collate": collate_runners,
            "tools": tool_runners,
            "split": split_runners
        }

        export_runners = []
        for e in export:
            try:
                export_runners.extend(map_export2runner[e])
            except KeyError:
                raise KeyError("unknown export section: {}".format(e))

        add_export_to_pipeline(pipeline,
                               export_runners,
                               suffix=suffix,
                               config=P.PARAMS)

        E.debug("added export to workflow")

        add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics)

        # Collate output files to facilitate analysis
        if "collation" in P.PARAMS:
            collators = add_collations_to_pipeline(pipeline,
                                                   map_collate_to_runner,
                                                   P.PARAMS["collation"],
                                                   config=P.PARAMS)

        E.debug("construction of workflow completed")

        E.debug("starting workflow")
        P.run_workflow(options, pipeline=pipeline)