コード例 #1
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    tools = glob.glob(
        os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools",
                     "*.py"))

    counter = E.Counter()
    for tool in tools:
        counter.found += 1
        tool_module = re.sub(".py", "", os.path.basename(tool))
        tool_name = re.sub("_", "-", tool_module)
        if tool_name in ("__init__", "cli"):
            c.ignored += 1
            continue

        dest = os.path.join("tools", "{}.rst".format(tool_name))

        if os.path.exists(dest) and not options.output_force:
            counter.skipped += 1
            continue

        with IOTools.openFile(dest, "w") as outf:
            outf.write(TEMPLATE_TOOL.format(**locals()))

        counter.new += 1

    E.info(counter)
    E.stop()
コード例 #2
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    total_counter = E.Counter()
    table = []

    for section, map_task2runner in [("tool", map_tool_to_runner),
                                     ("metric", map_metric_to_runner),
                                     ("split", map_split_to_runner),
                                     ("collate", map_collate_to_runner)]:
        E.debug("processing section: {}".format(section))
        counter = E.Counter()

        for task, taskf in sorted(map_task2runner.items()):
            counter.ntasks += 1
            comments = []
            try:
                version = taskf().get_version()
                counter.version_ok += 1
            except Exception:
                version = ""
                comments.append("unavailable")
                counter.version_fail += 1

            comments = "; ".join(comments)
            table.append((section, task, version, comments))

        E.info("{}: {}".format(section, counter))
        total_counter += counter

    options.stdout.write("section\ttask\tversion\tcomments\n")
    for row in table:
        options.stdout.write("\t".join(map(str, row)) + "\n")

    E.info("{}: {}".format("total", counter))
    E.stop()
コード例 #3
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-r",
                      "--run-id",
                      dest="run_id",
                      type="int",
                      help="numerical identifier of a run [%default]")

    parser.add_option("-d",
                      "--database-url",
                      dest="database_url",
                      type="string",
                      help="database url [%default]")

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="only show statements to be executed [%default]")

    parser.set_defaults(
        run_id=None,
        database_url="sqlite:///./csvdb",
        dry_run=False,
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    Storage.purge_run_id(options.run_id,
                         options.database_url,
                         dry_run=options.dry_run)

    E.stop()
コード例 #4
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-r",
        "--restrict-regex",
        dest="restrict_regex",
        action="append",
        help="pattern to restrict tests to certain tools/metrics. "
        "Can be specified multiple times [%default]")

    parser.add_option(
        "--data-directory",
        dest="data_directory",
        help="directory with sample data sets. This will override the default "
        "datadir in the configuration file and the environment variable "
        "DAISY_TEST_DATADIR [%default]")

    parser.add_option(
        "--library-directory",
        dest="library_directory",
        action="append",
        help="directory TaskLibrary functions. Will be added to the built-in "
        "and the one specified in DAISY_TASKLIBRARY environment variable "
        "[%default]")

    parser.add_option("--always-mount",
                      dest="always_mount",
                      action="store_true",
                      help="force mounting of arvados keep [%default]")

    parser.add_option("--keep-failed-temp",
                      dest="keep_failed_temp",
                      action="store_true",
                      help="keep temporary files of failed tests [%default]")

    parser.set_defaults(
        restrict_regex=[],
        always_mount=False,
        data_directory=None,
        keep_failed_temp=False,
        library_directories=[],
    )

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    P.get_parameters()

    # load the built-in tests
    filenames = [
        os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary",
                     "test_task_library.yml")
    ]
    if "DAISY_TASKLIBRARY" in os.environ:
        filenames.append(
            os.path.join(os.environ["DAISY_TASKLIBRARY"],
                         "test_task_library.yml"))
    filenames.extend(options.library_directories)

    master_config = None
    for fn in filenames:
        if not os.path.exists(fn):
            E.warn("file {} does not exist".format(fn))
            continue
        with IOTools.open_file(fn) as inf:
            raw_txt = inf.read()
            test_config = yaml.load(raw_txt)
            if test_config is None:
                E.warn("file {} is empty".format(fn))
                continue

            data_directory = os.environ.get("DAISY_TEST_DATADIR",
                                            test_config.get("data_directory"))

            if options.data_directory:
                data_directory = options.data_directory

            # reload config with placeholders replaced
            test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt))
            if master_config is None:
                master_config = test_config
            else:
                # add additional tool/test metrics
                master_config["tool"].update(test_config.get("tool", {}))
                master_config["metric"].update(test_config.get("metric", {}))

    for test_section, testclass, map_name_to_runner in [
        ("tool", TestTool, map_tool_to_runner),
        ("metric", TestMetric, map_metric_to_runner)
    ]:

        ignore = master_config[test_section].get("ignore", [])
        # propagate config variables
        testclass.test_config = master_config

        for task, taskf in sorted(map_name_to_runner.items()):
            found = False
            for to_ignore in ignore:
                if re.match(to_ignore, task):
                    found = True
            if found:
                continue
            if options.restrict_regex:
                take = False
                for x in options.restrict_regex:
                    if re.search(x, task):
                        take = True
                if not take:
                    continue
            add_tests(task, taskf, testclass)

    failed = False
    with arvados_enabled(always_mount=options.always_mount):
        for testclass in [TestTool, TestMetric]:
            suite = unittest.TestLoader().loadTestsFromTestCase(testclass)
            result = unittest.TextTestRunner(verbosity=2).run(suite)
            failed |= not result.wasSuccessful()

            # remove all tests in test class - necessary if function is
            # called repeatedly
            clear_tests(testclass)

    E.stop()
    return failed
コード例 #5
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-n",
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-l",
                      "--link",
                      dest="link",
                      action="store_true",
                      help="link instead of rename [%default]")

    parser.set_defaults(dry_run=False, link=False)

    (options, args) = E.start(parser, argv)

    config = P.get_parameters("benchmark.yml")

    old_data, new_data = [], []

    for old_info in glob.glob("*.dir/tool.info"):
        old_dir, old_file = os.path.split(old_info)
        old_info = Toolkit.read_data(old_info)
        old_data.append((old_dir, old_info))

    tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config)

    config_files = Workflow.expand_globs(config["input"])
    input_combos = Workflow.build_combinations(config_files)

    map_property_to_dir = collections.defaultdict(list)

    for toolf, input_files in itertools.product(tool_functions, input_combos):

        # create a copy of the task function and give it its unique name
        # by mangling it with the input_files
        taskf = copy.copy(toolf)
        taskf.register_input(input_files)
        result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir"))
        new_data.append((result_dir, taskf))
        for a, x, y in IOTools.nested_iter(taskf.input_files):
            map_property_to_dir[(x, y)].append(result_dir)
        map_property_to_dir[("name", taskf.name)].append(result_dir)
        for x, y in list(taskf._option_dict.items()):
            map_property_to_dir[(x, y)].append(result_dir)

    # match by input_files
    options.stdout.write("\t".join(("old", "new", "matching")) + "\n")

    for old_dir, old_info in old_data:
        targets = []
        for a, x, y in IOTools.nested_iter(old_info["input_files"]):
            if (x, y) in map_property_to_dir:
                targets.extend(map_property_to_dir[(x, y)])
        for x, y in list(old_info.items()):
            try:
                targets.extend(map_property_to_dir[(x, y)])
            except TypeError:
                pass

        counts = collections.Counter(targets)
        max_count = max(counts.values())
        max_count_items = [
            x for x, y in list(counts.items()) if y == max_count
        ]

        if len(max_count_items) > 1:
            E.warn("multiple matches for {}, ignored".format(old_dir))
            continue

        new_dir = max_count_items[0]

        options.stdout.write("\t".join(map(str, (old_dir, new_dir,
                                                 max_count))) + "\n")

        if os.path.exists(new_dir):
            raise ValueError("directory {} already exists".format(new_dir))

        if options.dry_run:
            continue

        if options.link:
            os.symlink(old_dir, new_dir)
        else:
            os.rename(old_dir, new_dir)

    E.stop()
コード例 #6
0
def main(argv):

    options, args = P.parse_commandline(argv)

    if options.config_file:
        PARAMS = P.get_parameters(options.config_file)
    else:
        sys.exit(P.main(options, args))

    with arvados_enabled(always_mount=options.always_mount):
        mountpoint = PARAMS.get("mount_point", None)
        if mountpoint:
            redirect_defaults2mountpoint(mountpoint)

        with LibraryContext(PARAMS, options, args, argv, "daisy"):
            # A selection of command line arguments are added to PARAMS
            # as 'extras' not implemented in ruffus 2.6.3
            kwargs = collections.defaultdict(dict)
            if options.only_info:
                kwargs["extras"].update({'only_info': True})
                P.PARAMS["only_info"] = True
            if options.is_test:
                kwargs["extras"].update({'is_test': True})
                P.PARAMS["is_test"] = True

            E.debug("construction of workflow started")
            pipeline = ruffus.Pipeline('benchmark')
            # Tool execution
            suffix, tool_runners = add_tools_to_pipeline(pipeline,
                                                         map_tool_to_runner,
                                                         config=P.PARAMS,
                                                         **kwargs)

            E.debug("added tools to workflow ")
            # Optionally, add externally computed files as
            # pseudo-tools:
            if "external" in P.PARAMS["setup"]:
                external_runners = add_external_data_to_pipeline(
                    pipeline, config=P.PARAMS, **kwargs)
                tool_runners.extend(external_runners)

            # Optionally, combine tool runs into aggregate
            # outputs. The type of the output is preserved
            # (VCF -> VCF, etc.)
            # For example, call individual members in a trio
            # and then build a combined VCF to analyse mendelian
            # inconsistencies.
            if "collate" in P.PARAMS["setup"]:
                collate_runners = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["collate"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_collate", False):
                    tool_runners = []
                if P.PARAMS["setup"].get("no_collate_metrics", False):
                    collate_runners = []
                E.debug("added collators to workflow ")
            else:
                collate_runners = []

            # Optionally, split up the output before applying
            # additional analyses. The type of the output is preserved
            # (VCF -> VCF, etc).
            # For example, identify false positives, false negatives
            # and true positives and collect metrics individually.
            if "split" in P.PARAMS["setup"]:
                split_runners = add_splits_to_pipeline(
                    pipeline,
                    map_split_to_runner,
                    tool_runners,
                    P.PARAMS["setup"]["split"],
                    tasks=tool_runners,
                    config=P.PARAMS)
                if P.PARAMS["setup"].get("only_split", False):
                    tool_runners = []
                E.debug("added splitters to workflow ")
            else:
                split_runners = []

            metric_runners = []
            for prefix, r in zip(
                ["tool", "collate", "split"],
                [tool_runners, collate_runners, split_runners]):
                if not r:
                    continue

                metrics = None

                if prefix == "collate" and "collate_metrics" in P.PARAMS[
                        "setup"]:
                    metrics = P.PARAMS["setup"]["collate_metrics"]
                elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["split_metrics"]
                elif "metrics" in P.PARAMS["setup"]:
                    metrics = P.PARAMS["setup"]["metrics"]
                else:
                    raise KeyError(
                        "configuration file requires a 'setup:metrics' section"
                    )

                # Metric execution
                mm = add_metrics_to_pipeline(pipeline,
                                             metrics,
                                             map_metric_to_runner,
                                             r,
                                             suffix=suffix,
                                             prefix=prefix + "_",
                                             config=P.PARAMS,
                                             **kwargs)

                if len(mm) == 0:
                    raise ValueError(
                        "workflow construction error: "
                        "no metric tasks result for metrics {}".format(
                            metrics))

                metric_runners.extend(mm)
                E.debug("added {}_metrics to workflow".format(prefix))

            # add plot task
            if "aggregate" in P.PARAMS["setup"]:
                aggregate_metrics = add_collations_to_pipeline(
                    pipeline,
                    map_collate_to_runner,
                    P.PARAMS["setup"]["aggregate"],
                    metric_runners,
                    config=P.PARAMS)

                E.debug("added metric aggregation to workflow")
            else:
                aggregate_metrics = []

            add_upload_to_pipeline(pipeline,
                                   metric_runners + aggregate_metrics,
                                   P.PARAMS)
            E.debug("added upload to workflow".format(prefix))

            # add export task
            export = P.PARAMS["setup"].get("export",
                                           ["tools", "collate", "split"])
            map_export2runner = {
                "collate": collate_runners,
                "tools": tool_runners,
                "split": split_runners
            }

            export_runners = []
            for e in export:
                try:
                    export_runners.extend(map_export2runner[e])
                except KeyError:
                    raise KeyError("unknown export section: {}".format(e))

            add_export_to_pipeline(pipeline,
                                   export_runners,
                                   suffix=suffix,
                                   config=P.PARAMS)

            E.debug("added export to workflow")

            add_all_task_to_pipeline(pipeline,
                                     metric_runners + aggregate_metrics)

            # Collate output files to facilitate analysis
            if "collation" in P.PARAMS:
                collators = add_collations_to_pipeline(pipeline,
                                                       map_collate_to_runner,
                                                       P.PARAMS["collation"],
                                                       config=P.PARAMS)

            E.debug("construction of workflow completed")

    E.stop()
コード例 #7
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-d",
        "--database",
        dest="databases",
        action="append",
        help="only show what will be done, don't do it [%default]")

    parser.add_option("-f",
                      "--filter",
                      dest="filter_method",
                      type="choice",
                      choices=("first", "last"),
                      help="only input a selection of results")

    parser.add_option("-t",
                      "--target",
                      dest="target_database",
                      type="string",
                      help="the target database [%default]")

    parser.set_defaults(filter_method=None, databases=[])

    (options, args) = E.start(parser, argv)

    run_id_offset = 0
    instance_id_offset = 0

    for database in options.databases:
        source_db = sqlite3.connect(database)
        is_instance = False
        is_run = False

        cc = source_db.cursor()
        min_run_id = cc.execute("SELECT MIN (id) FROM run").fetchall()[0][0]
        max_run_id = cc.execute("SELECT MAX (id) FROM run").fetchall()[0][0]
        max_instance_id = cc.execute(
            "SELECT MAX (id) FROM instance").fetchall()[0][0]

        E.info("{}: min_run_id={}, max_run_id={}, max_instance_id={}".format(
            database, min_run_id, max_run_id, max_instance_id))

        for line in source_db.iterdump():

            if line.startswith("CREATE TABLE"):
                try:
                    tablename = re.search("CREATE TABLE \"(\S+)\"",
                                          line).groups()[0]
                except AttributeError:
                    tablename = re.search("CREATE TABLE (\S+)",
                                          line).groups()[0]

                is_instance = False
                is_run = False
                if tablename == "run":
                    offset = run_id_offset
                    pos = "first"
                    is_run = True
                elif tablename == "tags":
                    offset = run_id_offset
                    pos = "first"
                elif tablename == "instance":
                    is_instance = True
                elif tablename == "tool_timings":
                    offset = instance_id_offset
                    pos = "last"
                elif tablename == "metric_timings":
                    offset = instance_id_offset
                    pos = "last"
                else:
                    # metric table
                    offset = instance_id_offset
                    pos = "last"

            elif line.startswith("INSERT INTO"):

                if is_instance:
                    i, n = re.search("VALUES\((\d+),(\d+),", line).groups()
                    if apply_run_filter(n, options.filter_method, min_run_id,
                                        max_run_id):
                        line = None
                    else:
                        line = re.sub(
                            "VALUES\({},{},".format(i, n),
                            "VALUES({},{},".format(
                                int(i) + instance_id_offset,
                                int(n) + run_id_offset), line)
                else:
                    if pos == "last":
                        n = re.search(",(\d+)\)", line).groups()[0]
                        line = re.sub(",{}\)".format(n),
                                      ",{})".format(int(n) + offset), line)
                    elif pos == "first":
                        n = re.search("VALUES\((\d+),", line).groups()[0]
                        line = re.sub("VALUES\({},".format(n),
                                      "VALUES({},".format(int(n) + offset),
                                      line)
                        if is_run:
                            if apply_run_filter(n, options.filter_method,
                                                min_run_id, max_run_id):
                                line = None

            if line is not None:
                print(line)

        cc = source_db.cursor()
        run_id_offset += max_run_id
        instance_id_offset += max_instance_id

        E.info("{}: updated offsets to run_id={}, instance_id={}".format(
            database, run_id_offset, instance_id_offset))

    E.stop()
コード例 #8
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-l", "--logfile", dest="logfile", type="string",
                      help="name of logfile [default=%default]")

    parser.add_option("-t", "--time", dest="time", type="choice",
                      choices=("seconds", "milliseconds"),
                      help="time to show [default=%default]")

    parser.add_option(
        "--no-reset", dest="reset", action="store_false",
        help="do not reset counters when a new pipeline run started "
        "The default is to reset so that only the counts from the latest "
        "pipeline execution are show "
        "[default=%default]")

    parser.add_option(
        "-f", "--filter-method", dest="filter", type="choice",
        choices=("unfinished", "running", "completed", "all"),
        help="apply filter to output [default=%default]")

    parser.add_option(
        "-s", "--sort-order", dest="sort_order", type="choice",
        choices=("object", "ncalls", "duration", "percall", "running"),
        help="apply filter to output [default=%default]")

    parser.add_option(
        "-i", "--ignore-errors", dest="ignore_errors", action="store_true",
        help="ignore errors [default=%default]")

    parser.set_defaults(sections=[],
                        logfile="pipeline.log",
                        filter="all",
                        reset=True,
                        sort_order="duration",
                        time="seconds")

    (options, args) = E.start(parser, argv)

    if options.sections:
        profile_sections = options.sections
    else:
        profile_sections = ("task", "job")

    counts = {}
    for section in profile_sections:
        counts[section] = collections.defaultdict(Counter)

    def line_grouper(filename):
        rx = re.compile("\d{4}-\d{2}-\d{2} ")
        with IOTools.open_file(filename) as infile:
            last_line = None
            for line in infile:
                line = line.strip()
                if not rx.match(line):
                    last_line = " ".join((last_line, line))
                else:
                    if last_line:
                        yield last_line
                    last_line = line
            yield last_line

    for line in line_grouper(options.logfile):

        data = line.split()
        if len(data) < 5:
            continue
        print(line)
        date, time, level, pipeline, source = data[:5]

        if re.search("output generated by", line):
            if options.reset:
                E.info("resetting counts at line=%s" % line[:-1])
                for section in profile_sections:
                    counts[section] = collections.defaultdict(Counter)
            continue

        # filter for log messages from task module
        if source != "task":
            continue

        dt = datetime.datetime.strptime(
            " ".join((date, time)), "%Y-%m-%d %H:%M:%S,%f")

        msg = "".join(data[5:])

        started_task, completed_task, started_job, completed_job = \
            (None, None, None, None)

        if re.search("task.log_at_level.\d+Task=(\S+)", msg):
            checked_task = re.search(
                "task.log_at_level.\d+Task=(\S+)", msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg):
            started_infiles, started_job, missing = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg):
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg).groups()
        elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg):
            started_infiles, started_job = re.search(
                "Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg).groups()
        elif re.search("Taskentersqueue=(\S+)", msg):
            started_task = re.search("Taskentersqueue=(\S+)", msg).groups()[0]
        elif re.search("Job=\[(\S+)->(\S+)\]completed", msg):
            completed_infiles, completed_job = re.search(
                "Job=\[(\S+)->(\S+)\]completed", msg).groups()
        elif re.search("CompletedTask=(\S+)", msg):
            completed_task = re.search("CompletedTask=(\S+)", msg).groups()[0]
        elif re.search("UptodateTask=(\S+)", msg):
            completed_task = re.search("UptodateTask=(\S+)", msg).groups()[0]
        else:
            continue

        try:
            if started_task:
                counts["task"][(pipeline, started_task)].add(True, dt, started_task)
            elif completed_task:
                counts["task"][(pipeline, completed_task)].add(False, dt, completed_task)
            elif started_job:
                counts["job"][(pipeline, started_job)].add(True, dt, started_job)
            elif completed_job:
                counts["job"][(pipeline, completed_job)].add(False, dt, completed_job)
            else:
                raise ValueError("unknown action")
        except ValueError as msg:
            if not options.ignore_errors:
                raise ValueError(str(msg) + "\nat line %s" % line)

    def to_milliseconds(d):
        return d.seconds + d.microseconds / 1000

    def to_seconds(d):
        return d.seconds + d.microseconds / 1000000

    if options.time == "milliseconds":
        f = to_milliseconds
    elif options.time == "seconds":
        f = to_seconds

    for section in profile_sections:
        running = []
        rows = []
        for objct, c in list(counts[section].items()):

            # apply filters
            if options.filter in ("unfinished", "running") and c.running == 0:
                continue

            d = f(c.duration)
            if c.calls > 0:
                percall = "%6.3f" % (d / float(c.calls))
            else:
                percall = "na"

            rows.append((section,
                         objct[0],
                         objct[1],
                         c.calls,
                         d,
                         percall,
                         c.running))
            running.extend([x for x, y in c._started.items() if y != 0])

        header = ("section", "pipeline", "object", "ncalls",
                  "duration", "percall", "running")

        options.stdout.write("\t".join((header)) + "\n")
        idx = header.index(options.sort_order)
        rows = sorted(rows, key=lambda x: x[idx])

        options.stdout.write("\n".join(
            ["\t".join(map(str, x)) for x in rows]) + "\n")

        options.stdout.write("#//\n\n")

        if running:
            options.stdout.write("# running %ss\n" % section)
            options.stdout.write("\n".join(map(str, running)) + "\n")
            options.stdout.write("#//\n\n")

    E.stop()