def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) tools = glob.glob( os.path.join(os.path.dirname(__file__), "..", "src", "daisy", "tools", "*.py")) counter = E.Counter() for tool in tools: counter.found += 1 tool_module = re.sub(".py", "", os.path.basename(tool)) tool_name = re.sub("_", "-", tool_module) if tool_name in ("__init__", "cli"): c.ignored += 1 continue dest = os.path.join("tools", "{}.rst".format(tool_name)) if os.path.exists(dest) and not options.output_force: counter.skipped += 1 continue with IOTools.openFile(dest, "w") as outf: outf.write(TEMPLATE_TOOL.format(**locals())) counter.new += 1 E.info(counter) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) (options, args) = E.start(parser, argv=argv, add_output_options=True) total_counter = E.Counter() table = [] for section, map_task2runner in [("tool", map_tool_to_runner), ("metric", map_metric_to_runner), ("split", map_split_to_runner), ("collate", map_collate_to_runner)]: E.debug("processing section: {}".format(section)) counter = E.Counter() for task, taskf in sorted(map_task2runner.items()): counter.ntasks += 1 comments = [] try: version = taskf().get_version() counter.version_ok += 1 except Exception: version = "" comments.append("unavailable") counter.version_fail += 1 comments = "; ".join(comments) table.append((section, task, version, comments)) E.info("{}: {}".format(section, counter)) total_counter += counter options.stdout.write("section\ttask\tversion\tcomments\n") for row in table: options.stdout.write("\t".join(map(str, row)) + "\n") E.info("{}: {}".format("total", counter)) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-r", "--run-id", dest="run_id", type="int", help="numerical identifier of a run [%default]") parser.add_option("-d", "--database-url", dest="database_url", type="string", help="database url [%default]") parser.add_option("-n", "--dry-run", dest="dry_run", action="store_true", help="only show statements to be executed [%default]") parser.set_defaults( run_id=None, database_url="sqlite:///./csvdb", dry_run=False, ) (options, args) = E.start(parser, argv=argv, add_output_options=True) Storage.purge_run_id(options.run_id, options.database_url, dry_run=options.dry_run) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--restrict-regex", dest="restrict_regex", action="append", help="pattern to restrict tests to certain tools/metrics. " "Can be specified multiple times [%default]") parser.add_option( "--data-directory", dest="data_directory", help="directory with sample data sets. This will override the default " "datadir in the configuration file and the environment variable " "DAISY_TEST_DATADIR [%default]") parser.add_option( "--library-directory", dest="library_directory", action="append", help="directory TaskLibrary functions. Will be added to the built-in " "and the one specified in DAISY_TASKLIBRARY environment variable " "[%default]") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.add_option("--keep-failed-temp", dest="keep_failed_temp", action="store_true", help="keep temporary files of failed tests [%default]") parser.set_defaults( restrict_regex=[], always_mount=False, data_directory=None, keep_failed_temp=False, library_directories=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) P.get_parameters() # load the built-in tests filenames = [ os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary", "test_task_library.yml") ] if "DAISY_TASKLIBRARY" in os.environ: filenames.append( os.path.join(os.environ["DAISY_TASKLIBRARY"], "test_task_library.yml")) filenames.extend(options.library_directories) master_config = None for fn in filenames: if not os.path.exists(fn): E.warn("file {} does not exist".format(fn)) continue with IOTools.open_file(fn) as inf: raw_txt = inf.read() test_config = yaml.load(raw_txt) if test_config is None: E.warn("file {} is empty".format(fn)) continue data_directory = os.environ.get("DAISY_TEST_DATADIR", test_config.get("data_directory")) if options.data_directory: data_directory = options.data_directory # reload config with placeholders replaced test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt)) if master_config is None: master_config = test_config else: # add additional tool/test metrics master_config["tool"].update(test_config.get("tool", {})) master_config["metric"].update(test_config.get("metric", {})) for test_section, testclass, map_name_to_runner in [ ("tool", TestTool, map_tool_to_runner), ("metric", TestMetric, map_metric_to_runner) ]: ignore = master_config[test_section].get("ignore", []) # propagate config variables testclass.test_config = master_config for task, taskf in sorted(map_name_to_runner.items()): found = False for to_ignore in ignore: if re.match(to_ignore, task): found = True if found: continue if options.restrict_regex: take = False for x in options.restrict_regex: if re.search(x, task): take = True if not take: continue add_tests(task, taskf, testclass) failed = False with arvados_enabled(always_mount=options.always_mount): for testclass in [TestTool, TestMetric]: suite = unittest.TestLoader().loadTestsFromTestCase(testclass) result = unittest.TextTestRunner(verbosity=2).run(suite) failed |= not result.wasSuccessful() # remove all tests in test class - necessary if function is # called repeatedly clear_tests(testclass) E.stop() return failed
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-n", "--dry-run", dest="dry_run", action="store_true", help="only show what will be done, don't do it [%default]") parser.add_option("-l", "--link", dest="link", action="store_true", help="link instead of rename [%default]") parser.set_defaults(dry_run=False, link=False) (options, args) = E.start(parser, argv) config = P.get_parameters("benchmark.yml") old_data, new_data = [], [] for old_info in glob.glob("*.dir/tool.info"): old_dir, old_file = os.path.split(old_info) old_info = Toolkit.read_data(old_info) old_data.append((old_dir, old_info)) tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config) config_files = Workflow.expand_globs(config["input"]) input_combos = Workflow.build_combinations(config_files) map_property_to_dir = collections.defaultdict(list) for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files) result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir")) new_data.append((result_dir, taskf)) for a, x, y in IOTools.nested_iter(taskf.input_files): map_property_to_dir[(x, y)].append(result_dir) map_property_to_dir[("name", taskf.name)].append(result_dir) for x, y in list(taskf._option_dict.items()): map_property_to_dir[(x, y)].append(result_dir) # match by input_files options.stdout.write("\t".join(("old", "new", "matching")) + "\n") for old_dir, old_info in old_data: targets = [] for a, x, y in IOTools.nested_iter(old_info["input_files"]): if (x, y) in map_property_to_dir: targets.extend(map_property_to_dir[(x, y)]) for x, y in list(old_info.items()): try: targets.extend(map_property_to_dir[(x, y)]) except TypeError: pass counts = collections.Counter(targets) max_count = max(counts.values()) max_count_items = [ x for x, y in list(counts.items()) if y == max_count ] if len(max_count_items) > 1: E.warn("multiple matches for {}, ignored".format(old_dir)) continue new_dir = max_count_items[0] options.stdout.write("\t".join(map(str, (old_dir, new_dir, max_count))) + "\n") if os.path.exists(new_dir): raise ValueError("directory {} already exists".format(new_dir)) if options.dry_run: continue if options.link: os.symlink(old_dir, new_dir) else: os.rename(old_dir, new_dir) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-d", "--database", dest="databases", action="append", help="only show what will be done, don't do it [%default]") parser.add_option("-f", "--filter", dest="filter_method", type="choice", choices=("first", "last"), help="only input a selection of results") parser.add_option("-t", "--target", dest="target_database", type="string", help="the target database [%default]") parser.set_defaults(filter_method=None, databases=[]) (options, args) = E.start(parser, argv) run_id_offset = 0 instance_id_offset = 0 for database in options.databases: source_db = sqlite3.connect(database) is_instance = False is_run = False cc = source_db.cursor() min_run_id = cc.execute("SELECT MIN (id) FROM run").fetchall()[0][0] max_run_id = cc.execute("SELECT MAX (id) FROM run").fetchall()[0][0] max_instance_id = cc.execute( "SELECT MAX (id) FROM instance").fetchall()[0][0] E.info("{}: min_run_id={}, max_run_id={}, max_instance_id={}".format( database, min_run_id, max_run_id, max_instance_id)) for line in source_db.iterdump(): if line.startswith("CREATE TABLE"): try: tablename = re.search("CREATE TABLE \"(\S+)\"", line).groups()[0] except AttributeError: tablename = re.search("CREATE TABLE (\S+)", line).groups()[0] is_instance = False is_run = False if tablename == "run": offset = run_id_offset pos = "first" is_run = True elif tablename == "tags": offset = run_id_offset pos = "first" elif tablename == "instance": is_instance = True elif tablename == "tool_timings": offset = instance_id_offset pos = "last" elif tablename == "metric_timings": offset = instance_id_offset pos = "last" else: # metric table offset = instance_id_offset pos = "last" elif line.startswith("INSERT INTO"): if is_instance: i, n = re.search("VALUES\((\d+),(\d+),", line).groups() if apply_run_filter(n, options.filter_method, min_run_id, max_run_id): line = None else: line = re.sub( "VALUES\({},{},".format(i, n), "VALUES({},{},".format( int(i) + instance_id_offset, int(n) + run_id_offset), line) else: if pos == "last": n = re.search(",(\d+)\)", line).groups()[0] line = re.sub(",{}\)".format(n), ",{})".format(int(n) + offset), line) elif pos == "first": n = re.search("VALUES\((\d+),", line).groups()[0] line = re.sub("VALUES\({},".format(n), "VALUES({},".format(int(n) + offset), line) if is_run: if apply_run_filter(n, options.filter_method, min_run_id, max_run_id): line = None if line is not None: print(line) cc = source_db.cursor() run_id_offset += max_run_id instance_id_offset += max_instance_id E.info("{}: updated offsets to run_id={}, instance_id={}".format( database, run_id_offset, instance_id_offset)) E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-l", "--logfile", dest="logfile", type="string", help="name of logfile [default=%default]") parser.add_option("-t", "--time", dest="time", type="choice", choices=("seconds", "milliseconds"), help="time to show [default=%default]") parser.add_option( "--no-reset", dest="reset", action="store_false", help="do not reset counters when a new pipeline run started " "The default is to reset so that only the counts from the latest " "pipeline execution are show " "[default=%default]") parser.add_option( "-f", "--filter-method", dest="filter", type="choice", choices=("unfinished", "running", "completed", "all"), help="apply filter to output [default=%default]") parser.add_option( "-s", "--sort-order", dest="sort_order", type="choice", choices=("object", "ncalls", "duration", "percall", "running"), help="apply filter to output [default=%default]") parser.add_option( "-i", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default=%default]") parser.set_defaults(sections=[], logfile="pipeline.log", filter="all", reset=True, sort_order="duration", time="seconds") (options, args) = E.start(parser, argv) if options.sections: profile_sections = options.sections else: profile_sections = ("task", "job") counts = {} for section in profile_sections: counts[section] = collections.defaultdict(Counter) def line_grouper(filename): rx = re.compile("\d{4}-\d{2}-\d{2} ") with IOTools.open_file(filename) as infile: last_line = None for line in infile: line = line.strip() if not rx.match(line): last_line = " ".join((last_line, line)) else: if last_line: yield last_line last_line = line yield last_line for line in line_grouper(options.logfile): data = line.split() if len(data) < 5: continue print(line) date, time, level, pipeline, source = data[:5] if re.search("output generated by", line): if options.reset: E.info("resetting counts at line=%s" % line[:-1]) for section in profile_sections: counts[section] = collections.defaultdict(Counter) continue # filter for log messages from task module if source != "task": continue dt = datetime.datetime.strptime( " ".join((date, time)), "%Y-%m-%d %H:%M:%S,%f") msg = "".join(data[5:]) started_task, completed_task, started_job, completed_job = \ (None, None, None, None) if re.search("task.log_at_level.\d+Task=(\S+)", msg): checked_task = re.search( "task.log_at_level.\d+Task=(\S+)", msg).groups()[0] elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg): started_infiles, started_job, missing = re.search( "Job=\[(\S+)->(\S+)\].*Missingfile[s]*\[(\S+)\]", msg).groups() elif re.search("Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg): started_infiles, started_job = re.search( "Job=\[(\S+)->(\S+)\].*Missingoutputfile", msg).groups() elif re.search("Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg): started_infiles, started_job = re.search( "Job=\[(\S+)->(\S+)\].*Missingfile[s]*", msg).groups() elif re.search("Taskentersqueue=(\S+)", msg): started_task = re.search("Taskentersqueue=(\S+)", msg).groups()[0] elif re.search("Job=\[(\S+)->(\S+)\]completed", msg): completed_infiles, completed_job = re.search( "Job=\[(\S+)->(\S+)\]completed", msg).groups() elif re.search("CompletedTask=(\S+)", msg): completed_task = re.search("CompletedTask=(\S+)", msg).groups()[0] elif re.search("UptodateTask=(\S+)", msg): completed_task = re.search("UptodateTask=(\S+)", msg).groups()[0] else: continue try: if started_task: counts["task"][(pipeline, started_task)].add(True, dt, started_task) elif completed_task: counts["task"][(pipeline, completed_task)].add(False, dt, completed_task) elif started_job: counts["job"][(pipeline, started_job)].add(True, dt, started_job) elif completed_job: counts["job"][(pipeline, completed_job)].add(False, dt, completed_job) else: raise ValueError("unknown action") except ValueError as msg: if not options.ignore_errors: raise ValueError(str(msg) + "\nat line %s" % line) def to_milliseconds(d): return d.seconds + d.microseconds / 1000 def to_seconds(d): return d.seconds + d.microseconds / 1000000 if options.time == "milliseconds": f = to_milliseconds elif options.time == "seconds": f = to_seconds for section in profile_sections: running = [] rows = [] for objct, c in list(counts[section].items()): # apply filters if options.filter in ("unfinished", "running") and c.running == 0: continue d = f(c.duration) if c.calls > 0: percall = "%6.3f" % (d / float(c.calls)) else: percall = "na" rows.append((section, objct[0], objct[1], c.calls, d, percall, c.running)) running.extend([x for x, y in c._started.items() if y != 0]) header = ("section", "pipeline", "object", "ncalls", "duration", "percall", "running") options.stdout.write("\t".join((header)) + "\n") idx = header.index(options.sort_order) rows = sorted(rows, key=lambda x: x[idx]) options.stdout.write("\n".join( ["\t".join(map(str, x)) for x in rows]) + "\n") options.stdout.write("#//\n\n") if running: options.stdout.write("# running %ss\n" % section) options.stdout.write("\n".join(map(str, running)) + "\n") options.stdout.write("#//\n\n") E.stop()