def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-n", "--dry-run", dest="dry_run", action="store_true", help="only show what will be done, don't do it [%default]") parser.add_option("-l", "--link", dest="link", action="store_true", help="link instead of rename [%default]") parser.set_defaults(dry_run=False, link=False) (options, args) = E.start(parser, argv) config = P.get_parameters("benchmark.yml") old_data, new_data = [], [] for old_info in glob.glob("*.dir/tool.info"): old_dir, old_file = os.path.split(old_info) old_info = Toolkit.read_data(old_info) old_data.append((old_dir, old_info)) tool_functions = Workflow.build_tool_functions(map_tool_to_runner, config) config_files = Workflow.expand_globs(config["input"]) input_combos = Workflow.build_combinations(config_files) map_property_to_dir = collections.defaultdict(list) for toolf, input_files in itertools.product(tool_functions, input_combos): # create a copy of the task function and give it its unique name # by mangling it with the input_files taskf = copy.copy(toolf) taskf.register_input(input_files) result_dir = os.path.basename(os.path.join(taskf.__name__ + ".dir")) new_data.append((result_dir, taskf)) for a, x, y in IOTools.nested_iter(taskf.input_files): map_property_to_dir[(x, y)].append(result_dir) map_property_to_dir[("name", taskf.name)].append(result_dir) for x, y in list(taskf._option_dict.items()): map_property_to_dir[(x, y)].append(result_dir) # match by input_files options.stdout.write("\t".join(("old", "new", "matching")) + "\n") for old_dir, old_info in old_data: targets = [] for a, x, y in IOTools.nested_iter(old_info["input_files"]): if (x, y) in map_property_to_dir: targets.extend(map_property_to_dir[(x, y)]) for x, y in list(old_info.items()): try: targets.extend(map_property_to_dir[(x, y)]) except TypeError: pass counts = collections.Counter(targets) max_count = max(counts.values()) max_count_items = [ x for x, y in list(counts.items()) if y == max_count ] if len(max_count_items) > 1: E.warn("multiple matches for {}, ignored".format(old_dir)) continue new_dir = max_count_items[0] options.stdout.write("\t".join(map(str, (old_dir, new_dir, max_count))) + "\n") if os.path.exists(new_dir): raise ValueError("directory {} already exists".format(new_dir)) if options.dry_run: continue if options.link: os.symlink(old_dir, new_dir) else: os.rename(old_dir, new_dir) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-r", "--restrict-regex", dest="restrict_regex", action="append", help="pattern to restrict tests to certain tools/metrics. " "Can be specified multiple times [%default]") parser.add_option( "--data-directory", dest="data_directory", help="directory with sample data sets. This will override the default " "datadir in the configuration file and the environment variable " "DAISY_TEST_DATADIR [%default]") parser.add_option( "--library-directory", dest="library_directory", action="append", help="directory TaskLibrary functions. Will be added to the built-in " "and the one specified in DAISY_TASKLIBRARY environment variable " "[%default]") parser.add_option("--always-mount", dest="always_mount", action="store_true", help="force mounting of arvados keep [%default]") parser.add_option("--keep-failed-temp", dest="keep_failed_temp", action="store_true", help="keep temporary files of failed tests [%default]") parser.set_defaults( restrict_regex=[], always_mount=False, data_directory=None, keep_failed_temp=False, library_directories=[], ) (options, args) = E.start(parser, argv=argv, add_output_options=True) P.get_parameters() # load the built-in tests filenames = [ os.path.join(os.path.dirname(os.path.dirname(__file__)), "TaskLibrary", "test_task_library.yml") ] if "DAISY_TASKLIBRARY" in os.environ: filenames.append( os.path.join(os.environ["DAISY_TASKLIBRARY"], "test_task_library.yml")) filenames.extend(options.library_directories) master_config = None for fn in filenames: if not os.path.exists(fn): E.warn("file {} does not exist".format(fn)) continue with IOTools.open_file(fn) as inf: raw_txt = inf.read() test_config = yaml.load(raw_txt) if test_config is None: E.warn("file {} is empty".format(fn)) continue data_directory = os.environ.get("DAISY_TEST_DATADIR", test_config.get("data_directory")) if options.data_directory: data_directory = options.data_directory # reload config with placeholders replaced test_config = yaml.load(re.sub("DATADIR", data_directory, raw_txt)) if master_config is None: master_config = test_config else: # add additional tool/test metrics master_config["tool"].update(test_config.get("tool", {})) master_config["metric"].update(test_config.get("metric", {})) for test_section, testclass, map_name_to_runner in [ ("tool", TestTool, map_tool_to_runner), ("metric", TestMetric, map_metric_to_runner) ]: ignore = master_config[test_section].get("ignore", []) # propagate config variables testclass.test_config = master_config for task, taskf in sorted(map_name_to_runner.items()): found = False for to_ignore in ignore: if re.match(to_ignore, task): found = True if found: continue if options.restrict_regex: take = False for x in options.restrict_regex: if re.search(x, task): take = True if not take: continue add_tests(task, taskf, testclass) failed = False with arvados_enabled(always_mount=options.always_mount): for testclass in [TestTool, TestMetric]: suite = unittest.TestLoader().loadTestsFromTestCase(testclass) result = unittest.TextTestRunner(verbosity=2).run(suite) failed |= not result.wasSuccessful() # remove all tests in test class - necessary if function is # called repeatedly clear_tests(testclass) E.stop() return failed
def main(argv): options, args = P.parse_commandline(argv) if options.config_file: PARAMS = P.get_parameters(options.config_file) else: sys.exit(P.main(options, args)) with arvados_enabled(always_mount=options.always_mount): mountpoint = PARAMS.get("mount_point", None) if mountpoint: redirect_defaults2mountpoint(mountpoint) with LibraryContext(PARAMS, options, args, argv, "daisy"): # A selection of command line arguments are added to PARAMS # as 'extras' not implemented in ruffus 2.6.3 kwargs = collections.defaultdict(dict) if options.only_info: kwargs["extras"].update({'only_info': True}) P.PARAMS["only_info"] = True if options.is_test: kwargs["extras"].update({'is_test': True}) P.PARAMS["is_test"] = True E.debug("construction of workflow started") pipeline = ruffus.Pipeline('benchmark') # Tool execution suffix, tool_runners = add_tools_to_pipeline(pipeline, map_tool_to_runner, config=P.PARAMS, **kwargs) E.debug("added tools to workflow ") # Optionally, add externally computed files as # pseudo-tools: if "external" in P.PARAMS["setup"]: external_runners = add_external_data_to_pipeline( pipeline, config=P.PARAMS, **kwargs) tool_runners.extend(external_runners) # Optionally, combine tool runs into aggregate # outputs. The type of the output is preserved # (VCF -> VCF, etc.) # For example, call individual members in a trio # and then build a combined VCF to analyse mendelian # inconsistencies. if "collate" in P.PARAMS["setup"]: collate_runners = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["collate"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_collate", False): tool_runners = [] if P.PARAMS["setup"].get("no_collate_metrics", False): collate_runners = [] E.debug("added collators to workflow ") else: collate_runners = [] # Optionally, split up the output before applying # additional analyses. The type of the output is preserved # (VCF -> VCF, etc). # For example, identify false positives, false negatives # and true positives and collect metrics individually. if "split" in P.PARAMS["setup"]: split_runners = add_splits_to_pipeline( pipeline, map_split_to_runner, tool_runners, P.PARAMS["setup"]["split"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_split", False): tool_runners = [] E.debug("added splitters to workflow ") else: split_runners = [] metric_runners = [] for prefix, r in zip( ["tool", "collate", "split"], [tool_runners, collate_runners, split_runners]): if not r: continue metrics = None if prefix == "collate" and "collate_metrics" in P.PARAMS[ "setup"]: metrics = P.PARAMS["setup"]["collate_metrics"] elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["split_metrics"] elif "metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["metrics"] else: raise KeyError( "configuration file requires a 'setup:metrics' section" ) # Metric execution mm = add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, r, suffix=suffix, prefix=prefix + "_", config=P.PARAMS, **kwargs) if len(mm) == 0: raise ValueError( "workflow construction error: " "no metric tasks result for metrics {}".format( metrics)) metric_runners.extend(mm) E.debug("added {}_metrics to workflow".format(prefix)) # add plot task if "aggregate" in P.PARAMS["setup"]: aggregate_metrics = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["aggregate"], metric_runners, config=P.PARAMS) E.debug("added metric aggregation to workflow") else: aggregate_metrics = [] add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics, P.PARAMS) E.debug("added upload to workflow".format(prefix)) # add export task export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"]) map_export2runner = { "collate": collate_runners, "tools": tool_runners, "split": split_runners } export_runners = [] for e in export: try: export_runners.extend(map_export2runner[e]) except KeyError: raise KeyError("unknown export section: {}".format(e)) add_export_to_pipeline(pipeline, export_runners, suffix=suffix, config=P.PARAMS) E.debug("added export to workflow") add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics) # Collate output files to facilitate analysis if "collation" in P.PARAMS: collators = add_collations_to_pipeline(pipeline, map_collate_to_runner, P.PARAMS["collation"], config=P.PARAMS) E.debug("construction of workflow completed") E.stop()