def save_benchmark(self, outfile, benchmark): if not isinstance(benchmark, list): benchmark = [benchmark] # flatten if nested list and remove None benchmark = [ x for x in IOTools.flatten(benchmark, ltypes=(list, )) if x is not None ] filename = self.build_meta_filename(outfile, "benchmark.bench") if not benchmark: E.warn("could not save benchmark info to {}".format(filename)) return try: header = benchmark[0]._fields except AttributeError as ex: E.warn("could not save benchmark timings for {}:" " {} from {}".format(outfile, str(ex), str(benchmark[0]))) return with open(filename, "w") as outf: outf.write("\t".join(header) + "\n") for b in benchmark: outf.write("\t".join(map(str, b)) + "\n")
def upload_result(infiles, outfile, *extras): """upload results into database. Connection details for the database are taken from the configuration dictionary given as first argument to extras. The configuration directory should have an element 'database' with the required field ``url`` and the optional field ``schema``. For example, to upload to an sqlite database in the current directory called csvdb, use:: config = {"database": {"url": "sqlite:///./csvdb"}} Arguments --------- infiles: list List of files to upload. These should be the output of metric tasks in a benchmarking workflow. outfile: output file On success, an empty output file is created. extras: list List of one element containing a configuration directory (see above). """ logger = P.get_logger() if len(extras) != 1: raise ValueError("expecting only one extra argument " "(configuration dictionary)") config = extras[0] url = config["database"]["url"] is_sqlite3 = url.startswith("sqlite") if is_sqlite3: connect_args = {'check_same_thread': False} else: connect_args = {} schema = config["database"].get("schema", None) # TODO: check if schema exists to avoid incomplete # transaction. engine = sqlalchemy.create_engine(url, connect_args=connect_args) # Catch exceptions until database access on thame available try: create_database(engine) except OperationalError as msg: logger.warn("could not connect to database at {}. " "The data will not be uploaded. Msg={}".format( url, str(msg))) return # Create schema if not exists if schema is not None: engine.execute(text("CREATE SCHEMA IF NOT EXISTS {}".format(schema))) pipeline_name = os.path.basename(sys.argv[0]) logger.debug("uploading data to {}, schema={}".format(url, schema)) # TODO: add dependencies # dependencies = infiles[1:] # meta_data = dict([("dependency{}".format(x), y) \ # for x, y in enumerate(dependencies)]) # need to set created dir somehow, important when re-loading # as otherwise all times will be the same. if os.path.exists("benchmark.yml"): s = os.stat("benchmark.yml") created = datetime.datetime.fromtimestamp(s.st_mtime) else: created = datetime.datetime.now() benchmark_run = BenchmarkRun( author=os.environ.get("USER", "unknown"), # needs refactoring, should be: uploaded_at, created_at, run_at # uploaded_at=datetime.datetime.now(), created=created, pipeline_name=pipeline_name, pipeline_version=P.get_version(), pipeline_dir=os.getcwd(), title=config["title"], description=config["description"], config=json.dumps(config), config_hash=hash(json.dumps(config)), status="incomplete") Session = sessionmaker(bind=engine) session = Session() session.add(benchmark_run) session.commit() for tag in config["tags"]: benchmark_tag = BenchmarkTag(run_id=benchmark_run.id, tag=tag) session.add(benchmark_tag) session.commit() tool_dirs = set() table_cache = TableCache(engine, schema, is_sqlite3) for infile in infiles: path, name = os.path.split(infile) # walk up the path to find "benchmark.info" as it might be # located on a higher level if the tool output multiple files. parts = path.split(os.sep) info_paths = [] rootdir = os.getcwd() while len(parts): p = os.path.join(*parts) if p == rootdir: break if os.path.exists(os.path.join(p, "benchmark.info")): info_paths.append(p) parts.pop() info_paths = info_paths[::-1] # the level of nesting determines the layout: # 1 level: aggregation: tool == metric # 2 levels: tool + metric # 3 levels: tool + split + metric if len(info_paths) not in (1, 2, 3): raise ValueError("for {}, expected two or three paths with info, " "got {}".format(infile, len(info_paths))) meta_data = {} if len(info_paths) == 1: tool_dir = metric_dir = info_paths[0] split_dir = None elif len(info_paths) == 2: tool_dir, metric_dir = info_paths split_dir = None # If there are multiple output files in aggregation, use # intermediate paths as split_subset factors. td = len(tool_dir.split(os.sep)) tm = len(metric_dir.split(os.sep)) d = tm - td if d > 1: meta_data["split_subset"] = re.sub( ".dir", "", os.sep.join(metric_dir.split(os.sep)[td:-1])) elif len(info_paths) == 3: tool_dir, split_dir, metric_dir = info_paths if tool_dir: d = read_data(os.path.join(tool_dir, "benchmark.info"), prefix="tool_") if "tool_action" in d: assert d["tool_action"] == "tool" meta_data.update(d) if metric_dir: d = read_data(os.path.join(metric_dir, "benchmark.info"), prefix="metric_") if "metric_action" in d: # ignore splits, they will be added through metrics if d["metric_action"] == "split": continue assert d["metric_action"] == "metric", \ "action for metric info {} is not 'metric', but '{}'" \ .format(os.path.join(metric_dir, "benchmark.info"), d["metric_action"]) meta_data.update(d) if split_dir: d = read_data(os.path.join(split_dir, "benchmark.info"), prefix="split_") if "split_action" in d: assert d["split_action"] == "split" meta_data.update(d) subset = os.path.basename(os.path.dirname(info_paths[-1])) if subset.endswith(".dir"): subset = subset[:-len(".dir")] meta_data["split_subset"] = subset # tool_input_files can either be a dictionary if a tool # or a simple list if aggregation. try: tool_input_files = [ x["path"] for x in meta_data["tool_input_files"] ] except TypeError: tool_input_files = meta_data["tool_input_files"] try: instance = BenchmarkInstance( run_id=benchmark_run.id, completed=datetime.datetime.fromtimestamp( os.path.getmtime(infile)), input=",".join(tool_input_files), input_alias=meta_data["tool_input_alias"], tool_name=meta_data["tool_name"], tool_version=meta_data["tool_version"], tool_options=meta_data["tool_options"], tool_hash=meta_data["tool_option_hash"], tool_alias=meta_data.get("tool_alias", ""), metric_name=meta_data["metric_name"], metric_version=meta_data["metric_version"], metric_options=meta_data["metric_options"], metric_hash=meta_data["metric_option_hash"], metric_alias=meta_data.get("metric_alias", ""), split_name=meta_data.get("split_name", ""), split_version=meta_data.get("split_version", ""), split_options=meta_data.get("split_options", ""), split_hash=meta_data.get("split_option_hash", ""), split_alias=meta_data.get("split_alias", ""), split_subset=meta_data.get("split_subset", "all"), meta_data=json.dumps(meta_data)) except KeyError as e: raise KeyError("missing required attribute {} in {}".format( str(e), str(meta_data))) session.add(instance) session.commit() # avoid multiple upload of tool data if tool_dir and tool_dir not in tool_dirs: tool_dirs.add(tool_dir) save_benchmark_timings(tool_dir, "tool_timings", engine, instance, schema, is_sqlite3) save_benchmark_timings(metric_dir, "metric_timings", engine, instance, schema, is_sqlite3) metric_table_filter = None if "metric_no_upload" in meta_data: if meta_data["metric_no_upload"] == "*": logger.warn("upload turned off for metric {}".format( meta_data["metric_name"])) continue else: metric_table_filter = re.compile(meta_data["metric_no_upload"]) # multiple tablenames for multiple metric output # # Tables are added into schemas to avoid cluttering # the public namespace. # (if only blobs, no metric output file) if "metric_output_files" in meta_data: assert len(meta_data["metric_output_files"]) == \ len(meta_data["metric_tablenames"]) for output_file, tablename in zip(meta_data["metric_output_files"], meta_data["metric_tablenames"]): if metric_table_filter and metric_table_filter.search( tablename): logger.warn( "upload for table {} turned off".format(tablename)) continue if not os.path.exists(output_file): logger.warn( "output file {} does not exist - ignored".format( output_file)) continue if IOTools.is_empty(output_file): logger.warn("output file {} is empty - ignored".format( output_file)) continue try: table = pandas.read_csv(output_file, sep="\t", comment="#", skip_blank_lines=True) except ValueError as e: logger.warn("table {} can not be read: {}".format( output_file, str(e))) continue except pandas.parser.CParserError as e: logger.warn( "malformatted table {} can not be read: {}".format( output_file, str(e))) continue if len(table) == 0: logger.warn( "table {} is empty - ignored".format(output_file)) continue tablename, table, dtypes = transform_table_before_upload( tablename, table, instance, meta_data, table_cache) if schema is None: tn = tablename else: tn = "{}.{}".format(schema, tablename) logger.debug("saving data from {} to table {}".format( output_file, tn)) # add foreign key table["instance_id"] = instance.id table_cache.add_table(table, tablename, dtypes) if "metric_blob_globs" in meta_data: metric_dir = meta_data["metric_outdir"] files = [ glob.glob(os.path.join(metric_dir, x)) for x in meta_data["metric_blob_globs"] ] files = IOTools.flatten(files) logger.debug("uploading binary data in {} files from {} to " "table binary_data".format(len(files), metric_dir)) table = [] for fn in files: with IOTools.open_file(fn, "rb") as inf: data_row = BenchmarkBinaryData( instance_id=instance.id, filename=os.path.basename(fn), path=fn, data=inf.read()) session.add(data_row) session.commit() table_cache.close() touch(outfile) # upload table sizes df_sizes = pandas.DataFrame.from_records( list(table_cache.uploaded_sizes.items()), columns=["tablename", "bytes_uploaded"]) df_sizes["bytes_resident"] = df_sizes.bytes_uploaded df_sizes["run_id"] = benchmark_run.id df_sizes["schema"] = schema save_table(df_sizes, engine, "metric_storage", schema=None, is_sqlite3=is_sqlite3) # check if arvados job if Arvados.have_arvados(): try: arv_job_info = arvados.current_job() except KeyError: arv_job_info = None if arv_job_info is not None: arv_job = BenchmarkArvadosJob( run_id=benchmark_run.id, job_uuid=arv_job_info["uuid"], owner_uuid=arv_job_info["owner_uuid"]) session.add(arv_job) session.commit() benchmark_run.status = "complete" session.commit() engine.dispose() del engine logger.info("uploaded results under run_id {}".format(benchmark_run.id))