Esempio n. 1
0
    def save_benchmark(self, outfile, benchmark):

        if not isinstance(benchmark, list):
            benchmark = [benchmark]

        # flatten if nested list and remove None
        benchmark = [
            x for x in IOTools.flatten(benchmark, ltypes=(list, ))
            if x is not None
        ]

        filename = self.build_meta_filename(outfile, "benchmark.bench")

        if not benchmark:
            E.warn("could not save benchmark info to {}".format(filename))
            return

        try:
            header = benchmark[0]._fields
        except AttributeError as ex:
            E.warn("could not save benchmark timings for {}:"
                   " {} from {}".format(outfile, str(ex), str(benchmark[0])))
            return

        with open(filename, "w") as outf:
            outf.write("\t".join(header) + "\n")
            for b in benchmark:
                outf.write("\t".join(map(str, b)) + "\n")
Esempio n. 2
0
def upload_result(infiles, outfile, *extras):
    """upload results into database.

    Connection details for the database are taken from the
    configuration dictionary given as first argument to extras.  The
    configuration directory should have an element 'database' with the
    required field ``url`` and the optional field ``schema``.  For
    example, to upload to an sqlite database in the current directory
    called csvdb, use::

        config = {"database": {"url": "sqlite:///./csvdb"}}

    Arguments
    ---------
    infiles: list
       List of files to upload. These should be the output
       of metric tasks in a benchmarking workflow.
    outfile: output file
       On success, an empty output file is created.
    extras: list
       List of one element containing a configuration directory
       (see above).

    """

    logger = P.get_logger()

    if len(extras) != 1:
        raise ValueError("expecting only one extra argument "
                         "(configuration dictionary)")

    config = extras[0]

    url = config["database"]["url"]
    is_sqlite3 = url.startswith("sqlite")

    if is_sqlite3:
        connect_args = {'check_same_thread': False}
    else:
        connect_args = {}

    schema = config["database"].get("schema", None)
    # TODO: check if schema exists to avoid incomplete
    # transaction.

    engine = sqlalchemy.create_engine(url, connect_args=connect_args)

    # Catch exceptions until database access on thame available
    try:
        create_database(engine)
    except OperationalError as msg:
        logger.warn("could not connect to database at {}. "
                    "The data will not be uploaded. Msg={}".format(
                        url, str(msg)))
        return

    # Create schema if not exists
    if schema is not None:
        engine.execute(text("CREATE SCHEMA IF NOT EXISTS {}".format(schema)))

    pipeline_name = os.path.basename(sys.argv[0])
    logger.debug("uploading data to {}, schema={}".format(url, schema))
    # TODO: add dependencies
    # dependencies = infiles[1:]
    # meta_data = dict([("dependency{}".format(x), y) \
    #                  for x, y in enumerate(dependencies)])

    # need to set created dir somehow, important when re-loading
    # as otherwise all times will be the same.
    if os.path.exists("benchmark.yml"):
        s = os.stat("benchmark.yml")
        created = datetime.datetime.fromtimestamp(s.st_mtime)
    else:
        created = datetime.datetime.now()

    benchmark_run = BenchmarkRun(
        author=os.environ.get("USER", "unknown"),
        # needs refactoring, should be: uploaded_at, created_at, run_at
        # uploaded_at=datetime.datetime.now(),
        created=created,
        pipeline_name=pipeline_name,
        pipeline_version=P.get_version(),
        pipeline_dir=os.getcwd(),
        title=config["title"],
        description=config["description"],
        config=json.dumps(config),
        config_hash=hash(json.dumps(config)),
        status="incomplete")

    Session = sessionmaker(bind=engine)
    session = Session()
    session.add(benchmark_run)
    session.commit()

    for tag in config["tags"]:
        benchmark_tag = BenchmarkTag(run_id=benchmark_run.id, tag=tag)
        session.add(benchmark_tag)
    session.commit()

    tool_dirs = set()

    table_cache = TableCache(engine, schema, is_sqlite3)

    for infile in infiles:

        path, name = os.path.split(infile)

        # walk up the path to find "benchmark.info" as it might be
        # located on a higher level if the tool output multiple files.
        parts = path.split(os.sep)

        info_paths = []
        rootdir = os.getcwd()
        while len(parts):
            p = os.path.join(*parts)
            if p == rootdir:
                break
            if os.path.exists(os.path.join(p, "benchmark.info")):
                info_paths.append(p)
            parts.pop()
        info_paths = info_paths[::-1]

        # the level of nesting determines the layout:
        # 1 level: aggregation: tool == metric
        # 2 levels: tool + metric
        # 3 levels: tool + split + metric
        if len(info_paths) not in (1, 2, 3):
            raise ValueError("for {}, expected two or three paths with info, "
                             "got {}".format(infile, len(info_paths)))

        meta_data = {}

        if len(info_paths) == 1:
            tool_dir = metric_dir = info_paths[0]
            split_dir = None
        elif len(info_paths) == 2:
            tool_dir, metric_dir = info_paths
            split_dir = None
            # If there are multiple output files in aggregation, use
            # intermediate paths as split_subset factors.
            td = len(tool_dir.split(os.sep))
            tm = len(metric_dir.split(os.sep))
            d = tm - td
            if d > 1:
                meta_data["split_subset"] = re.sub(
                    ".dir", "", os.sep.join(metric_dir.split(os.sep)[td:-1]))
        elif len(info_paths) == 3:
            tool_dir, split_dir, metric_dir = info_paths

        if tool_dir:
            d = read_data(os.path.join(tool_dir, "benchmark.info"),
                          prefix="tool_")
            if "tool_action" in d:
                assert d["tool_action"] == "tool"
            meta_data.update(d)

        if metric_dir:
            d = read_data(os.path.join(metric_dir, "benchmark.info"),
                          prefix="metric_")
            if "metric_action" in d:
                # ignore splits, they will be added through metrics
                if d["metric_action"] == "split":
                    continue
                assert d["metric_action"] == "metric", \
                    "action for metric info {} is not 'metric', but '{}'" \
                    .format(os.path.join(metric_dir, "benchmark.info"),
                            d["metric_action"])

        meta_data.update(d)

        if split_dir:
            d = read_data(os.path.join(split_dir, "benchmark.info"),
                          prefix="split_")
            if "split_action" in d:
                assert d["split_action"] == "split"
            meta_data.update(d)
            subset = os.path.basename(os.path.dirname(info_paths[-1]))
            if subset.endswith(".dir"):
                subset = subset[:-len(".dir")]
            meta_data["split_subset"] = subset

        # tool_input_files can either be a dictionary if a tool
        # or a simple list if aggregation.
        try:
            tool_input_files = [
                x["path"] for x in meta_data["tool_input_files"]
            ]
        except TypeError:
            tool_input_files = meta_data["tool_input_files"]

        try:
            instance = BenchmarkInstance(
                run_id=benchmark_run.id,
                completed=datetime.datetime.fromtimestamp(
                    os.path.getmtime(infile)),
                input=",".join(tool_input_files),
                input_alias=meta_data["tool_input_alias"],
                tool_name=meta_data["tool_name"],
                tool_version=meta_data["tool_version"],
                tool_options=meta_data["tool_options"],
                tool_hash=meta_data["tool_option_hash"],
                tool_alias=meta_data.get("tool_alias", ""),
                metric_name=meta_data["metric_name"],
                metric_version=meta_data["metric_version"],
                metric_options=meta_data["metric_options"],
                metric_hash=meta_data["metric_option_hash"],
                metric_alias=meta_data.get("metric_alias", ""),
                split_name=meta_data.get("split_name", ""),
                split_version=meta_data.get("split_version", ""),
                split_options=meta_data.get("split_options", ""),
                split_hash=meta_data.get("split_option_hash", ""),
                split_alias=meta_data.get("split_alias", ""),
                split_subset=meta_data.get("split_subset", "all"),
                meta_data=json.dumps(meta_data))
        except KeyError as e:
            raise KeyError("missing required attribute {} in {}".format(
                str(e), str(meta_data)))

        session.add(instance)
        session.commit()

        # avoid multiple upload of tool data
        if tool_dir and tool_dir not in tool_dirs:
            tool_dirs.add(tool_dir)
            save_benchmark_timings(tool_dir, "tool_timings", engine, instance,
                                   schema, is_sqlite3)

        save_benchmark_timings(metric_dir, "metric_timings", engine, instance,
                               schema, is_sqlite3)

        metric_table_filter = None
        if "metric_no_upload" in meta_data:
            if meta_data["metric_no_upload"] == "*":
                logger.warn("upload turned off for metric {}".format(
                    meta_data["metric_name"]))
                continue
            else:
                metric_table_filter = re.compile(meta_data["metric_no_upload"])

        # multiple tablenames for multiple metric output
        #
        # Tables are added into schemas to avoid cluttering
        # the public namespace.
        # (if only blobs, no metric output file)
        if "metric_output_files" in meta_data:
            assert len(meta_data["metric_output_files"]) == \
                len(meta_data["metric_tablenames"])

            for output_file, tablename in zip(meta_data["metric_output_files"],
                                              meta_data["metric_tablenames"]):

                if metric_table_filter and metric_table_filter.search(
                        tablename):
                    logger.warn(
                        "upload for table {} turned off".format(tablename))
                    continue

                if not os.path.exists(output_file):
                    logger.warn(
                        "output file {} does not exist - ignored".format(
                            output_file))
                    continue

                if IOTools.is_empty(output_file):
                    logger.warn("output file {} is empty - ignored".format(
                        output_file))
                    continue

                try:
                    table = pandas.read_csv(output_file,
                                            sep="\t",
                                            comment="#",
                                            skip_blank_lines=True)
                except ValueError as e:
                    logger.warn("table {} can not be read: {}".format(
                        output_file, str(e)))
                    continue
                except pandas.parser.CParserError as e:
                    logger.warn(
                        "malformatted table {} can not be read: {}".format(
                            output_file, str(e)))
                    continue

                if len(table) == 0:
                    logger.warn(
                        "table {} is empty - ignored".format(output_file))
                    continue

                tablename, table, dtypes = transform_table_before_upload(
                    tablename, table, instance, meta_data, table_cache)

                if schema is None:
                    tn = tablename
                else:
                    tn = "{}.{}".format(schema, tablename)

                logger.debug("saving data from {} to table {}".format(
                    output_file, tn))
                # add foreign key
                table["instance_id"] = instance.id
                table_cache.add_table(table, tablename, dtypes)

        if "metric_blob_globs" in meta_data:
            metric_dir = meta_data["metric_outdir"]
            files = [
                glob.glob(os.path.join(metric_dir, x))
                for x in meta_data["metric_blob_globs"]
            ]
            files = IOTools.flatten(files)
            logger.debug("uploading binary data in {} files from {} to "
                         "table binary_data".format(len(files), metric_dir))
            table = []
            for fn in files:
                with IOTools.open_file(fn, "rb") as inf:
                    data_row = BenchmarkBinaryData(
                        instance_id=instance.id,
                        filename=os.path.basename(fn),
                        path=fn,
                        data=inf.read())
                    session.add(data_row)
                session.commit()

    table_cache.close()
    touch(outfile)

    # upload table sizes
    df_sizes = pandas.DataFrame.from_records(
        list(table_cache.uploaded_sizes.items()),
        columns=["tablename", "bytes_uploaded"])
    df_sizes["bytes_resident"] = df_sizes.bytes_uploaded
    df_sizes["run_id"] = benchmark_run.id
    df_sizes["schema"] = schema
    save_table(df_sizes,
               engine,
               "metric_storage",
               schema=None,
               is_sqlite3=is_sqlite3)

    # check if arvados job
    if Arvados.have_arvados():
        try:
            arv_job_info = arvados.current_job()
        except KeyError:
            arv_job_info = None

        if arv_job_info is not None:
            arv_job = BenchmarkArvadosJob(
                run_id=benchmark_run.id,
                job_uuid=arv_job_info["uuid"],
                owner_uuid=arv_job_info["owner_uuid"])
            session.add(arv_job)
            session.commit()

    benchmark_run.status = "complete"
    session.commit()

    engine.dispose()
    del engine

    logger.info("uploaded results under run_id {}".format(benchmark_run.id))