Beispiel #1
0
def _get_artifact_run_info_map(store: metadata_store.MetadataStore,
                               artifact_ids: List[int]) -> Dict[int, _RunInfo]:
    """Returns a dictionary mapping artifact_id to its MyOrchestrator run_id.

  Args:
    store: MetaDataStore object to connect to MLMD instance.
    artifact_ids: A list of artifact ids to load.

  Returns:
    A dictionary containing artifact_id as a key and MyOrchestrator run_id as value.
  """
    # Get events of artifacts.
    events = store.get_events_by_artifact_ids(artifact_ids)
    exec_to_artifact = {}
    for event in events:
        exec_to_artifact[event.execution_id] = event.artifact_id

    # Get execution of artifacts.
    executions = store.get_executions_by_id(list(exec_to_artifact.keys()))
    artifact_to_run_info = {}
    for execution in executions:
        run_id = execution.properties[RUN_ID_KEY].string_value
        component = execution.properties[_COMPONENT_ID].string_value
        artifact_id = exec_to_artifact[execution.id]
        artifact_to_run_info[artifact_id] = _RunInfo(
            run_id=run_id,
            component_name=component,
            started_at=execution.create_time_since_epoch)

    return artifact_to_run_info
Beispiel #2
0
def get_statisticsgen_dir_list(
        store: metadata_store.MetadataStore) -> List[str]:
    """Obtains a list of statisticsgen_dir from the store."""

    stats_artifacts = store.get_artifacts_by_type(_STATS)
    stat_dirs_list = [artifact.uri for artifact in stats_artifacts]
    return stat_dirs_list
Beispiel #3
0
def get_model_dir_map(store: metadata_store.MetadataStore) -> Dict[str, str]:
    """Obtains a map of run_id to model_dir from the store."""

    evaluator_execs = store.get_executions_by_type(_EVALUATOR)

    def _go_up_2_levels(eval_model_dirs):
        model_dir_set = set()
        for eval_model_dir in eval_model_dirs:
            model_dir_set.add(os.sep.join(eval_model_dir.split(os.sep)[:-2]))
        return list(model_dir_set)

    def _eval_execs_to_model_dir_map(eval_execs):
        model_dir_map = {}
        for eval_exec in eval_execs:
            run_id = eval_exec.properties[_RUN_ID].string_value
            pipeline_root = eval_exec.properties[_PIPELINE_ROOT].string_value
            eval_component_id = eval_exec.properties[
                _COMPONENT_ID].string_value
            eval_config_path = os.path.join(pipeline_root, eval_component_id,
                                            'evaluation', str(eval_exec.id),
                                            'eval_config.json')

            with tf.io.gfile.GFile(eval_config_path, 'r') as f:
                eval_config = json.load(f)

            model_dir_map[run_id] = _go_up_2_levels(
                eval_config['modelLocations'].values())
        return model_dir_map

    return _eval_execs_to_model_dir_map(evaluator_execs)
Beispiel #4
0
def _get_kaggle_results(store: metadata_store.MetadataStore) -> _Result:
    """Returns the kaggle score detail from the KagglePublisher component.

  Args:
    store: MetaDataStore object to connect to MLMD instance.

  Returns:
    A _Result objects with properties containing kaggle results.
  """
    results = {}
    property_names = set()
    kaggle_artifacts = store.get_artifacts_by_type(_KAGGLE_RESULT)
    for artifact in kaggle_artifacts:
        submit_info = {}
        for key, val in artifact.custom_properties.items():
            if key not in _DEFAULT_CUSTOM_PROPERTIES:
                name = _KAGGLE + '_' + key
                submit_info[name] = _parse_value(val)
        property_names = property_names.union(submit_info.keys())
        results[artifact.id] = submit_info

    artifact_to_run_info = _get_artifact_run_info_map(store,
                                                      list(results.keys()))

    properties = {}
    for artifact_id, submit_info in results.items():
        run_info = artifact_to_run_info[artifact_id]
        result_key = run_info.run_id + run_info.component_name.replace(
            _KAGGLE_PUBLISHER_PREFIX, '')
        properties[result_key] = submit_info

    property_names = property_names.difference(
        {_NAME, _PRODUCER_COMPONENT, _STATE, *_DEFAULT_COLUMNS})
    return _Result(properties=properties,
                   property_names=sorted(property_names))
Beispiel #5
0
def _get_hparams(store: metadata_store.MetadataStore) -> _Result:
    """Returns the hparams of the EstimatorTrainer component.

  Args:
    store: MetaDataStore object to connect to MLMD instance.

  Returns:
    A _Result objects with properties containing hparams.
  """
    results = {}
    hparam_names = set()

    trainer_execs = store.get_executions_by_type(_TRAINER)
    for ex in trainer_execs:
        run_id = ex.properties[RUN_ID_KEY].string_value
        hparams = _parse_hparams(ex.properties[_HPARAMS].string_value)
        hparam_names.update(hparams.keys())
        hparams[RUN_ID_KEY] = run_id
        trainer_id = ex.properties[_COMPONENT_ID].string_value.replace(
            _TRAINER_PREFIX, '')
        result_key = run_id + trainer_id
        hparams[BENCHMARK_KEY] = trainer_id[1:]  # Removing '.' prefix
        # BeamDagRunner uses iso format timestamp. See for details:
        # http://google3/third_party/py/tfx/orchestration/beam/beam_dag_runner.py
        try:
            hparams[STARTED_AT] = datetime.datetime.fromtimestamp(int(run_id))
        except ValueError:
            hparams[STARTED_AT] = run_id
        results[result_key] = hparams
    return _Result(properties=results, property_names=sorted(hparam_names))
Beispiel #6
0
def _get_benchmark_results(store: metadata_store.MetadataStore) -> _Result:
    """Returns the benchmark results of the BenchmarkResultPublisher component.

  Args:
    store: MetaDataStore object to connect to MLMD instance.

  Returns:
    A _Result objects with properties containing benchmark results.
  """
    metrics = {}
    property_names = set()
    publisher_artifacts = store.get_artifacts_by_type(
        br.BenchmarkResult.TYPE_NAME)
    for artifact in publisher_artifacts:
        evals = {}
        for key, val in artifact.custom_properties.items():
            evals[key] = _parse_value(val)
            # Change for the IR world.
            if key == 'name':
                new_id = _parse_value(val).split(':')
                if len(new_id) > 2:
                    evals[RUN_ID_KEY] = new_id[1]
        property_names = property_names.union(evals.keys())
        metrics[artifact.id] = evals

    artifact_to_run_info = _get_artifact_run_info_map(store,
                                                      list(metrics.keys()))

    properties = {}
    for artifact_id, evals in metrics.items():
        run_info = artifact_to_run_info[artifact_id]
        started_at = run_info.started_at // 1000
        evals[STARTED_AT] = datetime.datetime.fromtimestamp(started_at)
        if RUN_ID_KEY not in metrics[artifact_id]:
            # Non-IR based runner.
            continue
        run_id = metrics[artifact_id][RUN_ID_KEY]

        result_key = run_id + '.' + evals[
            br.BenchmarkResult.BENCHMARK_NAME_KEY]
        if result_key in properties:
            properties[result_key].update(evals)
        else:
            properties[result_key] = {**evals}

    property_names = property_names.difference(
        {_NAME, _PRODUCER_COMPONENT, _STATE, *_DEFAULT_COLUMNS, _IS_IR_KEY})
    return _Result(properties=properties,
                   property_names=sorted(property_names))
Beispiel #7
0
def _get_benchmark_results(store: metadata_store.MetadataStore) -> _Result:
    """Returns the benchmark results of the BenchmarkResultPublisher component.

  Args:
    store: MetaDataStore object to connect to MLMD instance.

  Returns:
    A _Result objects with properties containing benchmark results.
  """
    metrics = {}
    property_names = set()
    publisher_artifacts = store.get_artifacts_by_type(_BENCHMARK_RESULT)
    for artifact in publisher_artifacts:
        evals = {}
        for key, val in artifact.custom_properties.items():
            evals[key] = _parse_value(val)
        property_names = property_names.union(evals.keys())
        metrics[artifact.id] = evals

    artifact_to_run_info = _get_artifact_run_info_map(store,
                                                      list(metrics.keys()))

    properties = {}
    for artifact_id, evals in metrics.items():
        run_info = artifact_to_run_info[artifact_id]
        evals[RUN_ID_KEY] = run_info.run_id
        # BeamDagRunner uses iso format timestamp. See for details:
        # http://google3/third_party/py/tfx/orchestration/beam/beam_dag_runner.py
        try:
            evals[STARTED_AT] = datetime.datetime.fromtimestamp(
                int(run_info.run_id))
        except ValueError:
            evals[STARTED_AT] = run_info.run_id
        result_key = run_info.run_id + '.' + evals[BENCHMARK_KEY]
        properties[result_key] = evals

    property_names = property_names.difference(
        {_NAME, _PRODUCER_COMPONENT, _STATE, *_DEFAULT_COLUMNS})
    return _Result(properties=properties,
                   property_names=sorted(property_names))