Esempio n. 1
0
 def testParseExecutionPropertiesMapsInputBaseUri(self):
   properties_pb = pipeline_pb2.ExecutorInput()
   properties_pb.inputs.parameters[
       'input_base_uri'].string_value = 'gs://input/base'
   self.assertDictEqual(
       {'input_base': 'gs://input/base'},
       kubeflow_v2_entrypoint_utils.parse_execution_properties(
           properties_pb.inputs.parameters))
Esempio n. 2
0
def main(args):
    executor_input = pipeline_spec_pb2.ExecutorInput()
    json_format.Parse(args.json_serialized_invocation_args,
                      executor_input,
                      ignore_unknown_fields=True)

    name_from_id = {}

    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        executor_input.inputs.parameters)
    outputs_dict = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        executor_input.outputs.artifacts, name_from_id)

    _run_driver(exec_properties, outputs_dict,
                executor_input.outputs.output_file, name_from_id)
Esempio n. 3
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('--json_serialized_invocation_args',
                        type=str,
                        required=True,
                        help='JSON-serialized metadata for this execution.')
    args, _ = parser.parse_known_args(argv)

    executor_input = pipeline_pb2.ExecutorInput()
    json_format.Parse(args.json_serialized_invocation_args,
                      executor_input,
                      ignore_unknown_fields=True)

    name_from_id = {}

    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        executor_input.inputs.parameters)
    outputs_dict = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        executor_input.outputs.artifacts, name_from_id)

    _run_driver(exec_properties, outputs_dict,
                executor_input.outputs.output_file, name_from_id)
Esempio n. 4
0
 def testParseExecutionProperties(self):
   self.assertDictEqual(
       _EXEC_PROPERTIES,
       kubeflow_v2_entrypoint_utils.parse_execution_properties(
           self._properties))
Esempio n. 5
0
def _run_executor(args: argparse.Namespace, beam_args: List[str]) -> None:
  """Selects a particular executor and run it based on name.

  Args:
    args:
      --executor_class_path: The import path of the executor class.
      --json_serialized_invocation_args: Full JSON-serialized parameters for
        this execution.
    beam_args: Optional parameter that maps to the optional_pipeline_args
      parameter in the pipeline, which provides additional configuration options
      for apache-beam and tensorflow.logging.
    For more about the beam arguments please refer to:
    https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
  """
  logging.set_verbosity(logging.INFO)

  # Rehydrate inputs/outputs/exec_properties from the serialized metadata.
  executor_input = pipeline_spec_pb2.ExecutorInput()
  json_format.Parse(
      args.json_serialized_invocation_args,
      executor_input,
      ignore_unknown_fields=True)

  inputs_dict = executor_input.inputs.artifacts
  outputs_dict = executor_input.outputs.artifacts
  inputs_parameter = executor_input.inputs.parameters

  if fileio.exists(executor_input.outputs.output_file):
    # It has a driver that outputs the updated exec_properties in this file.
    with fileio.open(executor_input.outputs.output_file,
                     'rb') as output_meta_json:
      output_metadata = pipeline_spec_pb2.ExecutorOutput()
      json_format.Parse(
          output_meta_json.read(), output_metadata, ignore_unknown_fields=True)
      # Append/Overwrite exec_propertise.
      for k, v in output_metadata.parameters.items():
        inputs_parameter[k].CopyFrom(v)

  name_from_id = {}

  inputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
      inputs_dict, name_from_id)
  outputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
      outputs_dict, name_from_id)
  exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
      inputs_parameter)
  logging.info('Executor %s do: inputs: %s, outputs: %s, exec_properties: %s',
               args.executor_class_path, inputs, outputs, exec_properties)
  executor_cls = import_utils.import_class_by_path(args.executor_class_path)
  if issubclass(executor_cls, base_beam_executor.BaseBeamExecutor):
    executor_context = base_beam_executor.BaseBeamExecutor.Context(
        beam_pipeline_args=beam_args, unique_id='', tmp_dir='/tmp')
  else:
    executor_context = base_executor.BaseExecutor.Context(
        extra_flags=beam_args, unique_id='', tmp_dir='/tmp')
  executor = executor_cls(executor_context)
  logging.info('Starting executor')
  executor.Do(inputs, outputs, exec_properties)

  # TODO(b/182316162): Unify publisher handling so that post-execution artifact
  # logic is more cleanly handled.
  outputs_utils.tag_output_artifacts_with_version(outputs)  # pylint: disable=protected-access

  # TODO(b/169583143): Remove this workaround when TFX migrates to use str-typed
  # id/name to identify artifacts.
  # Convert ModelBlessing artifact to use managed MLMD resource name.
  if (issubclass(executor_cls, evaluator_executor.Executor) and
      standard_component_specs.BLESSING_KEY in outputs):
    # Parse the parent prefix for managed MLMD resource name.
    kubeflow_v2_entrypoint_utils.refactor_model_blessing(
        artifact_utils.get_single_instance(
            outputs[standard_component_specs.BLESSING_KEY]), name_from_id)

  # Log the output metadata to a file. So that it can be picked up by MP.
  metadata_uri = executor_input.outputs.output_file
  executor_output = pipeline_spec_pb2.ExecutorOutput()
  for k, v in kubeflow_v2_entrypoint_utils.translate_executor_output(
      outputs, name_from_id).items():
    executor_output.artifacts[k].CopyFrom(v)

  fileio.makedirs(os.path.dirname(metadata_uri))
  with fileio.open(metadata_uri, 'wb') as f:
    f.write(json_format.MessageToJson(executor_output))
Esempio n. 6
0
def _run_driver(executor_input: pipeline_spec_pb2.ExecutorInput) -> None:
    """Runs the driver, writing its output as a ExecutorOutput proto.

  The main goal of this driver is to calculate the span and fingerprint of input
  data, allowing for the executor invocation to be skipped if the ExampleGen
  component has been previously run on the same data with the same
  configuration. This span and fingerprint are added as new custom execution
  properties to an ExecutorOutput proto and written to a GCS path. The CAIP
  pipelines system reads this file and updates MLMD with the new execution
  properties.

  Args:
    executor_input: pipeline_spec_pb2.ExecutorInput that contains TFX artifacts
      and exec_properties information.
  """

    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        executor_input.inputs.parameters)
    name_from_id = {}
    outputs_dict = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        executor_input.outputs.artifacts, name_from_id)
    # A path at which an ExecutorOutput message will be
    # written with updated execution properties and output artifacts. The CAIP
    # Pipelines service will update the task's properties and artifacts prior to
    # running the executor.
    output_metadata_uri = executor_input.outputs.output_file

    logging.set_verbosity(logging.INFO)
    logging.info('exec_properties = %s\noutput_metadata_uri = %s',
                 exec_properties, output_metadata_uri)

    input_base_uri = exec_properties.get(
        standard_component_specs.INPUT_BASE_KEY)

    input_config = example_gen_pb2.Input()
    proto_utils.json_to_proto(
        exec_properties[standard_component_specs.INPUT_CONFIG_KEY],
        input_config)

    range_config = None
    range_config_entry = exec_properties.get(
        standard_component_specs.RANGE_CONFIG_KEY)
    if range_config_entry:
        range_config = range_config_pb2.RangeConfig()
        proto_utils.json_to_proto(range_config_entry, range_config)

    processor = input_processor.FileBasedInputProcessor(
        input_base_uri, input_config.splits, range_config)
    span, version = processor.resolve_span_and_version()
    fingerprint = processor.get_input_fingerprint(span, version)

    logging.info('Calculated span: %s', span)
    logging.info('Calculated fingerprint: %s', fingerprint)

    exec_properties[utils.SPAN_PROPERTY_NAME] = span
    exec_properties[utils.FINGERPRINT_PROPERTY_NAME] = fingerprint
    exec_properties[utils.VERSION_PROPERTY_NAME] = version

    # Updates the input_config.splits.pattern.
    for split in input_config.splits:
        split.pattern = processor.get_pattern_for_span_version(
            split.pattern, span, version)
    exec_properties[standard_component_specs.
                    INPUT_CONFIG_KEY] = proto_utils.proto_to_json(input_config)

    if standard_component_specs.EXAMPLES_KEY not in outputs_dict:
        raise ValueError(
            'Example artifact was missing in the ExampleGen outputs.')
    example_artifact = artifact_utils.get_single_instance(
        outputs_dict[standard_component_specs.EXAMPLES_KEY])

    driver.update_output_artifact(
        exec_properties=exec_properties,
        output_artifact=example_artifact.mlmd_artifact)

    # Log the output metadata file
    output_metadata = pipeline_spec_pb2.ExecutorOutput()
    output_metadata.parameters[utils.SPAN_PROPERTY_NAME].int_value = span
    output_metadata.parameters[
        utils.FINGERPRINT_PROPERTY_NAME].string_value = fingerprint
    if version is not None:
        output_metadata.parameters[
            utils.VERSION_PROPERTY_NAME].int_value = version
    output_metadata.parameters[
        standard_component_specs.
        INPUT_CONFIG_KEY].string_value = proto_utils.proto_to_json(
            input_config)
    output_metadata.artifacts[
        standard_component_specs.EXAMPLES_KEY].artifacts.add().CopyFrom(
            kubeflow_v2_entrypoint_utils.to_runtime_artifact(
                example_artifact, name_from_id))

    fileio.makedirs(os.path.dirname(output_metadata_uri))
    with fileio.open(output_metadata_uri, 'wb') as f:
        f.write(json_format.MessageToJson(output_metadata, sort_keys=True))
Esempio n. 7
0
def _run_executor(args: argparse.Namespace, beam_args: List[str]) -> None:
    """Selects a particular executor and run it based on name.

  Args:
    args:
      --executor_class_path: The import path of the executor class.
      --json_serialized_invocation_args: Full JSON-serialized parameters for
        this execution. See go/mp-alpha-placeholder for details.
    beam_args: Optional parameter that maps to the optional_pipeline_args
      parameter in the pipeline, which provides additional configuration options
      for apache-beam and tensorflow.logging.
    For more about the beam arguments please refer to:
    https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
  """
    logging.set_verbosity(logging.INFO)

    # Rehydrate inputs/outputs/exec_properties from the serialized metadata.
    executor_input = pipeline_pb2.ExecutorInput()
    json_format.Parse(args.json_serialized_invocation_args,
                      executor_input,
                      ignore_unknown_fields=True)

    inputs_dict = executor_input.inputs.artifacts
    outputs_dict = executor_input.outputs.artifacts
    inputs_parameter = executor_input.inputs.parameters

    name_from_id = {}

    inputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        inputs_dict, name_from_id)
    outputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        outputs_dict, name_from_id)
    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        inputs_parameter)
    logging.info(
        'Executor %s do: inputs: %s, outputs: %s, exec_properties: %s',
        args.executor_class_path, inputs, outputs, exec_properties)
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=beam_args, unique_id='')
    executor = executor_cls(executor_context)
    logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # TODO(b/169583143): Remove this workaround when TFX migrates to use str-typed
    # id/name to identify artifacts.
    # Convert ModelBlessing artifact to use managed MLMD resource name.
    if (issubclass(executor_cls, evaluator_executor.Executor)
            and constants.BLESSING_KEY in outputs):
        # Parse the parent prefix for managed MLMD resource name.
        kubeflow_v2_entrypoint_utils.refactor_model_blessing(
            artifact_utils.get_single_instance(
                outputs[constants.BLESSING_KEY]), name_from_id)

    # Log the output metadata to a file. So that it can be picked up by MP.
    metadata_uri = executor_input.outputs.output_file
    executor_output = pipeline_pb2.ExecutorOutput()
    for k, v in kubeflow_v2_entrypoint_utils.translate_executor_output(
            outputs, name_from_id).items():
        executor_output.artifacts[k].CopyFrom(v)

    fileio.open(metadata_uri,
                'wb').write(json_format.MessageToJson(executor_output))