Exemple #1
0
 def setUp(self):
     super(KubeflowV2EntrypointUtilsTest, self).setUp()
     _ARTIFACT_1.uri = 'gs://root/string/'
     # Hash value of
     # 'projects/123456789/locations/us-central1/metadataStores/default/artifacts/11111'
     _ARTIFACT_1.id = 9171918664759481579
     _ARTIFACT_1.set_string_custom_property(key='my_property_1',
                                            value='Test string.')
     _ARTIFACT_2.uri = 'gs://root/model/'
     # Hash value of
     # 'projects/123456789/locations/us-central1/metadataStores/default/artifacts/22222'
     _ARTIFACT_2.id = 6826273797600318744
     _ARTIFACT_2.set_int_custom_property(key='my_property_2', value=42)
     _ARTIFACT_3.uri = 'gs://root/examples/'
     # Hash value of
     # 'projects/123456789/locations/us-central1/metadataStores/default/artifacts/33333'
     _ARTIFACT_3.id = 27709763105391302
     self._expected_dict = {
         _KEY_1: [_ARTIFACT_1],
         _KEY_2: [_ARTIFACT_2],
         _KEY_3: [_ARTIFACT_3],
     }
     source_data_dir = os.path.join(os.path.dirname(__file__), 'testdata')
     # Use two protos to store the testdata.
     artifacts_pb = pipeline_pb2.ExecutorInput()
     io_utils.parse_json_file(
         os.path.join(source_data_dir, 'artifacts.json'), artifacts_pb)
     self._artifacts = artifacts_pb.inputs.artifacts
     properties_pb = pipeline_pb2.ExecutorInput()
     io_utils.parse_json_file(
         os.path.join(source_data_dir, 'exec_properties.json'),
         properties_pb)
     self._properties = properties_pb.inputs.parameters
Exemple #2
0
    def setUp(self):
        super().setUp()
        self._test_dir = tempfile.mkdtemp()

        self._executor_invocation = pipeline_pb2.ExecutorInput()
        self._executor_invocation.outputs.output_file = _TEST_OUTPUT_METADATA_JSON
        self._executor_invocation.inputs.parameters[
            'input_base_uri'].string_value = _TEST_INPUT_DIR
        self._executor_invocation.inputs.parameters[
            'input_config'].string_value = json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(name='s1',
                                                pattern='span{SPAN}/split1/*'),
                    example_gen_pb2.Input.Split(name='s2',
                                                pattern='span{SPAN}/split2/*')
                ]))
        self._executor_invocation.outputs.artifacts[
            'examples'].artifacts.append(
                pipeline_pb2.RuntimeArtifact(
                    type=pipeline_pb2.ArtifactTypeSchema(
                        instance_schema=compiler_utils.get_artifact_schema(
                            standard_artifacts.Examples()))))

        self._executor_invocation_from_file = fileio.open(
            os.path.join(os.path.dirname(__file__), 'testdata',
                         'executor_invocation.json'), 'r').read()
        self._expected_result_from_file = fileio.open(
            os.path.join(os.path.dirname(__file__), 'testdata',
                         'expected_output_metadata.json'), 'r').read()

        self._olddir = os.getcwd()
        os.chdir(self._test_dir)
        fileio.makedirs(os.path.dirname(_TEST_OUTPUT_METADATA_JSON))
        fileio.makedirs(os.path.dirname(_TEST_INPUT_DIR))
Exemple #3
0
 def testParseExecutionPropertiesMapsInputBaseUri(self):
     properties_pb = pipeline_pb2.ExecutorInput()
     properties_pb.inputs.parameters[
         'input_base_uri'].string_value = 'gs://input/base'
     self.assertDictEqual(
         {'input_base': 'gs://input/base'},
         kubeflow_v2_entrypoint_utils.parse_execution_properties(
             properties_pb.inputs.parameters))
Exemple #4
0
def main(args):
    executor_input = pipeline_pb2.ExecutorInput()
    json_format.Parse(args.json_serialized_invocation_args,
                      executor_input,
                      ignore_unknown_fields=True)

    name_from_id = {}

    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        executor_input.inputs.parameters)
    outputs_dict = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        executor_input.outputs.artifacts, name_from_id)

    _run_driver(exec_properties, outputs_dict,
                executor_input.outputs.output_file, name_from_id)
Exemple #5
0
  def setUp(self):
    self._executor_invocation = pipeline_pb2.ExecutorInput()
    self._executor_invocation.outputs.output_file = _TEST_OUTPUT_METADATA_JSON
    self._executor_invocation.inputs.parameters[
        'input_base_uri'].string_value = _TEST_INPUT_DIR
    self._executor_invocation.inputs.parameters[
        'input_config'].string_value = json_format.MessageToJson(
            example_gen_pb2.Input(splits=[
                example_gen_pb2.Input.Split(
                    name='s1', pattern='span{SPAN}/split1/*'),
                example_gen_pb2.Input.Split(
                    name='s2', pattern='span{SPAN}/split2/*')
            ]))
    self._executor_invocation.outputs.artifacts['examples'].artifacts.append(
        pipeline_pb2.RuntimeArtifact(
            type=pipeline_pb2.ArtifactTypeSchema(
                instance_schema=compiler_utils.get_artifact_schema(
                    standard_artifacts.Examples()))))

    self._executor_invocation_from_file = fileio.open(
        os.path.join(
            os.path.dirname(__file__), 'testdata', 'executor_invocation.json'),
        'r').read()

    logging.debug('Executor invocation under test: %s',
                  self._executor_invocation_from_file)
    self._expected_result_from_file = fileio.open(
        os.path.join(
            os.path.dirname(__file__), 'testdata',
            'expected_output_metadata.json'), 'r').read()
    logging.debug('Expecting output metadata JSON: %s',
                  self._expected_result_from_file)

    # The initialization of TempWorkingDirTestCase has to be called after all
    # the testdata files have been read. Otherwise the original testdata files
    # are not accessible after cwd is changed.
    super().setUp()

    fileio.makedirs(os.path.dirname(_TEST_OUTPUT_METADATA_JSON))
    fileio.makedirs(os.path.dirname(_TEST_INPUT_DIR))
Exemple #6
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('--json_serialized_invocation_args',
                        type=str,
                        required=True,
                        help='JSON-serialized metadata for this execution.')
    args, _ = parser.parse_known_args(argv)

    executor_input = pipeline_pb2.ExecutorInput()
    json_format.Parse(args.json_serialized_invocation_args,
                      executor_input,
                      ignore_unknown_fields=True)

    name_from_id = {}

    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        executor_input.inputs.parameters)
    outputs_dict = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        executor_input.outputs.artifacts, name_from_id)

    _run_driver(exec_properties, outputs_dict,
                executor_input.outputs.output_file, name_from_id)
Exemple #7
0
  def setUp(self):
    super().setUp()

    self._executor_invocation = pipeline_pb2.ExecutorInput()
    self._executor_invocation.outputs.output_file = _TEST_OUTPUT_METADATA_JSON
    self._executor_invocation.inputs.parameters[
        'input_base_uri'].string_value = _TEST_INPUT_DIR
    self._executor_invocation.inputs.parameters[
        'input_config'].string_value = json_format.MessageToJson(
            example_gen_pb2.Input(splits=[
                example_gen_pb2.Input.Split(
                    name='s1', pattern='span{SPAN}/split1/*'),
                example_gen_pb2.Input.Split(
                    name='s2', pattern='span{SPAN}/split2/*')
            ]))
    self._executor_invocation.outputs.artifacts['examples'].artifacts.append(
        pipeline_pb2.RuntimeArtifact(
            type=pipeline_pb2.ArtifactTypeSchema(
                instance_schema=compiler_utils.get_artifact_schema(
                    standard_artifacts.Examples()))))

    self._executor_invocation_from_file = fileio.open(
        os.path.join(
            os.path.dirname(__file__), 'testdata', 'executor_invocation.json'),
        'r').read()

    logging.debug('Executor invocation under test: %s',
                  self._executor_invocation_from_file)
    self._expected_result_from_file = fileio.open(
        os.path.join(
            os.path.dirname(__file__), 'testdata',
            'expected_output_metadata.json'), 'r').read()
    logging.debug('Expecting output metadata JSON: %s',
                  self._expected_result_from_file)

    # Change working directory after all the testdata files have been read.
    self.enter_context(test_case_utils.change_working_dir(self.tmp_dir))

    fileio.makedirs(os.path.dirname(_TEST_INPUT_DIR))
Exemple #8
0
def _run_executor(args: argparse.Namespace, beam_args: List[str]) -> None:
    """Selects a particular executor and run it based on name.

  Args:
    args:
      --executor_class_path: The import path of the executor class.
      --json_serialized_invocation_args: Full JSON-serialized parameters for
        this execution.
    beam_args: Optional parameter that maps to the optional_pipeline_args
      parameter in the pipeline, which provides additional configuration options
      for apache-beam and tensorflow.logging.
    For more about the beam arguments please refer to:
    https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
  """
    logging.set_verbosity(logging.INFO)

    # Rehydrate inputs/outputs/exec_properties from the serialized metadata.
    executor_input = pipeline_pb2.ExecutorInput()
    json_format.Parse(args.json_serialized_invocation_args,
                      executor_input,
                      ignore_unknown_fields=True)

    inputs_dict = executor_input.inputs.artifacts
    outputs_dict = executor_input.outputs.artifacts
    inputs_parameter = executor_input.inputs.parameters

    name_from_id = {}

    inputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        inputs_dict, name_from_id)
    outputs = kubeflow_v2_entrypoint_utils.parse_raw_artifact_dict(
        outputs_dict, name_from_id)
    exec_properties = kubeflow_v2_entrypoint_utils.parse_execution_properties(
        inputs_parameter)
    logging.info(
        'Executor %s do: inputs: %s, outputs: %s, exec_properties: %s',
        args.executor_class_path, inputs, outputs, exec_properties)
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=beam_args, unique_id='')
    executor = executor_cls(executor_context)
    logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # TODO(b/169583143): Remove this workaround when TFX migrates to use str-typed
    # id/name to identify artifacts.
    # Convert ModelBlessing artifact to use managed MLMD resource name.
    if (issubclass(executor_cls, evaluator_executor.Executor)
            and BLESSING_KEY in outputs):
        # Parse the parent prefix for managed MLMD resource name.
        kubeflow_v2_entrypoint_utils.refactor_model_blessing(
            artifact_utils.get_single_instance(outputs[BLESSING_KEY]),
            name_from_id)

    # Log the output metadata to a file. So that it can be picked up by MP.
    metadata_uri = executor_input.outputs.output_file
    executor_output = pipeline_pb2.ExecutorOutput()
    for k, v in kubeflow_v2_entrypoint_utils.translate_executor_output(
            outputs, name_from_id).items():
        executor_output.artifacts[k].CopyFrom(v)

    fileio.makedirs(os.path.dirname(metadata_uri))
    with fileio.open(metadata_uri, 'wb') as f:
        f.write(json_format.MessageToJson(executor_output))