Beispiel #1
0
    def pre_execution(
        self,
        input_dict: Dict[Text, types.Channel],
        output_dict: Dict[Text, types.Channel],
        exec_properties: Dict[Text, Any],
        driver_args: data_types.DriverArgs,
        pipeline_info: data_types.PipelineInfo,
        component_info: data_types.ComponentInfo,
    ) -> data_types.ExecutionDecision:
        input_artifacts = channel_utils.unwrap_channel_dict(input_dict)
        output_artifacts = channel_utils.unwrap_channel_dict(output_dict)

        # Generating missing output artifact URIs
        for name, artifacts in output_artifacts.items():
            for idx, artifact in enumerate(artifacts):
                if not artifact.uri:
                    suffix = str(idx + 1) if idx > 0 else ''
                    artifact.uri = os.path.join(
                        pipeline_info.pipeline_root,
                        'artifacts',
                        name + suffix,
                        'data',
                    )
                    fileio.makedirs(os.path.dirname(artifact.uri))

        return data_types.ExecutionDecision(input_artifacts, output_artifacts,
                                            exec_properties, 123, False)
Beispiel #2
0
  def testRunStatisticsGen(self):
    # Prepare the paths
    test_data_dir = os.path.join(
        os.path.dirname(os.path.dirname(__file__)), 'components', 'testdata')
    output_data_dir = os.path.join(
        os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
        self._testMethodName)
    statistics_split_names_path = os.path.join(output_data_dir,
                                               'statistics.properties',
                                               'split_names')
    fileio.makedirs(output_data_dir)

    # Run StatisticsGen
    run_component.run_component(
        full_component_class_name='tfx.components.StatisticsGen',
        examples_uri=os.path.join(test_data_dir, 'csv_example_gen'),
        examples_split_names=artifact_utils.encode_split_names(
            ['train', 'eval']),
        statistics_path=output_data_dir,
        statistics_split_names_path=statistics_split_names_path,
    )

    # Check the statistics_gen outputs
    self.assertTrue(
        fileio.exists(
            os.path.join(output_data_dir, 'Split-train', 'FeatureStats.pb')))
    self.assertTrue(
        fileio.exists(
            os.path.join(output_data_dir, 'Split-eval', 'FeatureStats.pb')))
    self.assertTrue(os.path.exists(statistics_split_names_path))
    self.assertEqual(
        pathlib.Path(statistics_split_names_path).read_text(),
        '["train", "eval"]')
Beispiel #3
0
    def setUp(self):
        super().setUp()
        self._test_dir = tempfile.mkdtemp()

        self._executor_invocation = pipeline_pb2.ExecutorInput()
        self._executor_invocation.outputs.output_file = _TEST_OUTPUT_METADATA_JSON
        self._executor_invocation.inputs.parameters[
            'input_base_uri'].string_value = _TEST_INPUT_DIR
        self._executor_invocation.inputs.parameters[
            'input_config'].string_value = json_format.MessageToJson(
                example_gen_pb2.Input(splits=[
                    example_gen_pb2.Input.Split(name='s1',
                                                pattern='span{SPAN}/split1/*'),
                    example_gen_pb2.Input.Split(name='s2',
                                                pattern='span{SPAN}/split2/*')
                ]))
        self._executor_invocation.outputs.artifacts[
            'examples'].artifacts.append(
                pipeline_pb2.RuntimeArtifact(
                    type=pipeline_pb2.ArtifactTypeSchema(
                        instance_schema=compiler_utils.get_artifact_schema(
                            standard_artifacts.Examples()))))

        self._executor_invocation_from_file = fileio.open(
            os.path.join(os.path.dirname(__file__), 'testdata',
                         'executor_invocation.json'), 'r').read()
        self._expected_result_from_file = fileio.open(
            os.path.join(os.path.dirname(__file__), 'testdata',
                         'expected_output_metadata.json'), 'r').read()

        self._olddir = os.getcwd()
        os.chdir(self._test_dir)
        fileio.makedirs(os.path.dirname(_TEST_OUTPUT_METADATA_JSON))
        fileio.makedirs(os.path.dirname(_TEST_INPUT_DIR))
 def testDumpUiMetadata(self):
     trainer = Trainer(examples=Channel(type=standard_artifacts.Examples),
                       module_file='module_file',
                       train_args=trainer_pb2.TrainArgs(splits=['train'],
                                                        num_steps=100),
                       eval_args=trainer_pb2.EvalArgs(splits=['eval'],
                                                      num_steps=50))
     model_run = standard_artifacts.ModelRun()
     model_run.uri = 'model_run_uri'
     exec_info = data_types.ExecutionInfo(
         input_dict={},
         output_dict={'model_run': [model_run]},
         exec_properties={},
         execution_id='id')
     ui_metadata_path = os.path.join(
         os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
         self._testMethodName, 'json')
     fileio.makedirs(os.path.dirname(ui_metadata_path))
     container_entrypoint._dump_ui_metadata(trainer, exec_info,
                                            ui_metadata_path)
     with open(ui_metadata_path) as f:
         ui_metadata = json.load(f)
         self.assertEqual('tensorboard', ui_metadata['outputs'][-1]['type'])
         self.assertEqual('model_run_uri',
                          ui_metadata['outputs'][-1]['source'])
Beispiel #5
0
def _copy_and_replace_placeholder_dir(
        src: Text, dst: Text, ignore_paths: Set[Text],
        replace_dict: Dict[Pattern[Text], Text]) -> None:
    """Copy a directory to destination path and replace the placeholders."""
    if not os.path.isdir(dst):
        if os.path.exists(dst):
            raise RuntimeError(
                'Cannot copy template directory {}. Already a file exists.'.
                format(src))
        fileio.makedirs(dst)
    for f in os.listdir(src):
        src_path = os.path.join(src, f)
        dst_path = os.path.join(dst, f)
        if src_path in ignore_paths:
            continue

        if os.path.isdir(src_path):
            if f.startswith(
                    '_'):  # Excludes __pycache__ and other private folders.
                continue
            _copy_and_replace_placeholder_dir(src_path, dst_path, ignore_paths,
                                              replace_dict)
        else:  # a file.
            if f.endswith('.pyc'):  # Excludes .pyc
                continue
            _copy_and_replace_placeholder_file(src_path, dst_path,
                                               replace_dict)
Beispiel #6
0
    def _save_pipeline(self, pipeline_args: Dict[Text, Any]) -> None:
        """Creates/updates pipeline folder in the handler directory.

    Args:
      pipeline_args: Pipeline details obtained from DSL.
    """
        # Path to pipeline folder in Airflow.
        handler_pipeline_path = os.path.join(
            self._handler_home_dir, pipeline_args[labels.PIPELINE_NAME], '')

        # If updating pipeline, first delete pipeline directory.
        if fileio.exists(handler_pipeline_path):
            io_utils.delete_dir(handler_pipeline_path)

        # Dump pipeline_args to handler pipeline folder as json.
        fileio.makedirs(handler_pipeline_path)
        with open(os.path.join(handler_pipeline_path, 'pipeline_args.json'),
                  'w') as f:
            json.dump(pipeline_args, f)

        # Copy dsl to pipeline folder
        pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]
        io_utils.copy_file(
            pipeline_dsl_path,
            os.path.join(handler_pipeline_path,
                         os.path.basename(pipeline_dsl_path)))
Beispiel #7
0
def copy_template(flags_dict: Dict[str, Any]) -> None:
    """Copy template flags_dict["model"] to flags_dict["dest_dir"].

  Copies all *.py and README files in specified template, and replace
  the content of the files.

  Args:
    flags_dict: Should have pipeline_name, model and dest_dir.
  """
    model = flags_dict[labels.MODEL]
    pipeline_name = _sanitize_pipeline_name(flags_dict[labels.PIPELINE_NAME])
    template_dir = os.path.join(_templates_src_dir(), model)
    if not os.path.isdir(template_dir):
        raise ValueError('Model {} does not exist.'.format(model))
    destination_dir = flags_dict[labels.DESTINATION_PATH]

    ignore_paths = {
        os.path.join(template_dir, x)
        for x in _IGNORE_FILE_PATHS.get(model, [])
    }
    replace_dict = {
        _IMPORT_FROM_PACKAGE: _IMPORT_FROM_LOCAL_DIR,
        _PLACEHOLDER_PIPELINE_NAME: pipeline_name,
    }
    _copy_and_replace_placeholder_dir(template_dir, destination_dir,
                                      ignore_paths, replace_dict)
    for additional_file in _ADDITIONAL_FILE_PATHS.get(model, []):
        dst_path = os.path.join(destination_dir, additional_file.dst)
        fileio.makedirs(os.path.dirname(dst_path))

        if additional_file.src.startswith(('http://', 'https://')):
            urllib.request.urlretrieve(additional_file.src, dst_path)
        else:
            src_path = os.path.join(_tfx_src_dir(), additional_file.src)
            fileio.copy(src_path, dst_path)
Beispiel #8
0
def _prepare_output_paths(artifact: types.Artifact):
    """Create output directories for output artifact."""
    if fileio.exists(artifact.uri):
        msg = 'Output artifact uri %s already exists' % artifact.uri
        absl.logging.warning(msg)
        # TODO(b/158689199): We currently simply return as a short-term workaround
        # to unblock execution retires. A comprehensive solution to guarantee
        # idempotent executions is needed.
        return

    # TODO(b/147242148): Introduce principled artifact structure (directory
    # or file) definition.
    if isinstance(artifact, types.ValueArtifact):
        artifact_dir = os.path.dirname(artifact.uri)
    else:
        artifact_dir = artifact.uri

    # TODO(zhitaoli): Consider refactoring this out into something
    # which can handle permission bits.
    absl.logging.debug('Creating output artifact uri %s as directory',
                       artifact_dir)
    fileio.makedirs(artifact_dir)
    # TODO(b/147242148): Avoid special-casing the "split_names" property.
    if artifact.type.PROPERTIES and 'split_names' in artifact.type.PROPERTIES:
        split_names = artifact_utils.decode_split_names(artifact.split_names)
        for split in split_names:
            split_dir = os.path.join(artifact.uri, split)
            absl.logging.debug('Creating output split %s as directory',
                               split_dir)
            fileio.makedirs(split_dir)
Beispiel #9
0
    def testDumpUiMetadata(self):
        trainer = pipeline_pb2.PipelineNode()
        trainer.node_info.type.name = 'tfx.components.trainer.component.Trainer'
        model_run_out_spec = pipeline_pb2.OutputSpec(
            artifact_spec=pipeline_pb2.OutputSpec.ArtifactSpec(
                type=metadata_store_pb2.ArtifactType(
                    name=standard_artifacts.ModelRun.TYPE_NAME)))
        trainer.outputs.outputs['model_run'].CopyFrom(model_run_out_spec)

        model_run = standard_artifacts.ModelRun()
        model_run.uri = 'model_run_uri'
        exec_info = data_types.ExecutionInfo(
            input_dict={},
            output_dict={'model_run': [model_run]},
            exec_properties={},
            execution_id='id')
        ui_metadata_path = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName, 'json')
        fileio.makedirs(os.path.dirname(ui_metadata_path))
        container_entrypoint._dump_ui_metadata(trainer, exec_info,
                                               ui_metadata_path)
        with open(ui_metadata_path) as f:
            ui_metadata = json.load(f)
            self.assertEqual('tensorboard', ui_metadata['outputs'][-1]['type'])
            self.assertEqual('model_run_uri',
                             ui_metadata['outputs'][-1]['source'])
Beispiel #10
0
    def testPipelineSchemaSuccessfulRun(self):
        # First create a pipeline.
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: self.pipeline_path
        }
        handler = beam_handler.BeamHandler(flags_dict)
        handler.create_pipeline()

        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name,
        }
        handler = beam_handler.BeamHandler(flags_dict)
        # Create fake schema in pipeline root.
        component_output_dir = os.path.join(self.pipeline_root, 'SchemaGen')
        schema_path = base_driver._generate_output_uri(  # pylint: disable=protected-access
            component_output_dir, 'schema', 3)

        fileio.makedirs(schema_path)
        with open(os.path.join(schema_path, 'schema.pbtxt'), 'w') as f:
            f.write('SCHEMA')
        with self.captureWritesToStream(sys.stdout) as captured:
            handler.get_schema()
            curr_dir_path = os.path.abspath('schema.pbtxt')
            self.assertIn('Path to schema: {}'.format(curr_dir_path),
                          captured.contents())
            self.assertIn(
                '*********SCHEMA FOR {}**********'.format(
                    self.pipeline_name.upper()), captured.contents())
            self.assertTrue(fileio.exists(curr_dir_path))
Beispiel #11
0
    def setUp(self):
        super(CliKubeflowEndToEndTest, self).setUp()

        # List of packages installed.
        self._pip_list = pip_utils.get_package_names()

        # Check if Kubeflow is installed before running E2E tests.
        if labels.KUBEFLOW_PACKAGE_NAME not in self._pip_list:
            sys.exit('Kubeflow not installed.')

        # Change the encoding for Click since Python 3 is configured to use ASCII as
        # encoding for the environment.
        if codecs.lookup(locale.getpreferredencoding()).name == 'ascii':
            os.environ['LANG'] = 'en_US.utf-8'

        # Initialize CLI runner.
        self.runner = click_testing.CliRunner()

        # Testdata path.
        self._testdata_dir = os.path.join(
            os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
            'testdata')
        self._testdata_dir_updated = self.tmp_dir
        fileio.makedirs(self._testdata_dir_updated)

        self._pipeline_name = ('cli-kubeflow-e2e-test-' +
                               test_utils.generate_random_id())
        absl.logging.info('Pipeline name is %s' % self._pipeline_name)
        self._pipeline_name_v2 = self._pipeline_name + '_v2'

        orig_pipeline_path = os.path.join(self._testdata_dir,
                                          'test_pipeline_kubeflow_1.py')
        self._pipeline_path = os.path.join(self._testdata_dir_updated,
                                           'test_pipeline_kubeflow_1.py')
        self._pipeline_path_v2 = os.path.join(self._testdata_dir_updated,
                                              'test_pipeline_kubeflow_2.py')

        test_utils.copy_and_change_pipeline_name(
            orig_pipeline_path, self._pipeline_path,
            'chicago_taxi_pipeline_kubeflow', self._pipeline_name)
        self.assertTrue(fileio.exists(self._pipeline_path))
        test_utils.copy_and_change_pipeline_name(
            orig_pipeline_path, self._pipeline_path_v2,
            'chicago_taxi_pipeline_kubeflow', self._pipeline_name_v2)
        self.assertTrue(fileio.exists(self._pipeline_path_v2))

        # Endpoint URL
        self._endpoint = self._get_endpoint(
            subprocess.check_output(
                'kubectl describe configmap inverse-proxy-config -n kubeflow'.
                split()))
        absl.logging.info('ENDPOINT: ' + self._endpoint)

        self._pipeline_package_path = '{}.tar.gz'.format(self._pipeline_name)

        try:
            # Create a kfp client for cleanup after running commands.
            self._client = kfp.Client(host=self._endpoint)
        except kfp_server_api.rest.ApiException as err:
            absl.logging.info(err)
Beispiel #12
0
    def testRunSchemaGen(self):
        # Prepare the paths
        test_data_dir = os.path.join(
            os.path.dirname(os.path.dirname(__file__)), 'components',
            'testdata')
        output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', tempfile.mkdtemp()),
            self._testMethodName)
        fileio.makedirs(output_data_dir)

        # Run SchemaGen
        run_component.run_component(
            full_component_class_name='tfx.components.SchemaGen',
            # Testing that we can specify input artifact paths
            statistics_path=os.path.join(test_data_dir, 'statistics_gen'),
            # Testing that we can specify artifact properties
            statistics_split_names=artifact_utils.encode_split_names(
                ['train', 'eval']),
            # Testing that we can pass arguments for non-string properties
            infer_feature_shape='1',
            # Testing that we can specify output artifact paths
            schema_path=os.path.join(output_data_dir),
        )

        # Checking the schema_gen outputs
        self.assertTrue(
            fileio.exists(os.path.join(output_data_dir, 'schema.pbtxt')))
Beispiel #13
0
 def setUp(self):
     super(UtilsTest, self).setUp()
     # Create input splits.
     test_dir = os.path.join(
         os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
         self._testMethodName)
     self._input_base_path = os.path.join(test_dir, 'input_base')
     fileio.makedirs(self._input_base_path)
Beispiel #14
0
def copy_file(src: Text, dst: Text, overwrite: bool = False):
    """Copies a single file from source to destination."""

    if overwrite and fileio.exists(dst):
        fileio.remove(dst)
    dst_dir = os.path.dirname(dst)
    fileio.makedirs(dst_dir)
    fileio.copy(src, dst, overwrite=overwrite)
Beispiel #15
0
    def run(self,
            pipeline: tfx_pipeline.Pipeline,
            parameter_values: Optional[Dict[Text, Any]] = None,
            write_out: Optional[bool] = True) -> Dict[Text, Any]:
        """Compiles a pipeline DSL object into pipeline file.

    Args:
      pipeline: TFX pipeline object.
      parameter_values: mapping from runtime parameter names to its values.
      write_out: set to True to actually write out the file to the place
        designated by output_dir and output_filename. Otherwise return the
        JSON-serialized pipeline job spec.

    Returns:
      Returns the JSON pipeline job spec.

    Raises:
      RuntimeError: if trying to write out to a place occupied by an existing
      file.
    """
        # TODO(b/166343606): Support user-provided labels.
        # TODO(b/169095387): Deprecate .run() method in favor of the unified API
        # client.
        display_name = (self._config.display_name
                        or pipeline.pipeline_info.pipeline_name)
        pipeline_spec = pipeline_builder.PipelineBuilder(
            tfx_pipeline=pipeline,
            default_image=self._config.default_image,
            default_commands=self._config.default_commands).build()
        pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__)
        pipeline_spec.schema_version = _SCHEMA_VERSION
        runtime_config = pipeline_builder.RuntimeConfigBuilder(
            pipeline_info=pipeline.pipeline_info,
            parameter_values=parameter_values).build()
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}):
            result = pipeline_spec_pb2.PipelineJob(
                display_name=display_name
                or pipeline.pipeline_info.pipeline_name,
                labels=telemetry_utils.get_labels_dict(),
                runtime_config=runtime_config)
        result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec))
        pipeline_json_dict = json_format.MessageToDict(result)
        if write_out:
            if fileio.exists(
                    self._output_dir) and not fileio.isdir(self._output_dir):
                raise RuntimeError('Output path: %s is pointed to a file.' %
                                   self._output_dir)
            if not fileio.exists(self._output_dir):
                fileio.makedirs(self._output_dir)

            with fileio.open(
                    os.path.join(self._output_dir, self._output_filename),
                    'wb') as f:
                f.write(json.dumps(pipeline_json_dict, sort_keys=True))

        return pipeline_json_dict
Beispiel #16
0
 def _get_tmp_dir(self) -> Text:
     """Get the temporary directory path."""
     if not self._context:
         raise RuntimeError('No context for the executor')
     tmp_path = self._context.get_tmp_path()
     if not fileio.exists(tmp_path):
         absl.logging.info('Creating temp directory at %s', tmp_path)
         fileio.makedirs(tmp_path)
     return tmp_path
Beispiel #17
0
  def _save_pipeline(self,
                     pipeline_args: Dict[Text, Any],
                     update: bool = False) -> None:
    """Creates/updates pipeline folder in the handler directory."""
    pipeline_name = pipeline_args[labels.PIPELINE_NAME]

    # Path to pipeline folder.
    handler_pipeline_path = os.path.join(self._handler_home_dir, pipeline_name)

    pipeline_package_path = self.flags_dict[labels.PIPELINE_PACKAGE_PATH]

    if update:
      pipeline_id = self._get_pipeline_id(pipeline_name)
      # A timestamp will be appended for the uniqueness of `version_name`.
      version_name = '{}_{}'.format(pipeline_name,
                                    time.strftime('%Y%m%d%H%M%S'))
      upload_response = self._client.pipeline_uploads.upload_pipeline_version(
          uploadfile=pipeline_package_path,
          name=version_name,
          pipelineid=pipeline_id)
      pipeline_version_id = upload_response.id

      experiment_id = self._get_experiment_id(pipeline_name)
    else:  # creating a new pipeline.
      upload_response = self._client.upload_pipeline(
          pipeline_package_path=pipeline_package_path,
          pipeline_name=pipeline_name)
      pipeline_id = upload_response.id
      pipeline_version_id = upload_response.default_version.id

      # Create experiment with pipeline name as experiment name.
      experiment_name = pipeline_name
      experiment_id = self._client.create_experiment(experiment_name).id

    # Display the link to the pipeline detail page in KFP UI.
    click.echo(upload_response)
    click.echo('Please access the pipeline detail page at '
               '{prefix}/#/pipelines/details/{pipeline_id}'.format(
                   prefix=self._client._get_url_prefix(),  # pylint: disable=protected-access
                   pipeline_id=pipeline_id))

    # Add pipeline details to pipeline_args.
    pipeline_args[labels.PIPELINE_NAME] = pipeline_name
    pipeline_args[labels.PIPELINE_ID] = pipeline_id
    pipeline_args[labels.PIPELINE_VERSION_ID] = pipeline_version_id
    pipeline_args[labels.PIPELINE_PACKAGE_PATH] = pipeline_package_path
    pipeline_args[labels.EXPERIMENT_ID] = experiment_id

    # Path to pipeline_args.json .
    pipeline_args_path = os.path.join(handler_pipeline_path,
                                      'pipeline_args.json')

    # Copy pipeline_args to pipeline folder.
    fileio.makedirs(handler_pipeline_path)
    with open(pipeline_args_path, 'w') as f:
      json.dump(pipeline_args, f)
Beispiel #18
0
 def side_effect(cmd, stdout, stderr):
     self.assertLen(cmd, 3)
     self.assertEqual(sys.executable, cmd[0])
     self.assertEqual('sdist', cmd[2])
     self.assertEqual(stdout, stderr)
     setup_file = cmd[1]
     dist_dir = os.path.join(os.path.dirname(setup_file), 'dist')
     fileio.makedirs(dist_dir)
     dest_file = os.path.join(dist_dir, expected_package)
     fileio.copy(test_file, dest_file)
Beispiel #19
0
def write_tfrecord_file(file_name: Text, proto: Message) -> None:
    """Writes a serialized tfrecord to file."""
    try:
        import tensorflow as tf  # pylint: disable=g-import-not-at-top
    except ModuleNotFoundError as e:
        raise Exception(
            'TensorFlow must be installed to use this functionality.') from e
    fileio.makedirs(os.path.dirname(file_name))
    with tf.io.TFRecordWriter(file_name) as writer:
        writer.write(proto.SerializeToString())
Beispiel #20
0
 def setUp(self):
     super(BaseDriverTest, self).setUp()
     self._mock_metadata = tf.compat.v1.test.mock.Mock()
     self._input_dict = {
         'input_data':
         types.Channel(type=_InputArtifact,
                       artifacts=[_InputArtifact()],
                       producer_component_id='c',
                       output_key='k'),
         'input_string':
         types.Channel(type=standard_artifacts.String,
                       artifacts=[
                           standard_artifacts.String(),
                           standard_artifacts.String()
                       ],
                       producer_component_id='c2',
                       output_key='k2'),
     }
     input_dir = os.path.join(
         os.environ.get('TEST_TMP_DIR', self.get_temp_dir()),
         self._testMethodName, 'input_dir')
     # valid input artifacts must have a uri pointing to an existing directory.
     for key, input_channel in self._input_dict.items():
         for index, artifact in enumerate(input_channel.get()):
             artifact.id = index + 1
             uri = os.path.join(input_dir, key, str(artifact.id))
             artifact.uri = uri
             fileio.makedirs(uri)
     self._output_dict = {
         'output_data':
         types.Channel(type=_OutputArtifact, artifacts=[_OutputArtifact()]),
         'output_multi_data':
         types.Channel(type=_OutputArtifact,
                       matching_channel_name='input_string')
     }
     self._input_artifacts = channel_utils.unwrap_channel_dict(
         self._input_dict)
     self._output_artifacts = channel_utils.unwrap_channel_dict(
         self._output_dict)
     self._exec_properties = {
         'key': 'value',
     }
     self._execution_id = 100
     self._execution = metadata_store_pb2.Execution()
     self._execution.id = self._execution_id
     self._context_id = 123
     self._driver_args = data_types.DriverArgs(enable_cache=True)
     self._pipeline_info = data_types.PipelineInfo(
         pipeline_name='my_pipeline_name',
         pipeline_root=os.environ.get('TEST_TMP_DIR', self.get_temp_dir()),
         run_id='my_run_id')
     self._component_info = data_types.ComponentInfo(
         component_type='a.b.c',
         component_id='my_component_id',
         pipeline_info=self._pipeline_info)
Beispiel #21
0
    def setUp(self):
        super().setUp()
        self._source_data_dir = os.path.join(
            os.path.dirname(
                os.path.dirname(os.path.dirname(os.path.dirname(__file__)))),
            'components', 'testdata')
        self._output_data_dir = os.path.join(
            os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
            self._testMethodName)
        fileio.makedirs(self._output_data_dir)
        self._model_export = standard_artifacts.Model()
        self._model_export.uri = os.path.join(self._source_data_dir,
                                              'trainer/current')
        self._model_blessing = standard_artifacts.ModelBlessing()
        self._input_dict = {
            standard_component_specs.MODEL_KEY: [self._model_export],
            standard_component_specs.MODEL_BLESSING_KEY:
            [self._model_blessing],
        }

        self._model_push = standard_artifacts.PushedModel()
        self._model_push.uri = os.path.join(self._output_data_dir,
                                            'model_push')
        fileio.makedirs(self._model_push.uri)
        self._output_dict = {
            standard_component_specs.PUSHED_MODEL_KEY: [self._model_push],
        }
        # Dict format of exec_properties. custom_config needs to be serialized
        # before being passed into Do function.
        self._exec_properties = {
            'custom_config': {
                constants.SERVING_ARGS_KEY: {
                    'model_name': 'model_name',
                    'project_id': 'project_id'
                },
            },
            'push_destination': None,
        }
        self._container_image_uri_vertex = 'gcr.io/path/to/container'
        # Dict format of exec_properties for Vertex. custom_config needs to be
        # serialized before being passed into Do function.
        self._exec_properties_vertex = {
            'custom_config': {
                constants.SERVING_ARGS_KEY: {
                    'endpoint_name': 'endpoint_name',
                    'project_id': 'project_id',
                },
                constants.VERTEX_CONTAINER_IMAGE_URI_KEY:
                self._container_image_uri_vertex,
                constants.VERTEX_REGION_KEY: 'us-central1',
                constants.ENABLE_VERTEX_KEY: True,
            },
            'push_destination': None,
        }
        self._executor = executor.Executor()
Beispiel #22
0
 def get_stateful_working_directory(self):
     """Generates stateful working directory given execution id."""
     # TODO(b/150979622): We should introduce an id that is not changed across
     # retires of the same component run to provide better isolation between
     # "retry" and "new execution". When it is available, introduce it into
     # statuful working direcotry.
     stateful_working_dir = os.path.join(self._node_dir,
                                         self._pipeline_run_id,
                                         _STATEFUL_WORKING_DIR)
     fileio.makedirs(stateful_working_dir)
     return stateful_working_dir
Beispiel #23
0
 def createFiles(self, dir_spec, base_dir=None):
   if base_dir is None:
     base_dir = self._base_dir
   for key, value in dir_spec.items():
     full_path = os.path.join(base_dir, key)
     if isinstance(value, str):
       io_utils.write_string_file(full_path, value)
     elif isinstance(value, dict):
       fileio.makedirs(full_path)
       self.createFiles(value, base_dir=full_path)
     else:
       raise TypeError(f'Invalid directory spec: {dir_spec}')
Beispiel #24
0
  def testGetRun(self):
    # Create a pipeline in beam home.
    handler_pipeline_path = os.path.join(
        os.environ['BEAM_HOME'], self.pipeline_args[labels.PIPELINE_NAME])
    fileio.makedirs(handler_pipeline_path)

    # Now run the pipeline
    flags_dict = {labels.ENGINE_FLAG: self.engine, labels.RUN_ID: self.run_id}
    handler = beam_handler.BeamHandler(flags_dict)
    with self.captureWritesToStream(sys.stdout) as captured:
      handler.get_run()
    self.assertIn('Not supported for beam orchestrator.', captured.contents())
Beispiel #25
0
 def setUp(self):
     super().setUp()
     self.tmp_dir = os.path.join(
         os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()),
         self._testMethodName)
     fileio.makedirs(self.tmp_dir)
     # TODO(b/176196624): Delete following block when we drop support for TF<2.4.
     # Manually set up exit_stack because absltest.TestCase.setUp() is not called
     # in TF<2.4.
     if self._exit_stack is None:
         self._exit_stack = contextlib.ExitStack()
         self.addCleanup(self._exit_stack.close)
Beispiel #26
0
  def testTerminateRun(self):
    # Create a pipeline in local home.
    handler_pipeline_path = os.path.join(
        os.environ['LOCAL_HOME'], self.pipeline_args[labels.PIPELINE_NAME])
    fileio.makedirs(handler_pipeline_path)

    # Now run the pipeline
    flags_dict = {labels.ENGINE_FLAG: self.engine, labels.RUN_ID: self.run_id}
    handler = local_handler.LocalHandler(flags_dict)
    with self.captureWritesToStream(sys.stdout) as captured:
      handler.terminate_run()
    self.assertIn('Not supported for local orchestrator.', captured.contents())
Beispiel #27
0
    def testGetSchema(self):
        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_DSL_PATH: self.pipeline_path,
            labels.ENDPOINT: self.endpoint,
            labels.IAP_CLIENT_ID: self.iap_client_id,
            labels.NAMESPACE: self.namespace,
            labels.PIPELINE_PACKAGE_PATH: self.pipeline_package_path
        }
        handler = kubeflow_handler.KubeflowHandler(flags_dict)
        handler.create_pipeline()

        flags_dict = {
            labels.ENGINE_FLAG: self.engine,
            labels.PIPELINE_NAME: self.pipeline_name,
        }

        # No pipeline root
        handler = kubeflow_handler.KubeflowHandler(flags_dict)
        with self.assertRaises(SystemExit) as err:
            handler.get_schema()
        self.assertEqual(
            str(err.exception),
            'Create a run before inferring schema. If pipeline is already running, then wait for it to successfully finish.'
        )

        # No SchemaGen output.
        fileio.makedirs(self.pipeline_root)
        with self.assertRaises(SystemExit) as err:
            handler.get_schema()
        self.assertEqual(
            str(err.exception),
            'Either SchemaGen component does not exist or pipeline is still running. If pipeline is running, then wait for it to successfully finish.'
        )

        # Successful pipeline run.
        # Create fake schema in pipeline root.
        component_output_dir = os.path.join(self.pipeline_root, 'SchemaGen')
        schema_path = base_driver._generate_output_uri(  # pylint: disable=protected-access
            component_output_dir, 'schema', 3)
        fileio.makedirs(schema_path)
        with open(os.path.join(schema_path, 'schema.pbtxt'), 'w') as f:
            f.write('SCHEMA')
        with self.captureWritesToStream(sys.stdout) as captured:
            handler.get_schema()
            curr_dir_path = os.path.join(os.getcwd(), 'schema.pbtxt')
            self.assertIn('Path to schema: {}'.format(curr_dir_path),
                          captured.contents())
            self.assertIn(
                '*********SCHEMA FOR {}**********'.format(
                    self.pipeline_name.upper()), captured.contents())
            self.assertTrue(fileio.exists(curr_dir_path))
Beispiel #28
0
    def setUp(self):
        # Prepare executor input.
        serialized_metadata = self._get_text_from_test_data(
            "executor_invocation.json")
        metadata_json = json.loads(serialized_metadata)
        # Mutate the outputFile field.
        metadata_json["outputs"]["outputFile"] = _TEST_OUTPUT_METADATA_JSON
        self._serialized_metadata = json.dumps(metadata_json)

        self._expected_output = json.loads(
            self._get_text_from_test_data("expected_output_metadata.json"))

        super(KubeflowV2RunExecutorTest, self).setUp()
        fileio.makedirs(os.path.dirname(_TEST_OUTPUT_METADATA_JSON))
Beispiel #29
0
def _export_fn(estimator, export_path, checkpoint_path, eval_result,
               is_the_final_export):
    del estimator, checkpoint_path, eval_result, is_the_final_export
    path = os.path.join(export_path, BASE_EXPORT_SUBDIR)
    fileio.makedirs(path)
    with fileio.open(os.path.join(path, ORIGINAL_SAVED_MODEL), 'w') as f:
        f.write(str(ORIGINAL_SAVED_MODEL))

    assets_path = os.path.join(path, tf.saved_model.ASSETS_DIRECTORY)
    fileio.makedirs(assets_path)
    with fileio.open(os.path.join(assets_path, ORIGINAL_VOCAB), 'w') as f:
        f.write(str(ORIGINAL_VOCAB))

    return path
Beispiel #30
0
 def testCheckPipelinExistenceNotRequired(self):
     flags_dict = {
         labels.ENGINE_FLAG: 'beam',
         labels.PIPELINE_NAME: 'pipeline'
     }
     handler = FakeHandler(flags_dict)
     fileio.makedirs(
         os.path.join(os.environ['HOME'], 'tfx', 'beam', 'pipeline', ''))
     with self.assertRaises(SystemExit) as err:
         handler._check_pipeline_existence(flags_dict[labels.PIPELINE_NAME],
                                           required=False)
     self.assertTrue(
         str(err.exception), 'Pipeline "{}" already exists.'.format(
             flags_dict[labels.PIPELINE_NAME]))