Esempio n. 1
0
    def test_new_execution(self, mock_metadata_class, mock_driver_class,
                           mock_executor_class, mock_get_logger):
        self._setup_mocks(mock_metadata_class, mock_driver_class,
                          mock_executor_class, mock_get_logger)
        adapter, input_dict, output_dict, exec_properties, driver_args = self._setup_adapter_and_args(
        )

        self.mock_task_instance.xcom_pull.side_effect = [self.input_one_json]

        self.mock_driver.prepare_execution.return_value = data_types.ExecutionDecision(
            input_dict, output_dict, exec_properties, execution_id=12345)

        check_result = adapter.check_cache_and_maybe_prepare_execution(
            'cached_branch', 'uncached_branch', ti=self.mock_task_instance)

        mock_driver_class.assert_called_with(
            metadata_handler=self.mock_metadata)
        self.mock_driver.prepare_execution.called_with(input_dict, output_dict,
                                                       exec_properties,
                                                       driver_args)
        self.mock_task_instance.xcom_pull.assert_called_with(
            dag_id='input_one_component_id', key='input_one_key')

        calls = [
            mock.call(key='_exec_inputs',
                      value=types.jsonify_tfx_type_dict(input_dict)),
            mock.call(key='_exec_outputs',
                      value=types.jsonify_tfx_type_dict(output_dict)),
            mock.call(key='_exec_properties',
                      value=json.dumps(exec_properties)),
            mock.call(key='_execution_id', value=12345)
        ]
        self.mock_task_instance.xcom_push.assert_has_calls(calls)

        self.assertEqual(check_result, 'uncached_branch')
Esempio n. 2
0
  def check_cache_and_maybe_prepare_execution(self, cached_branch,
                                              uncached_branch, **kwargs):
    """Depending on previous run status, run exec or skip."""

    task_instance = kwargs['ti']
    self._update_input_dict_from_xcom(task_instance)

    with metadata.Metadata(self._metadata_connection_config) as m:
      driver = self._driver_class(
          log_root=self._exec_properties['log_root'], metadata_handler=m)
      execution_decision = driver.prepare_execution(
          self._input_dict, self._output_dict, self._exec_properties,
          self._driver_options)
      if not execution_decision.execution_id:
        self._logger.info(
            'All artifacts found. Publishing to pipeline and skipping executor.'
        )
        self._publish_outputs_to_pipeline(task_instance,
                                          execution_decision.output_dict)
        return cached_branch

      task_instance.xcom_push(
          key='_exec_inputs',
          value=jsonify_tfx_type_dict(execution_decision.input_dict))
      task_instance.xcom_push(
          key='_exec_outputs',
          value=jsonify_tfx_type_dict(execution_decision.output_dict))
      task_instance.xcom_push(
          key='_exec_properties',
          value=json.dumps(execution_decision.exec_properties))
      task_instance.xcom_push(
          key='_execution_id', value=execution_decision.execution_id)

      self._logger.info('No cached execution found. Starting executor.')
      return uncached_branch
Esempio n. 3
0
    def test_publish_exec(self, mock_metadata_class, mock_driver_class,
                          mock_executor_class, mock_get_logger):
        self._setup_mocks(mock_metadata_class, mock_driver_class,
                          mock_executor_class, mock_get_logger)
        adapter, input_dict, output_dict, exec_properties, _ = self._setup_adapter_and_args(
        )

        self.mock_task_instance.xcom_pull.side_effect = [
            types.jsonify_tfx_type_dict(input_dict),
            types.jsonify_tfx_type_dict(output_dict),
            json.dumps(exec_properties), 12345,
            types.jsonify_tfx_type_dict(output_dict)
        ]
        output_artifact_published = types.TfxArtifact('O')
        output_artifact_published.source = self.output_one.source
        self.mock_metadata.publish_execution.return_value = {
            u'output_one': [output_artifact_published]
        }

        adapter.publish_exec('cache_task_name',
                             'exec_task_name',
                             ti=self.mock_task_instance)

        calls = [
            mock.call(key='_exec_inputs', task_ids='cache_task_name'),
            mock.call(key='_exec_outputs', task_ids='cache_task_name'),
            mock.call(key='_exec_properties', task_ids='cache_task_name'),
            mock.call(key='_execution_id', task_ids='cache_task_name'),
            mock.call(key='return_value', task_ids='exec_task_name')
        ]

        self.mock_metadata.publish_execution.assert_called_with(
            12345, adapter._input_dict, adapter._output_dict)
        self.mock_task_instance.xcom_pull.assert_has_calls(calls)
        self.mock_task_instance.xcom_push.assert_called_once()
Esempio n. 4
0
  def test_python_exec(self, mock_metadata_class, mock_driver_class,
                       mock_executor_class, mock_docker_operator_class,
                       mock_get_logger):
    self._setup_mocks(mock_metadata_class, mock_driver_class,
                      mock_executor_class, mock_docker_operator_class,
                      mock_get_logger)
    adapter, input_dict, output_dict, exec_properties, _ = self._setup_adapter_and_args(
    )

    self.mock_task_instance.xcom_pull.side_effect = [
        types.jsonify_tfx_type_dict(input_dict),
        types.jsonify_tfx_type_dict(output_dict),
        json.dumps(exec_properties), 12345
    ]

    adapter.python_exec('cache_task_name', ti=self.mock_task_instance)

    calls = [
        mock.call(key='_exec_inputs', task_ids='cache_task_name'),
        mock.call(key='_exec_outputs', task_ids='cache_task_name'),
        mock.call(key='_exec_properties', task_ids='cache_task_name'),
        mock.call(key='_execution_id', task_ids='cache_task_name')
    ]

    self.assertEqual(
        json.dumps(exec_properties), json.dumps(adapter._exec_properties))
    mock_executor_class.assert_called_once()
    self.mock_executor.Do.assert_called_with(
        adapter._input_dict, adapter._output_dict, adapter._exec_properties)
    self.mock_task_instance.xcom_pull.assert_has_calls(calls)
    self.mock_task_instance.xcom_push.assert_called_once()
Esempio n. 5
0
    def test_python_exec(self, mock_metadata_class, mock_driver_class,
                         mock_executor_class, mock_docker_operator_class,
                         mock_get_logger):
        self._setup_mocks(mock_metadata_class, mock_driver_class,
                          mock_executor_class, mock_docker_operator_class,
                          mock_get_logger)
        adapter, input_dict, output_dict, exec_properties, _ = self._setup_adapter_and_args(
        )

        self.mock_task_instance.xcom_pull.side_effect = [
            types.jsonify_tfx_type_dict(input_dict),
            types.jsonify_tfx_type_dict(output_dict),
            json.dumps(exec_properties), 12345
        ]

        adapter.python_exec('cache_task_name', ti=self.mock_task_instance)

        calls = [
            mock.call(key='_exec_inputs', task_ids='cache_task_name'),
            mock.call(key='_exec_outputs', task_ids='cache_task_name'),
            mock.call(key='_exec_properties', task_ids='cache_task_name'),
            mock.call(key='_execution_id', task_ids='cache_task_name')
        ]

        self.assertEqual(json.dumps(exec_properties),
                         json.dumps(adapter._exec_properties))
        mock_executor_class.assert_called_once()
        self.mock_executor.Do.assert_called_with(adapter._input_dict,
                                                 adapter._output_dict,
                                                 adapter._exec_properties)
        self.mock_task_instance.xcom_pull.assert_has_calls(calls)
        self.mock_task_instance.xcom_push.assert_called_once()
Esempio n. 6
0
 def _log_startup(self, inputs, outputs, exec_properties):
     """Log inputs, outputs, and executor properties in a standard format."""
     tf.logging.info('Starting {} execution.'.format(
         self.__class__.__name__))
     tf.logging.info('Inputs for {} is: {}'.format(
         self.__class__.__name__, types.jsonify_tfx_type_dict(inputs)))
     tf.logging.info('Outputs for {} is: {}'.format(
         self.__class__.__name__, types.jsonify_tfx_type_dict(outputs)))
     tf.logging.info('Execution properties for {} is: {}'.format(
         self.__class__.__name__, json.dumps(exec_properties)))
Esempio n. 7
0
def _run_executor(args, pipeline_args):
    """Select a particular executor and run it based on name."""
    tf.logging.set_verbosity(tf.logging.INFO)

    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))

    inputs = types.parse_tfx_type_dict(inputs_str)
    outputs = types.parse_tfx_type_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)
    tf.logging.info(
        'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
            args.executor, inputs, outputs, exec_properties))

    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor = executor_cls(beam_pipeline_args=pipeline_args)
    tf.logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(types.jsonify_tfx_type_dict(outputs))
Esempio n. 8
0
    def testCsvExampleGenWrapper(self):
        input_base = types.TfxArtifact(type_name='ExternalPath', split='')
        input_base.uri = '/path/to/dataset'

        with patch.object(executor, 'Executor', autospec=True) as _:
            wrapper = executor_wrappers.CsvExampleGenWrapper(
                argparse.Namespace(
                    exec_properties=json.dumps(self.exec_properties),
                    outputs=types.jsonify_tfx_type_dict(
                        {'examples': self.examples}),
                    executor_class_path=
                    ('tfx.components.example_gen.csv_example_gen.executor.Executor'
                     ),
                    input_base=json.dumps([input_base.json_dict()])), )
            wrapper.run(output_basedir=self.output_basedir)

            # TODO(b/133011207): Validate arguments for executor and Do() method.

            metadata_file = os.path.join(self.output_basedir,
                                         'output/ml_metadata/examples')

            expected_output_examples = types.TfxArtifact(
                type_name='ExamplesPath', split='dummy')
            # Expect that span and path are resolved.
            expected_output_examples.span = 1
            expected_output_examples.uri = (
                '/path/to/output/csv_example_gen/examples/mock_workflow_id/dummy/'
            )

            with tf.gfile.GFile(metadata_file) as f:
                self.assertEqual([expected_output_examples.json_dict()],
                                 json.loads(f.read()))
Esempio n. 9
0
def _run_executor(args, pipeline_args):
  """Select a particular executor and run it based on name."""
  tf.logging.set_verbosity(tf.logging.INFO)

  (inputs_str, outputs_str,
   exec_properties_str) = (args.inputs or base64.b64decode(args.inputs_base64),
                           args.outputs or
                           base64.b64decode(args.outputs_base64),
                           args.exec_properties or
                           base64.b64decode(args.exec_properties_base64))

  inputs = parse_tfx_type_dict(inputs_str)
  outputs = parse_tfx_type_dict(outputs_str)
  exec_properties = json.loads(exec_properties_str)
  tf.logging.info(
      'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
          args.executor, inputs, outputs, exec_properties))

  executor = _get_executor_class(args.executor)(
      beam_pipeline_args=pipeline_args)
  tf.logging.info('Starting executor')
  executor.Do(inputs, outputs, exec_properties)

  # The last line of stdout will be pushed to xcom by Airflow.
  if args.write_outputs_stdout:
    print(jsonify_tfx_type_dict(outputs))
Esempio n. 10
0
 def testJsonifyTfxTypeDictDeprecated(self):
     with mock.patch.object(tf_logging, 'warning'):
         warn_mock = mock.MagicMock()
         tf_logging.warning = warn_mock
         self.assertEqual('{}', types.jsonify_tfx_type_dict({}))
         warn_mock.assert_called_once()
         self.assertIn(
             'tfx.utils.types.jsonify_tfx_type_dict has been renamed to',
             warn_mock.call_args[0][5])
Esempio n. 11
0
 def testMainEmptyInputs(self):
   """Test executor class import under empty inputs/outputs."""
   inputs = {'x': [types.TfxType(type_name='X'), types.TfxType(type_name='X')]}
   outputs = {'y': [types.TfxType(type_name='Y')]}
   exec_properties = {'a': 'b'}
   args = [
       '--executor_class_path=%s.%s' %
       (FakeExecutor.__module__, FakeExecutor.__name__),
       '--inputs=%s' % types.jsonify_tfx_type_dict(inputs),
       '--outputs=%s' % types.jsonify_tfx_type_dict(outputs),
       '--exec-properties=%s' % json.dumps(exec_properties),
   ]
   with ArgsCapture() as args_capture:
     run_executor.main(args)
     # TODO(b/131417512): Add equal comparison to TfxType class so we can
     # use asserters.
     self.assertSetEqual(
         set(args_capture.input_dict.keys()), set(inputs.keys()))
     self.assertSetEqual(
         set(args_capture.output_dict.keys()), set(outputs.keys()))
     self.assertDictEqual(args_capture.exec_properties, exec_properties)
Esempio n. 12
0
  def test_new_execution(self, mock_metadata_class, mock_driver_class,
                         mock_executor_class, mock_docker_operator_class,
                         mock_get_logger):
    self._setup_mocks(mock_metadata_class, mock_driver_class,
                      mock_executor_class, mock_docker_operator_class,
                      mock_get_logger)
    adapter, input_dict, output_dict, exec_properties, driver_options = self._setup_adapter_and_args(
    )

    self.mock_task_instance.xcom_pull.side_effect = [self.input_one_json]

    self.mock_driver.prepare_execution.return_value = base_driver.ExecutionDecision(
        input_dict, output_dict, exec_properties, execution_id=12345)

    check_result = adapter.check_cache_and_maybe_prepare_execution(
        'cached_branch',
        'uncached_branch',
        ti=self.mock_task_instance)

    mock_driver_class.assert_called_with(
        log_root='log_root', metadata_handler=self.mock_metadata)
    self.mock_driver.prepare_execution.called_with(
        input_dict, output_dict, exec_properties, driver_options)
    self.mock_task_instance.xcom_pull.assert_called_with(
        dag_id='input_one_component_id', key='input_one_key')

    calls = [
        mock.call(
            key='_exec_inputs', value=types.jsonify_tfx_type_dict(input_dict)),
        mock.call(
            key='_exec_outputs',
            value=types.jsonify_tfx_type_dict(output_dict)),
        mock.call(key='_exec_properties', value=json.dumps(exec_properties)),
        mock.call(key='_execution_id', value=12345)
    ]
    self.mock_task_instance.xcom_push.assert_has_calls(calls)

    self.assertEqual(check_result, 'uncached_branch')
Esempio n. 13
0
 def python_exec(self, cache_task_name, **kwargs):
   """PythonOperator callable to invoke executor."""
   # This is executed in worker-space not runtime-space (i.e. with distributed
   # workers, this runs on the worker node not the controller node).
   task_instance = kwargs['ti']
   self._refresh_execution_args_from_xcom(task_instance, cache_task_name)
   executor = self._executor_class(
       beam_pipeline_args=self._additional_pipeline_args.get(
           'beam_pipeline_args'))
   # Run executor
   executor.Do(self._input_dict, self._output_dict, self._exec_properties)
   # Docker operator chooses 'return_value' so we try to be consistent.
   task_instance.xcom_push(
       key='return_value', value=jsonify_tfx_type_dict(self._output_dict))
Esempio n. 14
0
    def testCsvExampleGenWrapper(self):
        input_base = types.TfxArtifact(type_name='ExternalPath', split='')
        input_base.uri = '/path/to/dataset'

        # It tests instantiation of component only. Does not test execution.
        _ = executor_wrappers.CsvExampleGenWrapper(
            argparse.Namespace(
                exec_properties=self.exec_properties,
                outputs=types.jsonify_tfx_type_dict(
                    {'examples': self.examples}),
                executor_class_path=
                ('tfx.components.example_gen.csv_example_gen.executor.Executor'
                 ),
                input_base=json.dumps([input_base.json_dict()])), )
Esempio n. 15
0
  def test_publish_exec(self, mock_metadata_class, mock_driver_class,
                        mock_executor_class, mock_docker_operator_class,
                        mock_get_logger):
    self._setup_mocks(mock_metadata_class, mock_driver_class,
                      mock_executor_class, mock_docker_operator_class,
                      mock_get_logger)
    adapter, input_dict, output_dict, exec_properties, _ = self._setup_adapter_and_args(
    )

    self.mock_task_instance.xcom_pull.side_effect = [
        types.jsonify_tfx_type_dict(input_dict),
        types.jsonify_tfx_type_dict(output_dict),
        json.dumps(exec_properties), 12345,
        types.jsonify_tfx_type_dict(output_dict)
    ]
    output_artifact_published = types.TfxType('O')
    output_artifact_published.source = self.output_one.source
    self.mock_metadata.publish_execution.return_value = {
        u'output_one': [output_artifact_published]
    }

    adapter.publish_exec(
        'cache_task_name', 'exec_task_name', ti=self.mock_task_instance)

    calls = [
        mock.call(key='_exec_inputs', task_ids='cache_task_name'),
        mock.call(key='_exec_outputs', task_ids='cache_task_name'),
        mock.call(key='_exec_properties', task_ids='cache_task_name'),
        mock.call(key='_execution_id', task_ids='cache_task_name'),
        mock.call(key='return_value', task_ids='exec_task_name')
    ]

    self.mock_metadata.publish_execution.assert_called_with(
        12345, adapter._input_dict, adapter._output_dict)
    self.mock_task_instance.xcom_pull.assert_has_calls(calls)
    self.mock_task_instance.xcom_push.assert_called_once()
Esempio n. 16
0
  def python_exec(self, cache_task_name, **kwargs):
    """PythonOperator callable to invoke executor."""
    # This is executed in worker-space not runtime-space (i.e. with distributed
    # workers, this runs on the worker node not the controller node).
    task_instance = kwargs['ti']
    self._refresh_execution_args_from_xcom(task_instance, cache_task_name)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=self._additional_pipeline_args.get(
            'beam_pipeline_args'),
        tmp_dir=self._additional_pipeline_args.get('tmp_dir'),
        unique_id=self._execution_id)

    executor = self._executor_class(executor_context)

    # Run executor
    executor.Do(self._input_dict, self._output_dict, self._exec_properties)
    # Airflow's docker_operator chooses 'return_value' so we try to be
    # consistent in case one day we want to add it back.
    task_instance.xcom_push(
        key='return_value', value=jsonify_tfx_type_dict(self._output_dict))
  def __init__(self, args, component, input_dict=None):
    executor_class_path = '.'.join(
        [component.executor.__module__, component.executor.__name__])

    output_dict = dict(
        (k, v.get()) for k, v in component.outputs.get_all().items())

    beam_pipeline_args = [
        '--experiments=shuffle_mode=auto',
        '--runner={}'.format(args.beam_runner),
        '--project={}'.format(args.project_id),
        '--temp_location={}'.format(os.path.join(args.output_dir, 'tmp')),
        '--region={}'.format(args.gcp_region),
    ]

    exec_properties = {
        "beam_pipeline_args": beam_pipeline_args,
        'output_dir': args.output_dir,
    }

    exec_properties.update(component.exec_properties)

    self._command = [
        'python',
        '/tfx-src/tfx/orchestration/kubeflow/container_entrypoint.py',
        '--exec_properties',
        json.dumps(exec_properties),
        '--outputs',
        types.jsonify_tfx_type_dict(output_dict),
        '--executor_class_path',
        executor_class_path,
        component.component_name,
    ]

    if input_dict:
      for k, v in input_dict.items():
        if isinstance(v, float) or isinstance(v, int):
          v = str(v)
        self._command.append('--{}'.format(k))
        self._command.append(v)
Esempio n. 18
0
    def testContainerOpArguments(self):
        self.assertEqual(self.component.container_op.arguments[0],
                         '--exec_properties')
        self.assertDictEqual(
            {
                'output_dir': 'output_dir',
                'log_root': 'log_root',
                'module_file': '/path/to/module.py'
            }, json.loads(self.component.container_op.arguments[1]))

        self.assertEqual(self.component.container_op.arguments[2:], [
            '--outputs',
            types.jsonify_tfx_type_dict(self._output_dict),
            '--executor_class_path',
            'some.executor.Class',
            'TFXComponent',
            '--input_data',
            'input-data-contents',
            '--train_steps',
            '300',
            '--accuracy_threshold',
            '0.3',
        ])
Esempio n. 19
0
    def __new__(
        cls,
        component_name: Text,
        input_dict: Dict[Text, Any],
        output_dict: Dict[Text, List[types.TfxArtifact]],
        exec_properties: Dict[Text, Any],
        executor_class_path: Text,
        pipeline_properties: PipelineProperties,
    ):
        """Creates a new component.

    Args:
      component_name: TFX component name.
      input_dict: Dictionary of input names to TFX types, or
        kfp.dsl.PipelineParam representing input parameters.
      output_dict: Dictionary of output names to List of TFX types.
      exec_properties: Execution properties.
      executor_class_path: <module>.<class> for Python class of executor.
      pipeline_properties: Pipeline level properties shared by all components.

    Returns:
      Newly constructed TFX Kubeflow component instance.
    """
        outputs = output_dict.keys()
        file_outputs = {
            output: '/output/ml_metadata/{}'.format(output)
            for output in outputs
        }

        for k, v in pipeline_properties.exec_properties.items():
            exec_properties[k] = v

        arguments = [
            '--exec_properties',
            json.dumps(exec_properties),
            '--outputs',
            types.jsonify_tfx_type_dict(output_dict),
            '--executor_class_path',
            executor_class_path,
            component_name,
        ]

        for k, v in input_dict.items():
            if isinstance(v, float) or isinstance(v, int):
                v = str(v)
            arguments.append('--{}'.format(k))
            arguments.append(v)

        container_op = dsl.ContainerOp(
            name=component_name,
            command=_COMMAND,
            image=pipeline_properties.tfx_image,
            arguments=arguments,
            file_outputs=file_outputs,
        )

        # Add the Argo workflow ID to the container's environment variable so it
        # can be used to uniquely place pipeline outputs under the pipeline_root.
        field_path = "metadata.labels['workflows.argoproj.io/workflow']"
        container_op.add_env_variable(
            k8s_client.V1EnvVar(name='WORKFLOW_ID',
                                value_from=k8s_client.V1EnvVarSource(
                                    field_ref=k8s_client.V1ObjectFieldSelector(
                                        field_path=field_path))))

        named_outputs = {
            output: container_op.outputs[output]
            for output in outputs
        }

        # This allows user code to refer to the ContainerOp 'op' output named 'x'
        # as op.outputs.x
        component_outputs = type('Output', (), named_outputs)

        return type(component_name, (BaseComponent, ), {
            'container_op': container_op,
            'outputs': component_outputs
        })
Esempio n. 20
0
  def __new__(cls, component_name, input_dict,
              output_dict,
              exec_properties):
    """Creates a new component.

    Args:
      component_name: TFX component name.
      input_dict: Dictionary of input names to TFX types, or
        kfp.dsl.PipelineParam representing input parameters.
      output_dict: Dictionary of output names to List of TFX types.
      exec_properties: Execution properties.

    Returns:
      Newly constructed TFX Kubeflow component instance.
    """
    outputs = output_dict.keys()
    file_outputs = {
        output: '/output/ml_metadata/{}'.format(output) for output in outputs
    }

    for k, v in ExecutionProperties.exec_properties.items():
      exec_properties[k] = v

    arguments = [
        '--exec_properties',
        json.dumps(exec_properties),
        '--outputs',
        types.jsonify_tfx_type_dict(output_dict),
        component_name,
    ]

    for k, v in input_dict.items():
      if isinstance(v, float) or isinstance(v, int):
        v = str(v)
      arguments.append('--{}'.format(k))
      arguments.append(v)

    container_op = dsl.ContainerOp(
        name=component_name,
        command=_COMMAND,
        image=_KUBEFLOW_TFX_IMAGE,
        arguments=arguments,
        file_outputs=file_outputs,
    ).apply(gcp.use_gcp_secret('user-gcp-sa'))  # Adds GCP authentication.

    # Add the Argo workflow ID to the container's environment variable so it
    # can be used to uniquely place pipeline outputs under the pipeline_root.
    field_path = "metadata.labels['workflows.argoproj.io/workflow']"
    container_op.add_env_variable(
        k8s_client.V1EnvVar(
            name='WORKFLOW_ID',
            value_from=k8s_client.V1EnvVarSource(
                field_ref=k8s_client.V1ObjectFieldSelector(
                    field_path=field_path))))

    named_outputs = {output: container_op.outputs[output] for output in outputs}

    # This allows user code to refer to the ContainerOp 'op' output named 'x'
    # as op.outputs.x
    component_outputs = type('Output', (), named_outputs)

    return type(component_name, (BaseComponent,), {
        'container_op': container_op,
        'outputs': component_outputs
    })
Esempio n. 21
0
def start_cmle_training(input_dict: Dict[Text, List[types.TfxArtifact]],
                        output_dict: Dict[Text, List[types.TfxArtifact]],
                        exec_properties: Dict[Text, Any],
                        executor_class_path: Text, training_inputs: Dict[Text,
                                                                         Any]):
    """Start a trainer job on CMLE.

  This is done by forwarding the inputs/outputs/exec_properties to the
  tfx.scripts.run_executor module on a CMLE training job interpreter.

  Args:
    input_dict: Passthrough input dict for tfx.components.Trainer.executor.
    output_dict: Passthrough input dict for tfx.components.Trainer.executor.
    exec_properties: Passthrough input dict for tfx.components.Trainer.executor.
    executor_class_path: class path for TFX core default trainer.
    training_inputs: Training input for CMLE training job. 'pythonModule',
      'pythonVersion' and 'runtimeVersion' will be inferred by the runner. For
      the full set of parameters supported, refer to
        https://cloud.google.com/ml-engine/docs/tensorflow/deploying-models#creating_a_model_version.

  Returns:
    None
  Raises:
    RuntimeError: if the Google Cloud AI Platform training job failed.
  """
    training_inputs = training_inputs.copy()
    # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself
    for gaip_training_key in ['cmle_training_args', 'gaip_training_args']:
        if gaip_training_key in exec_properties.get('custom_config'):
            exec_properties['custom_config'].pop(gaip_training_key)

    json_inputs = types.jsonify_tfx_type_dict(input_dict)
    tf.logging.info('json_inputs=\'%s\'.', json_inputs)
    json_outputs = types.jsonify_tfx_type_dict(output_dict)
    tf.logging.info('json_outputs=\'%s\'.', json_outputs)
    json_exec_properties = json.dumps(exec_properties)
    tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

    # Configure CMLE job
    api_client = discovery.build('ml', 'v1')
    job_args = [
        '--executor_class_path', executor_class_path, '--inputs', json_inputs,
        '--outputs', json_outputs, '--exec-properties', json_exec_properties
    ]
    training_inputs['args'] = job_args
    training_inputs['pythonModule'] = 'tfx.scripts.run_executor'
    training_inputs['pythonVersion'] = _get_caip_python_version()
    # runtimeVersion should be same as <major>.<minor> of currently
    # installed tensorflow version.
    training_inputs['runtimeVersion'] = _get_tf_runtime_version()

    # Pop project_id so CMLE doesn't complain about an unexpected parameter.
    # It's been a stowaway in cmle_args and has finally reached its destination.
    project = training_inputs.pop('project')
    project_id = 'projects/{}'.format(project)

    package_uris = training_inputs.get('packageUris', [])
    if package_uris:
        tf.logging.info('Following packageUris \'%s\' are provided by user.',
                        package_uris)
    else:
        local_package = deps_utils.build_ephemeral_package()
        # TODO(b/125451545): Use a safe temp dir instead of jobDir.
        cloud_package = os.path.join(training_inputs['jobDir'],
                                     os.path.basename(local_package))
        io_utils.copy_file(local_package, cloud_package, True)
        training_inputs['packageUris'] = [cloud_package]
        tf.logging.info('Package %s will be used',
                        training_inputs['packageUris'])

    job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    job_spec = {'jobId': job_name, 'trainingInput': training_inputs}

    # Submit job to CMLE
    tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format(
        job_name, project))
    request = api_client.projects().jobs().create(body=job_spec,
                                                  parent=project_id)
    request.execute()

    # Wait for CMLE job to finish
    job_id = '{}/jobs/{}'.format(project_id, job_name)
    request = api_client.projects().jobs().get(name=job_id)
    response = request.execute()
    while response['state'] not in ('SUCCEEDED', 'FAILED'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        response = request.execute()

    if response['state'] == 'FAILED':
        err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
            job_name, response)
        tf.logging.error(err_msg)
        raise RuntimeError(err_msg)

    # CMLE training complete
    tf.logging.info('Job \'{}\' successful.'.format(job_name))
Esempio n. 22
0
def _run_executor(args, pipeline_args) -> None:
    r"""Select a particular executor and run it based on name.

  # pylint: disable=line-too-long
  _run_executor() is used to invoke a class subclassing
  tfx.components.base.base_executor.BaseExecutor.  This function can be used for
  both invoking the executor on remote environments as well as for unit testing
  of executors.

  How to invoke an executor as standalone:
  # TODO(b/132958430): Create utility script to generate arguments for run_executor.py
  First, the input data needs to be prepared.  An easy way to generate the test
  data is to fully run the pipeline once.  This will generate the data to be
  used for testing as well as log the artifacts to be used as input parameters.
  In each executed component, three log entries will be generated similar to the
  below:
  ```
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution.
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  Each of these map directly to the input parameters expected by run_executor():
  ```
  python scripts/run_executor.py \
      --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \
      --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \
      --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \
      --exec-properties={"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  # pylint: disable=line-too-long

  Args:
    args:
      - inputs: The input artifacts for this execution, serialized as JSON.
      - outputs: The output artifacts to be generated by this execution,
        serialized as JSON.
      - exec_properties: The execution properties to be used by this execution,
        serialized as JSON.
    pipeline_args: Optional parameter that maps to the optional_pipeline_args
    parameter in the pipeline, which provides additional configuration options
    for apache-beam and tensorflow.logging.

  Returns:
    None

  Raises:
    None
  """

    tf.logging.set_verbosity(tf.logging.INFO)

    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))

    inputs = types.parse_tfx_type_dict(inputs_str)
    outputs = types.parse_tfx_type_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)
    tf.logging.info(
        'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
            args.executor_class_path, inputs, outputs, exec_properties))
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=pipeline_args,
        tmp_dir=args.temp_directory_path,
        unique_id='')
    executor = executor_cls(executor_context)
    tf.logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(types.jsonify_tfx_type_dict(outputs))
Esempio n. 23
0
def start_cmle_training(input_dict, output_dict, exec_properties,
                        training_inputs):
    """Start a trainer job on CMLE."""
    training_inputs = training_inputs.copy()
    # TODO(khaas): This file goes away when cl/236428692 lands
    # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself
    exec_properties['custom_config'].pop('cmle_training_args')

    json_inputs = types.jsonify_tfx_type_dict(input_dict)
    tf.logging.info('json_inputs=\'%s\'.', json_inputs)
    json_outputs = types.jsonify_tfx_type_dict(output_dict)
    tf.logging.info('json_outputs=\'%s\'.', json_outputs)
    json_exec_properties = json.dumps(exec_properties)
    tf.logging.info('json_exec_properties=\'%s\'.', json_exec_properties)

    # Configure CMLE job
    api_client = discovery.build('ml', 'v1')
    job_args = [
        '--executor', 'Trainer', '--inputs', json_inputs, '--outputs',
        json_outputs, '--exec-properties', json_exec_properties
    ]
    training_inputs['args'] = job_args
    training_inputs['pythonModule'] = 'tfx.scripts.run_executor'

    # Pop project_id so CMLE doesn't complain about an unexpected parameter.
    # It's been a stowaway in cmle_args and has finally reached its destination.
    project = training_inputs.pop('project')
    project_id = 'projects/{}'.format(project)

    if 'packageUris' not in training_inputs:
        # Create TFX dist and add it to training_inputs
        local_package = io_utils.build_package()
        cloud_package = os.path.join(training_inputs['jobDir'],
                                     os.path.basename(local_package))
        io_utils.copy_file(local_package, cloud_package, True)
        training_inputs['packageUris'] = [cloud_package]

    job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    job_spec = {'jobId': job_name, 'trainingInput': training_inputs}

    # Submit job to CMLE
    tf.logging.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format(
        job_name, project))
    request = api_client.projects().jobs().create(body=job_spec,
                                                  parent=project_id)
    request.execute()

    # Wait for CMLE job to finish
    job_id = '{}/jobs/{}'.format(project_id, job_name)
    request = api_client.projects().jobs().get(name=job_id)
    response = request.execute()
    while response['state'] not in ('SUCCEEDED', 'FAILED'):
        time.sleep(_POLLING_INTERVAL_IN_SECONDS)
        response = request.execute()

    if response['state'] == 'FAILED':
        err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
            job_name, response)
        tf.logging.error(err_msg)
        raise RuntimeError(err_msg)

    # CMLE training complete
    tf.logging.info('Job \'{}\' successful.'.format(job_name))
Esempio n. 24
0
def start_cmle_training(input_dict,
                        output_dict,
                        exec_properties,
                        training_inputs):
  """Start a trainer job on CMLE."""
  training_inputs = training_inputs.copy()
  logger = logging_utils.get_logger(exec_properties['log_root'], 'exec')
  # Remove cmle_args from exec_properties so CMLE trainer doesn't call itself
  exec_properties['custom_config'].pop('cmle_training_args')

  json_inputs = types.jsonify_tfx_type_dict(input_dict)
  logger.info('json_inputs=\'%s\'.', json_inputs)
  json_outputs = types.jsonify_tfx_type_dict(output_dict)
  logger.info('json_outputs=\'%s\'.', json_outputs)
  json_exec_properties = json.dumps(exec_properties)
  logger.info('json_exec_properties=\'%s\'.', json_exec_properties)

  # Configure CMLE job
  api_client = discovery.build('ml', 'v1')
  job_args = [
      '--executor', 'Trainer', '--inputs', json_inputs, '--outputs',
      json_outputs, '--exec-properties', json_exec_properties
  ]
  training_inputs['args'] = job_args
  training_inputs['pythonModule'] = 'tfx.scripts.run_executor'

  # Pop project_id so CMLE doesn't complain about an unexpected parameter.
  # It's been a stowaway in cmle_args and has finally reached its destination.
  project = training_inputs.pop('project')
  project_id = 'projects/{}'.format(project)

  if 'packageUris' not in training_inputs:
    # Create TFX dist and add it to training_inputs
    local_package = io_utils.build_package()
    cloud_package = os.path.join(training_inputs['jobDir'],
                                 os.path.basename(local_package))
    io_utils.copy_file(local_package, cloud_package, True)
    training_inputs['packageUris'] = [cloud_package]

  job_name = 'tfx_' + datetime.datetime.now().strftime('%Y%m%d%H%M%S')
  job_spec = {'jobId': job_name, 'trainingInput': training_inputs}

  # Submit job to CMLE
  logger.info('Submitting job=\'{}\', project=\'{}\' to CMLE.'.format(
      job_name, project))
  request = api_client.projects().jobs().create(
      body=job_spec, parent=project_id)
  request.execute()

  # Wait for CMLE job to finish
  job_id = '{}/jobs/{}'.format(project_id, job_name)
  request = api_client.projects().jobs().get(name=job_id)
  response = request.execute()
  while response['state'] not in ('SUCCEEDED', 'FAILED'):
    time.sleep(_POLLING_INTERVAL_IN_SECONDS)
    response = request.execute()

  if response['state'] == 'FAILED':
    err_msg = 'Job \'{}\' did not succeed.  Detailed response {}.'.format(
        job_name, response)
    logger.error(err_msg)
    raise RuntimeError(err_msg)

  # CMLE training complete
  logger.info('Job \'{}\' successful.'.format(job_name))
    def __init__(self,
                 component: base_component.BaseComponent,
                 input_dict: Optional[Dict] = None):

        self.component = component

        executor_class_path = '.'.join(
            [component.executor.__module__, component.executor.__name__])

        output_dict = dict(
            (k, v.get()) for k, v in component.outputs.get_all().items())

        outputs = output_dict.keys()
        file_outputs = {
            output: '/output/ml_metadata/{}'.format(output)
            for output in outputs
        }

        exec_properties = component.exec_properties

        # extra exec properties that is needed for KubeflowExecutorWrapper.
        exec_properties['output_dir'] = os.path.join(_PIPELINE_ROOT,
                                                     _PIPELINE_NAME)
        exec_properties['beam_pipeline_args'] = [
            '--runner=DataflowRunner',
            '--experiments=shuffle_mode=auto',
            '--project=' + _PROJECT_ID,
            '--temp_location=' + os.path.join(_PIPELINE_ROOT, 'tmp'),
            '--region=' + _GCP_REGION,
        ]

        arguments = [
            '--exec_properties',
            json.dumps(component.exec_properties),
            '--outputs',
            types.jsonify_tfx_type_dict(output_dict),
            '--executor_class_path',
            executor_class_path,
            component.component_name,
        ]

        if input_dict:
            for k, v in input_dict.items():
                # if isinstance(v, float) or isinstance(v, int):
                #   v = str(v)
                arguments.append('--{}'.format(k))
                arguments.append(v)

        super().__init__(
            name=component.component_name,
            # TODO(muchida): each component could take different child image,
            # while maintaining the common entry point. It is nice because it could
            # cleanly embeds user code and/or configuration.
            image=_IMAGE,
            command=_COMMAND,
            arguments=arguments,
            file_outputs=file_outputs,
        )
        self.apply(gcp.use_gcp_secret('user-gcp-sa'))

        field_path = "metadata.labels['workflows.argoproj.io/workflow']"
        self.add_env_variable(
            k8s_client.V1EnvVar(name='WORKFLOW_ID',
                                value_from=k8s_client.V1EnvVarSource(
                                    field_ref=k8s_client.V1ObjectFieldSelector(
                                        field_path=field_path))))