Esempio n. 1
0
def _run_executor(args, pipeline_args):
    """Select a particular executor and run it based on name."""
    tf.logging.set_verbosity(tf.logging.INFO)

    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))

    inputs = types.parse_tfx_type_dict(inputs_str)
    outputs = types.parse_tfx_type_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)
    tf.logging.info(
        'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
            args.executor, inputs, outputs, exec_properties))

    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor = executor_cls(beam_pipeline_args=pipeline_args)
    tf.logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(types.jsonify_tfx_type_dict(outputs))
Esempio n. 2
0
def _run_executor(args, pipeline_args):
  """Select a particular executor and run it based on name."""
  tf.logging.set_verbosity(tf.logging.INFO)

  (inputs_str, outputs_str,
   exec_properties_str) = (args.inputs or base64.b64decode(args.inputs_base64),
                           args.outputs or
                           base64.b64decode(args.outputs_base64),
                           args.exec_properties or
                           base64.b64decode(args.exec_properties_base64))

  inputs = parse_tfx_type_dict(inputs_str)
  outputs = parse_tfx_type_dict(outputs_str)
  exec_properties = json.loads(exec_properties_str)
  tf.logging.info(
      'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
          args.executor, inputs, outputs, exec_properties))

  executor = _get_executor_class(args.executor)(
      beam_pipeline_args=pipeline_args)
  tf.logging.info('Starting executor')
  executor.Do(inputs, outputs, exec_properties)

  # The last line of stdout will be pushed to xcom by Airflow.
  if args.write_outputs_stdout:
    print(jsonify_tfx_type_dict(outputs))
Esempio n. 3
0
  def _refresh_execution_args_from_xcom(self, task_instance, pushing_task_name):
    """Refresh inputs, outputs and exec_properties from xcom."""
    inputs_str = task_instance.xcom_pull(
        key='_exec_inputs', task_ids=pushing_task_name)
    self._input_dict = parse_tfx_type_dict(inputs_str)

    outputs_str = task_instance.xcom_pull(
        key='_exec_outputs', task_ids=pushing_task_name)
    self._output_dict = parse_tfx_type_dict(outputs_str)

    exec_properties_str = task_instance.xcom_pull(
        key='_exec_properties', task_ids=pushing_task_name)
    self._exec_properties = json.loads(exec_properties_str)

    self._execution_id = task_instance.xcom_pull(
        key='_execution_id', task_ids=pushing_task_name)
Esempio n. 4
0
    def __init__(
        self,
        executor_class_path,
        name,
        input_dict,
        outputs,
        exec_properties,
    ):
        raw_args = exec_properties.get('beam_pipeline_args', [])

        # Beam expects str types for it's pipeline args. Ensure unicode type is
        # converted to str if required.
        beam_pipeline_args = []
        for arg in raw_args:
            if isinstance(arg, unicode):
                arg = arg.encode('ascii', 'ignore')
            beam_pipeline_args.append(arg)

        # TODO(zhitaoli): Revisit usage of setup_file here.
        module_dir = os.path.dirname(os.path.dirname(tfx.__file__))
        setup_file = os.path.join(module_dir, 'setup.py')
        beam_pipeline_args.append('--setup_file={}'.format(setup_file))

        executor_cls = import_utils.import_class_by_path(executor_class_path)
        self._executor = executor_cls(beam_pipeline_args=beam_pipeline_args)

        self._input_dict = input_dict
        self._output_dict = types.parse_tfx_type_dict(outputs)
        self._exec_properties = exec_properties
        self._component_name = to_snake_case(name)
Esempio n. 5
0
  def __init__(self, executor_cls, name,
               input_dict, outputs,
               exec_properties):
    raw_args = exec_properties.get('beam_pipeline_args', [])

    # Beam expects str types for it's pipeline args. Ensure unicode type is
    # converted to str if required.
    beam_pipeline_args = []
    for arg in raw_args:
      if isinstance(arg, unicode):
        arg = arg.encode('ascii', 'ignore')
      beam_pipeline_args.append(arg)

    module_dir = os.path.dirname(
        os.path.dirname(tfx.__file__))
    setup_file = os.path.join(module_dir, 'setup.py')
    beam_pipeline_args.append('--setup_file={}'.format(setup_file))

    self._executor = executor_cls(beam_pipeline_args)
    self._input_dict = input_dict
    self._output_dict = types.parse_tfx_type_dict(outputs)
    self._exec_properties = exec_properties
    self._component_name = to_snake_case(name)

    self._logger = logging_utils.get_logger(
        exec_properties['log_root'], self._component_name + '_driver.logs')
Esempio n. 6
0
 def testParseTfxTypeDictDeprecated(self):
     with mock.patch.object(tf_logging, 'warning'):
         warn_mock = mock.MagicMock()
         tf_logging.warning = warn_mock
         self.assertEqual({}, types.parse_tfx_type_dict('{}'))
         warn_mock.assert_called_once()
         self.assertIn(
             'tfx.utils.types.parse_tfx_type_dict has been renamed to',
             warn_mock.call_args[0][5])
Esempio n. 7
0
  def publish_exec(self, cache_task_name, exec_task_name, **kwargs):
    """Publish artifacts produced in this execution to the pipeline."""
    task_instance = kwargs['ti']
    self._refresh_execution_args_from_xcom(task_instance, cache_task_name)

    # Overwrite outputs from cache with outputs produced by exec operator.
    outputs_str = task_instance.xcom_pull(
        key='return_value', task_ids=exec_task_name)
    self._output_dict = parse_tfx_type_dict(outputs_str)
    final_output = self._publish_execution_to_metadata()
    self._publish_outputs_to_pipeline(task_instance, final_output)
Esempio n. 8
0
    def __init__(
        self,
        executor_class_path: Text,
        name: Text,
        input_dict: Dict[Text, List[types.TfxArtifact]],
        outputs: Text,
        exec_properties: Dict[Text, Any],
    ):
        self._input_dict = input_dict
        self._output_dict = types.parse_tfx_type_dict(outputs)
        self._component_name = to_snake_case(name)
        self._exec_properties = exec_properties
        self._output_dir = self._exec_properties['output_dir']
        self._workflow_id = os.environ['WORKFLOW_ID']

        raw_args = self._exec_properties.get('beam_pipeline_args', [])

        # Beam expects str types for it's pipeline args. Ensure unicode type is
        # converted to str if required.
        beam_pipeline_args = []
        for arg in raw_args:
            # In order to support both Py2 and Py3: Py3 doesn't have `unicode` type.
            if six.PY2 and isinstance(arg, unicode):
                arg = arg.encode('ascii', 'ignore')

            beam_pipeline_args.append(arg)

        # TODO(zhitaoli): Revisit usage of setup_file here.
        module_dir = os.path.dirname(os.path.dirname(version.__file__))
        setup_file = os.path.join(module_dir, 'setup.py')
        tf.logging.info('Using setup_file \'%s\' to capture TFX dependencies',
                        setup_file)
        beam_pipeline_args.append('--setup_file={}'.format(setup_file))

        executor_cls = import_utils.import_class_by_path(executor_class_path)
        # TODO(swoonna): Switch to execution_id when available
        unique_id = '{}_{}'.format(self._component_name, self._workflow_id)
        # TODO(swoonna): Add tmp_dir to additional_pipeline_args
        executor_context = base_executor.BaseExecutor.Context(
            beam_pipeline_args=beam_pipeline_args,
            tmp_dir=os.path.join(self._output_dir, '.temp', ''),
            unique_id=unique_id)
        self._executor = executor_cls(executor_context)
Esempio n. 9
0
    def __init__(self, executor_cls, name, input_dict, outputs,
                 exec_properties):
        raw_args = exec_properties.get('beam_pipeline_args', [])

        # Beam expects str types for it's pipeline args. Ensure unicode type is
        # converted to str if required.
        beam_pipeline_args = []
        for arg in raw_args:
            if isinstance(arg, unicode):
                arg = arg.encode('ascii', 'ignore')
            beam_pipeline_args.append(arg)

        module_dir = os.path.dirname(os.path.dirname(tfx.__file__))
        setup_file = os.path.join(module_dir, 'setup.py')
        beam_pipeline_args.append('--setup_file={}'.format(setup_file))

        self._executor = executor_cls(beam_pipeline_args)
        self._input_dict = input_dict
        self._output_dict = types.parse_tfx_type_dict(outputs)
        self._exec_properties = exec_properties
        self._component_name = to_snake_case(name)

        self._logger = logging_utils.get_logger(
            exec_properties['log_root'], self._component_name + '_driver.logs')
Esempio n. 10
0
def _run_executor(args, pipeline_args) -> None:
    r"""Select a particular executor and run it based on name.

  # pylint: disable=line-too-long
  _run_executor() is used to invoke a class subclassing
  tfx.components.base.base_executor.BaseExecutor.  This function can be used for
  both invoking the executor on remote environments as well as for unit testing
  of executors.

  How to invoke an executor as standalone:
  # TODO(b/132958430): Create utility script to generate arguments for run_executor.py
  First, the input data needs to be prepared.  An easy way to generate the test
  data is to fully run the pipeline once.  This will generate the data to be
  used for testing as well as log the artifacts to be used as input parameters.
  In each executed component, three log entries will be generated similar to the
  below:
  ```
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution.
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  Each of these map directly to the input parameters expected by run_executor():
  ```
  python scripts/run_executor.py \
      --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \
      --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \
      --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \
      --exec-properties={"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  # pylint: disable=line-too-long

  Args:
    args:
      - inputs: The input artifacts for this execution, serialized as JSON.
      - outputs: The output artifacts to be generated by this execution,
        serialized as JSON.
      - exec_properties: The execution properties to be used by this execution,
        serialized as JSON.
    pipeline_args: Optional parameter that maps to the optional_pipeline_args
    parameter in the pipeline, which provides additional configuration options
    for apache-beam and tensorflow.logging.

  Returns:
    None

  Raises:
    None
  """

    tf.logging.set_verbosity(tf.logging.INFO)

    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))

    inputs = types.parse_tfx_type_dict(inputs_str)
    outputs = types.parse_tfx_type_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)
    tf.logging.info(
        'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
            args.executor_class_path, inputs, outputs, exec_properties))
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=pipeline_args,
        tmp_dir=args.temp_directory_path,
        unique_id='')
    executor = executor_cls(executor_context)
    tf.logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(types.jsonify_tfx_type_dict(outputs))