コード例 #1
0
  def __init__(
      self,
      executor_class_path: Text,
      name: Text,
      input_dict: Dict[Text, List[types.Artifact]],
      outputs: Text,
      exec_properties: Dict[Text, Any],
  ):
    self._input_dict = input_dict
    self._output_dict = artifact_utils.parse_artifact_dict(outputs)
    self._component_name = to_snake_case(name)
    self._exec_properties = exec_properties
    self._output_dir = self._exec_properties['output_dir']
    self._workflow_id = os.environ['WORKFLOW_ID']

    raw_args = self._exec_properties.get('beam_pipeline_args', [])

    # Beam expects str types for it's pipeline args. Ensure unicode type is
    # converted to str if required.
    beam_pipeline_args = []
    for arg in raw_args:
      # In order to support both Py2 and Py3: Py3 doesn't have `unicode` type.
      if six.PY2 and isinstance(arg, unicode):
        arg = arg.encode('ascii', 'ignore')

      beam_pipeline_args.append(arg)

    module_dir = os.environ['TFX_SRC_DIR']
    setup_file = os.path.join(module_dir, 'setup.py')
    tf.logging.info('Using setup_file \'%s\' to capture TFX dependencies',
                    setup_file)
    beam_pipeline_args.append('--setup_file={}'.format(setup_file))

    executor_cls = import_utils.import_class_by_path(executor_class_path)
    # TODO(swoonna): Switch to execution_id when available
    unique_id = '{}_{}'.format(self._component_name, self._workflow_id)
    # TODO(swoonna): Add tmp_dir to additional_pipeline_args
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=beam_pipeline_args,
        tmp_dir=os.path.join(self._output_dir, '.temp', ''),
        unique_id=unique_id)
    self._executor = executor_cls(executor_context)
コード例 #2
0
ファイル: run_executor.py プロジェクト: anitameh/tfx-1
def _run_executor(args, pipeline_args) -> None:
    r"""Select a particular executor and run it based on name.

  # pylint: disable=line-too-long
  _run_executor() is used to invoke a class subclassing
  tfx.components.base.base_executor.BaseExecutor.  This function can be used for
  both invoking the executor on remote environments as well as for unit testing
  of executors.

  How to invoke an executor as standalone:
  # TODO(b/132958430): Create utility script to generate arguments for run_executor.py
  First, the input data needs to be prepared.  An easy way to generate the test
  data is to fully run the pipeline once.  This will generate the data to be
  used for testing as well as log the artifacts to be used as input parameters.
  In each executed component, three log entries will be generated similar to the
  below:
  ```
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution.
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  Each of these map directly to the input parameters expected by run_executor():
  ```
  python scripts/run_executor.py \
      --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \
      --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \
      --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \
      --exec-properties={"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  # pylint: disable=line-too-long

  Args:
    args:
      - inputs: The input artifacts for this execution, serialized as JSON.
      - outputs: The output artifacts to be generated by this execution,
        serialized as JSON.
      - exec_properties: The execution properties to be used by this execution,
        serialized as JSON.
    pipeline_args: Optional parameter that maps to the optional_pipeline_args
    parameter in the pipeline, which provides additional configuration options
    for apache-beam and tensorflow.logging.

  Returns:
    None

  Raises:
    None
  """

    tf.logging.set_verbosity(tf.logging.INFO)

    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))

    inputs = artifact_utils.parse_artifact_dict(inputs_str)
    outputs = artifact_utils.parse_artifact_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)
    tf.logging.info(
        'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format(
            args.executor_class_path, inputs, outputs, exec_properties))
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=pipeline_args,
        tmp_dir=args.temp_directory_path,
        unique_id='')
    executor = executor_cls(executor_context)
    tf.logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(artifact_utils.jsonify_artifact_dict(outputs))
コード例 #3
0
ファイル: run_executor.py プロジェクト: shizukanaskytree/tfx
def _run_executor(args, pipeline_args) -> None:
    r"""Select a particular executor and run it based on name.

  # pylint: disable=line-too-long
  _run_executor() is used to invoke a class subclassing
  tfx.components.base.base_executor.BaseExecutor.  This function can be used for
  both invoking the executor on remote environments as well as for unit testing
  of executors.

  How to invoke an executor as standalone:
  # TODO(b/132958430): Create utility script to generate arguments for run_executor.py
  First, the input data needs to be prepared.  An easy way to generate the test
  data is to fully run the pipeline once.  This will generate the data to be
  used for testing as well as log the artifacts to be used as input parameters.
  In each executed component, three log entries will be generated similar to the
  below:
  ```
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution.
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]}
  [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  Each of these map directly to the input parameters expected by run_executor():
  ```
  python scripts/run_executor.py \
      --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \
      --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \
      --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \
      --exec-properties={"output": "{  \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"}
  ```
  # pylint: disable=line-too-long

  Args:
    args:
      - inputs: The input artifacts for this execution, serialized as JSON.
      - outputs: The output artifacts to be generated by this execution,
        serialized as JSON.
      - exec_properties: The execution properties to be used by this execution,
        serialized as JSON. Technically all the exec_properties values should be
        a primitive, and nested exec_properties needs to be JSON-encoded as a
        string. But as a convenience, the script allows you to feed in
        non-serialized values of exec_properties, which is then automatically
        serialized.
    pipeline_args: Optional parameter that maps to the optional_pipeline_args
    parameter in the pipeline, which provides additional configuration options
    for apache-beam and tensorflow.logging.

  Returns:
    None

  Raises:
    None
  """
    (inputs_str, outputs_str,
     exec_properties_str) = (args.inputs
                             or base64.b64decode(args.inputs_base64),
                             args.outputs
                             or base64.b64decode(args.outputs_base64),
                             args.exec_properties
                             or base64.b64decode(args.exec_properties_base64))
    inputs = artifact_utils.parse_artifact_dict(inputs_str)
    outputs = artifact_utils.parse_artifact_dict(outputs_str)
    exec_properties = json.loads(exec_properties_str)

    # Technically exec_properties value can only be a primitive (e.g. string), and
    # one of our convention is to use proto object by JSON-serializing it.
    # Unfortunately, run_executor.py script accepts serialized exec_properties as
    # an input, thus proto object value would be serialized twice. This is really
    # inconvenient if you're manually constructing exec_properties, so we allow
    # to feed in non-serialized values of exec_properties, and serialize them
    # here.
    for key, value in exec_properties.items():
        if isinstance(value, (dict, list)):
            exec_properties[key] = json.dumps(value)

    logging.info(
        'Executor %s do: inputs: %s, outputs: %s, exec_properties: %s',
        args.executor_class_path, inputs, outputs, exec_properties)
    executor_cls = import_utils.import_class_by_path(args.executor_class_path)
    executor_context = base_executor.BaseExecutor.Context(
        beam_pipeline_args=pipeline_args,
        tmp_dir=args.temp_directory_path,
        unique_id='')
    executor = executor_cls(executor_context)
    logging.info('Starting executor')
    executor.Do(inputs, outputs, exec_properties)

    # The last line of stdout will be pushed to xcom by Airflow.
    if args.write_outputs_stdout:
        print(artifact_utils.jsonify_artifact_dict(outputs))
コード例 #4
0
def main():
    # Log to the container's stdout so Kubeflow Pipelines UI can display logs to
    # the user.
    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
    logging.getLogger().setLevel(logging.INFO)

    parser = argparse.ArgumentParser()
    parser.add_argument('--pipeline_name', type=str, required=True)
    parser.add_argument('--pipeline_root', type=str, required=True)
    parser.add_argument('--kubeflow_metadata_config', type=str, required=True)
    parser.add_argument('--additional_pipeline_args', type=str, required=True)
    parser.add_argument('--component_id', type=str, required=True)
    parser.add_argument('--component_type', type=str, required=True)
    parser.add_argument('--driver_class_path', type=str, required=True)
    parser.add_argument('--executor_spec', type=str, required=True)
    parser.add_argument('--component_launcher_class_path',
                        type=str,
                        required=True)
    parser.add_argument('--inputs', type=str, required=True)
    parser.add_argument('--outputs', type=str, required=True)
    parser.add_argument('--exec_properties', type=str, required=True)
    parser.add_argument('--enable_cache', action='store_true')

    args = parser.parse_args()

    inputs = artifact_utils.parse_artifact_dict(args.inputs)
    input_dict = _make_channel_dict(inputs)

    outputs = artifact_utils.parse_artifact_dict(args.outputs)
    output_dict = _make_channel_dict(outputs)

    exec_properties = json.loads(args.exec_properties)

    driver_class = import_utils.import_class_by_path(args.driver_class_path)
    executor_spec = json_utils.loads(args.executor_spec)

    component_launcher_class = import_utils.import_class_by_path(
        args.component_launcher_class_path)
    if not issubclass(component_launcher_class,
                      base_component_launcher.BaseComponentLauncher):
        raise TypeError(
            'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher'
            % component_launcher_class)

    kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config)
    connection_config = _get_metadata_connection_config(
        kubeflow_metadata_config)

    component_info = data_types.ComponentInfo(
        component_type=args.component_type, component_id=args.component_id)

    driver_args = data_types.DriverArgs(enable_cache=args.enable_cache)

    additional_pipeline_args = _make_additional_pipeline_args(
        args.additional_pipeline_args)

    # TODO(hongyes): create a classmethod to create launcher from a deserialized
    # component.
    launcher = component_launcher_class(
        component_info=component_info,
        driver_class=driver_class,
        component_executor_spec=executor_spec,
        input_dict=input_dict,
        output_dict=output_dict,
        exec_properties=exec_properties,
        pipeline_info=data_types.PipelineInfo(
            pipeline_name=args.pipeline_name,
            pipeline_root=args.pipeline_root,
            run_id=os.environ['WORKFLOW_ID']),
        driver_args=driver_args,
        metadata_connection_config=connection_config,
        additional_pipeline_args=additional_pipeline_args)

    launcher.launch()
コード例 #5
0
ファイル: types.py プロジェクト: anitameh/tfx-1
def parse_tfx_type_dict(json_str: Text) -> Dict[Text, List[Artifact]]:
    return artifact_utils.parse_artifact_dict(json_str)