def __init__( self, executor_class_path: Text, name: Text, input_dict: Dict[Text, List[types.Artifact]], outputs: Text, exec_properties: Dict[Text, Any], ): self._input_dict = input_dict self._output_dict = artifact_utils.parse_artifact_dict(outputs) self._component_name = to_snake_case(name) self._exec_properties = exec_properties self._output_dir = self._exec_properties['output_dir'] self._workflow_id = os.environ['WORKFLOW_ID'] raw_args = self._exec_properties.get('beam_pipeline_args', []) # Beam expects str types for it's pipeline args. Ensure unicode type is # converted to str if required. beam_pipeline_args = [] for arg in raw_args: # In order to support both Py2 and Py3: Py3 doesn't have `unicode` type. if six.PY2 and isinstance(arg, unicode): arg = arg.encode('ascii', 'ignore') beam_pipeline_args.append(arg) module_dir = os.environ['TFX_SRC_DIR'] setup_file = os.path.join(module_dir, 'setup.py') tf.logging.info('Using setup_file \'%s\' to capture TFX dependencies', setup_file) beam_pipeline_args.append('--setup_file={}'.format(setup_file)) executor_cls = import_utils.import_class_by_path(executor_class_path) # TODO(swoonna): Switch to execution_id when available unique_id = '{}_{}'.format(self._component_name, self._workflow_id) # TODO(swoonna): Add tmp_dir to additional_pipeline_args executor_context = base_executor.BaseExecutor.Context( beam_pipeline_args=beam_pipeline_args, tmp_dir=os.path.join(self._output_dir, '.temp', ''), unique_id=unique_id) self._executor = executor_cls(executor_context)
def _run_executor(args, pipeline_args) -> None: r"""Select a particular executor and run it based on name. # pylint: disable=line-too-long _run_executor() is used to invoke a class subclassing tfx.components.base.base_executor.BaseExecutor. This function can be used for both invoking the executor on remote environments as well as for unit testing of executors. How to invoke an executor as standalone: # TODO(b/132958430): Create utility script to generate arguments for run_executor.py First, the input data needs to be prepared. An easy way to generate the test data is to fully run the pipeline once. This will generate the data to be used for testing as well as log the artifacts to be used as input parameters. In each executed component, three log entries will be generated similar to the below: ``` [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution. [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{ \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"} ``` Each of these map directly to the input parameters expected by run_executor(): ``` python scripts/run_executor.py \ --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \ --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \ --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \ --exec-properties={"output": "{ \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"} ``` # pylint: disable=line-too-long Args: args: - inputs: The input artifacts for this execution, serialized as JSON. - outputs: The output artifacts to be generated by this execution, serialized as JSON. - exec_properties: The execution properties to be used by this execution, serialized as JSON. pipeline_args: Optional parameter that maps to the optional_pipeline_args parameter in the pipeline, which provides additional configuration options for apache-beam and tensorflow.logging. Returns: None Raises: None """ tf.logging.set_verbosity(tf.logging.INFO) (inputs_str, outputs_str, exec_properties_str) = (args.inputs or base64.b64decode(args.inputs_base64), args.outputs or base64.b64decode(args.outputs_base64), args.exec_properties or base64.b64decode(args.exec_properties_base64)) inputs = artifact_utils.parse_artifact_dict(inputs_str) outputs = artifact_utils.parse_artifact_dict(outputs_str) exec_properties = json.loads(exec_properties_str) tf.logging.info( 'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format( args.executor_class_path, inputs, outputs, exec_properties)) executor_cls = import_utils.import_class_by_path(args.executor_class_path) executor_context = base_executor.BaseExecutor.Context( beam_pipeline_args=pipeline_args, tmp_dir=args.temp_directory_path, unique_id='') executor = executor_cls(executor_context) tf.logging.info('Starting executor') executor.Do(inputs, outputs, exec_properties) # The last line of stdout will be pushed to xcom by Airflow. if args.write_outputs_stdout: print(artifact_utils.jsonify_artifact_dict(outputs))
def _run_executor(args, pipeline_args) -> None: r"""Select a particular executor and run it based on name. # pylint: disable=line-too-long _run_executor() is used to invoke a class subclassing tfx.components.base.base_executor.BaseExecutor. This function can be used for both invoking the executor on remote environments as well as for unit testing of executors. How to invoke an executor as standalone: # TODO(b/132958430): Create utility script to generate arguments for run_executor.py First, the input data needs to be prepared. An easy way to generate the test data is to fully run the pipeline once. This will generate the data to be used for testing as well as log the artifacts to be used as input parameters. In each executed component, three log entries will be generated similar to the below: ``` [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution. [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{ \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"} ``` Each of these map directly to the input parameters expected by run_executor(): ``` python scripts/run_executor.py \ --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \ --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \ --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \ --exec-properties={"output": "{ \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"} ``` # pylint: disable=line-too-long Args: args: - inputs: The input artifacts for this execution, serialized as JSON. - outputs: The output artifacts to be generated by this execution, serialized as JSON. - exec_properties: The execution properties to be used by this execution, serialized as JSON. Technically all the exec_properties values should be a primitive, and nested exec_properties needs to be JSON-encoded as a string. But as a convenience, the script allows you to feed in non-serialized values of exec_properties, which is then automatically serialized. pipeline_args: Optional parameter that maps to the optional_pipeline_args parameter in the pipeline, which provides additional configuration options for apache-beam and tensorflow.logging. Returns: None Raises: None """ (inputs_str, outputs_str, exec_properties_str) = (args.inputs or base64.b64decode(args.inputs_base64), args.outputs or base64.b64decode(args.outputs_base64), args.exec_properties or base64.b64decode(args.exec_properties_base64)) inputs = artifact_utils.parse_artifact_dict(inputs_str) outputs = artifact_utils.parse_artifact_dict(outputs_str) exec_properties = json.loads(exec_properties_str) # Technically exec_properties value can only be a primitive (e.g. string), and # one of our convention is to use proto object by JSON-serializing it. # Unfortunately, run_executor.py script accepts serialized exec_properties as # an input, thus proto object value would be serialized twice. This is really # inconvenient if you're manually constructing exec_properties, so we allow # to feed in non-serialized values of exec_properties, and serialize them # here. for key, value in exec_properties.items(): if isinstance(value, (dict, list)): exec_properties[key] = json.dumps(value) logging.info( 'Executor %s do: inputs: %s, outputs: %s, exec_properties: %s', args.executor_class_path, inputs, outputs, exec_properties) executor_cls = import_utils.import_class_by_path(args.executor_class_path) executor_context = base_executor.BaseExecutor.Context( beam_pipeline_args=pipeline_args, tmp_dir=args.temp_directory_path, unique_id='') executor = executor_cls(executor_context) logging.info('Starting executor') executor.Do(inputs, outputs, exec_properties) # The last line of stdout will be pushed to xcom by Airflow. if args.write_outputs_stdout: print(artifact_utils.jsonify_artifact_dict(outputs))
def main(): # Log to the container's stdout so Kubeflow Pipelines UI can display logs to # the user. logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--pipeline_name', type=str, required=True) parser.add_argument('--pipeline_root', type=str, required=True) parser.add_argument('--kubeflow_metadata_config', type=str, required=True) parser.add_argument('--additional_pipeline_args', type=str, required=True) parser.add_argument('--component_id', type=str, required=True) parser.add_argument('--component_type', type=str, required=True) parser.add_argument('--driver_class_path', type=str, required=True) parser.add_argument('--executor_spec', type=str, required=True) parser.add_argument('--component_launcher_class_path', type=str, required=True) parser.add_argument('--inputs', type=str, required=True) parser.add_argument('--outputs', type=str, required=True) parser.add_argument('--exec_properties', type=str, required=True) parser.add_argument('--enable_cache', action='store_true') args = parser.parse_args() inputs = artifact_utils.parse_artifact_dict(args.inputs) input_dict = _make_channel_dict(inputs) outputs = artifact_utils.parse_artifact_dict(args.outputs) output_dict = _make_channel_dict(outputs) exec_properties = json.loads(args.exec_properties) driver_class = import_utils.import_class_by_path(args.driver_class_path) executor_spec = json_utils.loads(args.executor_spec) component_launcher_class = import_utils.import_class_by_path( args.component_launcher_class_path) if not issubclass(component_launcher_class, base_component_launcher.BaseComponentLauncher): raise TypeError( 'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher' % component_launcher_class) kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig() json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config) connection_config = _get_metadata_connection_config( kubeflow_metadata_config) component_info = data_types.ComponentInfo( component_type=args.component_type, component_id=args.component_id) driver_args = data_types.DriverArgs(enable_cache=args.enable_cache) additional_pipeline_args = _make_additional_pipeline_args( args.additional_pipeline_args) # TODO(hongyes): create a classmethod to create launcher from a deserialized # component. launcher = component_launcher_class( component_info=component_info, driver_class=driver_class, component_executor_spec=executor_spec, input_dict=input_dict, output_dict=output_dict, exec_properties=exec_properties, pipeline_info=data_types.PipelineInfo( pipeline_name=args.pipeline_name, pipeline_root=args.pipeline_root, run_id=os.environ['WORKFLOW_ID']), driver_args=driver_args, metadata_connection_config=connection_config, additional_pipeline_args=additional_pipeline_args) launcher.launch()
def parse_tfx_type_dict(json_str: Text) -> Dict[Text, List[Artifact]]: return artifact_utils.parse_artifact_dict(json_str)