def _run_executor(args, pipeline_args): """Select a particular executor and run it based on name.""" tf.logging.set_verbosity(tf.logging.INFO) (inputs_str, outputs_str, exec_properties_str) = (args.inputs or base64.b64decode(args.inputs_base64), args.outputs or base64.b64decode(args.outputs_base64), args.exec_properties or base64.b64decode(args.exec_properties_base64)) inputs = types.parse_tfx_type_dict(inputs_str) outputs = types.parse_tfx_type_dict(outputs_str) exec_properties = json.loads(exec_properties_str) tf.logging.info( 'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format( args.executor, inputs, outputs, exec_properties)) executor_cls = import_utils.import_class_by_path(args.executor_class_path) executor = executor_cls(beam_pipeline_args=pipeline_args) tf.logging.info('Starting executor') executor.Do(inputs, outputs, exec_properties) # The last line of stdout will be pushed to xcom by Airflow. if args.write_outputs_stdout: print(types.jsonify_tfx_type_dict(outputs))
def _run_executor(args, pipeline_args): """Select a particular executor and run it based on name.""" tf.logging.set_verbosity(tf.logging.INFO) (inputs_str, outputs_str, exec_properties_str) = (args.inputs or base64.b64decode(args.inputs_base64), args.outputs or base64.b64decode(args.outputs_base64), args.exec_properties or base64.b64decode(args.exec_properties_base64)) inputs = parse_tfx_type_dict(inputs_str) outputs = parse_tfx_type_dict(outputs_str) exec_properties = json.loads(exec_properties_str) tf.logging.info( 'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format( args.executor, inputs, outputs, exec_properties)) executor = _get_executor_class(args.executor)( beam_pipeline_args=pipeline_args) tf.logging.info('Starting executor') executor.Do(inputs, outputs, exec_properties) # The last line of stdout will be pushed to xcom by Airflow. if args.write_outputs_stdout: print(jsonify_tfx_type_dict(outputs))
def _refresh_execution_args_from_xcom(self, task_instance, pushing_task_name): """Refresh inputs, outputs and exec_properties from xcom.""" inputs_str = task_instance.xcom_pull( key='_exec_inputs', task_ids=pushing_task_name) self._input_dict = parse_tfx_type_dict(inputs_str) outputs_str = task_instance.xcom_pull( key='_exec_outputs', task_ids=pushing_task_name) self._output_dict = parse_tfx_type_dict(outputs_str) exec_properties_str = task_instance.xcom_pull( key='_exec_properties', task_ids=pushing_task_name) self._exec_properties = json.loads(exec_properties_str) self._execution_id = task_instance.xcom_pull( key='_execution_id', task_ids=pushing_task_name)
def __init__( self, executor_class_path, name, input_dict, outputs, exec_properties, ): raw_args = exec_properties.get('beam_pipeline_args', []) # Beam expects str types for it's pipeline args. Ensure unicode type is # converted to str if required. beam_pipeline_args = [] for arg in raw_args: if isinstance(arg, unicode): arg = arg.encode('ascii', 'ignore') beam_pipeline_args.append(arg) # TODO(zhitaoli): Revisit usage of setup_file here. module_dir = os.path.dirname(os.path.dirname(tfx.__file__)) setup_file = os.path.join(module_dir, 'setup.py') beam_pipeline_args.append('--setup_file={}'.format(setup_file)) executor_cls = import_utils.import_class_by_path(executor_class_path) self._executor = executor_cls(beam_pipeline_args=beam_pipeline_args) self._input_dict = input_dict self._output_dict = types.parse_tfx_type_dict(outputs) self._exec_properties = exec_properties self._component_name = to_snake_case(name)
def __init__(self, executor_cls, name, input_dict, outputs, exec_properties): raw_args = exec_properties.get('beam_pipeline_args', []) # Beam expects str types for it's pipeline args. Ensure unicode type is # converted to str if required. beam_pipeline_args = [] for arg in raw_args: if isinstance(arg, unicode): arg = arg.encode('ascii', 'ignore') beam_pipeline_args.append(arg) module_dir = os.path.dirname( os.path.dirname(tfx.__file__)) setup_file = os.path.join(module_dir, 'setup.py') beam_pipeline_args.append('--setup_file={}'.format(setup_file)) self._executor = executor_cls(beam_pipeline_args) self._input_dict = input_dict self._output_dict = types.parse_tfx_type_dict(outputs) self._exec_properties = exec_properties self._component_name = to_snake_case(name) self._logger = logging_utils.get_logger( exec_properties['log_root'], self._component_name + '_driver.logs')
def testParseTfxTypeDictDeprecated(self): with mock.patch.object(tf_logging, 'warning'): warn_mock = mock.MagicMock() tf_logging.warning = warn_mock self.assertEqual({}, types.parse_tfx_type_dict('{}')) warn_mock.assert_called_once() self.assertIn( 'tfx.utils.types.parse_tfx_type_dict has been renamed to', warn_mock.call_args[0][5])
def publish_exec(self, cache_task_name, exec_task_name, **kwargs): """Publish artifacts produced in this execution to the pipeline.""" task_instance = kwargs['ti'] self._refresh_execution_args_from_xcom(task_instance, cache_task_name) # Overwrite outputs from cache with outputs produced by exec operator. outputs_str = task_instance.xcom_pull( key='return_value', task_ids=exec_task_name) self._output_dict = parse_tfx_type_dict(outputs_str) final_output = self._publish_execution_to_metadata() self._publish_outputs_to_pipeline(task_instance, final_output)
def __init__( self, executor_class_path: Text, name: Text, input_dict: Dict[Text, List[types.TfxArtifact]], outputs: Text, exec_properties: Dict[Text, Any], ): self._input_dict = input_dict self._output_dict = types.parse_tfx_type_dict(outputs) self._component_name = to_snake_case(name) self._exec_properties = exec_properties self._output_dir = self._exec_properties['output_dir'] self._workflow_id = os.environ['WORKFLOW_ID'] raw_args = self._exec_properties.get('beam_pipeline_args', []) # Beam expects str types for it's pipeline args. Ensure unicode type is # converted to str if required. beam_pipeline_args = [] for arg in raw_args: # In order to support both Py2 and Py3: Py3 doesn't have `unicode` type. if six.PY2 and isinstance(arg, unicode): arg = arg.encode('ascii', 'ignore') beam_pipeline_args.append(arg) # TODO(zhitaoli): Revisit usage of setup_file here. module_dir = os.path.dirname(os.path.dirname(version.__file__)) setup_file = os.path.join(module_dir, 'setup.py') tf.logging.info('Using setup_file \'%s\' to capture TFX dependencies', setup_file) beam_pipeline_args.append('--setup_file={}'.format(setup_file)) executor_cls = import_utils.import_class_by_path(executor_class_path) # TODO(swoonna): Switch to execution_id when available unique_id = '{}_{}'.format(self._component_name, self._workflow_id) # TODO(swoonna): Add tmp_dir to additional_pipeline_args executor_context = base_executor.BaseExecutor.Context( beam_pipeline_args=beam_pipeline_args, tmp_dir=os.path.join(self._output_dir, '.temp', ''), unique_id=unique_id) self._executor = executor_cls(executor_context)
def __init__(self, executor_cls, name, input_dict, outputs, exec_properties): raw_args = exec_properties.get('beam_pipeline_args', []) # Beam expects str types for it's pipeline args. Ensure unicode type is # converted to str if required. beam_pipeline_args = [] for arg in raw_args: if isinstance(arg, unicode): arg = arg.encode('ascii', 'ignore') beam_pipeline_args.append(arg) module_dir = os.path.dirname(os.path.dirname(tfx.__file__)) setup_file = os.path.join(module_dir, 'setup.py') beam_pipeline_args.append('--setup_file={}'.format(setup_file)) self._executor = executor_cls(beam_pipeline_args) self._input_dict = input_dict self._output_dict = types.parse_tfx_type_dict(outputs) self._exec_properties = exec_properties self._component_name = to_snake_case(name) self._logger = logging_utils.get_logger( exec_properties['log_root'], self._component_name + '_driver.logs')
def _run_executor(args, pipeline_args) -> None: r"""Select a particular executor and run it based on name. # pylint: disable=line-too-long _run_executor() is used to invoke a class subclassing tfx.components.base.base_executor.BaseExecutor. This function can be used for both invoking the executor on remote environments as well as for unit testing of executors. How to invoke an executor as standalone: # TODO(b/132958430): Create utility script to generate arguments for run_executor.py First, the input data needs to be prepared. An easy way to generate the test data is to fully run the pipeline once. This will generate the data to be used for testing as well as log the artifacts to be used as input parameters. In each executed component, three log entries will be generated similar to the below: ``` [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,116] {base_executor.py:72} INFO - Starting Executor execution. [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:74} INFO - Inputs for Executor is: {"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:76} INFO - Outputs for Executor is: {"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} [2019-05-16 08:59:27,117] {logging_mixin.py:95} INFO - [2019-05-16 08:59:27,117] {base_executor.py:78} INFO - Execution properties for Executor is: {"output": "{ \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"} ``` Each of these map directly to the input parameters expected by run_executor(): ``` python scripts/run_executor.py \ --executor_class_path=tfx.components.example_gen.big_query_example_gen.executor.Executor \ --inputs={"input_base": [{"artifact": {"id": "1", "typeId": "1", "uri": "/usr/local/google/home/khaas/taxi/data/simple", "properties": {"split": {"stringValue": ""}, "state": {"stringValue": "published"}, "span": {"intValue": "1"}, "type_name": {"stringValue": "ExternalPath"}}}, "artifact_type": {"id": "1", "name": "ExternalPath", "properties": {"span": "INT", "name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING"}}}]} \ --outputs={"examples": [{"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/train/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "train"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}, {"artifact": {"uri": "/usr/local/google/home/khaas/tfx/pipelines/chicago_taxi_simple/CsvExampleGen/examples/1/eval/", "properties": {"type_name": {"stringValue": "ExamplesPath"}, "split": {"stringValue": "eval"}, "span": {"intValue": "1"}}}, "artifact_type": {"name": "ExamplesPath", "properties": {"name": "STRING", "type_name": "STRING", "split": "STRING", "state": "STRING", "span": "INT"}}}]} \ --exec-properties={"output": "{ \"splitConfig\": {\"splits\": [{\"name\": \"train\", \"hashBuckets\": 2}, {\"name\": \"eval\",\"hashBuckets\": 1}]}}"} ``` # pylint: disable=line-too-long Args: args: - inputs: The input artifacts for this execution, serialized as JSON. - outputs: The output artifacts to be generated by this execution, serialized as JSON. - exec_properties: The execution properties to be used by this execution, serialized as JSON. pipeline_args: Optional parameter that maps to the optional_pipeline_args parameter in the pipeline, which provides additional configuration options for apache-beam and tensorflow.logging. Returns: None Raises: None """ tf.logging.set_verbosity(tf.logging.INFO) (inputs_str, outputs_str, exec_properties_str) = (args.inputs or base64.b64decode(args.inputs_base64), args.outputs or base64.b64decode(args.outputs_base64), args.exec_properties or base64.b64decode(args.exec_properties_base64)) inputs = types.parse_tfx_type_dict(inputs_str) outputs = types.parse_tfx_type_dict(outputs_str) exec_properties = json.loads(exec_properties_str) tf.logging.info( 'Executor {} do: inputs: {}, outputs: {}, exec_properties: {}'.format( args.executor_class_path, inputs, outputs, exec_properties)) executor_cls = import_utils.import_class_by_path(args.executor_class_path) executor_context = base_executor.BaseExecutor.Context( beam_pipeline_args=pipeline_args, tmp_dir=args.temp_directory_path, unique_id='') executor = executor_cls(executor_context) tf.logging.info('Starting executor') executor.Do(inputs, outputs, exec_properties) # The last line of stdout will be pushed to xcom by Airflow. if args.write_outputs_stdout: print(types.jsonify_tfx_type_dict(outputs))