def testScopedLabels(self): """Test for scoped_labels.""" orig_labels = telemetry_utils.get_labels_dict() with telemetry_utils.scoped_labels({'foo': 'bar'}): self.assertDictEqual(telemetry_utils.get_labels_dict(), dict({'foo': 'bar'}, **orig_labels)) with telemetry_utils.scoped_labels({'inner': 'baz'}): self.assertDictEqual( telemetry_utils.get_labels_dict(), dict({ 'foo': 'bar', 'inner': 'baz' }, **orig_labels))
def run(self, pipeline: pipeline_py.Pipeline) -> None: """Runs given logical pipeline locally. Args: pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid executing the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return for component in pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access pipeline.pipeline_info.pipeline_root) c = compiler.Compiler() pipeline = c.compile(pipeline) # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: datetime.datetime.now().isoformat(), }) deployment_config = runner_utils.extract_local_deployment_config(pipeline) connection_config = deployment_config.metadata_connection_config logging.info('Running pipeline:\n %s', pipeline) logging.info('Using deployment config:\n %s', deployment_config) logging.info('Using connection config:\n %s', connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'local'}): # Run each component. Note that the pipeline.components list is in # topological order. # # TODO(b/171319478): After IR-based execution is used, used multi-threaded # execution so that independent components can be run in parallel. for node in pipeline.nodes: pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = runner_utils.extract_executor_spec( deployment_config, node_id) custom_driver_spec = runner_utils.extract_custom_driver_spec( deployment_config, node_id) component_launcher = launcher.Launcher( pipeline_node=pipeline_node, mlmd_connection=metadata.Metadata(connection_config), pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec) logging.info('Component %s is running.', node_id) component_launcher.launch() logging.info('Component %s is finished.', node_id)
def testDoBlessed(self, mock_runner, _): self._model_blessing.uri = os.path.join(self._source_data_dir, 'model_validator/blessed') self._model_blessing.set_int_custom_property('blessed', 1) mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') self._executor.Do(self._input_dict, self._output_dict, self._serialize_custom_config_under_test()) executor_class_path = '%s.%s' % (self._executor.__class__.__module__, self._executor.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( mock.ANY, self._model_push.uri, mock.ANY, mock.ANY, job_labels, ) self.assertPushed() version = self._model_push.get_string_custom_property('pushed_version') self.assertEqual( self._model_push.get_string_custom_property('pushed_destination'), 'projects/project_id/models/model_name/versions/{}'.format( version))
def testDoSkippedModelCreation(self, mock_runner, mock_run_model_inference, _): input_dict = { 'examples': [self._examples], 'model': [self._model], 'model_blessing': [self._model_blessing], } output_dict = { 'inference_result': [self._inference_result], } ai_platform_serving_args = { 'model_name': 'model_name', 'project_id': 'project_id' } # Create exe properties. exec_properties = { 'data_spec': proto_utils.proto_to_json(bulk_inferrer_pb2.DataSpec()), 'custom_config': json_utils.dumps( {executor.SERVING_ARGS_KEY: ai_platform_serving_args}), } mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') mock_runner.create_model_for_aip_prediction_if_not_exist.return_value = False # Run executor. bulk_inferrer = executor.Executor(self._context) bulk_inferrer.Do(input_dict, output_dict, exec_properties) ai_platform_prediction_model_spec = ( model_spec_pb2.AIPlatformPredictionModelSpec( project_id='project_id', model_name='model_name', version_name=self._model_version)) ai_platform_prediction_model_spec.use_serialization_config = True inference_endpoint = model_spec_pb2.InferenceSpecType() inference_endpoint.ai_platform_prediction_model_spec.CopyFrom( ai_platform_prediction_model_spec) mock_run_model_inference.assert_called_once_with(mock.ANY, mock.ANY, mock.ANY, mock.ANY, mock.ANY, inference_endpoint) executor_class_path = '%s.%s' % (bulk_inferrer.__class__.__module__, bulk_inferrer.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.make_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( serving_path=path_utils.serving_model_path(self._model.uri), model_version_name=mock.ANY, ai_platform_serving_args=ai_platform_serving_args, labels=job_labels, api=mock.ANY, skip_model_endpoint_creation=True, set_default=False) mock_runner.delete_model_from_aip_if_exists.assert_called_once_with( model_version_name=mock.ANY, ai_platform_serving_args=ai_platform_serving_args, api=mock.ANY, delete_model_endpoint=False)
def testDoBlessed_Vertex(self, mock_runner): endpoint_uri = 'projects/project_id/locations/us-central1/endpoints/12345' mock_runner.deploy_model_for_aip_prediction.return_value = endpoint_uri self._model_blessing.uri = os.path.join(self._source_data_dir, 'model_validator/blessed') self._model_blessing.set_int_custom_property('blessed', 1) self._executor.Do(self._input_dict, self._output_dict, self._serialize_custom_config_under_test_vertex()) executor_class_path = '%s.%s' % (self._executor.__class__.__module__, self._executor.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.make_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( serving_container_image_uri=self._container_image_uri_vertex, model_version_name=mock.ANY, ai_platform_serving_args=mock.ANY, labels=job_labels, serving_path=self._model_push.uri, endpoint_region='us-central1', enable_vertex=True, ) self.assertPushed() self.assertEqual( self._model_push.get_string_custom_property('pushed_destination'), endpoint_uri)
def __init__(self, context: Optional[Context] = None): """Constructs a beam based executor.""" super().__init__(context) self._beam_pipeline_args = None if context: if isinstance(context, BaseBeamExecutor.Context): self._beam_pipeline_args = context.beam_pipeline_args else: raise ValueError( 'BaseBeamExecutor found initialized with ' 'BaseExecutorSpec. Please use BeamEecutorSpec for ' 'Beam Components instead.') if self._beam_pipeline_args: self._beam_pipeline_args = dependency_utils.make_beam_dependency_flags( self._beam_pipeline_args) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) # TODO(zhitaoli): Rethink how we can add labels and only normalize them # if the job is submitted against GCP. with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): self._beam_pipeline_args.extend( telemetry_utils.make_beam_labels_args()) # TODO(b/174174381): Don't use beam_pipeline_args to set ABSL flags. flags.FLAGS(sys.argv + self._beam_pipeline_args, known_only=True)
def setUp(self): super(RunnerTest, self).setUp() self._output_data_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) self._project_id = '12345' self._mock_api_client = mock.Mock() self._inputs = {} self._outputs = {} self._training_inputs = { 'project': self._project_id, } self._job_id = 'my_jobid' # Dict format of exec_properties. custom_config needs to be serialized # before being passed into start_aip_training function. self._exec_properties = { 'custom_config': { executor.TRAINING_ARGS_KEY: self._training_inputs, }, } self._model_name = 'model_name' self._ai_platform_serving_args = { 'model_name': self._model_name, 'project_id': self._project_id, } self._executor_class_path = 'my.executor.Executor' with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path}): self._job_labels = telemetry_utils.get_labels_dict()
def __init__(self, context: Optional[Context] = None): """Constructs a beam based executor.""" self._context = context self._beam_pipeline_args = context.beam_pipeline_args if context else None if self._beam_pipeline_args: if beam: self._beam_pipeline_args = dependency_utils.make_beam_dependency_flags( self._beam_pipeline_args) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) # TODO(zhitaoli): Rethink how we can add labels and only normalize them # if the job is submitted against GCP. with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): self._beam_pipeline_args.extend( telemetry_utils.make_beam_labels_args()) # TODO(b/174174381): Don't use beam_pipeline_args to set ABSL flags. flags.FLAGS(sys.argv + self._beam_pipeline_args, known_only=True) else: # TODO(b/156000550): We should not specialize `Context` to embed beam # pipeline args. Instead, the `Context` should consists of generic # purpose `extra_flags` which can be interpreted differently by # different implementations of executors. absl.logging.warning( 'Executor context\'s beam_pipeline_args is being ignored because ' 'Apache Beam is not installed.')
def testScopedLabels(self): """Test for scoped_labels.""" orig_labels = telemetry_utils.make_labels_dict() with telemetry_utils.scoped_labels({'foo': 'bar'}): self.assertDictEqual(telemetry_utils.make_labels_dict(), dict({'foo': 'bar'}, **orig_labels)) with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_EXECUTOR: 'custom_component.custom_executor' }): self.assertDictEqual( telemetry_utils.make_labels_dict(), dict( { 'foo': 'bar', telemetry_utils.LABEL_TFX_EXECUTOR: 'third_party_executor' }, **orig_labels)) with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_EXECUTOR: 'tfx.components.example_gen.import_example_gen.executor.Executor' }): self.assertDictEqual( telemetry_utils.make_labels_dict(), dict( { 'foo': 'bar', telemetry_utils.LABEL_TFX_EXECUTOR: # Label is normalized. 'tfx-components-example_gen-import_example_gen-executor-executor' }, **orig_labels)) with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_EXECUTOR: 'tfx.extensions.google_cloud_big_query.example_gen.executor.Executor' }): self.assertDictEqual( telemetry_utils.make_labels_dict(), dict( { 'foo': 'bar', telemetry_utils.LABEL_TFX_EXECUTOR: # Label is normalized. 'tfx-extensions-google_cloud_big_query-example_gen-executor-exec' }, **orig_labels))
def create_training_job(self, input_dict: Dict[str, List[types.Artifact]], output_dict: Dict[str, List[types.Artifact]], exec_properties: Dict[str, Any], executor_class_path: str, job_args: Dict[str, Any], job_id: Optional[str]) -> Dict[str, Any]: """Get training args for runner._launch_aip_training. The training args contain the inputs/outputs/exec_properties to the tfx.scripts.run_executor module. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. job_args: Training input argument for AI Platform training job. 'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput job_id: Job ID for AI Platform Training job. If not supplied, system-determined unique ID is given. Refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job Returns: A dict containing the training arguments """ training_inputs = job_args.copy() container_command = self.generate_container_command(input_dict, output_dict, exec_properties, executor_class_path) if not training_inputs.get('masterConfig'): training_inputs['masterConfig'] = { 'imageUri': _TFX_IMAGE, } # Always use our own entrypoint instead of relying on container default. if 'containerCommand' in training_inputs['masterConfig']: logging.warn('Overriding custom value of containerCommand') training_inputs['masterConfig']['containerCommand'] = container_command with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.make_labels_dict() # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified. job_id = job_id or 'tfx_{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S')) caip_job = { 'job_id': job_id, 'training_input': training_inputs, 'labels': job_labels } return caip_job
def main(): # Log to the container's stdout so it can be streamed by the orchestrator. logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--pipeline_name', type=str, required=True) parser.add_argument('--pipeline_root', type=str, required=True) parser.add_argument('--run_id', type=str, required=True) parser.add_argument('--metadata_config', type=str, required=True) parser.add_argument('--beam_pipeline_args', type=str, required=True) parser.add_argument('--additional_pipeline_args', type=str, required=True) parser.add_argument('--component_launcher_class_path', type=str, required=True) parser.add_argument('--enable_cache', action='store_true') parser.add_argument('--serialized_component', type=str, required=True) parser.add_argument('--component_config', type=str, required=True) args = parser.parse_args() component = json_utils.loads(args.serialized_component) component_config = json_utils.loads(args.component_config) component_launcher_class = import_utils.import_class_by_path( args.component_launcher_class_path) if not issubclass(component_launcher_class, base_component_launcher.BaseComponentLauncher): raise TypeError( 'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher' % component_launcher_class) metadata_config = metadata_store_pb2.ConnectionConfig() json_format.Parse(args.metadata_config, metadata_config) driver_args = data_types.DriverArgs(enable_cache=args.enable_cache) beam_pipeline_args = json.loads(args.beam_pipeline_args) additional_pipeline_args = json.loads(args.additional_pipeline_args) launcher = component_launcher_class.create( component=component, pipeline_info=data_types.PipelineInfo( pipeline_name=args.pipeline_name, pipeline_root=args.pipeline_root, run_id=args.run_id, ), driver_args=driver_args, metadata_connection=metadata.Metadata( connection_config=metadata_config), beam_pipeline_args=beam_pipeline_args, additional_pipeline_args=additional_pipeline_args, component_config=component_config) # Attach necessary labels to distinguish different runner and DSL. with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_RUNNER: 'kubernetes', }): launcher.launch()
def run( self, component: base_node.BaseNode, enable_cache: bool = True, beam_pipeline_args: Optional[List[Text]] = None ) -> execution_result.ExecutionResult: """Run a given TFX component in the interactive context. Args: component: Component instance to be run. enable_cache: whether caching logic should be enabled in the driver. beam_pipeline_args: Optional Beam pipeline args for beam jobs within executor. Executor will use beam DirectRunner as Default. If provided, will override beam_pipeline_args specified in constructor. Returns: execution_result.ExecutionResult object. """ run_id = datetime.datetime.now().isoformat() pipeline_info = data_types.PipelineInfo( pipeline_name=self.pipeline_name, pipeline_root=self.pipeline_root, run_id=run_id) driver_args = data_types.DriverArgs(enable_cache=enable_cache, interactive_resolution=True) metadata_connection = metadata.Metadata( self.metadata_connection_config) beam_pipeline_args = beam_pipeline_args or self.beam_pipeline_args additional_pipeline_args = {} for name, output in component.outputs.items(): for artifact in output.get(): artifact.pipeline_name = self.pipeline_name artifact.producer_component = component.id artifact.name = name # Special treatment for pip dependencies. # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies(self.pipeline_root) # pylint: disable=protected-access # TODO(hongyes): figure out how to resolve launcher class in the interactive # context. launcher = in_process_component_launcher.InProcessComponentLauncher.create( component, pipeline_info, driver_args, metadata_connection, beam_pipeline_args, additional_pipeline_args) try: import colab # pytype: disable=import-error # pylint: disable=g-import-not-at-top, unused-import, unused-variable runner_label = 'interactivecontext-colab' except ImportError: runner_label = 'interactivecontext' with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_RUNNER: runner_label, }): execution_id = launcher.launch().execution_id return execution_result.ExecutionResult(component=component, execution_id=execution_id)
def run(self, pipeline: tfx_pipeline.Pipeline, parameter_values: Optional[Dict[Text, Any]] = None, write_out: Optional[bool] = True) -> Dict[Text, Any]: """Compiles a pipeline DSL object into pipeline file. Args: pipeline: TFX pipeline object. parameter_values: mapping from runtime parameter names to its values. write_out: set to True to actually write out the file to the place designated by output_dir and output_filename. Otherwise return the JSON-serialized pipeline job spec. Returns: Returns the JSON pipeline job spec. Raises: RuntimeError: if trying to write out to a place occupied by an existing file. """ # TODO(b/166343606): Support user-provided labels. # TODO(b/169095387): Deprecate .run() method in favor of the unified API # client. display_name = (self._config.display_name or pipeline.pipeline_info.pipeline_name) pipeline_spec = pipeline_builder.PipelineBuilder( tfx_pipeline=pipeline, default_image=self._config.default_image, default_commands=self._config.default_commands).build() pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__) pipeline_spec.schema_version = _SCHEMA_VERSION runtime_config = pipeline_builder.RuntimeConfigBuilder( pipeline_info=pipeline.pipeline_info, parameter_values=parameter_values).build() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}): result = pipeline_spec_pb2.PipelineJob( display_name=display_name or pipeline.pipeline_info.pipeline_name, labels=telemetry_utils.get_labels_dict(), runtime_config=runtime_config) result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec)) pipeline_json_dict = json_format.MessageToDict(result) if write_out: if fileio.exists( self._output_dir) and not fileio.isdir(self._output_dir): raise RuntimeError('Output path: %s is pointed to a file.' % self._output_dir) if not fileio.exists(self._output_dir): fileio.makedirs(self._output_dir) with fileio.open( os.path.join(self._output_dir, self._output_filename), 'wb') as f: f.write(json.dumps(pipeline_json_dict, sort_keys=True)) return pipeline_json_dict
def run(self, tfx_pipeline: pipeline.Pipeline) -> None: """Deploys given logical pipeline on Beam. Args: tfx_pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid executing the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}): with beam.Pipeline(argv=self._beam_orchestrator_args) as p: # Uses for triggering the component DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of component to its signal. signal_map = {} # pipeline.components are in topological order. for component in tfx_pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access tfx_pipeline.pipeline_info.pipeline_root) component_id = component.id # Signals from upstream components. signals_to_wait = [] if component.upstream_nodes: for upstream_node in component.upstream_nodes: assert upstream_node in signal_map, ( 'Components is not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) absl.logging.info( 'Component %s depends on %s.', component_id, [s.producer.full_label for s in signals_to_wait]) (component_launcher_class, component_config ) = config_utils.find_component_launch_info( self._config, component) # Each signal is an empty PCollection. AsIter ensures component will # be triggered after upstream components are finished. signal_map[component] = ( root | 'Run[%s]' % component_id >> beam.ParDo( _ComponentAsDoFn(component, component_launcher_class, component_config, tfx_pipeline), * [beam.pvalue.AsIter(s) for s in signals_to_wait])) absl.logging.info('Component %s is scheduled.', component_id)
def _assertDeployModelMockCalls(self, expected_models_create_body=None, expected_versions_create_body=None, expect_set_default=True): if not expected_models_create_body: expected_models_create_body = { 'name': self._model_name, 'regions': [], } if not expected_versions_create_body: with telemetry_utils.scoped_labels({ telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path }): labels = telemetry_utils.get_labels_dict() expected_versions_create_body = { 'name': self._model_version, 'deployment_uri': self._serving_path, 'runtime_version': runner._get_tf_runtime_version(tf.__version__), 'python_version': runner._get_caip_python_version( runner._get_tf_runtime_version(tf.__version__)), 'labels': labels } self._mock_models_create.assert_called_with( body=mock.ANY, parent='projects/{}'.format(self._project_id), ) (_, models_create_kwargs) = self._mock_models_create.call_args self.assertDictEqual(expected_models_create_body, models_create_kwargs['body']) self._mock_versions_create.assert_called_with( body=mock.ANY, parent='projects/{}/models/{}'.format(self._project_id, self._model_name)) (_, versions_create_kwargs) = self._mock_versions_create.call_args self.assertDictEqual(expected_versions_create_body, versions_create_kwargs['body']) if not expect_set_default: return self._mock_set_default.assert_called_with( name='projects/{}/models/{}/versions/{}'.format( self._project_id, self._model_name, self._model_version)) self._mock_set_default_execute.assert_called_with()
def run(self, pipeline: pipeline_pb2.Pipeline) -> None: """Deploys given logical pipeline on Beam. Args: pipeline: Logical pipeline in IR format. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid deploying the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return # TODO(b/163003901): Support beam DAG runner args through IR. # TODO(b/163003901): MLMD connection config should be passed in via IR. connection_config = metadata_store_pb2.ConnectionConfig() connection_config.sqlite.SetInParent() mlmd_connection = metadata.Metadata( connection_config=connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}): with beam.Pipeline() as p: # Uses for triggering the component DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of component to its signal. signal_map = {} # pipeline.components are in topological order. for node in pipeline.nodes: # TODO(b/160882349): Support subpipeline pipeline_node = node.pipeline_node component_id = pipeline_node.node_info.id # Signals from upstream components. signals_to_wait = [] for upstream_node in pipeline_node.upstream_nodes: assert upstream_node in signal_map, ('Components is not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) logging.info('Component %s depends on %s.', component_id, [s.producer.full_label for s in signals_to_wait]) # Each signal is an empty PCollection. AsIter ensures component will # be triggered after upstream components are finished. # LINT.IfChange signal_map[component_id] = ( root | 'Run[%s]' % component_id >> beam.ParDo( _PipelineNodeAsDoFn(pipeline_node, mlmd_connection, pipeline.pipeline_info, pipeline.runtime_spec), * [beam.pvalue.AsIter(s) for s in signals_to_wait])) # LINT.ThenChange(../beam/beam_dag_runner.py) logging.info('Component %s is scheduled.', component_id)
def __init__(self, context: Optional[Context] = None): """Constructs a beam based executor.""" self._context = context self._beam_pipeline_args = context.beam_pipeline_args if context else None if self._beam_pipeline_args: self._beam_pipeline_args = dependency_utils.make_beam_dependency_flags( self._beam_pipeline_args) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) # TODO(zhitaoli): Rethink how we can add labels and only normalize them # if the job is submitted against GCP. with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): self._beam_pipeline_args.extend(telemetry_utils.make_beam_labels_args())
def _airflow_component_launcher( component: base_node.BaseNode, component_launcher_class: Type[ base_component_launcher.BaseComponentLauncher], pipeline_info: data_types.PipelineInfo, driver_args: data_types.DriverArgs, metadata_connection_config: metadata_store_pb2.ConnectionConfig, beam_pipeline_args: List[Text], additional_pipeline_args: Dict[Text, Any], component_config: base_component_config.BaseComponentConfig, exec_properties: Dict[Text, Any], **kwargs) -> None: """Helper function to launch TFX component execution. This helper function will be called with Airflow env objects which contains run_id that we need to pass into TFX ComponentLauncher. Args: component: TFX BaseComponent instance. This instance holds all inputs and outputs placeholders as well as component properties. component_launcher_class: The class of the launcher to launch the component. pipeline_info: A data_types.PipelineInfo instance that holds pipeline properties driver_args: Component specific args for driver. metadata_connection_config: Configuration for how to connect to metadata. beam_pipeline_args: Pipeline arguments for Beam powered Components. additional_pipeline_args: A dict of additional pipeline args. component_config: Component config to launch the component. exec_properties: Execution properties from the ComponentSpec. **kwargs: Context arguments that will be passed in by Airflow, including: - ti: TaskInstance object from which we can get run_id of the running pipeline. For more details, please refer to the code: https://github.com/apache/airflow/blob/master/airflow/operators/python_operator.py """ component.exec_properties.update(exec_properties) # Populate run id from Airflow task instance. pipeline_info.run_id = kwargs['ti'].get_dagrun().run_id launcher = component_launcher_class.create( component=component, pipeline_info=pipeline_info, driver_args=driver_args, metadata_connection=metadata.Metadata(metadata_connection_config), beam_pipeline_args=beam_pipeline_args, additional_pipeline_args=additional_pipeline_args, component_config=component_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'airflow'}): launcher.launch()
def testDeployModelForAIPPredictionWithCustomRegion(self, mock_discovery): mock_discovery.build.return_value = self._mock_api_client self._setUpPredictionMocks() self._ai_platform_serving_args['regions'] = ['custom-region'] runner.deploy_model_for_aip_prediction(self._serving_path, self._model_version, self._ai_platform_serving_args, self._executor_class_path) self._mock_models_create.assert_called_with( body=mock.ANY, parent='projects/{}'.format(self._project_id), ) (_, models_create_kwargs) = self._mock_models_create.call_args models_create_body = models_create_kwargs['body'] self.assertDictEqual( { 'name': 'model_name', 'regions': ['custom-region'] }, models_create_body) self._mock_versions_create.assert_called_with( body=mock.ANY, parent='projects/{}/models/{}'.format(self._project_id, 'model_name')) (_, versions_create_kwargs) = self._mock_versions_create.call_args versions_create_body = versions_create_kwargs['body'] with telemetry_utils.scoped_labels( {telemetry_utils.TFX_EXECUTOR: self._executor_class_path}): labels = telemetry_utils.get_labels_dict() runtime_version = runner._get_tf_runtime_version(tf.__version__) self.assertDictEqual( { 'name': self._model_version, 'deployment_uri': self._serving_path, 'runtime_version': runtime_version, 'python_version': runner._get_caip_python_version(runtime_version), 'labels': labels, }, versions_create_body) self._mock_get.assert_called_with(name='op_name') self._mock_set_default.assert_called_with( name='projects/{}/models/{}/versions/{}'.format( self._project_id, 'model_name', self._model_version)) self._mock_set_default_execute.assert_called_with()
def run(self, tfx_pipeline: pipeline.Pipeline) -> None: """Runs given logical pipeline locally. Args: tfx_pipeline: Logical pipeline containing pipeline args and components. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid executing the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return tfx_pipeline.pipeline_info.run_id = datetime.datetime.now().isoformat() with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'local'}): # Run each component. Note that the pipeline.components list is in # topological order. # # TODO(b/171319478): After IR-based execution is used, used multi-threaded # execution so that independent components can be run in parallel. for component in tfx_pipeline.components: # TODO(b/187122662): Pass through pip dependencies as a first-class # component flag. if isinstance(component, base_component.BaseComponent): component._resolve_pip_dependencies( # pylint: disable=protected-access tfx_pipeline.pipeline_info.pipeline_root) (component_launcher_class, component_config) = ( config_utils.find_component_launch_info(self._config, component)) driver_args = data_types.DriverArgs( enable_cache=tfx_pipeline.enable_cache) metadata_connection = metadata.Metadata( tfx_pipeline.metadata_connection_config) node_launcher = component_launcher_class.create( component=component, pipeline_info=tfx_pipeline.pipeline_info, driver_args=driver_args, metadata_connection=metadata_connection, beam_pipeline_args=tfx_pipeline.beam_pipeline_args, additional_pipeline_args=tfx_pipeline.additional_pipeline_args, component_config=component_config) logging.info('Component %s is running.', component.id) node_launcher.launch() logging.info('Component %s is finished.', component.id)
def testMakeBeamLabelsArgs(self): """Test for make_beam_labels_args.""" beam_pipeline_args = telemetry_utils.make_beam_labels_args() expected_beam_pipeline_args = [ '--labels', 'tfx_py_version=%d-%d' % (sys.version_info.major, sys.version_info.minor), '--labels', 'tfx_version=%s' % version.__version__.replace('.', '-'), ] self.assertListEqual(expected_beam_pipeline_args, beam_pipeline_args) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: 'TestExecutor'}): beam_pipeline_args = telemetry_utils.make_beam_labels_args() expected_beam_pipeline_args = [ '--labels', 'tfx_executor=third_party_executor', ] + expected_beam_pipeline_args self.assertListEqual(expected_beam_pipeline_args, beam_pipeline_args)
def testDoBlessedOnRegionalEndpoint(self, mock_runner, _): self._exec_properties = { 'custom_config': { constants.SERVING_ARGS_KEY: { 'model_name': 'model_name', 'project_id': 'project_id' }, constants.ENDPOINT_ARGS_KEY: 'https://ml-us-west1.googleapis.com', }, } self._model_blessing.uri = os.path.join(self._source_data_dir, 'model_validator/blessed') self._model_blessing.set_int_custom_property('blessed', 1) mock_runner.get_service_name_and_api_version.return_value = ('ml', 'v1') version = self._model_push.get_string_custom_property('pushed_version') mock_runner.deploy_model_for_aip_prediction.return_value = ( 'projects/project_id/models/model_name/versions/{}'.format(version) ) self._executor.Do(self._input_dict, self._output_dict, self._serialize_custom_config_under_test()) executor_class_path = '%s.%s' % (self._executor.__class__.__module__, self._executor.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.make_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( serving_path=self._model_push.uri, model_version_name=mock.ANY, ai_platform_serving_args=mock.ANY, api=mock.ANY, labels=job_labels, ) self.assertPushed() self.assertEqual( self._model_push.get_string_custom_property('pushed_destination'), 'projects/project_id/models/model_name/versions/{}'.format( version))
def testDoBlessedOnRegionalEndpoint_Vertex(self, mock_runner): endpoint_uri = 'projects/project_id/locations/us-west1/endpoints/12345' mock_runner.deploy_model_for_aip_prediction.return_value = endpoint_uri self._exec_properties_vertex = { 'custom_config': { constants.SERVING_ARGS_KEY: { 'model_name': 'model_name', 'project_id': 'project_id' }, constants.VERTEX_CONTAINER_IMAGE_URI_KEY: self._container_image_uri_vertex, constants.ENABLE_VERTEX_KEY: True, constants.VERTEX_REGION_KEY: 'us-west1', }, } self._model_blessing.uri = os.path.join(self._source_data_dir, 'model_validator/blessed') self._model_blessing.set_int_custom_property('blessed', 1) self._executor.Do(self._input_dict, self._output_dict, self._serialize_custom_config_under_test_vertex()) executor_class_path = '%s.%s' % (self._executor.__class__.__module__, self._executor.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.make_labels_dict() mock_runner.deploy_model_for_aip_prediction.assert_called_once_with( serving_path=self._model_push.uri, model_version_name=mock.ANY, ai_platform_serving_args=mock.ANY, labels=job_labels, serving_container_image_uri=self._container_image_uri_vertex, endpoint_region='us-west1', enable_vertex=True, ) self.assertPushed() self.assertEqual( self._model_push.get_string_custom_property('pushed_destination'), endpoint_uri)
def testDeployModelForAIPPredictionWithCustomRuntime(self, mock_discovery): mock_discovery.build.return_value = self._mock_api_client self._setUpPredictionMocks() self._ai_platform_serving_args['runtime_version'] = '1.23.45' runner.deploy_model_for_aip_prediction(self._serving_path, self._model_version, self._ai_platform_serving_args, self._executor_class_path) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: self._executor_class_path}): labels = telemetry_utils.get_labels_dict() expected_versions_create_body = { 'name': self._model_version, 'deployment_uri': self._serving_path, 'runtime_version': '1.23.45', 'python_version': runner._get_caip_python_version('1.23.45'), 'labels': labels, } self._assertDeployModelMockCalls( expected_versions_create_body=expected_versions_create_body)
def run(self, pipeline: pipeline_pb2.Pipeline) -> None: """Deploys given logical pipeline on Beam. Args: pipeline: Logical pipeline in IR format. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid deploying the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return run_id = datetime.datetime.now().isoformat() # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: run_id, }) # TODO(b/163003901): Support beam DAG runner args through IR. deployment_config = self._extract_deployment_config(pipeline) connection_config = deployment_config.metadata_connection_config mlmd_connection = metadata.Metadata( connection_config=connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}): with beam.Pipeline() as p: # Uses for triggering the component DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of component to its signal. signal_map = {} # pipeline.components are in topological order. for node in pipeline.nodes: # TODO(b/160882349): Support subpipeline pipeline_node = node.pipeline_node component_id = pipeline_node.node_info.id executor_spec = self._extract_executor_spec( deployment_config, component_id) custom_driver_spec = self._extract_custom_driver_spec( deployment_config, component_id) # Signals from upstream components. signals_to_wait = [] for upstream_node in pipeline_node.upstream_nodes: assert upstream_node in signal_map, ( 'Components is not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) logging.info( 'Component %s depends on %s.', component_id, [s.producer.full_label for s in signals_to_wait]) # Each signal is an empty PCollection. AsIter ensures component will # be triggered after upstream components are finished. # LINT.IfChange signal_map[component_id] = ( root | 'Run[%s]' % component_id >> beam.ParDo( self._PIPELINE_NODE_DO_FN_CLS( pipeline_node=pipeline_node, mlmd_connection=mlmd_connection, pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec), * [beam.pvalue.AsIter(s) for s in signals_to_wait])) # LINT.ThenChange(../beam/beam_dag_runner.py) logging.info('Component %s is scheduled.', component_id)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]): """Overrides the tfx_pusher_executor. Args: input_dict: Input dict from input key to a list of artifacts, including: - model_export: exported model from trainer. - model_blessing: model blessing path from model_validator. output_dict: Output dict from key to a list of artifacts, including: - model_push: A list of 'ModelPushPath' artifact of size one. It will include the model in this push execution if the model was pushed. exec_properties: Mostly a passthrough input dict for tfx.components.Pusher.executor. custom_config.bigquery_serving_args is consumed by this class. For the full set of parameters supported by Big Query ML, refer to https://cloud.google.com/bigquery-ml/ Returns: None Raises: ValueError: If bigquery_serving_args is not in exec_properties.custom_config. If pipeline_root is not 'gs://...' RuntimeError: if the Big Query job failed. """ self._log_startup(input_dict, output_dict, exec_properties) model_push = artifact_utils.get_single_instance( output_dict[tfx_pusher_executor.PUSHED_MODEL_KEY]) if not self.CheckBlessing(input_dict): self._MarkNotPushed(model_push) return model_export = artifact_utils.get_single_instance( input_dict[tfx_pusher_executor.MODEL_KEY]) model_export_uri = model_export.uri custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict.') bigquery_serving_args = custom_config.get(SERVING_ARGS_KEY) # if configuration is missing error out if bigquery_serving_args is None: raise ValueError('Big Query ML configuration was not provided') bq_model_uri = '.'.join([ bigquery_serving_args[_PROJECT_ID_KEY], bigquery_serving_args[_BQ_DATASET_ID_KEY], bigquery_serving_args[_MODEL_NAME_KEY], ]) # Deploy the model. io_utils.copy_dir( src=path_utils.serving_model_path(model_export_uri), dst=model_push.uri) model_path = model_push.uri if not model_path.startswith(_GCS_PREFIX): raise ValueError('pipeline_root must be gs:// for BigQuery ML Pusher.') logging.info('Deploying the model to BigQuery ML for serving: %s from %s', bigquery_serving_args, model_path) query = _BQML_CREATE_OR_REPLACE_MODEL_QUERY_TEMPLATE.format( model_uri=bq_model_uri, model_path=model_path) # TODO(zhitaoli): Refactor the executor_class_path creation into a common # utility function. executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): default_query_job_config = bigquery.job.QueryJobConfig( labels=telemetry_utils.get_labels_dict()) client = bigquery.Client(default_query_job_config=default_query_job_config) try: query_job = client.query(query) query_job.result() # Waits for the query to finish except Exception as e: raise RuntimeError('BigQuery ML Push failed: {}'.format(e)) logging.info('Successfully deployed model %s serving from %s', bq_model_uri, model_path) # Setting the push_destination to bigquery uri self._MarkPushed(model_push, pushed_destination=bq_model_uri)
def Do(self, input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any]) -> None: """Runs batch inference on a given model with given input examples. This function creates a new model (if necessary) and a new model version before inference, and cleans up resources after inference. It provides re-executability as it cleans up (only) the model resources that are created during the process even inference job failed. Args: input_dict: Input dict from input key to a list of Artifacts. - examples: examples for inference. - model: exported model. - model_blessing: model blessing result output_dict: Output dict from output key to a list of Artifacts. - output: bulk inference results. exec_properties: A dict of execution properties. - data_spec: JSON string of bulk_inferrer_pb2.DataSpec instance. - custom_config: custom_config.ai_platform_serving_args need to contain the serving job parameters sent to Google Cloud AI Platform. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.models Returns: None """ self._log_startup(input_dict, output_dict, exec_properties) if 'examples' not in input_dict: raise ValueError('\'examples\' is missing in input dict.') if 'inference_result' not in output_dict: raise ValueError('\'inference_result\' is missing in output dict.') output = artifact_utils.get_single_instance(output_dict['inference_result']) if 'model' not in input_dict: raise ValueError('Input models are not valid, model ' 'need to be specified.') if 'model_blessing' in input_dict: model_blessing = artifact_utils.get_single_instance( input_dict['model_blessing']) if not model_utils.is_model_blessed(model_blessing): output.set_int_custom_property('inferred', 0) logging.info('Model on %s was not blessed', model_blessing.uri) return else: logging.info('Model blessing is not provided, exported model will be ' 'used.') if _CUSTOM_CONFIG_KEY not in exec_properties: raise ValueError('Input exec properties are not valid, {} ' 'need to be specified.'.format(_CUSTOM_CONFIG_KEY)) custom_config = json_utils.loads( exec_properties.get(_CUSTOM_CONFIG_KEY, 'null')) if custom_config is not None and not isinstance(custom_config, Dict): raise ValueError('custom_config in execution properties needs to be a ' 'dict.') ai_platform_serving_args = custom_config.get(SERVING_ARGS_KEY) if not ai_platform_serving_args: raise ValueError( '\'ai_platform_serving_args\' is missing in \'custom_config\'') service_name, api_version = runner.get_service_name_and_api_version( ai_platform_serving_args) executor_class_path = '%s.%s' % (self.__class__.__module__, self.__class__.__name__) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() model = artifact_utils.get_single_instance(input_dict['model']) model_path = path_utils.serving_model_path(model.uri) logging.info('Use exported model from %s.', model_path) # Use model artifact uri to generate model version to guarantee the # 1:1 mapping from model version to model. model_version = 'version_' + hashlib.sha256(model.uri.encode()).hexdigest() inference_spec = self._get_inference_spec(model_path, model_version, ai_platform_serving_args) data_spec = bulk_inferrer_pb2.DataSpec() json_format.Parse(exec_properties['data_spec'], data_spec) api = discovery.build(service_name, api_version) new_model_created = False try: new_model_created = runner.create_model_for_aip_prediction_if_not_exist( api, job_labels, ai_platform_serving_args) runner.deploy_model_for_aip_prediction( api, model_path, model_version, ai_platform_serving_args, job_labels, skip_model_creation=True, set_default_version=False, ) self._run_model_inference(data_spec, input_dict['examples'], output.uri, inference_spec) except Exception as e: logging.error('Error in executing CloudAIBulkInferrerComponent: %s', str(e)) output.set_int_custom_property('inferred', 0) raise finally: # Guarantee newly created resources are cleaned up even if theinference # job failed. # Clean up the newly deployed model. runner.delete_model_version_from_aip_if_exists(api, model_version, ai_platform_serving_args) if new_model_created: runner.delete_model_from_aip_if_exists(api, ai_platform_serving_args) # Mark the inferenence as successful after resources are cleaned up. output.set_int_custom_property('inferred', 1)
def main(): # Log to the container's stdout so Kubeflow Pipelines UI can display logs to # the user. logging.basicConfig(stream=sys.stdout, level=logging.INFO) logging.getLogger().setLevel(logging.INFO) parser = argparse.ArgumentParser() parser.add_argument('--pipeline_name', type=str, required=True) parser.add_argument('--pipeline_root', type=str, required=True) parser.add_argument('--kubeflow_metadata_config', type=str, required=True) parser.add_argument('--beam_pipeline_args', type=str, required=True) parser.add_argument('--additional_pipeline_args', type=str, required=True) parser.add_argument('--component_launcher_class_path', type=str, required=True) parser.add_argument('--enable_cache', action='store_true') parser.add_argument('--serialized_component', type=str, required=True) parser.add_argument('--component_config', type=str, required=True) args = parser.parse_args() component = json_utils.loads(args.serialized_component) component_config = json_utils.loads(args.component_config) component_launcher_class = import_utils.import_class_by_path( args.component_launcher_class_path) if not issubclass(component_launcher_class, base_component_launcher.BaseComponentLauncher): raise TypeError( 'component_launcher_class "%s" is not subclass of base_component_launcher.BaseComponentLauncher' % component_launcher_class) kubeflow_metadata_config = kubeflow_pb2.KubeflowMetadataConfig() json_format.Parse(args.kubeflow_metadata_config, kubeflow_metadata_config) metadata_connection = kubeflow_metadata_adapter.KubeflowMetadataAdapter( _get_metadata_connection_config(kubeflow_metadata_config)) driver_args = data_types.DriverArgs(enable_cache=args.enable_cache) beam_pipeline_args = _make_beam_pipeline_args(args.beam_pipeline_args) additional_pipeline_args = json.loads(args.additional_pipeline_args) launcher = component_launcher_class.create( component=component, pipeline_info=data_types.PipelineInfo( pipeline_name=args.pipeline_name, pipeline_root=args.pipeline_root, run_id=os.environ['WORKFLOW_ID']), driver_args=driver_args, metadata_connection=metadata_connection, beam_pipeline_args=beam_pipeline_args, additional_pipeline_args=additional_pipeline_args, component_config=component_config) # Attach necessary labels to distinguish different runner and DSL. # TODO(zhitaoli): Pass this from KFP runner side when the same container # entrypoint can be used by a different runner. with telemetry_utils.scoped_labels({ telemetry_utils.TFX_RUNNER: 'kfp', }): execution_info = launcher.launch() # Dump the UI metadata. _dump_ui_metadata(component, execution_info)
def run(self, pipeline: Union[pipeline_pb2.Pipeline, pipeline_py.Pipeline]) -> None: """Deploys given logical pipeline on Beam. Args: pipeline: Logical pipeline in IR format. """ # For CLI, while creating or updating pipeline, pipeline_args are extracted # and hence we avoid deploying the pipeline. if 'TFX_JSON_EXPORT_PIPELINE_ARGS_PATH' in os.environ: return if isinstance(pipeline, pipeline_py.Pipeline): c = compiler.Compiler() pipeline = c.compile(pipeline) run_id = datetime.datetime.now().strftime('%Y%m%d-%H%M%S.%f') # Substitute the runtime parameter to be a concrete run_id runtime_parameter_utils.substitute_runtime_parameter( pipeline, { constants.PIPELINE_RUN_ID_PARAMETER_NAME: run_id, }) deployment_config = self._extract_deployment_config(pipeline) connection_config = self._connection_config_from_deployment_config( deployment_config) logging.info('Running pipeline:\n %s', pipeline) logging.info('Using deployment config:\n %s', deployment_config) logging.info('Using connection config:\n %s', connection_config) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_RUNNER: 'beam'}): with beam.Pipeline() as p: # Uses for triggering the node DoFns. root = p | 'CreateRoot' >> beam.Create([None]) # Stores mapping of node to its signal. signal_map = {} # pipeline.nodes are in topological order. for node in pipeline.nodes: # TODO(b/160882349): Support subpipeline pipeline_node = node.pipeline_node node_id = pipeline_node.node_info.id executor_spec = self._extract_executor_spec( deployment_config, node_id) custom_driver_spec = self._extract_custom_driver_spec( deployment_config, node_id) # Signals from upstream nodes. signals_to_wait = [] for upstream_node in pipeline_node.upstream_nodes: assert upstream_node in signal_map, ('Nodes are not in ' 'topological order') signals_to_wait.append(signal_map[upstream_node]) logging.info('Node %s depends on %s.', node_id, [s.producer.full_label for s in signals_to_wait]) # Each signal is an empty PCollection. AsIter ensures a node will # be triggered after upstream nodes are finished. signal_map[node_id] = ( root | 'Run[%s]' % node_id >> beam.ParDo( self._PIPELINE_NODE_DO_FN_CLS( pipeline_node=pipeline_node, mlmd_connection_config=connection_config, pipeline_info=pipeline.pipeline_info, pipeline_runtime_spec=pipeline.runtime_spec, executor_spec=executor_spec, custom_driver_spec=custom_driver_spec, deployment_config=deployment_config), *[beam.pvalue.AsIter(s) for s in signals_to_wait])) logging.info('Node %s is scheduled.', node_id)
def start_aip_training(input_dict: Dict[Text, List[types.Artifact]], output_dict: Dict[Text, List[types.Artifact]], exec_properties: Dict[Text, Any], executor_class_path: Text, training_inputs: Dict[Text, Any], job_id: Optional[Text]): """Start a trainer job on AI Platform (AIP). This is done by forwarding the inputs/outputs/exec_properties to the tfx.scripts.run_executor module on a AI Platform training job interpreter. Args: input_dict: Passthrough input dict for tfx.components.Trainer.executor. output_dict: Passthrough input dict for tfx.components.Trainer.executor. exec_properties: Passthrough input dict for tfx.components.Trainer.executor. executor_class_path: class path for TFX core default trainer. training_inputs: Training input argument for AI Platform training job. 'pythonModule', 'pythonVersion' and 'runtimeVersion' will be inferred. For the full set of parameters, refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#TrainingInput job_id: Job ID for AI Platform Training job. If not supplied, system-determined unique ID is given. Refer to https://cloud.google.com/ml-engine/reference/rest/v1/projects.jobs#resource-job Returns: None Raises: RuntimeError: if the Google Cloud AI Platform training job failed/cancelled. """ training_inputs = training_inputs.copy() json_inputs = artifact_utils.jsonify_artifact_dict(input_dict) logging.info('json_inputs=\'%s\'.', json_inputs) json_outputs = artifact_utils.jsonify_artifact_dict(output_dict) logging.info('json_outputs=\'%s\'.', json_outputs) json_exec_properties = json.dumps(exec_properties, sort_keys=True) logging.info('json_exec_properties=\'%s\'.', json_exec_properties) # Configure AI Platform training job api_client = discovery.build('ml', 'v1') # We use custom containers to launch training on AI Platform, which invokes # the specified image using the container's entrypoint. The default # entrypoint for TFX containers is to call scripts/run_executor.py. The # arguments below are passed to this run_executor entry to run the executor # specified in `executor_class_path`. job_args = [ '--executor_class_path', executor_class_path, '--inputs', json_inputs, '--outputs', json_outputs, '--exec-properties', json_exec_properties ] if not training_inputs.get('masterConfig'): training_inputs['masterConfig'] = { 'imageUri': _TFX_IMAGE, } training_inputs['args'] = job_args # Pop project_id so AIP doesn't complain about an unexpected parameter. # It's been a stowaway in aip_args and has finally reached its destination. project = training_inputs.pop('project') project_id = 'projects/{}'.format(project) with telemetry_utils.scoped_labels( {telemetry_utils.LABEL_TFX_EXECUTOR: executor_class_path}): job_labels = telemetry_utils.get_labels_dict() # 'tfx_YYYYmmddHHMMSS' is the default job ID if not explicitly specified. job_id = job_id or 'tfx_{}'.format( datetime.datetime.now().strftime('%Y%m%d%H%M%S')) job_spec = { 'jobId': job_id, 'trainingInput': training_inputs, 'labels': job_labels, } # Submit job to AIP Training logging.info('Submitting job=\'%s\', project=\'%s\' to AI Platform.', job_id, project) request = api_client.projects().jobs().create( body=job_spec, parent=project_id) request.execute() # Wait for AIP Training job to finish job_name = '{}/jobs/{}'.format(project_id, job_id) request = api_client.projects().jobs().get(name=job_name) response = request.execute() retry_count = 0 # Monitors the long-running operation by polling the job state periodically, # and retries the polling when a transient connectivity issue is encountered. # # Long-running operation monitoring: # The possible states of "get job" response can be found at # https://cloud.google.com/ai-platform/training/docs/reference/rest/v1/projects.jobs#State # where SUCCEEDED/FAILED/CANCELLED are considered to be final states. # The following logic will keep polling the state of the job until the job # enters a final state. # # During the polling, if a connection error was encountered, the GET request # will be retried by recreating the Python API client to refresh the lifecycle # of the connection being used. See # https://github.com/googleapis/google-api-python-client/issues/218 # for a detailed description of the problem. If the error persists for # _CONNECTION_ERROR_RETRY_LIMIT consecutive attempts, the function will exit # with code 1. while response['state'] not in ('SUCCEEDED', 'FAILED', 'CANCELLED'): time.sleep(_POLLING_INTERVAL_IN_SECONDS) try: response = request.execute() retry_count = 0 # Handle transient connection error. except ConnectionError as err: if retry_count < _CONNECTION_ERROR_RETRY_LIMIT: retry_count += 1 logging.warning( 'ConnectionError (%s) encountered when polling job: %s. Trying to ' 'recreate the API client.', err, job_id) # Recreate the Python API client. api_client = discovery.build('ml', 'v1') request = api_client.projects().jobs().get(name=job_name) else: # TODO(b/158433873): Consider raising the error instead of exit with # code 1 after CMLE supports configurable retry policy. # Currently CMLE will automatically retry the job unless return code # 1-128 is returned. logging.error('Request failed after %s retries.', _CONNECTION_ERROR_RETRY_LIMIT) sys.exit(1) if response['state'] in ('FAILED', 'CANCELLED'): err_msg = 'Job \'{}\' did not succeed. Detailed response {}.'.format( job_name, response) logging.error(err_msg) raise RuntimeError(err_msg) # AIP training complete logging.info('Job \'%s\' successful.', job_name)