def _run_pipeline( self, pipeline: tfx_pipeline.Pipeline, exit_handler: Optional[base_node.BaseNode] = None) -> None: """Trigger the pipeline execution with a specific job ID.""" # Ensure cleanup regardless of whether pipeline succeeds or fails. self.addCleanup(self._delete_pipeline_output, pipeline.pipeline_info.pipeline_name) config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( default_image=self.container_image) executing_kubeflow_v2_dag_runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( config=config, output_filename='pipeline.json') if exit_handler: executing_kubeflow_v2_dag_runner.set_exit_handler(exit_handler) _ = executing_kubeflow_v2_dag_runner.run(pipeline, write_out=True) job_id = pipeline.pipeline_info.pipeline_name job = pipeline_jobs.PipelineJob( template_path='pipeline.json', job_id=job_id, display_name=pipeline.pipeline_info.pipeline_name) job.run(sync=False) job.wait_for_resource_creation() vertex_client_utils.poll_job_status(job_id, _MAX_JOB_EXECUTION_TIME, _POLLING_INTERVAL_IN_SECONDS)
def run(): """Define a pipeline to be executed using Kubeflow V2 runner.""" runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( default_image=configs.PIPELINE_IMAGE) dsl_pipeline = pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=_PIPELINE_ROOT, data_path=_DATA_PATH, # TODO(step 7): (Optional) Uncomment here to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=_SERVING_MODEL_DIR, # TODO(step 7): (Optional) Uncomment here to use provide GCP related # config for BigQuery with Beam DirectRunner. # beam_pipeline_args=configs. # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, # TODO(step 8): (Optional) Uncomment below to use Dataflow. # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, ) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(config=runner_config) runner.run(pipeline=dsl_pipeline)
def main(_): absl.logging.set_verbosity(absl.logging.INFO) dsl_pipeline = _create_pipeline(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root) runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig() runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(config=runner_config) runner.run(pipeline=dsl_pipeline)
def _run_pipeline(self, pipeline: tfx_pipeline.Pipeline, job_id: str) -> None: """Trigger the pipeline execution with a specific job ID.""" # Ensure cleanup regardless of whether pipeline succeeds or fails. self.addCleanup(self._delete_pipeline_output, pipeline.pipeline_info.pipeline_name) config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( default_image=self.container_image) _ = kubeflow_v2_dag_runner.KubeflowV2DagRunner( config=config, output_filename='pipeline.json').run( pipeline, write_out=True) self._client.create_run_from_job_spec( job_spec_path='pipeline.json', job_id=job_id)
def testCompileFullTaxiPipeline(self, fake_now, fake_sys_version): fake_now.return_value = datetime.date(2020, 1, 1) fake_sys_version.major = 3 fake_sys_version.minor = 7 runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( output_dir=_TEST_DIR, output_filename=_TEST_FILE_NAME, config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( display_name='my-pipeline', default_image='tensorflow/tfx:latest')) self._compare_against_testdata( runner=runner, pipeline=test_utils.full_taxi_pipeline(), golden_file='expected_full_taxi_pipeline_job.json')
def _compile_pipeline(pipeline_def, project_id, pipeline_name, pipeline_image, pipeline_spec_path): """Compiles the pipeline.""" # Create Kubeflow V2 runner runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( project_id=project_id, display_name=pipeline_name, default_image=pipeline_image) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( config=runner_config, output_filename=pipeline_spec_path) # Compile the pipeline runner.run(pipeline_def)
def testCompileTwoStepPipeline(self, fake_now, fake_sys_version): fake_now.return_value = datetime.date(2020, 1, 1) fake_sys_version.major = 3 fake_sys_version.minor = 7 runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( output_dir=_TEST_DIR, output_filename=_TEST_FILE_NAME, config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( project_id='my-project', display_name='my-pipeline', default_image='gcr.io/my-tfx:latest')) self._compare_against_testdata( runner=runner, pipeline=test_utils.two_step_pipeline(), golden_file='expected_two_step_pipeline_job.json')
def testPatcherSavePipelineFn(self): pipeline_name = 'dummy' pipeline_dir = '/foo/pipeline' mock_prepare_dir_fn = mock.MagicMock(return_value=pipeline_dir) patcher = kubeflow_v2_dag_runner_patcher.KubeflowV2DagRunnerPatcher( call_real_run=False, prepare_dir_fn=mock_prepare_dir_fn) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig()) pipeline = tfx_pipeline.Pipeline(pipeline_name, 'dummy_root') with patcher.patch() as context: runner.run(pipeline) mock_prepare_dir_fn.assert_called_once_with(pipeline_name) self.assertEqual( context[patcher.OUTPUT_FILE_PATH], os.path.join(pipeline_dir, kubeflow_v2_dag_runner_patcher._OUTPUT_FILENAME))
def run(): """Define a pipeline to be executed using Kubeflow V2 runner.""" # TODO(b/157598477) Find a better way to pass parameters from CLI handler to # pipeline DSL file, instead of using environment vars. tfx_image = os.environ.get(labels.TFX_IMAGE_ENV) project_id = os.environ.get(labels.GCP_PROJECT_ID_ENV) api_key = os.environ.get(labels.API_KEY_ENV) runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( project_id=project_id, display_name='tfx-kubeflow-v2-pipeline-{}'.format(configs.PIPELINE_NAME), default_image=tfx_image) dsl_pipeline = pipeline.create_pipeline( pipeline_name=configs.PIPELINE_NAME, pipeline_root=_PIPELINE_ROOT, data_path=_DATA_PATH, # TODO(step 7): (Optional) Uncomment here to use BigQueryExampleGen. # query=configs.BIG_QUERY_QUERY, preprocessing_fn=configs.PREPROCESSING_FN, run_fn=configs.RUN_FN, train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS), eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS), eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD, serving_model_dir=_SERVING_MODEL_DIR, # TODO(step 7): (Optional) Uncomment here to use provide GCP related # config for BigQuery with Beam DirectRunner. # beam_pipeline_args=configs. # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS, # TODO(step 8): (Optional) Uncomment below to use Dataflow. # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS, # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform. # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS, ) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( config=runner_config) if os.environ.get(labels.RUN_FLAG_ENV, False): # Only trigger the execution when invoked by 'run' command. runner.run( pipeline=dsl_pipeline, api_key=api_key) else: runner.compile(pipeline=dsl_pipeline, write_out=True)
def testPatcherBuildImageFn(self): given_image_name = 'foo/bar' built_image_name = 'foo/bar@sha256:1234567890' mock_build_image_fn = mock.MagicMock(return_value=built_image_name) patcher = kubeflow_v2_dag_runner_patcher.KubeflowV2DagRunnerPatcher( call_real_run=True, build_image_fn=mock_build_image_fn) runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( default_image=given_image_name) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( config=runner_config) pipeline = tfx_pipeline.Pipeline('dummy', 'dummy_root') with patcher.patch() as context: runner.run(pipeline) self.assertIn(patcher.OUTPUT_FILE_PATH, context) mock_build_image_fn.assert_called_once_with(given_image_name) self.assertEqual(runner_config.default_image, built_image_name)
def main(): absl.logging.set_verbosity(absl.logging.INFO) tfx_image = os.environ.get(labels.TFX_IMAGE_ENV) project_id = os.environ.get(labels.GCP_PROJECT_ID_ENV) api_key = os.environ.get(labels.API_KEY_ENV) dsl_pipeline = _create_pipeline(pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root) runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( project_id=project_id, default_image=tfx_image) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(config=runner_config) if os.environ.get(labels.RUN_FLAG_ENV, False): runner.run(pipeline=dsl_pipeline, api_key=api_key) else: runner.compile(pipeline=dsl_pipeline, write_out=True)
def main(): absl.logging.set_verbosity(absl.logging.INFO) tfx_image = os.environ.get(labels.TFX_IMAGE_ENV) project_id = os.environ.get(labels.GCP_PROJECT_ID_ENV) api_key = os.environ.get(labels.API_KEY_ENV) dsl_pipeline = _create_pipeline( pipeline_name=_pipeline_name, pipeline_root=_pipeline_root, data_root=_data_root) runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( # pylint: disable=unused-variable project_id=project_id, default_image=tfx_image) # runner_config is intentionally missed here so this pipeline won't compile. runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner() if os.environ.get(labels.RUN_FLAG_ENV, False): runner.run(pipeline=dsl_pipeline, api_key=api_key) else: runner.compile(pipeline=dsl_pipeline, write_out=True)
push_destination=pusher_pb2.PushDestination( filesystem=pusher_pb2.PushDestination.Filesystem( base_directory=SERVING_MODEL_DIR))) components = [ example_gen, statistics_gen, schema_gen, example_validator, transform, trainer, pusher ] return tfx_pipeline.Pipeline(pipeline_name=pipeline_name, pipeline_root=PIPELINE_ROOT, components=components) # Compile and run the pipeline print('TensorFlow version: {}'.format(tf.__version__)) print('TFX version: {}'.format(__import__('tfx.version').__version__)) #absl.logging.set_verbosity(absl.logging.INFO) tfx_pipeline = create_tfx_pipeline(pipeline_name=PIPELINE_NAME, input_dir=RAW_DATA) client = client.Client(project_id=PROJECT_ID, region=REGION, api_key=API_KEY) config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig( project_id=PROJECT_ID, display_name=PIPELINE_NAME) runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner( config=config, output_filename='pipeline.json') runner.run(tfx_pipeline, write_out=True) client.create_run_from_job_spec('pipeline.json')