Exemple #1
0
    def _run_pipeline(
            self,
            pipeline: tfx_pipeline.Pipeline,
            exit_handler: Optional[base_node.BaseNode] = None) -> None:
        """Trigger the pipeline execution with a specific job ID."""
        # Ensure cleanup regardless of whether pipeline succeeds or fails.
        self.addCleanup(self._delete_pipeline_output,
                        pipeline.pipeline_info.pipeline_name)

        config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
            default_image=self.container_image)

        executing_kubeflow_v2_dag_runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
            config=config, output_filename='pipeline.json')
        if exit_handler:
            executing_kubeflow_v2_dag_runner.set_exit_handler(exit_handler)

        _ = executing_kubeflow_v2_dag_runner.run(pipeline, write_out=True)

        job_id = pipeline.pipeline_info.pipeline_name
        job = pipeline_jobs.PipelineJob(
            template_path='pipeline.json',
            job_id=job_id,
            display_name=pipeline.pipeline_info.pipeline_name)
        job.run(sync=False)
        job.wait_for_resource_creation()

        vertex_client_utils.poll_job_status(job_id, _MAX_JOB_EXECUTION_TIME,
                                            _POLLING_INTERVAL_IN_SECONDS)
Exemple #2
0
def run():
    """Define a pipeline to be executed using Kubeflow V2 runner."""

    runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
        default_image=configs.PIPELINE_IMAGE)

    dsl_pipeline = pipeline.create_pipeline(
        pipeline_name=configs.PIPELINE_NAME,
        pipeline_root=_PIPELINE_ROOT,
        data_path=_DATA_PATH,
        # TODO(step 7): (Optional) Uncomment here to use BigQueryExampleGen.
        # query=configs.BIG_QUERY_QUERY,
        preprocessing_fn=configs.PREPROCESSING_FN,
        run_fn=configs.RUN_FN,
        train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS),
        eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS),
        eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD,
        serving_model_dir=_SERVING_MODEL_DIR,
        # TODO(step 7): (Optional) Uncomment here to use provide GCP related
        #               config for BigQuery with Beam DirectRunner.
        # beam_pipeline_args=configs.
        # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS,
        # TODO(step 8): (Optional) Uncomment below to use Dataflow.
        # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS,
        # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform.
        # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS,
        # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform.
        # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS,
    )

    runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(config=runner_config)

    runner.run(pipeline=dsl_pipeline)
Exemple #3
0
def main(_):
    absl.logging.set_verbosity(absl.logging.INFO)
    dsl_pipeline = _create_pipeline(pipeline_name=_pipeline_name,
                                    pipeline_root=_pipeline_root,
                                    data_root=_data_root)

    runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig()

    runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(config=runner_config)

    runner.run(pipeline=dsl_pipeline)
Exemple #4
0
  def _run_pipeline(self, pipeline: tfx_pipeline.Pipeline, job_id: str) -> None:
    """Trigger the pipeline execution with a specific job ID."""
    # Ensure cleanup regardless of whether pipeline succeeds or fails.
    self.addCleanup(self._delete_pipeline_output,
                    pipeline.pipeline_info.pipeline_name)

    config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
        default_image=self.container_image)

    _ = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
        config=config, output_filename='pipeline.json').run(
            pipeline, write_out=True)

    self._client.create_run_from_job_spec(
        job_spec_path='pipeline.json', job_id=job_id)
Exemple #5
0
    def testCompileFullTaxiPipeline(self, fake_now, fake_sys_version):
        fake_now.return_value = datetime.date(2020, 1, 1)
        fake_sys_version.major = 3
        fake_sys_version.minor = 7
        runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
            output_dir=_TEST_DIR,
            output_filename=_TEST_FILE_NAME,
            config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
                display_name='my-pipeline',
                default_image='tensorflow/tfx:latest'))

        self._compare_against_testdata(
            runner=runner,
            pipeline=test_utils.full_taxi_pipeline(),
            golden_file='expected_full_taxi_pipeline_job.json')
Exemple #6
0
def _compile_pipeline(pipeline_def, project_id, pipeline_name, pipeline_image,
                      pipeline_spec_path):
    """Compiles the pipeline."""

    # Create Kubeflow V2 runner
    runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
        project_id=project_id,
        display_name=pipeline_name,
        default_image=pipeline_image)

    runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
        config=runner_config, output_filename=pipeline_spec_path)

    # Compile the pipeline
    runner.run(pipeline_def)
    def testCompileTwoStepPipeline(self, fake_now, fake_sys_version):
        fake_now.return_value = datetime.date(2020, 1, 1)
        fake_sys_version.major = 3
        fake_sys_version.minor = 7
        runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
            output_dir=_TEST_DIR,
            output_filename=_TEST_FILE_NAME,
            config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
                project_id='my-project',
                display_name='my-pipeline',
                default_image='gcr.io/my-tfx:latest'))

        self._compare_against_testdata(
            runner=runner,
            pipeline=test_utils.two_step_pipeline(),
            golden_file='expected_two_step_pipeline_job.json')
Exemple #8
0
    def testPatcherSavePipelineFn(self):
        pipeline_name = 'dummy'
        pipeline_dir = '/foo/pipeline'
        mock_prepare_dir_fn = mock.MagicMock(return_value=pipeline_dir)
        patcher = kubeflow_v2_dag_runner_patcher.KubeflowV2DagRunnerPatcher(
            call_real_run=False, prepare_dir_fn=mock_prepare_dir_fn)
        runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
            config=kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig())
        pipeline = tfx_pipeline.Pipeline(pipeline_name, 'dummy_root')
        with patcher.patch() as context:
            runner.run(pipeline)

        mock_prepare_dir_fn.assert_called_once_with(pipeline_name)
        self.assertEqual(
            context[patcher.OUTPUT_FILE_PATH],
            os.path.join(pipeline_dir,
                         kubeflow_v2_dag_runner_patcher._OUTPUT_FILENAME))
Exemple #9
0
def run():
  """Define a pipeline to be executed using Kubeflow V2 runner."""
  # TODO(b/157598477) Find a better way to pass parameters from CLI handler to
  # pipeline DSL file, instead of using environment vars.
  tfx_image = os.environ.get(labels.TFX_IMAGE_ENV)
  project_id = os.environ.get(labels.GCP_PROJECT_ID_ENV)
  api_key = os.environ.get(labels.API_KEY_ENV)

  runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
      project_id=project_id,
      display_name='tfx-kubeflow-v2-pipeline-{}'.format(configs.PIPELINE_NAME),
      default_image=tfx_image)

  dsl_pipeline = pipeline.create_pipeline(
      pipeline_name=configs.PIPELINE_NAME,
      pipeline_root=_PIPELINE_ROOT,
      data_path=_DATA_PATH,
      # TODO(step 7): (Optional) Uncomment here to use BigQueryExampleGen.
      # query=configs.BIG_QUERY_QUERY,
      preprocessing_fn=configs.PREPROCESSING_FN,
      run_fn=configs.RUN_FN,
      train_args=trainer_pb2.TrainArgs(num_steps=configs.TRAIN_NUM_STEPS),
      eval_args=trainer_pb2.EvalArgs(num_steps=configs.EVAL_NUM_STEPS),
      eval_accuracy_threshold=configs.EVAL_ACCURACY_THRESHOLD,
      serving_model_dir=_SERVING_MODEL_DIR,
      # TODO(step 7): (Optional) Uncomment here to use provide GCP related
      #               config for BigQuery with Beam DirectRunner.
      # beam_pipeline_args=configs.
      # BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS,
      # TODO(step 8): (Optional) Uncomment below to use Dataflow.
      # beam_pipeline_args=configs.DATAFLOW_BEAM_PIPELINE_ARGS,
      # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform.
      # ai_platform_training_args=configs.GCP_AI_PLATFORM_TRAINING_ARGS,
      # TODO(step 9): (Optional) Uncomment below to use Cloud AI Platform.
      # ai_platform_serving_args=configs.GCP_AI_PLATFORM_SERVING_ARGS,
  )

  runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
      config=runner_config)

  if os.environ.get(labels.RUN_FLAG_ENV, False):
    # Only trigger the execution when invoked by 'run' command.
    runner.run(
        pipeline=dsl_pipeline, api_key=api_key)
  else:
    runner.compile(pipeline=dsl_pipeline, write_out=True)
Exemple #10
0
    def testPatcherBuildImageFn(self):
        given_image_name = 'foo/bar'
        built_image_name = 'foo/bar@sha256:1234567890'

        mock_build_image_fn = mock.MagicMock(return_value=built_image_name)
        patcher = kubeflow_v2_dag_runner_patcher.KubeflowV2DagRunnerPatcher(
            call_real_run=True, build_image_fn=mock_build_image_fn)
        runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
            default_image=given_image_name)
        runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
            config=runner_config)
        pipeline = tfx_pipeline.Pipeline('dummy', 'dummy_root')
        with patcher.patch() as context:
            runner.run(pipeline)
        self.assertIn(patcher.OUTPUT_FILE_PATH, context)

        mock_build_image_fn.assert_called_once_with(given_image_name)
        self.assertEqual(runner_config.default_image, built_image_name)
Exemple #11
0
def main():
    absl.logging.set_verbosity(absl.logging.INFO)
    tfx_image = os.environ.get(labels.TFX_IMAGE_ENV)
    project_id = os.environ.get(labels.GCP_PROJECT_ID_ENV)
    api_key = os.environ.get(labels.API_KEY_ENV)

    dsl_pipeline = _create_pipeline(pipeline_name=_pipeline_name,
                                    pipeline_root=_pipeline_root,
                                    data_root=_data_root)

    runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
        project_id=project_id, default_image=tfx_image)

    runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(config=runner_config)
    if os.environ.get(labels.RUN_FLAG_ENV, False):
        runner.run(pipeline=dsl_pipeline, api_key=api_key)
    else:
        runner.compile(pipeline=dsl_pipeline, write_out=True)
Exemple #12
0
def main():
  absl.logging.set_verbosity(absl.logging.INFO)
  tfx_image = os.environ.get(labels.TFX_IMAGE_ENV)
  project_id = os.environ.get(labels.GCP_PROJECT_ID_ENV)
  api_key = os.environ.get(labels.API_KEY_ENV)

  dsl_pipeline = _create_pipeline(
      pipeline_name=_pipeline_name,
      pipeline_root=_pipeline_root,
      data_root=_data_root)

  runner_config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(  # pylint: disable=unused-variable
      project_id=project_id, default_image=tfx_image)
  # runner_config is intentionally missed here so this pipeline won't compile.
  runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner()
  if os.environ.get(labels.RUN_FLAG_ENV, False):
    runner.run(pipeline=dsl_pipeline, api_key=api_key)
  else:
    runner.compile(pipeline=dsl_pipeline, write_out=True)
Exemple #13
0
                    push_destination=pusher_pb2.PushDestination(
                        filesystem=pusher_pb2.PushDestination.Filesystem(
                            base_directory=SERVING_MODEL_DIR)))

    components = [
        example_gen, statistics_gen, schema_gen, example_validator, transform,
        trainer, pusher
    ]

    return tfx_pipeline.Pipeline(pipeline_name=pipeline_name,
                                 pipeline_root=PIPELINE_ROOT,
                                 components=components)


# Compile and run the pipeline
print('TensorFlow version: {}'.format(tf.__version__))
print('TFX version: {}'.format(__import__('tfx.version').__version__))
#absl.logging.set_verbosity(absl.logging.INFO)

tfx_pipeline = create_tfx_pipeline(pipeline_name=PIPELINE_NAME,
                                   input_dir=RAW_DATA)
client = client.Client(project_id=PROJECT_ID, region=REGION, api_key=API_KEY)

config = kubeflow_v2_dag_runner.KubeflowV2DagRunnerConfig(
    project_id=PROJECT_ID, display_name=PIPELINE_NAME)

runner = kubeflow_v2_dag_runner.KubeflowV2DagRunner(
    config=config, output_filename='pipeline.json')

runner.run(tfx_pipeline, write_out=True)
client.create_run_from_job_spec('pipeline.json')