Example #1
0
    def testPipeline(self):
        self._copyTemplate()

        # Uncomment all variables in config.
        self._uncommentMultiLineVariables(
            os.path.join('pipeline', 'configs.py'), [
                'GOOGLE_CLOUD_REGION',
                'BIG_QUERY_WITH_DIRECT_RUNNER_BEAM_PIPELINE_ARGS',
                'BIG_QUERY_QUERY', 'DATAFLOW_BEAM_PIPELINE_ARGS',
                'GCP_AI_PLATFORM_TRAINING_ARGS', 'GCP_AI_PLATFORM_SERVING_ARGS'
            ])

        # Prepare data.
        self._prepare_data()
        self._replaceFileContent('kubeflow_v2_dag_runner.py', [
            ('_DATA_PATH = \'gs://{}/tfx-template/data/\'.'
             'format(configs.GCS_BUCKET_NAME)',
             '_DATA_PATH = \'gs://{{}}/{}/{}\'.format(configs.GCS_BUCKET_NAME)'
             .format(self._DATA_DIRECTORY_NAME, self._pipeline_name)),
        ])

        # Create a pipeline with only one component.
        self._create_pipeline()

        # Extract the compiled pipeline spec.
        kubeflow_v2_pb = pipeline_spec_pb2.PipelineJob()
        io_utils.parse_json_file(file_name=os.path.join(
            os.getcwd(), 'pipeline.json'),
                                 message=kubeflow_v2_pb)
        # There should be one step in the compiled pipeline.
        self.assertLen(kubeflow_v2_pb.pipeline_spec['tasks'], 1)
Example #2
0
    def _create_pipeline_job(
        self,
        pipeline_spec: pipeline_spec_pb2.PipelineSpec,
        pipeline_root: str,
        pipeline_parameters: Optional[Mapping[str, Any]] = None,
    ) -> pipeline_spec_pb2.PipelineJob:
        """Creates the pipeline job spec object.

    Args:
      pipeline_spec: The pipeline spec object.
      pipeline_root: The root of the pipeline outputs.
      pipeline_parameters: The mapping from parameter names to values. Optional.

    Returns:
      A PipelineJob proto representing the compiled pipeline.
    """
        runtime_config = compiler_utils.build_runtime_config_spec(
            pipeline_root=pipeline_root,
            pipeline_parameters=pipeline_parameters)
        pipeline_job = pipeline_spec_pb2.PipelineJob(
            runtime_config=runtime_config)
        pipeline_job.pipeline_spec.update(
            json_format.MessageToDict(pipeline_spec))

        return pipeline_job
Example #3
0
def _mock_subprocess_call(cmd: Sequence[Optional[Text]],
                          env: Mapping[Text, Text]) -> int:
  """Mocks the subprocess call."""
  assert len(cmd) == 2, 'Unexpected number of commands: {}'.format(cmd)
  del env
  dsl_path = cmd[1]

  if dsl_path.endswith('test_pipeline_bad.py'):
    sys.exit(1)
  if not dsl_path.endswith(
      'test_pipeline_1.py') and not dsl_path.endswith(
          'test_pipeline_2.py'):
    raise ValueError('Unexpected dsl path: {}'.format(dsl_path))

  spec_pb = pipeline_pb2.PipelineSpec(
      pipeline_info=pipeline_pb2.PipelineInfo(name='chicago_taxi_kubeflow'))
  runtime_pb = pipeline_pb2.PipelineJob.RuntimeConfig(
      gcs_output_directory=os.path.join(os.environ['HOME'], 'tfx', 'pipelines',
                                        'chicago_taxi_kubeflow'))
  job_pb = pipeline_pb2.PipelineJob(runtime_config=runtime_pb)
  job_pb.pipeline_spec.update(json_format.MessageToDict(spec_pb))
  io_utils.write_string_file(
      file_name='pipeline.json',
      string_value=json_format.MessageToJson(message=job_pb, sort_keys=True))
  return 0
Example #4
0
    def run(self,
            pipeline: tfx_pipeline.Pipeline,
            parameter_values: Optional[Dict[Text, Any]] = None,
            write_out: Optional[bool] = True) -> Dict[Text, Any]:
        """Compiles a pipeline DSL object into pipeline file.

    Args:
      pipeline: TFX pipeline object.
      parameter_values: mapping from runtime parameter names to its values.
      write_out: set to True to actually write out the file to the place
        designated by output_dir and output_filename. Otherwise return the
        JSON-serialized pipeline job spec.

    Returns:
      Returns the JSON pipeline job spec.

    Raises:
      RuntimeError: if trying to write out to a place occupied by an existing
      file.
    """
        # TODO(b/166343606): Support user-provided labels.
        # TODO(b/169095387): Deprecate .run() method in favor of the unified API
        # client.
        display_name = (self._config.display_name
                        or pipeline.pipeline_info.pipeline_name)
        pipeline_spec = pipeline_builder.PipelineBuilder(
            tfx_pipeline=pipeline,
            default_image=self._config.default_image,
            default_commands=self._config.default_commands).build()
        pipeline_spec.sdk_version = 'tfx-{}'.format(version.__version__)
        pipeline_spec.schema_version = _SCHEMA_VERSION
        runtime_config = pipeline_builder.RuntimeConfigBuilder(
            pipeline_info=pipeline.pipeline_info,
            parameter_values=parameter_values).build()
        with telemetry_utils.scoped_labels(
            {telemetry_utils.LABEL_TFX_RUNNER: 'kubeflow_v2'}):
            result = pipeline_spec_pb2.PipelineJob(
                display_name=display_name
                or pipeline.pipeline_info.pipeline_name,
                labels=telemetry_utils.get_labels_dict(),
                runtime_config=runtime_config)
        result.pipeline_spec.update(json_format.MessageToDict(pipeline_spec))
        pipeline_json_dict = json_format.MessageToDict(result)
        if write_out:
            if fileio.exists(
                    self._output_dir) and not fileio.isdir(self._output_dir):
                raise RuntimeError('Output path: %s is pointed to a file.' %
                                   self._output_dir)
            if not fileio.exists(self._output_dir):
                fileio.makedirs(self._output_dir)

            with fileio.open(
                    os.path.join(self._output_dir, self._output_filename),
                    'wb') as f:
                f.write(json.dumps(pipeline_json_dict, sort_keys=True))

        return pipeline_json_dict
Example #5
0
    def _extract_pipeline_args(self) -> Dict[Text, Any]:
        """Get pipeline args from the DSL by compiling the pipeline.

    Returns:
      Python dictionary with pipeline details extracted from DSL.

    Raises:
      RuntimeError: when the given pipeline arg file location is occupied.
    """
        pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]

        if os.path.isdir(pipeline_dsl_path):
            sys.exit('Provide a valid dsl file path.')

        # Create an environment for subprocess.
        temp_env = os.environ.copy()

        # We don't need image name and project ID for extracting pipeline info,
        # so they can be optional.
        runner_env = {
            kubeflow_labels.TFX_IMAGE_ENV:
            self.flags_dict.get(kubeflow_labels.TFX_IMAGE_ENV, ''),
            kubeflow_labels.GCP_PROJECT_ID_ENV:
            self.flags_dict.get(kubeflow_labels.GCP_PROJECT_ID_ENV, ''),
        }

        temp_env.update(runner_env)

        # Run pipeline dsl. Note that here because we don't have RUN_FLAG_ENV
        # the actual execution won't be triggered. Instead the DSL will output a
        # compiled pipeline spec.
        self._subprocess_call(command=[sys.executable, pipeline_dsl_path],
                              env=temp_env)

        # Only import pipeline_spec_pb2 when needed to guard CLI dependency.
        from kfp.pipeline_spec import pipeline_spec_pb2  # pylint: disable=g-import-not-at-top

        # Extract the needed information from compiled pipeline spec.
        job_message = pipeline_spec_pb2.PipelineJob()
        io_utils.parse_json_file(file_name=os.path.join(
            os.getcwd(), _PIPELINE_SPEC_FILE),
                                 message=job_message)

        pipeline_spec_pb = json_format.ParseDict(
            job_message.pipeline_spec, pipeline_spec_pb2.PipelineSpec())

        pipeline_name = pipeline_spec_pb.pipeline_info.name
        pipeline_args = {
            'pipeline_name': pipeline_name,
            'pipeline_root': job_message.runtime_config.gcs_output_directory
        }

        return pipeline_args
Example #6
0
  def _create_pipeline_v2(
      self,
      pipeline_func: Callable[..., Any],
      pipeline_root: Optional[str] = None,
      pipeline_name: Optional[str] = None,
      pipeline_parameters_override: Optional[Mapping[str, Any]] = None,
  ) -> pipeline_spec_pb2.PipelineJob:
    """Creates a pipeline instance and constructs the pipeline spec from it.

    Args:
      pipeline_func: Pipeline function with @dsl.pipeline decorator.
      pipeline_root: The root of the pipeline outputs. Optional.
      pipeline_name: The name of the pipeline. Optional.
      pipeline_parameters_override: The mapping from parameter names to values.
        Optional.

    Returns:
      A PipelineJob proto representing the compiled pipeline.
    """

    # Create the arg list with no default values and call pipeline function.
    # Assign type information to the PipelineParam
    pipeline_meta = _python_op._extract_component_interface(pipeline_func)
    pipeline_name = pipeline_name or pipeline_meta.name

    pipeline_root = pipeline_root or getattr(pipeline_func, 'output_directory',
                                             None)
    if not pipeline_root:
      warnings.warn('pipeline_root is None or empty. A valid pipeline_root '
                    'must be provided at job submission.')

    args_list = []
    signature = inspect.signature(pipeline_func)
    for arg_name in signature.parameters:
      arg_type = None
      for pipeline_input in pipeline_meta.inputs or []:
        if arg_name == pipeline_input.name:
          arg_type = pipeline_input.type
          break
      args_list.append(
          dsl.PipelineParam(
              sanitize_k8s_name(arg_name, True), param_type=arg_type))

    with dsl.Pipeline(pipeline_name) as dsl_pipeline:
      pipeline_func(*args_list)

    self._sanitize_and_inject_artifact(dsl_pipeline)

    # Fill in the default values.
    args_list_with_defaults = []
    if pipeline_meta.inputs:
      args_list_with_defaults = [
          dsl.PipelineParam(
              sanitize_k8s_name(input_spec.name, True),
              param_type=input_spec.type,
              value=input_spec.default) for input_spec in pipeline_meta.inputs
      ]

    # Making the pipeline group name unique to prevent name clashes with templates
    pipeline_group = dsl_pipeline.groups[0]
    temp_pipeline_group_name = uuid.uuid4().hex
    pipeline_group.name = temp_pipeline_group_name

    pipeline_spec = self._create_pipeline_spec(
        args_list_with_defaults,
        dsl_pipeline,
    )

    pipeline_parameters = {
        param.name: param for param in args_list_with_defaults
    }
    # Update pipeline parameters override if there were any.
    pipeline_parameters_override = pipeline_parameters_override or {}
    for k, v in pipeline_parameters_override.items():
      if k not in pipeline_parameters:
        raise ValueError('Pipeline parameter {} does not match any known '
                         'pipeline argument.'.format(k))
      pipeline_parameters[k].value = v

    runtime_config = compiler_utils.build_runtime_config_spec(
        output_directory=pipeline_root, pipeline_parameters=pipeline_parameters)
    pipeline_job = pipeline_spec_pb2.PipelineJob(runtime_config=runtime_config)
    pipeline_job.pipeline_spec.update(json_format.MessageToDict(pipeline_spec))

    return pipeline_job
Example #7
0
    def _create_pipeline(
        self,
        pipeline_func: Callable[..., Any],
        output_directory: str,
        pipeline_name: Optional[str] = None,
        pipeline_parameters_override: Optional[Mapping[str, Any]] = None,
    ) -> pipeline_spec_pb2.PipelineJob:
        """Creates a pipeline instance and constructs the pipeline spec from it.

    Args:
      pipeline_func: Pipeline function with @dsl.pipeline decorator.
      pipeline_name: The name of the pipeline. Optional.
      output_directory: The root of the pipeline outputs.
      pipeline_parameters_override: The mapping from parameter names to values.
        Optional.

    Returns:
      A PipelineJob proto representing the compiled pipeline.
    """

        # Create the arg list with no default values and call pipeline function.
        # Assign type information to the PipelineParam
        pipeline_meta = _python_op._extract_component_interface(pipeline_func)
        pipeline_name = pipeline_name or pipeline_meta.name

        args_list = []
        signature = inspect.signature(pipeline_func)
        for arg_name in signature.parameters:
            arg_type = None
            for pipeline_input in pipeline_meta.inputs or []:
                if arg_name == pipeline_input.name:
                    arg_type = pipeline_input.type
                    break
            args_list.append(
                dsl.PipelineParam(sanitize_k8s_name(arg_name, True),
                                  param_type=arg_type))

        with dsl.Pipeline(pipeline_name) as dsl_pipeline:
            pipeline_func(*args_list)

        # Fill in the default values.
        args_list_with_defaults = []
        if pipeline_meta.inputs:
            args_list_with_defaults = [
                dsl.PipelineParam(sanitize_k8s_name(input_spec.name, True),
                                  param_type=input_spec.type,
                                  value=input_spec.default)
                for input_spec in pipeline_meta.inputs
            ]

        pipeline_spec = self._create_pipeline_spec(
            args_list_with_defaults,
            dsl_pipeline,
        )

        pipeline_parameters = {
            arg.name: arg.value
            for arg in args_list_with_defaults
        }
        # Update pipeline parameters override if there were any.
        pipeline_parameters.update(pipeline_parameters_override or {})
        runtime_config = compiler_utils.build_runtime_config_spec(
            output_directory=output_directory,
            pipeline_parameters=pipeline_parameters)
        pipeline_job = pipeline_spec_pb2.PipelineJob(
            runtime_config=runtime_config)
        pipeline_job.pipeline_spec.update(
            json_format.MessageToDict(pipeline_spec))

        return pipeline_job