Esempio n. 1
0
def _mock_subprocess_call(cmd: Sequence[Optional[Text]],
                          env: Mapping[Text, Text]) -> int:
  """Mocks the subprocess call."""
  assert len(cmd) == 2, 'Unexpected number of commands: {}'.format(cmd)
  del env
  dsl_path = cmd[1]

  if dsl_path.endswith('test_pipeline_bad.py'):
    sys.exit(1)
  if not dsl_path.endswith(
      'test_pipeline_1.py') and not dsl_path.endswith(
          'test_pipeline_2.py'):
    raise ValueError('Unexpected dsl path: {}'.format(dsl_path))

  spec_pb = pipeline_pb2.PipelineSpec(
      pipeline_info=pipeline_pb2.PipelineInfo(name='chicago_taxi_kubeflow'))
  runtime_pb = pipeline_pb2.PipelineJob.RuntimeConfig(
      gcs_output_directory=os.path.join(os.environ['HOME'], 'tfx', 'pipelines',
                                        'chicago_taxi_kubeflow'))
  job_pb = pipeline_pb2.PipelineJob(runtime_config=runtime_pb)
  job_pb.pipeline_spec.update(json_format.MessageToDict(spec_pb))
  io_utils.write_string_file(
      file_name='pipeline.json',
      string_value=json_format.MessageToJson(message=job_pb, sort_keys=True))
  return 0
Esempio n. 2
0
 def testBuildPipelineWithRuntimeParameter(self):
   my_builder = pipeline_builder.PipelineBuilder(
       tfx_pipeline=test_utils.pipeline_with_runtime_parameter(),
       default_image='gcr.io/my-tfx:latest')
   actual_pipeline_spec = my_builder.build()
   self.assertProtoEquals(
       test_utils.get_proto_from_test_data(
           'expected_pipeline_with_runtime_parameter.pbtxt',
           pipeline_pb2.PipelineSpec()), actual_pipeline_spec)
Esempio n. 3
0
 def testBuildPipelineWithPrimitiveValuePassing(self):
   my_builder = pipeline_builder.PipelineBuilder(
       tfx_pipeline=test_utils.consume_primitive_artifacts_by_value_pipeline(),
       default_image='gcr.io/my-tfx:latest')
   actual_pipeline_spec = my_builder.build()
   self.assertProtoEquals(
       test_utils.get_proto_from_test_data(
           'expected_consume_primitive_artifacts_by_value_pipeline.pbtxt',
           pipeline_pb2.PipelineSpec()), actual_pipeline_spec)
Esempio n. 4
0
 def testBuildTwoStepPipeline(self):
   my_builder = pipeline_builder.PipelineBuilder(
       tfx_pipeline=test_utils.two_step_pipeline(),
       default_image='gcr.io/my-tfx:latest')
   actual_pipeline_spec = my_builder.build()
   self.assertProtoEquals(
       test_utils.get_proto_from_test_data('expected_two_step_pipeline.pbtxt',
                                           pipeline_pb2.PipelineSpec()),
       actual_pipeline_spec)
    def test_build_runtime_parameter_spec(self):
        pipeline_params = [
            dsl.PipelineParam(name='input1', param_type='Integer', value=99),
            dsl.PipelineParam(name='input2',
                              param_type='String',
                              value='hello'),
            dsl.PipelineParam(name='input3',
                              param_type='Float',
                              value=3.1415926),
            dsl.PipelineParam(name='input4', param_type=None, value=None),
        ]
        expected_dict = {
            'runtimeParameters': {
                'input1': {
                    'type': 'INT',
                    'defaultValue': {
                        'intValue': '99'
                    }
                },
                'input2': {
                    'type': 'STRING',
                    'defaultValue': {
                        'stringValue': 'hello'
                    }
                },
                'input3': {
                    'type': 'DOUBLE',
                    'defaultValue': {
                        'doubleValue': '3.1415926'
                    }
                },
                'input4': {
                    'type': 'STRING'
                }
            }
        }
        expected_spec = pipeline_spec_pb2.PipelineSpec()
        json_format.ParseDict(expected_dict, expected_spec)

        pipeline_spec = pipeline_spec_pb2.PipelineSpec(
            runtime_parameters=compiler_utils.build_runtime_parameter_spec(
                pipeline_params))
        self.maxDiff = None
        self.assertEqual(expected_spec, pipeline_spec)
Esempio n. 6
0
  def testTwoStepPipelineWithTaskOnlyDependency(self):
    builder = pipeline_builder.PipelineBuilder(
        tfx_pipeline=test_utils.two_step_pipeline_with_task_only_dependency(),
        default_image='unused-image')

    pipeline_spec = builder.build()
    self.assertProtoEquals(
        test_utils.get_proto_from_test_data(
            'expected_two_step_pipeline_with_task_only_dependency.pbtxt',
            pipeline_pb2.PipelineSpec()), pipeline_spec)
Esempio n. 7
0
  def testBuildPipelineWithTwoContainerSpecComponents(self):
    my_builder = pipeline_builder.PipelineBuilder(
        tfx_pipeline=test_utils.pipeline_with_two_container_spec_components(),
        default_image='gcr.io/my-tfx:latest')
    actual_pipeline_spec = my_builder.build()

    self.assertProtoEquals(
        test_utils.get_proto_from_test_data(
            'expected_pipeline_with_two_container_spec_components.pbtxt',
            pipeline_pb2.PipelineSpec()), actual_pipeline_spec)
Esempio n. 8
0
  def testBuildTwoStepPipelineWithCacheEnabled(self):
    pipeline = test_utils.two_step_pipeline()
    pipeline.enable_cache = True

    builder = pipeline_builder.PipelineBuilder(
        tfx_pipeline=pipeline, default_image='gcr.io/my-tfx:latest')
    pipeline_spec = builder.build()
    self.assertProtoEquals(
        test_utils.get_proto_from_test_data(
            'expected_two_step_pipeline_with_cache_enabled.pbtxt',
            pipeline_pb2.PipelineSpec()), pipeline_spec)
Esempio n. 9
0
    def _extract_pipeline_args(self) -> Dict[Text, Any]:
        """Get pipeline args from the DSL by compiling the pipeline.

    Returns:
      Python dictionary with pipeline details extracted from DSL.

    Raises:
      RuntimeError: when the given pipeline arg file location is occupied.
    """
        pipeline_dsl_path = self.flags_dict[labels.PIPELINE_DSL_PATH]

        if os.path.isdir(pipeline_dsl_path):
            sys.exit('Provide a valid dsl file path.')

        # Create an environment for subprocess.
        temp_env = os.environ.copy()

        # We don't need image name and project ID for extracting pipeline info,
        # so they can be optional.
        runner_env = {
            kubeflow_labels.TFX_IMAGE_ENV:
            self.flags_dict.get(kubeflow_labels.TFX_IMAGE_ENV, ''),
            kubeflow_labels.GCP_PROJECT_ID_ENV:
            self.flags_dict.get(kubeflow_labels.GCP_PROJECT_ID_ENV, ''),
        }

        temp_env.update(runner_env)

        # Run pipeline dsl. Note that here because we don't have RUN_FLAG_ENV
        # the actual execution won't be triggered. Instead the DSL will output a
        # compiled pipeline spec.
        self._subprocess_call(command=[sys.executable, pipeline_dsl_path],
                              env=temp_env)

        # Only import pipeline_spec_pb2 when needed to guard CLI dependency.
        from kfp.pipeline_spec import pipeline_spec_pb2  # pylint: disable=g-import-not-at-top

        # Extract the needed information from compiled pipeline spec.
        job_message = pipeline_spec_pb2.PipelineJob()
        io_utils.parse_json_file(file_name=os.path.join(
            os.getcwd(), _PIPELINE_SPEC_FILE),
                                 message=job_message)

        pipeline_spec_pb = json_format.ParseDict(
            job_message.pipeline_spec, pipeline_spec_pb2.PipelineSpec())

        pipeline_name = pipeline_spec_pb.pipeline_info.name
        pipeline_args = {
            'pipeline_name': pipeline_name,
            'pipeline_root': job_message.runtime_config.gcs_output_directory
        }

        return pipeline_args
Esempio n. 10
0
  def testPipelineWithExitHandler(self):
    pipeline = test_utils.two_step_pipeline()
    # define exit handler
    exit_handler = test_utils.dummy_exit_handler(
        param1=decorators.FinalStatusStr())

    builder = pipeline_builder.PipelineBuilder(
        tfx_pipeline=pipeline,
        default_image='gcr.io/my-tfx:latest',
        exit_handler=exit_handler)
    pipeline_spec = builder.build()
    self.assertProtoEquals(
        test_utils.get_proto_from_test_data(
            'expected_two_step_pipeline_with_exit_handler.pbtxt',
            pipeline_pb2.PipelineSpec()), pipeline_spec)
Esempio n. 11
0
  def _create_pipeline_spec(
      self,
      args: List[dsl.PipelineParam],
      pipeline: dsl.Pipeline,
  ) -> pipeline_spec_pb2.PipelineSpec:
    """Creates the pipeline spec object.

    Args:
      args: The list of pipeline arguments.
      pipeline: The instantiated pipeline object.

    Returns:
      A PipelineSpec proto representing the compiled pipeline.

    Raises:
      NotImplementedError if the argument is of unsupported types.
    """
    compiler_utils.validate_pipeline_name(pipeline.name)

    deployment_config = pipeline_spec_pb2.PipelineDeploymentConfig()
    pipeline_spec = pipeline_spec_pb2.PipelineSpec()

    pipeline_spec.pipeline_info.name = pipeline.name
    pipeline_spec.sdk_version = 'kfp-{}'.format(kfp.__version__)
    # Schema version 2.0.0 is required for kfp-pipeline-spec>0.1.3.1
    pipeline_spec.schema_version = '2.0.0'

    dsl_component_spec.build_component_inputs_spec(
        component_spec=pipeline_spec.root,
        pipeline_params=args,
        is_root_component=True)

    root_group = pipeline.groups[0]
    opsgroups = self._get_groups(root_group)
    op_name_to_parent_groups = self._get_groups_for_ops(root_group)
    opgroup_name_to_parent_groups = self._get_groups_for_opsgroups(root_group)

    condition_params = self._get_condition_params_for_ops(root_group)
    op_name_to_for_loop_op = self._get_for_loop_ops(root_group)
    inputs, outputs = self._get_inputs_outputs(
        pipeline,
        args,
        root_group,
        op_name_to_parent_groups,
        opgroup_name_to_parent_groups,
        condition_params,
        op_name_to_for_loop_op,
    )
    dependencies = self._get_dependencies(
        pipeline,
        root_group,
        op_name_to_parent_groups,
        opgroup_name_to_parent_groups,
        opsgroups,
        condition_params,
    )

    for opsgroup_name in opsgroups.keys():
      self._group_to_dag_spec(
          opsgroups[opsgroup_name],
          inputs,
          outputs,
          dependencies,
          pipeline_spec,
          deployment_config,
          root_group.name,
      )

    return pipeline_spec
Esempio n. 12
0
 def setUpClass(cls) -> None:
     pipeline_spec = pipeline_spec_pb2.PipelineSpec()
     pipeline_spec.pipeline_info.name = 'pipeline-name'
     cls.pipeline_spec = pipeline_spec
Esempio n. 13
0
    def build(self) -> pipeline_pb2.PipelineSpec:
        """Build a pipeline PipelineSpec."""

        _check_name(self._pipeline_info.pipeline_name)

        deployment_config = pipeline_pb2.PipelineDeploymentConfig()
        pipeline_info = pipeline_pb2.PipelineInfo(
            name=self._pipeline_info.pipeline_name)

        tfx_tasks = {}
        component_defs = {}
        # Map from (producer component id, output key) to (new producer component
        # id, output key)
        channel_redirect_map = {}
        with parameter_utils.ParameterContext() as pc:
            for component in self._pipeline.components:
                if self._exit_handler and component.id == compiler_utils.TFX_DAG_NAME:
                    component.with_id(component.id +
                                      _generate_component_name_suffix())
                    logging.warning(
                        '_tfx_dag is system reserved name for pipeline with'
                        'exit handler, added suffix to your component name: %s',
                        component.id)
                # Here the topological order of components is required.
                # If a channel redirection is needed, redirect mapping is expected to be
                # available because the upstream node (which is the cause for
                # redirecting) is processed before the downstream consumer nodes.
                built_tasks = step_builder.StepBuilder(
                    node=component,
                    deployment_config=deployment_config,
                    component_defs=component_defs,
                    image=self._default_image,
                    image_cmds=self._default_commands,
                    beam_pipeline_args=self._pipeline.beam_pipeline_args,
                    enable_cache=self._pipeline.enable_cache,
                    pipeline_info=self._pipeline_info,
                    channel_redirect_map=channel_redirect_map).build()
                tfx_tasks.update(built_tasks)

        result = pipeline_pb2.PipelineSpec(pipeline_info=pipeline_info)

        # if exit handler is defined, put all the TFX tasks under tfx_dag,
        # exit handler is a separate component triggered by tfx_dag.
        if self._exit_handler:
            for name, task_spec in tfx_tasks.items():
                result.components[compiler_utils.TFX_DAG_NAME].dag.tasks[
                    name].CopyFrom(task_spec)
            # construct root with exit handler
            exit_handler_task = step_builder.StepBuilder(
                node=self._exit_handler,
                deployment_config=deployment_config,
                component_defs=component_defs,
                image=self._default_image,
                image_cmds=self._default_commands,
                beam_pipeline_args=self._pipeline.beam_pipeline_args,
                enable_cache=False,
                pipeline_info=self._pipeline_info,
                channel_redirect_map=channel_redirect_map,
                is_exit_handler=True).build()
            result.root.dag.tasks[
                compiler_utils.
                TFX_DAG_NAME].component_ref.name = compiler_utils.TFX_DAG_NAME
            result.root.dag.tasks[
                compiler_utils.
                TFX_DAG_NAME].task_info.name = compiler_utils.TFX_DAG_NAME
            result.root.dag.tasks[self._exit_handler.id].CopyFrom(
                exit_handler_task[self._exit_handler.id])
        else:
            for name, task_spec in tfx_tasks.items():
                result.root.dag.tasks[name].CopyFrom(task_spec)

        result.deployment_spec.update(
            json_format.MessageToDict(deployment_config))
        for name, component_def in component_defs.items():
            result.components[name].CopyFrom(component_def)

        # Attach runtime parameter to root's input parameter
        for param in pc.parameters:
            result.root.input_definitions.parameters[param.name].CopyFrom(
                compiler_utils.build_parameter_type_spec(param))

        return result
Esempio n. 14
0
    def to_pipeline_spec(self) -> pipeline_spec_pb2.PipelineSpec:
        """Creates a pipeline instance and constructs the pipeline spec for a
        single component.

        Args:
            component_spec: The ComponentSpec to convert to PipelineSpec.

        Returns:
            A PipelineSpec proto representing the compiled component.
        """
        # import here to aviod circular module dependency
        from kfp.compiler import pipeline_spec_builder as builder
        from kfp.components import pipeline_task
        from kfp.components import tasks_group
        from kfp.components.types import type_utils

        args_dict = {}
        pipeline_inputs = self.inputs or {}

        for arg_name, input_spec in pipeline_inputs.items():
            arg_type = input_spec.type
            if not type_utils.is_parameter_type(
                    arg_type) or type_utils.is_task_final_status_type(
                        arg_type):
                raise TypeError(
                    builder.make_invalid_input_type_error_msg(
                        arg_name, arg_type))
            args_dict[arg_name] = dsl.PipelineParameterChannel(
                name=arg_name, channel_type=arg_type)

        task = pipeline_task.PipelineTask(self, args_dict)

        # instead of constructing a pipeline with pipeline_context.Pipeline,
        # just build the single task group
        group = tasks_group.TasksGroup(
            group_type=tasks_group.TasksGroupType.PIPELINE)
        group.tasks.append(task)

        # Fill in the default values.
        args_list_with_defaults = [
            dsl.PipelineParameterChannel(
                name=input_name,
                channel_type=input_spec.type,
                value=input_spec.default,
            ) for input_name, input_spec in pipeline_inputs.items()
        ]
        group.name = uuid.uuid4().hex

        pipeline_name = self.name
        pipeline_args = args_list_with_defaults
        task_group = group

        builder.validate_pipeline_name(pipeline_name)

        pipeline_spec = pipeline_spec_pb2.PipelineSpec()
        pipeline_spec.pipeline_info.name = pipeline_name
        pipeline_spec.sdk_version = f'kfp-{kfp.__version__}'
        # Schema version 2.1.0 is required for kfp-pipeline-spec>0.1.13
        pipeline_spec.schema_version = '2.1.0'
        pipeline_spec.root.CopyFrom(
            builder.build_component_spec_for_group(
                pipeline_channels=pipeline_args,
                is_root_group=True,
            ))

        deployment_config = pipeline_spec_pb2.PipelineDeploymentConfig()
        root_group = task_group

        task_name_to_parent_groups, group_name_to_parent_groups = builder.get_parent_groups(
            root_group)

        def get_inputs(task_group: tasks_group.TasksGroup,
                       task_name_to_parent_groups):
            inputs = collections.defaultdict(set)
            if len(task_group.tasks) != 1:
                raise ValueError(
                    f'Error compiling component. Expected one task in task group, got {len(task_group.tasks)}.'
                )
            only_task = task_group.tasks[0]
            if only_task.channel_inputs:
                for group_name in task_name_to_parent_groups[only_task.name]:
                    inputs[group_name].add(
                        (only_task.channel_inputs[-1], None))
            return inputs

        inputs = get_inputs(task_group, task_name_to_parent_groups)

        builder.build_spec_by_group(
            pipeline_spec=pipeline_spec,
            deployment_config=deployment_config,
            group=root_group,
            inputs=inputs,
            dependencies={},  # no dependencies for single-component pipeline
            rootgroup_name=root_group.name,
            task_name_to_parent_groups=task_name_to_parent_groups,
            group_name_to_parent_groups=group_name_to_parent_groups,
            name_to_for_loop_group=
            {},  # no for loop for single-component pipeline
        )

        return pipeline_spec
Esempio n. 15
0
  def _create_pipeline_spec(
      self,
      args: List[dsl.PipelineParam],
      pipeline: dsl.Pipeline,
  ) -> pipeline_spec_pb2.PipelineSpec:
    """Creates the pipeline spec object.

    Args:
      args: The list of pipeline arguments.
      pipeline: The instantiated pipeline object.

    Returns:
      A PipelineSpec proto representing the compiled pipeline.

    Raises:
      NotImplementedError if the argument is of unsupported types.
    """
    compiler_utils.validate_pipeline_name(pipeline.name)

    deployment_config = pipeline_spec_pb2.PipelineDeploymentConfig()
    pipeline_spec = pipeline_spec_pb2.PipelineSpec()

    pipeline_spec.pipeline_info.name = pipeline.name
    pipeline_spec.sdk_version = 'kfp-{}'.format(kfp.__version__)
    # Schema version 2.0.0 is required for kfp-pipeline-spec>0.1.3.1
    pipeline_spec.schema_version = '2.0.0'

    dsl_component_spec.build_component_inputs_spec(
        component_spec=pipeline_spec.root,
        pipeline_params=args,
        is_root_component=True)

    root_group = pipeline.groups[0]
    opsgroups = self._get_groups(root_group)
    op_name_to_parent_groups = self._get_groups_for_ops(root_group)
    opgroup_name_to_parent_groups = self._get_groups_for_opsgroups(root_group)

    condition_params = self._get_condition_params_for_ops(root_group)
    op_name_to_for_loop_op = self._get_for_loop_ops(root_group)
    inputs, outputs = self._get_inputs_outputs(
        pipeline,
        args,
        root_group,
        op_name_to_parent_groups,
        opgroup_name_to_parent_groups,
        condition_params,
        op_name_to_for_loop_op,
    )
    dependencies = self._get_dependencies(
        pipeline,
        root_group,
        op_name_to_parent_groups,
        opgroup_name_to_parent_groups,
        opsgroups,
        condition_params,
    )

    for opsgroup_name in opsgroups.keys():
      self._group_to_dag_spec(
          opsgroups[opsgroup_name],
          inputs,
          outputs,
          dependencies,
          pipeline_spec,
          deployment_config,
          root_group.name,
          op_name_to_parent_groups,
      )

    # Exit Handler
    if pipeline.groups[0].groups:
      first_group = pipeline.groups[0].groups[0]
      if first_group.type == 'exit_handler':
        exit_handler_op = first_group.exit_op

        # Add exit op task spec
        task_name = exit_handler_op.task_spec.task_info.name
        exit_handler_op.task_spec.dependent_tasks.extend(
            pipeline_spec.root.dag.tasks.keys())
        exit_handler_op.task_spec.trigger_policy.strategy = (
            pipeline_spec_pb2.PipelineTaskSpec.TriggerPolicy.TriggerStrategy
            .ALL_UPSTREAM_TASKS_COMPLETED)
        pipeline_spec.root.dag.tasks[task_name].CopyFrom(
            exit_handler_op.task_spec)

        # Add exit op component spec if it does not exist.
        component_name = exit_handler_op.task_spec.component_ref.name
        if component_name not in pipeline_spec.components:
          pipeline_spec.components[component_name].CopyFrom(
              exit_handler_op.component_spec)

        # Add exit op executor spec if it does not exist.
        executor_label = exit_handler_op.component_spec.executor_label
        if executor_label not in deployment_config.executors:
          deployment_config.executors[executor_label].container.CopyFrom(
              exit_handler_op.container_spec)
          pipeline_spec.deployment_spec.update(
              json_format.MessageToDict(deployment_config))

    return pipeline_spec
Esempio n. 16
0
    def _create_pipeline_spec(
        self,
        args: List[dsl.PipelineParam],
        pipeline: dsl.Pipeline,
    ) -> pipeline_spec_pb2.PipelineSpec:
        """Creates the pipeline spec object.

    Args:
      args: The list of pipeline arguments.
      pipeline: The instantiated pipeline object.

    Returns:
      A PipelineSpec proto representing the compiled pipeline.

    Raises:
      NotImplementedError if the argument is of unsupported types.
    """
        compiler_utils.validate_pipeline_name(pipeline.name)

        pipeline_spec = pipeline_spec_pb2.PipelineSpec()

        pipeline_spec.pipeline_info.name = pipeline.name
        pipeline_spec.sdk_version = 'kfp-{}'.format(kfp.__version__)
        # Schema version 2.0.0 is required for kfp-pipeline-spec>0.1.3.1
        pipeline_spec.schema_version = '2.0.0'

        pipeline_spec.root.CopyFrom(
            dsl_component_spec.build_root_spec_from_pipeline_params(args))

        deployment_config = pipeline_spec_pb2.PipelineDeploymentConfig()

        for op in pipeline.ops.values():
            task_name = op.task_spec.task_info.name
            component_name = op.task_spec.component_ref.name
            executor_label = op.component_spec.executor_label

            pipeline_spec.root.dag.tasks[task_name].CopyFrom(op.task_spec)
            pipeline_spec.components[component_name].CopyFrom(
                op.component_spec)
            deployment_config.executors[executor_label].container.CopyFrom(
                op.container_spec)

            task = pipeline_spec.root.dag.tasks[task_name]
            # A task may have explicit depdency on other tasks even though they may
            # not have inputs/outputs dependency. e.g.: op2.after(op1)
            if op.dependent_names:
                op.dependent_names = [
                    dsl_utils.sanitize_task_name(name)
                    for name in op.dependent_names
                ]
                task.dependent_tasks.extend(op.dependent_names)

            # Check if need to insert importer node
            for input_name in task.inputs.artifacts:
                if not task.inputs.artifacts[
                        input_name].task_output_artifact.producer_task:
                    type_schema = type_utils.get_input_artifact_type_schema(
                        input_name, op._metadata.inputs)

                    importer_name = importer_node.generate_importer_base_name(
                        dependent_task_name=task_name, input_name=input_name)
                    importer_task_spec = importer_node.build_importer_task_spec(
                        importer_name)
                    importer_comp_spec = importer_node.build_importer_component_spec(
                        importer_base_name=importer_name,
                        input_name=input_name,
                        input_type_schema=type_schema)
                    importer_task_name = importer_task_spec.task_info.name
                    importer_comp_name = importer_task_spec.component_ref.name
                    importer_exec_label = importer_comp_spec.executor_label
                    pipeline_spec.root.dag.tasks[importer_task_name].CopyFrom(
                        importer_task_spec)
                    pipeline_spec.components[importer_comp_name].CopyFrom(
                        importer_comp_spec)

                    task.inputs.artifacts[
                        input_name].task_output_artifact.producer_task = (
                            importer_task_name)
                    task.inputs.artifacts[
                        input_name].task_output_artifact.output_artifact_key = (
                            importer_node.OUTPUT_KEY)

                    # Retrieve the pre-built importer spec
                    importer_spec = op.importer_specs[input_name]
                    deployment_config.executors[
                        importer_exec_label].importer.CopyFrom(importer_spec)

        pipeline_spec.deployment_spec.update(
            json_format.MessageToDict(deployment_config))

        return pipeline_spec
Esempio n. 17
0
    def _create_pipeline_spec(
        self,
        args: List[dsl.PipelineParam],
        pipeline: dsl.Pipeline,
    ) -> pipeline_spec_pb2.PipelineSpec:
        """Creates the pipeline spec object.

    Args:
      args: The list of pipeline arguments.
      pipeline: The instantiated pipeline object.

    Returns:
      A PipelineSpec proto representing the compiled pipeline.

    Raises:
      NotImplementedError if the argument is of unsupported types.
    """
        compiler_utils.validate_pipeline_name(pipeline.name)

        pipeline_spec = pipeline_spec_pb2.PipelineSpec(
            runtime_parameters=compiler_utils.build_runtime_parameter_spec(
                args))

        pipeline_spec.pipeline_info.name = pipeline.name
        pipeline_spec.sdk_version = 'kfp-{}'.format(kfp.__version__)
        pipeline_spec.schema_version = 'v2alpha1'

        deployment_config = pipeline_spec_pb2.PipelineDeploymentConfig()
        importer_tasks = []

        for op in pipeline.ops.values():
            component_spec = op._metadata
            task = pipeline_spec.tasks.add()
            task.CopyFrom(op.task_spec)
            deployment_config.executors[
                task.executor_label].container.CopyFrom(op.container_spec)

            # A task may have explicit depdency on other tasks even though they may
            # not have inputs/outputs dependency. e.g.: op2.after(op1)
            if op.dependent_names:
                task.dependent_tasks.extend(op.dependent_names)

            # Check if need to insert importer node
            for input_name in task.inputs.artifacts:
                if not task.inputs.artifacts[input_name].producer_task:
                    type_schema = type_utils.get_input_artifact_type_schema(
                        input_name, component_spec.inputs)

                    importer_task = importer_node.build_importer_task_spec(
                        dependent_task=task,
                        input_name=input_name,
                        input_type_schema=type_schema)
                    importer_tasks.append(importer_task)

                    task.inputs.artifacts[
                        input_name].producer_task = importer_task.task_info.name
                    task.inputs.artifacts[
                        input_name].output_artifact_key = importer_node.OUTPUT_KEY

                    # Retrieve the pre-built importer spec
                    importer_spec = op.importer_spec[input_name]
                    deployment_config.executors[
                        importer_task.executor_label].importer.CopyFrom(
                            importer_spec)

        pipeline_spec.deployment_config.Pack(deployment_config)
        pipeline_spec.tasks.extend(importer_tasks)

        return pipeline_spec
Esempio n. 18
0
    def _create_pipeline_spec(
        self,
        pipeline_args: List[dsl.PipelineChannel],
        pipeline: pipeline_context.Pipeline,
    ) -> pipeline_spec_pb2.PipelineSpec:
        """Creates a pipeline spec object.

        Args:
            pipeline_args: The list of pipeline input parameters.
            pipeline: The instantiated pipeline object.

        Returns:
            A PipelineSpec proto representing the compiled pipeline.

        Raises:
            ValueError if the argument is of unsupported types.
        """
        builder.validate_pipeline_name(pipeline.name)

        deployment_config = pipeline_spec_pb2.PipelineDeploymentConfig()
        pipeline_spec = pipeline_spec_pb2.PipelineSpec()

        pipeline_spec.pipeline_info.name = pipeline.name
        pipeline_spec.sdk_version = f'kfp-{kfp.__version__}'
        # Schema version 2.1.0 is required for kfp-pipeline-spec>0.1.13
        pipeline_spec.schema_version = '2.1.0'

        pipeline_spec.root.CopyFrom(
            builder.build_component_spec_for_group(
                pipeline_channels=pipeline_args,
                is_root_group=True,
            ))

        root_group = pipeline.groups[0]

        all_groups = self._get_all_groups(root_group)
        group_name_to_group = {group.name: group for group in all_groups}
        task_name_to_parent_groups, group_name_to_parent_groups = (
            builder.get_parent_groups(root_group))
        condition_channels = self._get_condition_channels_for_tasks(root_group)
        name_to_for_loop_group = {
            group_name: group
            for group_name, group in group_name_to_group.items()
            if isinstance(group, dsl.ParallelFor)
        }
        inputs = self._get_inputs_for_all_groups(
            pipeline=pipeline,
            pipeline_args=pipeline_args,
            root_group=root_group,
            task_name_to_parent_groups=task_name_to_parent_groups,
            group_name_to_parent_groups=group_name_to_parent_groups,
            condition_channels=condition_channels,
            name_to_for_loop_group=name_to_for_loop_group,
        )
        dependencies = self._get_dependencies(
            pipeline=pipeline,
            root_group=root_group,
            task_name_to_parent_groups=task_name_to_parent_groups,
            group_name_to_parent_groups=group_name_to_parent_groups,
            group_name_to_group=group_name_to_group,
            condition_channels=condition_channels,
        )

        for group in all_groups:
            builder.build_spec_by_group(
                pipeline_spec=pipeline_spec,
                deployment_config=deployment_config,
                group=group,
                inputs=inputs,
                dependencies=dependencies,
                rootgroup_name=root_group.name,
                task_name_to_parent_groups=task_name_to_parent_groups,
                group_name_to_parent_groups=group_name_to_parent_groups,
                name_to_for_loop_group=name_to_for_loop_group,
            )

        # TODO: refactor to support multiple exit handler per pipeline.
        if pipeline.groups[0].groups:
            first_group = pipeline.groups[0].groups[0]
            if isinstance(first_group, dsl.ExitHandler):
                exit_task = first_group.exit_task
                exit_task_name = component_utils.sanitize_task_name(
                    exit_task.name)
                exit_handler_group_task_name = component_utils.sanitize_task_name(
                    first_group.name)
                input_parameters_in_current_dag = [
                    input_name for input_name in
                    pipeline_spec.root.input_definitions.parameters
                ]
                exit_task_task_spec = builder.build_task_spec_for_exit_task(
                    task=exit_task,
                    dependent_task=exit_handler_group_task_name,
                    pipeline_inputs=pipeline_spec.root.input_definitions,
                )

                exit_task_component_spec = builder.build_component_spec_for_exit_task(
                    task=exit_task)

                exit_task_container_spec = builder.build_container_spec_for_task(
                    task=exit_task)

                # Add exit task task spec
                pipeline_spec.root.dag.tasks[exit_task_name].CopyFrom(
                    exit_task_task_spec)

                # Add exit task component spec if it does not exist.
                component_name = exit_task_task_spec.component_ref.name
                if component_name not in pipeline_spec.components:
                    pipeline_spec.components[component_name].CopyFrom(
                        exit_task_component_spec)

                # Add exit task container spec if it does not exist.
                executor_label = exit_task_component_spec.executor_label
                if executor_label not in deployment_config.executors:
                    deployment_config.executors[
                        executor_label].container.CopyFrom(
                            exit_task_container_spec)
                    pipeline_spec.deployment_spec.update(
                        json_format.MessageToDict(deployment_config))

        return pipeline_spec
Esempio n. 19
0
def pipeline_spec_from_file(filepath: str) -> str:
    with open(filepath, 'r') as f:
        dictionary = yaml.safe_load(f)
    return json_format.ParseDict(dictionary, pipeline_spec_pb2.PipelineSpec())