Beispiel #1
0
def _create_task_object(name: str,
                        container_image: str,
                        command=None,
                        arguments=None,
                        file_inputs=None,
                        file_outputs=None):
    import kfp.dsl as dsl
    global _dummy_pipeline
    need_dummy = dsl.Pipeline._default_pipeline is None
    if need_dummy:
        if _dummy_pipeline == None:
            _dummy_pipeline = dsl.Pipeline('dummy pipeline')
        _dummy_pipeline.__enter__()

    task = dsl.ContainerOp(
        name=name,
        image=container_image,
        command=command,
        arguments=arguments,
        file_inputs=file_inputs,
        file_outputs=file_outputs,
    )

    if need_dummy:
        _dummy_pipeline.__exit__()

    return task
Beispiel #2
0
  def setUp(self):
    super(BaseComponentTest, self).setUp()
    examples = standard_artifacts.ExternalArtifact()
    example_gen = csv_example_gen_component.CsvExampleGen(
        input_base=channel_utils.as_channel([examples]))
    statistics_gen = statistics_gen_component.StatisticsGen(
        input_data=example_gen.outputs.examples, instance_name='foo')

    pipeline = tfx_pipeline.Pipeline(
        pipeline_name='test_pipeline',
        pipeline_root='test_pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )

    self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
    with dsl.Pipeline('test_pipeline'):
      self.component = base_component.BaseComponent(
          component=statistics_gen,
          depends_on=set(),
          pipeline=pipeline,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
      )
    self.tfx_component = statistics_gen
Beispiel #3
0
  def setUp(self):
    super(BaseComponentWithPipelineParamTest, self).setUp()

    test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
    example_gen_buckets = data_types.RuntimeParameter(
        name='example-gen-buckets', ptype=int, default=10)

    examples = standard_artifacts.ExternalArtifact()
    example_gen = csv_example_gen_component.CsvExampleGen(
        input=channel_utils.as_channel([examples]),
        output_config={
            'split_config': {
                'splits': [{
                    'name': 'examples',
                    'hash_buckets': example_gen_buckets
                }]
            }
        })
    statistics_gen = statistics_gen_component.StatisticsGen(
        examples=example_gen.outputs['examples'], instance_name='foo')

    pipeline = tfx_pipeline.Pipeline(
        pipeline_name=self._test_pipeline_name,
        pipeline_root='test_pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )

    self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
    self._tfx_ir = pipeline_pb2.Pipeline()
    with dsl.Pipeline('test_pipeline'):
      self.example_gen = base_component.BaseComponent(
          component=example_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
          tfx_ir=self._tfx_ir)
      self.statistics_gen = base_component.BaseComponent(
          component=statistics_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
          tfx_ir=self._tfx_ir
      )

    self.tfx_example_gen = example_gen
    self.tfx_statistics_gen = statistics_gen
Beispiel #4
0
  def setUp(self):
    super(BaseComponentTest, self).setUp()
    examples = standard_artifacts.ExternalArtifact()
    example_gen = csv_example_gen_component.CsvExampleGen(
        input=channel_utils.as_channel([examples]))
    statistics_gen = statistics_gen_component.StatisticsGen(
        examples=example_gen.outputs['examples'], instance_name='foo')

    pipeline = tfx_pipeline.Pipeline(
        pipeline_name=self._test_pipeline_name,
        pipeline_root='test_pipeline_root',
        metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
        components=[example_gen, statistics_gen],
    )

    test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')

    self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
    self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
    with dsl.Pipeline('test_pipeline'):
      self.component = base_component.BaseComponent(
          component=statistics_gen,
          component_launcher_class=in_process_component_launcher
          .InProcessComponentLauncher,
          depends_on=set(),
          pipeline=pipeline,
          pipeline_name=self._test_pipeline_name,
          pipeline_root=test_pipeline_root,
          tfx_image='container_image',
          kubeflow_metadata_config=self._metadata_config,
          component_config=None,
      )
    self.tfx_component = statistics_gen
Beispiel #5
0
    def setUp(self):
        super(BaseComponentTest, self).setUp()
        example_gen = csv_example_gen_component.CsvExampleGen(
            input_base='data_input')
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples']).with_id('foo')

        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        self._tfx_ir = pipeline_pb2.Pipeline()
        with dsl.Pipeline('test_pipeline'):
            self.component = base_component.BaseComponent(
                component=statistics_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
            )
        self.tfx_component = statistics_gen
Beispiel #6
0
    def setUp(self):
        super(BaseComponentWithPipelineParamTest, self).setUp()

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
        example_gen_output_name = runtime_string_parameter.RuntimeStringParameter(
            name='example-gen-output-name', default='default-to-be-discarded')

        examples = standard_artifacts.ExternalArtifact()
        example_gen = csv_example_gen_component.CsvExampleGen(
            input=channel_utils.as_channel([examples]),
            output_config=example_gen_pb2.Output(
                split_config=example_gen_pb2.SplitConfig(splits=[
                    example_gen_pb2.SplitConfig.Split(
                        name=example_gen_output_name, hash_buckets=10)
                ])))
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples'], instance_name='foo')

        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        with dsl.Pipeline('test_pipeline'):
            self.example_gen = base_component.BaseComponent(
                component=example_gen,
                component_launcher_class=in_process_component_launcher.
                InProcessComponentLauncher,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_name=self._test_pipeline_name,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                component_config=None)
            self.statistics_gen = base_component.BaseComponent(
                component=statistics_gen,
                component_launcher_class=in_process_component_launcher.
                InProcessComponentLauncher,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_name=self._test_pipeline_name,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                component_config=None,
            )

        self.tfx_example_gen = example_gen
        self.tfx_statistics_gen = statistics_gen
Beispiel #7
0
    def setUp(self):
        output_dict = {
            'output_name': [types.TfxType(type_name='ExamplesPath')]
        }

        with dsl.Pipeline('test_pipeline'):
            self.component = base_component.BaseComponent(
                component_name='TFXComponent',
                input_dict={
                    'input_data': 'input-data-contents',
                    'train_steps': 300,
                    'accuracy_threshold': 0.3,
                },
                output_dict=output_dict,
                exec_properties={'module_file': '/path/to/module.py'},
            )
Beispiel #8
0
    def setUp(self):
        super().setUp()

        example_gen_output_config = data_types.RuntimeParameter(
            name='example-gen-output-config', ptype=str)

        example_gen = csv_example_gen_component.CsvExampleGen(
            input_base='data_root', output_config=example_gen_output_config)
        statistics_gen = statistics_gen_component.StatisticsGen(
            examples=example_gen.outputs['examples']).with_id('foo')

        test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param')
        pipeline = tfx_pipeline.Pipeline(
            pipeline_name=self._test_pipeline_name,
            pipeline_root='test_pipeline_root',
            metadata_connection_config=metadata_store_pb2.ConnectionConfig(),
            components=[example_gen, statistics_gen],
        )

        self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig()
        self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST'
        self._tfx_ir = pipeline_pb2.Pipeline()
        with dsl.Pipeline('test_pipeline'):
            self.example_gen = base_component.BaseComponent(
                component=example_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
                pod_labels_to_attach={},
                runtime_parameters=[example_gen_output_config])
            self.statistics_gen = base_component.BaseComponent(
                component=statistics_gen,
                depends_on=set(),
                pipeline=pipeline,
                pipeline_root=test_pipeline_root,
                tfx_image='container_image',
                kubeflow_metadata_config=self._metadata_config,
                tfx_ir=self._tfx_ir,
                pod_labels_to_attach={},
                runtime_parameters=[])

        self.tfx_example_gen = example_gen
        self.tfx_statistics_gen = statistics_gen
    def setUp(self):
        self._output_dict = {'output_name': [standard_artifacts.Examples()]}
        self._pipeline_properties = base_component.PipelineProperties(
            output_dir='output_dir',
            log_root='log_root',
        )

        with dsl.Pipeline('test_pipeline'):
            self.component = base_component.BaseComponent(
                component_name='TFXComponent',
                input_dict=collections.OrderedDict([
                    ('input_data', 'input-data-contents'),
                    ('train_steps', 300),
                    ('accuracy_threshold', 0.3),
                ]),
                output_dict=self._output_dict,
                exec_properties=collections.OrderedDict([
                    ('module_file', '/path/to/module.py')
                ]),
                executor_class_path='some.executor.Class',
                pipeline_properties=self._pipeline_properties,
            )
Beispiel #10
0
    def _compile(self, pipeline_func):
        """Compile the given pipeline function into workflow."""

        argspec = inspect.getfullargspec(pipeline_func)
        self._validate_args(argspec)

        registered_pipeline_functions = dsl.Pipeline.get_pipeline_functions()
        if pipeline_func not in registered_pipeline_functions:
            raise ValueError(
                'Please use a function with @dsl.pipeline decorator.')

        pipeline_name, _ = dsl.Pipeline.get_pipeline_functions()[pipeline_func]
        pipeline_name = self._sanitize_name(pipeline_name)

        # Create the arg list with no default values and call pipeline function.
        args_list = [
            dsl.PipelineParam(self._sanitize_name(arg_name))
            for arg_name in argspec.args
        ]
        with dsl.Pipeline(pipeline_name) as p:
            pipeline_func(*args_list)

        # Remove when argo supports local exit handler.
        self._validate_exit_handler(p)

        # Fill in the default values.
        args_list_with_defaults = [
            dsl.PipelineParam(self._sanitize_name(arg_name))
            for arg_name in argspec.args
        ]
        if argspec.defaults:
            for arg, default in zip(reversed(args_list_with_defaults),
                                    reversed(argspec.defaults)):
                arg.value = default.value

        workflow = self._create_pipeline_workflow(args_list_with_defaults, p)
        return workflow
Beispiel #11
0
    def test_operator_to_template(self):
        """Test converting operator to template"""

        with dsl.Pipeline('somename') as p:
            msg1 = dsl.PipelineParam('msg1')
            msg2 = dsl.PipelineParam('msg2', value='value2')
            op = dsl.ContainerOp(
                name='echo',
                image='image',
                command=['sh', '-c'],
                arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)],
                file_outputs={'merged': '/tmp/message.txt'})
        golden_output = {
            'container': {
                'image':
                'image',
                'args': [
                    'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt'
                ],
                'command': ['sh', '-c'],
            },
            'inputs': {
                'parameters': [
                    {
                        'name': 'msg1'
                    },
                    {
                        'name': 'msg2',
                        'value': 'value2'
                    },
                ]
            },
            'name': 'echo',
            'outputs': {
                'parameters': [{
                    'name': 'echo-merged',
                    'valueFrom': {
                        'path': '/tmp/message.txt'
                    }
                }],
                'artifacts': [{
                    'name': 'mlpipeline-ui-metadata',
                    'path': '/mlpipeline-ui-metadata.json',
                    's3': {
                        'accessKeySecret': {
                            'key': 'accesskey',
                            'name': 'mlpipeline-minio-artifact',
                        },
                        'bucket': 'mlpipeline',
                        'endpoint': 'minio-service.kubeflow:9000',
                        'insecure': True,
                        'key':
                        'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-ui-metadata.tgz',
                        'secretKeySecret': {
                            'key': 'secretkey',
                            'name': 'mlpipeline-minio-artifact',
                        }
                    }
                }, {
                    'name': 'mlpipeline-metrics',
                    'path': '/mlpipeline-metrics.json',
                    's3': {
                        'accessKeySecret': {
                            'key': 'accesskey',
                            'name': 'mlpipeline-minio-artifact',
                        },
                        'bucket': 'mlpipeline',
                        'endpoint': 'minio-service.kubeflow:9000',
                        'insecure': True,
                        'key':
                        'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-metrics.tgz',
                        'secretKeySecret': {
                            'key': 'secretkey',
                            'name': 'mlpipeline-minio-artifact',
                        }
                    }
                }]
            }
        }

        self.maxDiff = None
        self.assertEqual(golden_output,
                         compiler.Compiler()._op_to_template(op))
  def test_operator_to_template(self):
    """Test converting operator to template"""

    from kubernetes import client as k8s_client

    with dsl.Pipeline('somename') as p:
      msg1 = dsl.PipelineParam('msg1')
      msg2 = dsl.PipelineParam('msg2', value='value2')
      op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'],
                           arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)],
                           file_outputs={'merged': '/tmp/message.txt'}) \
        .add_volume_mount(k8s_client.V1VolumeMount(
          mount_path='/secret/gcp-credentials',
          name='gcp-credentials')) \
        .add_env_variable(k8s_client.V1EnvVar(
          name='GOOGLE_APPLICATION_CREDENTIALS',
          value='/secret/gcp-credentials/user-gcp-sa.json'))
    golden_output = {
      'container': {
        'image': 'image',
        'args': [
          'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt'
        ],
        'command': ['sh', '-c'],
        'env': [
          {
            'name': 'GOOGLE_APPLICATION_CREDENTIALS',
            'value': '/secret/gcp-credentials/user-gcp-sa.json'
          }
        ],
        'volumeMounts':[
          {
            'mountPath': '/secret/gcp-credentials',
            'name': 'gcp-credentials',
          }
        ]
      },
      'inputs': {'parameters':
        [
          {'name': 'msg1'},
          {'name': 'msg2', 'value': 'value2'},
        ]},
      'name': 'echo',
      'outputs': {
        'parameters': [
          {'name': 'echo-merged',
           'valueFrom': {'path': '/tmp/message.txt'}
          }],
        'artifacts': [{
          'name': 'mlpipeline-ui-metadata',
          'path': '/mlpipeline-ui-metadata.json',
          's3': {
            'accessKeySecret': {
              'key': 'accesskey',
              'name': 'mlpipeline-minio-artifact',
            },
            'bucket': 'mlpipeline',
            'endpoint': 'minio-service.kubeflow:9000',
            'insecure': True,
            'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-ui-metadata.tgz',
            'secretKeySecret': {
              'key': 'secretkey',
              'name': 'mlpipeline-minio-artifact',
            }
          }
        },{
          'name': 'mlpipeline-metrics',
          'path': '/mlpipeline-metrics.json',
          's3': {
            'accessKeySecret': {
              'key': 'accesskey',
              'name': 'mlpipeline-minio-artifact',
            },
            'bucket': 'mlpipeline',
            'endpoint': 'minio-service.kubeflow:9000',
            'insecure': True,
            'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-metrics.tgz',
            'secretKeySecret': {
              'key': 'secretkey',
              'name': 'mlpipeline-minio-artifact',
            }
          }
        }]
      }
    }

    self.maxDiff = None
    self.assertEqual(golden_output, compiler.Compiler()._op_to_template(op))
Beispiel #13
0
    def test_operator_to_template(self):
        """Test converting operator to template"""

        from kubernetes import client as k8s_client

        with dsl.Pipeline('somename') as p:
            msg1 = dsl.PipelineParam('msg1')
            msg2 = dsl.PipelineParam('msg2', value='value2')
            json = dsl.PipelineParam('json')
            kind = dsl.PipelineParam('kind')
            op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'],
                                 arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)],
                                 file_outputs={'merged': '/tmp/message.txt'}) \
              .add_volume_mount(k8s_client.V1VolumeMount(
                mount_path='/secret/gcp-credentials',
                name='gcp-credentials')) \
              .add_env_variable(k8s_client.V1EnvVar(
                name='GOOGLE_APPLICATION_CREDENTIALS',
                value='/secret/gcp-credentials/user-gcp-sa.json'))
            res = dsl.ResourceOp(
                name="test-resource",
                k8s_resource=k8s_client.V1PersistentVolumeClaim(
                    api_version="v1",
                    kind=kind,
                    metadata=k8s_client.V1ObjectMeta(name="resource")),
                attribute_outputs={"out": json})
        golden_output = {
            'container': {
                'image':
                'image',
                'args': [
                    'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt'
                ],
                'command': ['sh', '-c'],
                'env': [{
                    'name': 'GOOGLE_APPLICATION_CREDENTIALS',
                    'value': '/secret/gcp-credentials/user-gcp-sa.json'
                }],
                'volumeMounts': [{
                    'mountPath': '/secret/gcp-credentials',
                    'name': 'gcp-credentials',
                }]
            },
            'inputs': {
                'parameters': [
                    {
                        'name': 'msg1'
                    },
                    {
                        'name': 'msg2',
                        'value': 'value2'
                    },
                ]
            },
            'name': 'echo',
            'outputs': {
                'parameters': [{
                    'name': 'echo-merged',
                    'valueFrom': {
                        'path': '/tmp/message.txt'
                    }
                }],
                'artifacts': [{
                    'name': 'mlpipeline-ui-metadata',
                    'path': '/mlpipeline-ui-metadata.json',
                    's3': {
                        'accessKeySecret': {
                            'key': 'accesskey',
                            'name': 'mlpipeline-minio-artifact',
                        },
                        'bucket': 'mlpipeline',
                        'endpoint': 'minio-service.kubeflow:9000',
                        'insecure': True,
                        'key':
                        'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-ui-metadata.tgz',
                        'secretKeySecret': {
                            'key': 'secretkey',
                            'name': 'mlpipeline-minio-artifact',
                        }
                    }
                }, {
                    'name': 'mlpipeline-metrics',
                    'path': '/mlpipeline-metrics.json',
                    's3': {
                        'accessKeySecret': {
                            'key': 'accesskey',
                            'name': 'mlpipeline-minio-artifact',
                        },
                        'bucket': 'mlpipeline',
                        'endpoint': 'minio-service.kubeflow:9000',
                        'insecure': True,
                        'key':
                        'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-metrics.tgz',
                        'secretKeySecret': {
                            'key': 'secretkey',
                            'name': 'mlpipeline-minio-artifact',
                        }
                    }
                }]
            }
        }
        res_output = {
            'inputs': {
                'parameters': [{
                    'name': 'json'
                }, {
                    'name': 'kind'
                }]
            },
            'name': 'test-resource',
            'outputs': {
                'parameters': [{
                    'name': 'test-resource-manifest',
                    'valueFrom': {
                        'jsonPath': '{}'
                    }
                }, {
                    'name': 'test-resource-name',
                    'valueFrom': {
                        'jsonPath': '{.metadata.name}'
                    }
                }, {
                    'name': 'test-resource-out',
                    'valueFrom': {
                        'jsonPath': '{{inputs.parameters.json}}'
                    }
                }]
            },
            'resource': {
                'action':
                'create',
                'manifest': ("apiVersion: v1\n"
                             "kind: '{{inputs.parameters.kind}}'\n"
                             "metadata:\n"
                             "  name: resource\n")
            }
        }

        self.maxDiff = None
        self.assertEqual(golden_output,
                         compiler.Compiler()._op_to_template(op))
        self.assertEqual(res_output, compiler.Compiler()._op_to_template(res))
Beispiel #14
0
  def _create_workflow(self,
      pipeline_func: Callable,
      pipeline_name: Text=None,
      pipeline_description: Text=None,
      params_list: List[dsl.PipelineParam]=None,
      pipeline_conf: dsl.PipelineConf = None,
      ) -> List[Dict[Text, Any]]:  # Tekton change, signature
    """ Internal implementation of create_workflow."""
    params_list = params_list or []
    argspec = inspect.getfullargspec(pipeline_func)

    # Create the arg list with no default values and call pipeline function.
    # Assign type information to the PipelineParam
    pipeline_meta = _extract_pipeline_metadata(pipeline_func)
    pipeline_meta.name = pipeline_name or pipeline_meta.name
    pipeline_meta.description = pipeline_description or pipeline_meta.description
    pipeline_name = sanitize_k8s_name(pipeline_meta.name)

    # Need to first clear the default value of dsl.PipelineParams. Otherwise, it
    # will be resolved immediately in place when being to each component.
    default_param_values = {}
    for param in params_list:
      default_param_values[param.name] = param.value
      param.value = None

    # Currently only allow specifying pipeline params at one place.
    if params_list and pipeline_meta.inputs:
      raise ValueError('Either specify pipeline params in the pipeline function, or in "params_list", but not both.')

    args_list = []
    for arg_name in argspec.args:
      arg_type = None
      for input in pipeline_meta.inputs or []:
        if arg_name == input.name:
          arg_type = input.type
          break
      args_list.append(dsl.PipelineParam(sanitize_k8s_name(arg_name, True), param_type=arg_type))

    with dsl.Pipeline(pipeline_name) as dsl_pipeline:
      pipeline_func(*args_list)

    pipeline_conf = pipeline_conf or dsl_pipeline.conf # Configuration passed to the compiler is overriding. Unfortunately, it's not trivial to detect whether the dsl_pipeline.conf was ever modified.

    self._validate_exit_handler(dsl_pipeline)
    self._sanitize_and_inject_artifact(dsl_pipeline, pipeline_conf)

    # Fill in the default values.
    args_list_with_defaults = []
    if pipeline_meta.inputs:
      args_list_with_defaults = [dsl.PipelineParam(sanitize_k8s_name(arg_name, True))
                                 for arg_name in argspec.args]
      if argspec.defaults:
        for arg, default in zip(reversed(args_list_with_defaults), reversed(argspec.defaults)):
          arg.value = default.value if isinstance(default, dsl.PipelineParam) else default
    elif params_list:
      # Or, if args are provided by params_list, fill in pipeline_meta.
      for param in params_list:
        param.value = default_param_values[param.name]

      args_list_with_defaults = params_list
      pipeline_meta.inputs = [
        InputSpec(
            name=param.name,
            type=param.param_type,
            default=param.value) for param in params_list]

    op_transformers = [add_pod_env]
    op_transformers.extend(pipeline_conf.op_transformers)

    workflow = self._create_pipeline_workflow(
        args_list_with_defaults,
        dsl_pipeline,
        op_transformers,
        pipeline_conf,
    )

    from ._data_passing_rewriter import fix_big_data_passing
    workflow = fix_big_data_passing(workflow)

    import json
    pipeline = [item for item in workflow if item["kind"] == "Pipeline"][0]  # Tekton change
    pipeline.setdefault('metadata', {}).setdefault('annotations', {})['pipelines.kubeflow.org/pipeline_spec'] = json.dumps(pipeline_meta.to_dict(), sort_keys=True)

    return workflow
Beispiel #15
0
  def _create_pipeline_v2(
      self,
      pipeline_func: Callable[..., Any],
      pipeline_root: Optional[str] = None,
      pipeline_name: Optional[str] = None,
      pipeline_parameters_override: Optional[Mapping[str, Any]] = None,
  ) -> pipeline_spec_pb2.PipelineJob:
    """Creates a pipeline instance and constructs the pipeline spec from it.

    Args:
      pipeline_func: Pipeline function with @dsl.pipeline decorator.
      pipeline_root: The root of the pipeline outputs. Optional.
      pipeline_name: The name of the pipeline. Optional.
      pipeline_parameters_override: The mapping from parameter names to values.
        Optional.

    Returns:
      A PipelineJob proto representing the compiled pipeline.
    """

    # Create the arg list with no default values and call pipeline function.
    # Assign type information to the PipelineParam
    pipeline_meta = _python_op._extract_component_interface(pipeline_func)
    pipeline_name = pipeline_name or pipeline_meta.name

    pipeline_root = pipeline_root or getattr(pipeline_func, 'output_directory',
                                             None)
    if not pipeline_root:
      warnings.warn('pipeline_root is None or empty. A valid pipeline_root '
                    'must be provided at job submission.')

    args_list = []
    signature = inspect.signature(pipeline_func)
    for arg_name in signature.parameters:
      arg_type = None
      for pipeline_input in pipeline_meta.inputs or []:
        if arg_name == pipeline_input.name:
          arg_type = pipeline_input.type
          break
      args_list.append(
          dsl.PipelineParam(
              sanitize_k8s_name(arg_name, True), param_type=arg_type))

    with dsl.Pipeline(pipeline_name) as dsl_pipeline:
      pipeline_func(*args_list)

    self._sanitize_and_inject_artifact(dsl_pipeline)

    # Fill in the default values.
    args_list_with_defaults = []
    if pipeline_meta.inputs:
      args_list_with_defaults = [
          dsl.PipelineParam(
              sanitize_k8s_name(input_spec.name, True),
              param_type=input_spec.type,
              value=input_spec.default) for input_spec in pipeline_meta.inputs
      ]

    # Making the pipeline group name unique to prevent name clashes with templates
    pipeline_group = dsl_pipeline.groups[0]
    temp_pipeline_group_name = uuid.uuid4().hex
    pipeline_group.name = temp_pipeline_group_name

    pipeline_spec = self._create_pipeline_spec(
        args_list_with_defaults,
        dsl_pipeline,
    )

    pipeline_parameters = {
        param.name: param for param in args_list_with_defaults
    }
    # Update pipeline parameters override if there were any.
    pipeline_parameters_override = pipeline_parameters_override or {}
    for k, v in pipeline_parameters_override.items():
      if k not in pipeline_parameters:
        raise ValueError('Pipeline parameter {} does not match any known '
                         'pipeline argument.'.format(k))
      pipeline_parameters[k].value = v

    runtime_config = compiler_utils.build_runtime_config_spec(
        output_directory=pipeline_root, pipeline_parameters=pipeline_parameters)
    pipeline_job = pipeline_spec_pb2.PipelineJob(runtime_config=runtime_config)
    pipeline_job.pipeline_spec.update(json_format.MessageToDict(pipeline_spec))

    return pipeline_job
Beispiel #16
0
    def _create_workflow(
        self,
        pipeline_func: Callable,
        pipeline_name: Text = None,
        pipeline_description: Text = None,
        params_list: List[dsl.PipelineParam] = None,
        pipeline_conf: dsl.PipelineConf = None,
    ) -> Dict[Text, Any]:
        """ Internal implementation of create_workflow."""
        params_list = params_list or []
        argspec = inspect.getfullargspec(pipeline_func)

        # Create the arg list with no default values and call pipeline function.
        # Assign type information to the PipelineParam
        pipeline_meta = _extract_pipeline_metadata(pipeline_func)
        pipeline_meta.name = pipeline_name or pipeline_meta.name
        pipeline_meta.description = pipeline_description or pipeline_meta.description
        pipeline_name = sanitize_k8s_name(pipeline_meta.name)

        # Need to first clear the default value of dsl.PipelineParams. Otherwise, it
        # will be resolved immediately in place when being to each component.
        default_param_values = {}
        for param in params_list:
            default_param_values[param.name] = param.value
            param.value = None

        # Currently only allow specifying pipeline params at one place.
        if params_list and pipeline_meta.inputs:
            raise ValueError(
                'Either specify pipeline params in the pipeline function, or in "params_list", but not both.'
            )

        args_list = []
        for arg_name in argspec.args:
            arg_type = None
            for input in pipeline_meta.inputs or []:
                if arg_name == input.name:
                    arg_type = input.type
                    break
            args_list.append(
                dsl.PipelineParam(sanitize_k8s_name(arg_name, True),
                                  param_type=arg_type))

        with dsl.Pipeline(pipeline_name) as dsl_pipeline:
            pipeline_func(*args_list)

        # Configuration passed to the compiler is overriding. Unfortunately, it is
        # not trivial to detect whether the dsl_pipeline.conf was ever modified.
        pipeline_conf = pipeline_conf or dsl_pipeline.conf

        self._validate_exit_handler(dsl_pipeline)
        self._sanitize_and_inject_artifact(dsl_pipeline, pipeline_conf)

        # Fill in the default values.
        args_list_with_defaults = []
        if pipeline_meta.inputs:
            args_list_with_defaults = [
                dsl.PipelineParam(sanitize_k8s_name(arg_name, True))
                for arg_name in argspec.args
            ]
            if argspec.defaults:
                for arg, default in zip(reversed(args_list_with_defaults),
                                        reversed(argspec.defaults)):
                    arg.value = default.value if isinstance(
                        default, dsl.PipelineParam) else default
        elif params_list:
            # Or, if args are provided by params_list, fill in pipeline_meta.
            for param in params_list:
                param.value = default_param_values[param.name]

            args_list_with_defaults = params_list
            pipeline_meta.inputs = [
                InputSpec(name=param.name,
                          type=param.param_type,
                          default=param.value) for param in params_list
            ]

        op_transformers = [add_pod_env]

        # # By default adds telemetry instruments. Users can opt out toggling
        # # allow_telemetry.
        # # Also, TFX pipelines will be bypassed for pipeline compiled by tfx>0.21.4.
        # if allow_telemetry:
        #   pod_labels = get_default_telemetry_labels()
        #   op_transformers.append(add_pod_labels(pod_labels))

        op_transformers.extend(pipeline_conf.op_transformers)

        workflow = self._create_pipeline_workflow(
            args_list_with_defaults,
            dsl_pipeline,
            op_transformers,
            pipeline_conf,
        )

        workflow = fix_big_data_passing(workflow)

        workflow.setdefault('metadata', {}).setdefault('annotations', {})['pipelines.kubeflow.org/pipeline_spec'] = \
          json.dumps(pipeline_meta.to_dict(), sort_keys=True)

        # recursively strip empty structures, DANGER: this may remove necessary empty elements ?!
        def remove_empty_elements(obj) -> dict:
            if not isinstance(obj, (dict, list)):
                return obj
            if isinstance(obj, list):
                return [remove_empty_elements(o) for o in obj if o != []]
            return {
                k: remove_empty_elements(v)
                for k, v in obj.items() if v != []
            }

        workflow = remove_empty_elements(workflow)

        return workflow
Beispiel #17
0
  def _create_workflow(
      self,
      pipeline_func: Callable,
      pipeline_name: Optional[Text] = None,
      pipeline_description: Optional[Text] = None,
      params_list: Optional[List[dsl.PipelineParam]] = None,
      pipeline_conf: Optional[dsl.PipelineConf] = None,
  ) -> Dict[Text, Any]:
    """ Internal implementation of create_workflow."""
    params_list = params_list or []

    # Create the arg list with no default values and call pipeline function.
    # Assign type information to the PipelineParam
    pipeline_meta = _extract_pipeline_metadata(pipeline_func)
    pipeline_meta.name = pipeline_name or pipeline_meta.name
    pipeline_meta.description = pipeline_description or pipeline_meta.description
    pipeline_name = sanitize_k8s_name(pipeline_meta.name)

    # Need to first clear the default value of dsl.PipelineParams. Otherwise, it
    # will be resolved immediately in place when being to each component.
    default_param_values = OrderedDict()

    if self._pipeline_root_param:
      params_list.append(self._pipeline_root_param)
    if self._pipeline_name_param:
      params_list.append(self._pipeline_name_param)

    for param in params_list:
      default_param_values[param.name] = param.value
      param.value = None

    args_list = []
    kwargs_dict = dict()
    signature = inspect.signature(pipeline_func)
    for arg_name, arg in signature.parameters.items():
      arg_type = None
      for input in pipeline_meta.inputs or []:
        if arg_name == input.name:
          arg_type = input.type
          break
      param = dsl.PipelineParam(sanitize_k8s_name(arg_name, True), param_type=arg_type)
      if arg.kind == inspect.Parameter.KEYWORD_ONLY:
        kwargs_dict[arg_name] = param
      else:
        args_list.append(param)

    with dsl.Pipeline(pipeline_name) as dsl_pipeline:
      pipeline_func(*args_list, **kwargs_dict)

    pipeline_conf = pipeline_conf or dsl_pipeline.conf # Configuration passed to the compiler is overriding. Unfortunately, it's not trivial to detect whether the dsl_pipeline.conf was ever modified.

    self._validate_exit_handler(dsl_pipeline)
    self._sanitize_and_inject_artifact(dsl_pipeline, pipeline_conf)

    # Fill in the default values by merging two param lists.
    args_list_with_defaults = OrderedDict()
    if pipeline_meta.inputs:
      args_list_with_defaults = OrderedDict([
        (sanitize_k8s_name(input_spec.name, True), input_spec.default)
        for input_spec in pipeline_meta.inputs
      ])

    if params_list:
      # Or, if args are provided by params_list, fill in pipeline_meta.
      for k, v in default_param_values.items():
        args_list_with_defaults[k] = v

      pipeline_meta.inputs = pipeline_meta.inputs or []
      for param in params_list:
        pipeline_meta.inputs.append(
            InputSpec(
                name=param.name,
                type=param.param_type,
                default=default_param_values[param.name]))

    op_transformers = [add_pod_env]
    pod_labels = {_SDK_VERSION_LABEL: kfp.__version__, _SDK_ENV_LABEL:_SDK_ENV_DEFAULT}
    op_transformers.append(add_pod_labels(pod_labels))
    op_transformers.extend(pipeline_conf.op_transformers)

    if self._mode == dsl.PipelineExecutionMode.V2_COMPATIBLE:
      # Add self._pipeline_name_param and self._pipeline_root_param to ops inputs
      # if they don't exist already.
      for op in dsl_pipeline.ops.values():
        insert_pipeline_name_param = True
        insert_pipeline_root_param = True
        for param in op.inputs:
          if param.name == self._pipeline_name_param.name:
            insert_pipeline_name_param = False
          elif param.name == self._pipeline_root_param.name:
            insert_pipeline_root_param = False

        if insert_pipeline_name_param:
          op.inputs.append(self._pipeline_name_param)
        if insert_pipeline_root_param:
          op.inputs.append(self._pipeline_root_param)

    workflow = self._create_pipeline_workflow(
        args_list_with_defaults,
        dsl_pipeline,
        op_transformers,
        pipeline_conf,
    )

    from ._data_passing_rewriter import fix_big_data_passing
    workflow = fix_big_data_passing(workflow)

    workflow = _data_passing_rewriter.add_pod_name_passing(
        workflow, str(self._pipeline_root_param or None))

    if pipeline_conf and pipeline_conf.data_passing_method != None:
      workflow = pipeline_conf.data_passing_method(workflow)

    metadata = workflow.setdefault('metadata', {})
    annotations = metadata.setdefault('annotations', {})
    labels = metadata.setdefault('labels', {})

    annotations[_SDK_VERSION_LABEL] = kfp.__version__
    annotations['pipelines.kubeflow.org/pipeline_compilation_time'] = datetime.datetime.now().isoformat()
    annotations['pipelines.kubeflow.org/pipeline_spec'] = json.dumps(pipeline_meta.to_dict(), sort_keys=True)

    if self._mode == dsl.PipelineExecutionMode.V2_COMPATIBLE:
      annotations['pipelines.kubeflow.org/v2_pipeline'] = "true"
      labels['pipelines.kubeflow.org/v2_pipeline'] = "true"


    # Labels might be logged better than annotations so adding some information here as well
    labels[_SDK_VERSION_LABEL] = kfp.__version__

    return workflow