def _create_task_object(name: str, container_image: str, command=None, arguments=None, file_inputs=None, file_outputs=None): import kfp.dsl as dsl global _dummy_pipeline need_dummy = dsl.Pipeline._default_pipeline is None if need_dummy: if _dummy_pipeline == None: _dummy_pipeline = dsl.Pipeline('dummy pipeline') _dummy_pipeline.__enter__() task = dsl.ContainerOp( name=name, image=container_image, command=command, arguments=arguments, file_inputs=file_inputs, file_outputs=file_outputs, ) if need_dummy: _dummy_pipeline.__exit__() return task
def setUp(self): super(BaseComponentTest, self).setUp() examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input_base=channel_utils.as_channel([examples])) statistics_gen = statistics_gen_component.StatisticsGen( input_data=example_gen.outputs.examples, instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name='test_pipeline', pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component=statistics_gen, depends_on=set(), pipeline=pipeline, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, ) self.tfx_component = statistics_gen
def setUp(self): super(BaseComponentWithPipelineParamTest, self).setUp() test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') example_gen_buckets = data_types.RuntimeParameter( name='example-gen-buckets', ptype=int, default=10) examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input=channel_utils.as_channel([examples]), output_config={ 'split_config': { 'splits': [{ 'name': 'examples', 'hash_buckets': example_gen_buckets }] } }) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples'], instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' self._tfx_ir = pipeline_pb2.Pipeline() with dsl.Pipeline('test_pipeline'): self.example_gen = base_component.BaseComponent( component=example_gen, component_launcher_class=in_process_component_launcher .InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, tfx_ir=self._tfx_ir) self.statistics_gen = base_component.BaseComponent( component=statistics_gen, component_launcher_class=in_process_component_launcher .InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, tfx_ir=self._tfx_ir ) self.tfx_example_gen = example_gen self.tfx_statistics_gen = statistics_gen
def setUp(self): super(BaseComponentTest, self).setUp() examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input=channel_utils.as_channel([examples])) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples'], instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component=statistics_gen, component_launcher_class=in_process_component_launcher .InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, ) self.tfx_component = statistics_gen
def setUp(self): super(BaseComponentTest, self).setUp() example_gen = csv_example_gen_component.CsvExampleGen( input_base='data_input') statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples']).with_id('foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' self._tfx_ir = pipeline_pb2.Pipeline() with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component=statistics_gen, depends_on=set(), pipeline=pipeline, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, tfx_ir=self._tfx_ir, ) self.tfx_component = statistics_gen
def setUp(self): super(BaseComponentWithPipelineParamTest, self).setUp() test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') example_gen_output_name = runtime_string_parameter.RuntimeStringParameter( name='example-gen-output-name', default='default-to-be-discarded') examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input=channel_utils.as_channel([examples]), output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name=example_gen_output_name, hash_buckets=10) ]))) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples'], instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' with dsl.Pipeline('test_pipeline'): self.example_gen = base_component.BaseComponent( component=example_gen, component_launcher_class=in_process_component_launcher. InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None) self.statistics_gen = base_component.BaseComponent( component=statistics_gen, component_launcher_class=in_process_component_launcher. InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, ) self.tfx_example_gen = example_gen self.tfx_statistics_gen = statistics_gen
def setUp(self): output_dict = { 'output_name': [types.TfxType(type_name='ExamplesPath')] } with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component_name='TFXComponent', input_dict={ 'input_data': 'input-data-contents', 'train_steps': 300, 'accuracy_threshold': 0.3, }, output_dict=output_dict, exec_properties={'module_file': '/path/to/module.py'}, )
def setUp(self): super().setUp() example_gen_output_config = data_types.RuntimeParameter( name='example-gen-output-config', ptype=str) example_gen = csv_example_gen_component.CsvExampleGen( input_base='data_root', output_config=example_gen_output_config) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples']).with_id('foo') test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' self._tfx_ir = pipeline_pb2.Pipeline() with dsl.Pipeline('test_pipeline'): self.example_gen = base_component.BaseComponent( component=example_gen, depends_on=set(), pipeline=pipeline, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, tfx_ir=self._tfx_ir, pod_labels_to_attach={}, runtime_parameters=[example_gen_output_config]) self.statistics_gen = base_component.BaseComponent( component=statistics_gen, depends_on=set(), pipeline=pipeline, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, tfx_ir=self._tfx_ir, pod_labels_to_attach={}, runtime_parameters=[]) self.tfx_example_gen = example_gen self.tfx_statistics_gen = statistics_gen
def setUp(self): self._output_dict = {'output_name': [standard_artifacts.Examples()]} self._pipeline_properties = base_component.PipelineProperties( output_dir='output_dir', log_root='log_root', ) with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component_name='TFXComponent', input_dict=collections.OrderedDict([ ('input_data', 'input-data-contents'), ('train_steps', 300), ('accuracy_threshold', 0.3), ]), output_dict=self._output_dict, exec_properties=collections.OrderedDict([ ('module_file', '/path/to/module.py') ]), executor_class_path='some.executor.Class', pipeline_properties=self._pipeline_properties, )
def _compile(self, pipeline_func): """Compile the given pipeline function into workflow.""" argspec = inspect.getfullargspec(pipeline_func) self._validate_args(argspec) registered_pipeline_functions = dsl.Pipeline.get_pipeline_functions() if pipeline_func not in registered_pipeline_functions: raise ValueError( 'Please use a function with @dsl.pipeline decorator.') pipeline_name, _ = dsl.Pipeline.get_pipeline_functions()[pipeline_func] pipeline_name = self._sanitize_name(pipeline_name) # Create the arg list with no default values and call pipeline function. args_list = [ dsl.PipelineParam(self._sanitize_name(arg_name)) for arg_name in argspec.args ] with dsl.Pipeline(pipeline_name) as p: pipeline_func(*args_list) # Remove when argo supports local exit handler. self._validate_exit_handler(p) # Fill in the default values. args_list_with_defaults = [ dsl.PipelineParam(self._sanitize_name(arg_name)) for arg_name in argspec.args ] if argspec.defaults: for arg, default in zip(reversed(args_list_with_defaults), reversed(argspec.defaults)): arg.value = default.value workflow = self._create_pipeline_workflow(args_list_with_defaults, p) return workflow
def test_operator_to_template(self): """Test converting operator to template""" with dsl.Pipeline('somename') as p: msg1 = dsl.PipelineParam('msg1') msg2 = dsl.PipelineParam('msg2', value='value2') op = dsl.ContainerOp( name='echo', image='image', command=['sh', '-c'], arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)], file_outputs={'merged': '/tmp/message.txt'}) golden_output = { 'container': { 'image': 'image', 'args': [ 'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt' ], 'command': ['sh', '-c'], }, 'inputs': { 'parameters': [ { 'name': 'msg1' }, { 'name': 'msg2', 'value': 'value2' }, ] }, 'name': 'echo', 'outputs': { 'parameters': [{ 'name': 'echo-merged', 'valueFrom': { 'path': '/tmp/message.txt' } }], 'artifacts': [{ 'name': 'mlpipeline-ui-metadata', 'path': '/mlpipeline-ui-metadata.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-ui-metadata.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } }, { 'name': 'mlpipeline-metrics', 'path': '/mlpipeline-metrics.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-metrics.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } }] } } self.maxDiff = None self.assertEqual(golden_output, compiler.Compiler()._op_to_template(op))
def test_operator_to_template(self): """Test converting operator to template""" from kubernetes import client as k8s_client with dsl.Pipeline('somename') as p: msg1 = dsl.PipelineParam('msg1') msg2 = dsl.PipelineParam('msg2', value='value2') op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'], arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)], file_outputs={'merged': '/tmp/message.txt'}) \ .add_volume_mount(k8s_client.V1VolumeMount( mount_path='/secret/gcp-credentials', name='gcp-credentials')) \ .add_env_variable(k8s_client.V1EnvVar( name='GOOGLE_APPLICATION_CREDENTIALS', value='/secret/gcp-credentials/user-gcp-sa.json')) golden_output = { 'container': { 'image': 'image', 'args': [ 'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt' ], 'command': ['sh', '-c'], 'env': [ { 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': '/secret/gcp-credentials/user-gcp-sa.json' } ], 'volumeMounts':[ { 'mountPath': '/secret/gcp-credentials', 'name': 'gcp-credentials', } ] }, 'inputs': {'parameters': [ {'name': 'msg1'}, {'name': 'msg2', 'value': 'value2'}, ]}, 'name': 'echo', 'outputs': { 'parameters': [ {'name': 'echo-merged', 'valueFrom': {'path': '/tmp/message.txt'} }], 'artifacts': [{ 'name': 'mlpipeline-ui-metadata', 'path': '/mlpipeline-ui-metadata.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-ui-metadata.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } },{ 'name': 'mlpipeline-metrics', 'path': '/mlpipeline-metrics.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-metrics.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } }] } } self.maxDiff = None self.assertEqual(golden_output, compiler.Compiler()._op_to_template(op))
def test_operator_to_template(self): """Test converting operator to template""" from kubernetes import client as k8s_client with dsl.Pipeline('somename') as p: msg1 = dsl.PipelineParam('msg1') msg2 = dsl.PipelineParam('msg2', value='value2') json = dsl.PipelineParam('json') kind = dsl.PipelineParam('kind') op = dsl.ContainerOp(name='echo', image='image', command=['sh', '-c'], arguments=['echo %s %s | tee /tmp/message.txt' % (msg1, msg2)], file_outputs={'merged': '/tmp/message.txt'}) \ .add_volume_mount(k8s_client.V1VolumeMount( mount_path='/secret/gcp-credentials', name='gcp-credentials')) \ .add_env_variable(k8s_client.V1EnvVar( name='GOOGLE_APPLICATION_CREDENTIALS', value='/secret/gcp-credentials/user-gcp-sa.json')) res = dsl.ResourceOp( name="test-resource", k8s_resource=k8s_client.V1PersistentVolumeClaim( api_version="v1", kind=kind, metadata=k8s_client.V1ObjectMeta(name="resource")), attribute_outputs={"out": json}) golden_output = { 'container': { 'image': 'image', 'args': [ 'echo {{inputs.parameters.msg1}} {{inputs.parameters.msg2}} | tee /tmp/message.txt' ], 'command': ['sh', '-c'], 'env': [{ 'name': 'GOOGLE_APPLICATION_CREDENTIALS', 'value': '/secret/gcp-credentials/user-gcp-sa.json' }], 'volumeMounts': [{ 'mountPath': '/secret/gcp-credentials', 'name': 'gcp-credentials', }] }, 'inputs': { 'parameters': [ { 'name': 'msg1' }, { 'name': 'msg2', 'value': 'value2' }, ] }, 'name': 'echo', 'outputs': { 'parameters': [{ 'name': 'echo-merged', 'valueFrom': { 'path': '/tmp/message.txt' } }], 'artifacts': [{ 'name': 'mlpipeline-ui-metadata', 'path': '/mlpipeline-ui-metadata.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-ui-metadata.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } }, { 'name': 'mlpipeline-metrics', 'path': '/mlpipeline-metrics.json', 's3': { 'accessKeySecret': { 'key': 'accesskey', 'name': 'mlpipeline-minio-artifact', }, 'bucket': 'mlpipeline', 'endpoint': 'minio-service.kubeflow:9000', 'insecure': True, 'key': 'runs/{{workflow.uid}}/{{pod.name}}/mlpipeline-metrics.tgz', 'secretKeySecret': { 'key': 'secretkey', 'name': 'mlpipeline-minio-artifact', } } }] } } res_output = { 'inputs': { 'parameters': [{ 'name': 'json' }, { 'name': 'kind' }] }, 'name': 'test-resource', 'outputs': { 'parameters': [{ 'name': 'test-resource-manifest', 'valueFrom': { 'jsonPath': '{}' } }, { 'name': 'test-resource-name', 'valueFrom': { 'jsonPath': '{.metadata.name}' } }, { 'name': 'test-resource-out', 'valueFrom': { 'jsonPath': '{{inputs.parameters.json}}' } }] }, 'resource': { 'action': 'create', 'manifest': ("apiVersion: v1\n" "kind: '{{inputs.parameters.kind}}'\n" "metadata:\n" " name: resource\n") } } self.maxDiff = None self.assertEqual(golden_output, compiler.Compiler()._op_to_template(op)) self.assertEqual(res_output, compiler.Compiler()._op_to_template(res))
def _create_workflow(self, pipeline_func: Callable, pipeline_name: Text=None, pipeline_description: Text=None, params_list: List[dsl.PipelineParam]=None, pipeline_conf: dsl.PipelineConf = None, ) -> List[Dict[Text, Any]]: # Tekton change, signature """ Internal implementation of create_workflow.""" params_list = params_list or [] argspec = inspect.getfullargspec(pipeline_func) # Create the arg list with no default values and call pipeline function. # Assign type information to the PipelineParam pipeline_meta = _extract_pipeline_metadata(pipeline_func) pipeline_meta.name = pipeline_name or pipeline_meta.name pipeline_meta.description = pipeline_description or pipeline_meta.description pipeline_name = sanitize_k8s_name(pipeline_meta.name) # Need to first clear the default value of dsl.PipelineParams. Otherwise, it # will be resolved immediately in place when being to each component. default_param_values = {} for param in params_list: default_param_values[param.name] = param.value param.value = None # Currently only allow specifying pipeline params at one place. if params_list and pipeline_meta.inputs: raise ValueError('Either specify pipeline params in the pipeline function, or in "params_list", but not both.') args_list = [] for arg_name in argspec.args: arg_type = None for input in pipeline_meta.inputs or []: if arg_name == input.name: arg_type = input.type break args_list.append(dsl.PipelineParam(sanitize_k8s_name(arg_name, True), param_type=arg_type)) with dsl.Pipeline(pipeline_name) as dsl_pipeline: pipeline_func(*args_list) pipeline_conf = pipeline_conf or dsl_pipeline.conf # Configuration passed to the compiler is overriding. Unfortunately, it's not trivial to detect whether the dsl_pipeline.conf was ever modified. self._validate_exit_handler(dsl_pipeline) self._sanitize_and_inject_artifact(dsl_pipeline, pipeline_conf) # Fill in the default values. args_list_with_defaults = [] if pipeline_meta.inputs: args_list_with_defaults = [dsl.PipelineParam(sanitize_k8s_name(arg_name, True)) for arg_name in argspec.args] if argspec.defaults: for arg, default in zip(reversed(args_list_with_defaults), reversed(argspec.defaults)): arg.value = default.value if isinstance(default, dsl.PipelineParam) else default elif params_list: # Or, if args are provided by params_list, fill in pipeline_meta. for param in params_list: param.value = default_param_values[param.name] args_list_with_defaults = params_list pipeline_meta.inputs = [ InputSpec( name=param.name, type=param.param_type, default=param.value) for param in params_list] op_transformers = [add_pod_env] op_transformers.extend(pipeline_conf.op_transformers) workflow = self._create_pipeline_workflow( args_list_with_defaults, dsl_pipeline, op_transformers, pipeline_conf, ) from ._data_passing_rewriter import fix_big_data_passing workflow = fix_big_data_passing(workflow) import json pipeline = [item for item in workflow if item["kind"] == "Pipeline"][0] # Tekton change pipeline.setdefault('metadata', {}).setdefault('annotations', {})['pipelines.kubeflow.org/pipeline_spec'] = json.dumps(pipeline_meta.to_dict(), sort_keys=True) return workflow
def _create_pipeline_v2( self, pipeline_func: Callable[..., Any], pipeline_root: Optional[str] = None, pipeline_name: Optional[str] = None, pipeline_parameters_override: Optional[Mapping[str, Any]] = None, ) -> pipeline_spec_pb2.PipelineJob: """Creates a pipeline instance and constructs the pipeline spec from it. Args: pipeline_func: Pipeline function with @dsl.pipeline decorator. pipeline_root: The root of the pipeline outputs. Optional. pipeline_name: The name of the pipeline. Optional. pipeline_parameters_override: The mapping from parameter names to values. Optional. Returns: A PipelineJob proto representing the compiled pipeline. """ # Create the arg list with no default values and call pipeline function. # Assign type information to the PipelineParam pipeline_meta = _python_op._extract_component_interface(pipeline_func) pipeline_name = pipeline_name or pipeline_meta.name pipeline_root = pipeline_root or getattr(pipeline_func, 'output_directory', None) if not pipeline_root: warnings.warn('pipeline_root is None or empty. A valid pipeline_root ' 'must be provided at job submission.') args_list = [] signature = inspect.signature(pipeline_func) for arg_name in signature.parameters: arg_type = None for pipeline_input in pipeline_meta.inputs or []: if arg_name == pipeline_input.name: arg_type = pipeline_input.type break args_list.append( dsl.PipelineParam( sanitize_k8s_name(arg_name, True), param_type=arg_type)) with dsl.Pipeline(pipeline_name) as dsl_pipeline: pipeline_func(*args_list) self._sanitize_and_inject_artifact(dsl_pipeline) # Fill in the default values. args_list_with_defaults = [] if pipeline_meta.inputs: args_list_with_defaults = [ dsl.PipelineParam( sanitize_k8s_name(input_spec.name, True), param_type=input_spec.type, value=input_spec.default) for input_spec in pipeline_meta.inputs ] # Making the pipeline group name unique to prevent name clashes with templates pipeline_group = dsl_pipeline.groups[0] temp_pipeline_group_name = uuid.uuid4().hex pipeline_group.name = temp_pipeline_group_name pipeline_spec = self._create_pipeline_spec( args_list_with_defaults, dsl_pipeline, ) pipeline_parameters = { param.name: param for param in args_list_with_defaults } # Update pipeline parameters override if there were any. pipeline_parameters_override = pipeline_parameters_override or {} for k, v in pipeline_parameters_override.items(): if k not in pipeline_parameters: raise ValueError('Pipeline parameter {} does not match any known ' 'pipeline argument.'.format(k)) pipeline_parameters[k].value = v runtime_config = compiler_utils.build_runtime_config_spec( output_directory=pipeline_root, pipeline_parameters=pipeline_parameters) pipeline_job = pipeline_spec_pb2.PipelineJob(runtime_config=runtime_config) pipeline_job.pipeline_spec.update(json_format.MessageToDict(pipeline_spec)) return pipeline_job
def _create_workflow( self, pipeline_func: Callable, pipeline_name: Text = None, pipeline_description: Text = None, params_list: List[dsl.PipelineParam] = None, pipeline_conf: dsl.PipelineConf = None, ) -> Dict[Text, Any]: """ Internal implementation of create_workflow.""" params_list = params_list or [] argspec = inspect.getfullargspec(pipeline_func) # Create the arg list with no default values and call pipeline function. # Assign type information to the PipelineParam pipeline_meta = _extract_pipeline_metadata(pipeline_func) pipeline_meta.name = pipeline_name or pipeline_meta.name pipeline_meta.description = pipeline_description or pipeline_meta.description pipeline_name = sanitize_k8s_name(pipeline_meta.name) # Need to first clear the default value of dsl.PipelineParams. Otherwise, it # will be resolved immediately in place when being to each component. default_param_values = {} for param in params_list: default_param_values[param.name] = param.value param.value = None # Currently only allow specifying pipeline params at one place. if params_list and pipeline_meta.inputs: raise ValueError( 'Either specify pipeline params in the pipeline function, or in "params_list", but not both.' ) args_list = [] for arg_name in argspec.args: arg_type = None for input in pipeline_meta.inputs or []: if arg_name == input.name: arg_type = input.type break args_list.append( dsl.PipelineParam(sanitize_k8s_name(arg_name, True), param_type=arg_type)) with dsl.Pipeline(pipeline_name) as dsl_pipeline: pipeline_func(*args_list) # Configuration passed to the compiler is overriding. Unfortunately, it is # not trivial to detect whether the dsl_pipeline.conf was ever modified. pipeline_conf = pipeline_conf or dsl_pipeline.conf self._validate_exit_handler(dsl_pipeline) self._sanitize_and_inject_artifact(dsl_pipeline, pipeline_conf) # Fill in the default values. args_list_with_defaults = [] if pipeline_meta.inputs: args_list_with_defaults = [ dsl.PipelineParam(sanitize_k8s_name(arg_name, True)) for arg_name in argspec.args ] if argspec.defaults: for arg, default in zip(reversed(args_list_with_defaults), reversed(argspec.defaults)): arg.value = default.value if isinstance( default, dsl.PipelineParam) else default elif params_list: # Or, if args are provided by params_list, fill in pipeline_meta. for param in params_list: param.value = default_param_values[param.name] args_list_with_defaults = params_list pipeline_meta.inputs = [ InputSpec(name=param.name, type=param.param_type, default=param.value) for param in params_list ] op_transformers = [add_pod_env] # # By default adds telemetry instruments. Users can opt out toggling # # allow_telemetry. # # Also, TFX pipelines will be bypassed for pipeline compiled by tfx>0.21.4. # if allow_telemetry: # pod_labels = get_default_telemetry_labels() # op_transformers.append(add_pod_labels(pod_labels)) op_transformers.extend(pipeline_conf.op_transformers) workflow = self._create_pipeline_workflow( args_list_with_defaults, dsl_pipeline, op_transformers, pipeline_conf, ) workflow = fix_big_data_passing(workflow) workflow.setdefault('metadata', {}).setdefault('annotations', {})['pipelines.kubeflow.org/pipeline_spec'] = \ json.dumps(pipeline_meta.to_dict(), sort_keys=True) # recursively strip empty structures, DANGER: this may remove necessary empty elements ?! def remove_empty_elements(obj) -> dict: if not isinstance(obj, (dict, list)): return obj if isinstance(obj, list): return [remove_empty_elements(o) for o in obj if o != []] return { k: remove_empty_elements(v) for k, v in obj.items() if v != [] } workflow = remove_empty_elements(workflow) return workflow
def _create_workflow( self, pipeline_func: Callable, pipeline_name: Optional[Text] = None, pipeline_description: Optional[Text] = None, params_list: Optional[List[dsl.PipelineParam]] = None, pipeline_conf: Optional[dsl.PipelineConf] = None, ) -> Dict[Text, Any]: """ Internal implementation of create_workflow.""" params_list = params_list or [] # Create the arg list with no default values and call pipeline function. # Assign type information to the PipelineParam pipeline_meta = _extract_pipeline_metadata(pipeline_func) pipeline_meta.name = pipeline_name or pipeline_meta.name pipeline_meta.description = pipeline_description or pipeline_meta.description pipeline_name = sanitize_k8s_name(pipeline_meta.name) # Need to first clear the default value of dsl.PipelineParams. Otherwise, it # will be resolved immediately in place when being to each component. default_param_values = OrderedDict() if self._pipeline_root_param: params_list.append(self._pipeline_root_param) if self._pipeline_name_param: params_list.append(self._pipeline_name_param) for param in params_list: default_param_values[param.name] = param.value param.value = None args_list = [] kwargs_dict = dict() signature = inspect.signature(pipeline_func) for arg_name, arg in signature.parameters.items(): arg_type = None for input in pipeline_meta.inputs or []: if arg_name == input.name: arg_type = input.type break param = dsl.PipelineParam(sanitize_k8s_name(arg_name, True), param_type=arg_type) if arg.kind == inspect.Parameter.KEYWORD_ONLY: kwargs_dict[arg_name] = param else: args_list.append(param) with dsl.Pipeline(pipeline_name) as dsl_pipeline: pipeline_func(*args_list, **kwargs_dict) pipeline_conf = pipeline_conf or dsl_pipeline.conf # Configuration passed to the compiler is overriding. Unfortunately, it's not trivial to detect whether the dsl_pipeline.conf was ever modified. self._validate_exit_handler(dsl_pipeline) self._sanitize_and_inject_artifact(dsl_pipeline, pipeline_conf) # Fill in the default values by merging two param lists. args_list_with_defaults = OrderedDict() if pipeline_meta.inputs: args_list_with_defaults = OrderedDict([ (sanitize_k8s_name(input_spec.name, True), input_spec.default) for input_spec in pipeline_meta.inputs ]) if params_list: # Or, if args are provided by params_list, fill in pipeline_meta. for k, v in default_param_values.items(): args_list_with_defaults[k] = v pipeline_meta.inputs = pipeline_meta.inputs or [] for param in params_list: pipeline_meta.inputs.append( InputSpec( name=param.name, type=param.param_type, default=default_param_values[param.name])) op_transformers = [add_pod_env] pod_labels = {_SDK_VERSION_LABEL: kfp.__version__, _SDK_ENV_LABEL:_SDK_ENV_DEFAULT} op_transformers.append(add_pod_labels(pod_labels)) op_transformers.extend(pipeline_conf.op_transformers) if self._mode == dsl.PipelineExecutionMode.V2_COMPATIBLE: # Add self._pipeline_name_param and self._pipeline_root_param to ops inputs # if they don't exist already. for op in dsl_pipeline.ops.values(): insert_pipeline_name_param = True insert_pipeline_root_param = True for param in op.inputs: if param.name == self._pipeline_name_param.name: insert_pipeline_name_param = False elif param.name == self._pipeline_root_param.name: insert_pipeline_root_param = False if insert_pipeline_name_param: op.inputs.append(self._pipeline_name_param) if insert_pipeline_root_param: op.inputs.append(self._pipeline_root_param) workflow = self._create_pipeline_workflow( args_list_with_defaults, dsl_pipeline, op_transformers, pipeline_conf, ) from ._data_passing_rewriter import fix_big_data_passing workflow = fix_big_data_passing(workflow) workflow = _data_passing_rewriter.add_pod_name_passing( workflow, str(self._pipeline_root_param or None)) if pipeline_conf and pipeline_conf.data_passing_method != None: workflow = pipeline_conf.data_passing_method(workflow) metadata = workflow.setdefault('metadata', {}) annotations = metadata.setdefault('annotations', {}) labels = metadata.setdefault('labels', {}) annotations[_SDK_VERSION_LABEL] = kfp.__version__ annotations['pipelines.kubeflow.org/pipeline_compilation_time'] = datetime.datetime.now().isoformat() annotations['pipelines.kubeflow.org/pipeline_spec'] = json.dumps(pipeline_meta.to_dict(), sort_keys=True) if self._mode == dsl.PipelineExecutionMode.V2_COMPATIBLE: annotations['pipelines.kubeflow.org/v2_pipeline'] = "true" labels['pipelines.kubeflow.org/v2_pipeline'] = "true" # Labels might be logged better than annotations so adding some information here as well labels[_SDK_VERSION_LABEL] = kfp.__version__ return workflow