def setUp(self): super(BaseComponentWithPipelineParamTest, self).setUp() test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') example_gen_buckets = data_types.RuntimeParameter( name='example-gen-buckets', ptype=int, default=10) examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input=channel_utils.as_channel([examples]), output_config={ 'split_config': { 'splits': [{ 'name': 'examples', 'hash_buckets': example_gen_buckets }] } }) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples'], instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' self._tfx_ir = pipeline_pb2.Pipeline() with dsl.Pipeline('test_pipeline'): self.example_gen = base_component.BaseComponent( component=example_gen, component_launcher_class=in_process_component_launcher .InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, tfx_ir=self._tfx_ir) self.statistics_gen = base_component.BaseComponent( component=statistics_gen, component_launcher_class=in_process_component_launcher .InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, tfx_ir=self._tfx_ir ) self.tfx_example_gen = example_gen self.tfx_statistics_gen = statistics_gen
def setUp(self): super(BaseComponentWithPipelineParamTest, self).setUp() test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') example_gen_output_name = runtime_string_parameter.RuntimeStringParameter( name='example-gen-output-name', default='default-to-be-discarded') examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input=channel_utils.as_channel([examples]), output_config=example_gen_pb2.Output( split_config=example_gen_pb2.SplitConfig(splits=[ example_gen_pb2.SplitConfig.Split( name=example_gen_output_name, hash_buckets=10) ]))) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples'], instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' with dsl.Pipeline('test_pipeline'): self.example_gen = base_component.BaseComponent( component=example_gen, component_launcher_class=in_process_component_launcher. InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None) self.statistics_gen = base_component.BaseComponent( component=statistics_gen, component_launcher_class=in_process_component_launcher. InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, ) self.tfx_example_gen = example_gen self.tfx_statistics_gen = statistics_gen
def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline): """Constructs a Kubeflow Pipeline graph. Args: pipeline: The logical TFX pipeline to base the construction on. """ component_to_kfp_op = {} # Assumption: There is a partial ordering of components in the list, i.e., # if component A depends on component B and C, then A appears after B and C # in the list. for component in pipeline.components: # Keep track of the set of upstream dsl.ContainerOps for this component. depends_on = set() for upstream_component in component.upstream_nodes: depends_on.add(component_to_kfp_op[upstream_component]) kfp_component = base_component.BaseComponent( component=component, depends_on=depends_on, pipeline=pipeline, tfx_image=self._config.tfx_image, kubeflow_metadata_config=self._config.kubeflow_metadata_config) for operator in self._config.pipeline_operator_funcs: kfp_component.container_op.apply(operator) component_to_kfp_op[component] = kfp_component.container_op
def setUp(self): super(BaseComponentTest, self).setUp() examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input_base=channel_utils.as_channel([examples])) statistics_gen = statistics_gen_component.StatisticsGen( input_data=example_gen.outputs.examples, instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name='test_pipeline', pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component=statistics_gen, depends_on=set(), pipeline=pipeline, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, ) self.tfx_component = statistics_gen
def setUp(self): super(BaseComponentTest, self).setUp() examples = standard_artifacts.ExternalArtifact() example_gen = csv_example_gen_component.CsvExampleGen( input=channel_utils.as_channel([examples])) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples'], instance_name='foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component=statistics_gen, component_launcher_class=in_process_component_launcher .InProcessComponentLauncher, depends_on=set(), pipeline=pipeline, pipeline_name=self._test_pipeline_name, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, component_config=None, ) self.tfx_component = statistics_gen
def setUp(self): super(BaseComponentTest, self).setUp() example_gen = csv_example_gen_component.CsvExampleGen( input_base='data_input') statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples']).with_id('foo') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' self._tfx_ir = pipeline_pb2.Pipeline() with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component=statistics_gen, depends_on=set(), pipeline=pipeline, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, tfx_ir=self._tfx_ir, ) self.tfx_component = statistics_gen
def setUp(self): super().setUp() example_gen_output_config = data_types.RuntimeParameter( name='example-gen-output-config', ptype=str) example_gen = csv_example_gen_component.CsvExampleGen( input_base='data_root', output_config=example_gen_output_config) statistics_gen = statistics_gen_component.StatisticsGen( examples=example_gen.outputs['examples']).with_id('foo') test_pipeline_root = dsl.PipelineParam(name='pipeline-root-param') pipeline = tfx_pipeline.Pipeline( pipeline_name=self._test_pipeline_name, pipeline_root='test_pipeline_root', metadata_connection_config=metadata_store_pb2.ConnectionConfig(), components=[example_gen, statistics_gen], ) self._metadata_config = kubeflow_pb2.KubeflowMetadataConfig() self._metadata_config.mysql_db_service_host.environment_variable = 'MYSQL_SERVICE_HOST' self._tfx_ir = pipeline_pb2.Pipeline() with dsl.Pipeline('test_pipeline'): self.example_gen = base_component.BaseComponent( component=example_gen, depends_on=set(), pipeline=pipeline, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, tfx_ir=self._tfx_ir, pod_labels_to_attach={}, runtime_parameters=[example_gen_output_config]) self.statistics_gen = base_component.BaseComponent( component=statistics_gen, depends_on=set(), pipeline=pipeline, pipeline_root=test_pipeline_root, tfx_image='container_image', kubeflow_metadata_config=self._metadata_config, tfx_ir=self._tfx_ir, pod_labels_to_attach={}, runtime_parameters=[]) self.tfx_example_gen = example_gen self.tfx_statistics_gen = statistics_gen
def setUp(self): output_dict = { 'output_name': [types.TfxType(type_name='ExamplesPath')] } with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component_name='TFXComponent', input_dict={ 'input_data': 'input-data-contents', 'train_steps': 300, 'accuracy_threshold': 0.3, }, output_dict=output_dict, exec_properties={'module_file': '/path/to/module.py'}, )
def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline, pipeline_root: dsl.PipelineParam): """Constructs a Kubeflow Pipeline graph. Args: pipeline: The logical TFX pipeline to base the construction on. pipeline_root: dsl.PipelineParam representing the pipeline root. """ component_to_kfp_op = {} # Assumption: There is a partial ordering of components in the list, i.e., # if component A depends on component B and C, then A appears after B and C # in the list. for component in pipeline.components: # Keep track of the set of upstream dsl.ContainerOps for this component. depends_on = set() for upstream_component in component.upstream_nodes: depends_on.add(component_to_kfp_op[upstream_component]) (component_launcher_class, component_config) = config_utils.find_component_launch_info( self._config, component) kfp_component = base_component.BaseComponent( component=component, component_launcher_class=component_launcher_class, depends_on=depends_on, pipeline=pipeline, pipeline_name=pipeline.pipeline_info.pipeline_name, pipeline_root=pipeline_root, tfx_image=self._config.tfx_image, kubeflow_metadata_config=self._config.kubeflow_metadata_config, component_config=component_config) for operator in self._config.pipeline_operator_funcs: kfp_component.container_op.apply(operator) kfp_component.container_op.add_pod_label(SDK_ENV_LABEL, self._sdk_env) assert self._pipeline_id, 'Failed to generate pipeline ID.' kfp_component.container_op.add_pod_label(PIPELINE_UUID_LABEL, self._pipeline_id) component_to_kfp_op[component] = kfp_component.container_op
def _construct_pipeline_graph(self, pipeline: tfx_pipeline.Pipeline, pipeline_root: dsl.PipelineParam): """Constructs a Kubeflow Pipeline graph. Args: pipeline: The logical TFX pipeline to base the construction on. pipeline_root: dsl.PipelineParam representing the pipeline root. """ component_to_kfp_op = {} tfx_ir = self._generate_tfx_ir(pipeline) # Assumption: There is a partial ordering of components in the list, i.e., # if component A depends on component B and C, then A appears after B and C # in the list. for component in pipeline.components: # Keep track of the set of upstream dsl.ContainerOps for this component. depends_on = set() for upstream_component in component.upstream_nodes: depends_on.add(component_to_kfp_op[upstream_component]) # remove the extra pipeline node information tfx_node_ir = self._dehydrate_tfx_ir(tfx_ir, component.id) kfp_component = base_component.BaseComponent( component=component, depends_on=depends_on, pipeline=pipeline, pipeline_root=pipeline_root, tfx_image=self._config.tfx_image, kubeflow_metadata_config=self._config.kubeflow_metadata_config, pod_labels_to_attach=self._pod_labels_to_attach, tfx_ir=tfx_node_ir, metadata_ui_path=self._config.metadata_ui_path, runtime_parameters=( self._params_by_component_id[component.id] + [tfx_pipeline.ROOT_PARAMETER])) for operator in self._config.pipeline_operator_funcs: kfp_component.container_op.apply(operator) component_to_kfp_op[component] = kfp_component.container_op
def _construct_pipeline_graph(self, pipeline): """Constructs a Kubeflow Pipeline graph. Args: pipeline: The logical TFX pipeline to base the construction on. """ # producers is a map from an output Channel, to a Kubeflow component that # is responsible for the named output represented by the Channel. # Assumption: Channels are unique in a pipeline. producers = {} # Assumption: There is a partial ordering of components in the list, i.e., # if component A depends on component B and C, then A appears after B and C # in the list. for component in pipeline.components: input_dict = {} for input_name, input_channel in component.input_dict.items(): if input_channel in producers: output = getattr( producers[input_channel]['component'].outputs, producers[input_channel]['channel_name']) if not isinstance(output, dsl.PipelineParam): raise ValueError( 'Component outputs should be of type dsl.PipelineParam.' ' Got type {} for output {}'.format( type(output), output)) input_dict[input_name] = output else: input_dict[input_name] = json.dumps( [x.json_dict() for x in input_channel.get()]) kfp_component = base_component.BaseComponent( component_name=component.component_name, input_dict=input_dict, output_dict=self._prepare_output_dict(component.outputs), exec_properties=component.exec_properties) for channel_name, channel in component.outputs.get_all().items(): producers[channel] = {} producers[channel]['component'] = kfp_component producers[channel]['channel_name'] = channel_name
def setUp(self): self._output_dict = {'output_name': [standard_artifacts.Examples()]} self._pipeline_properties = base_component.PipelineProperties( output_dir='output_dir', log_root='log_root', ) with dsl.Pipeline('test_pipeline'): self.component = base_component.BaseComponent( component_name='TFXComponent', input_dict=collections.OrderedDict([ ('input_data', 'input-data-contents'), ('train_steps', 300), ('accuracy_threshold', 0.3), ]), output_dict=self._output_dict, exec_properties=collections.OrderedDict([ ('module_file', '/path/to/module.py') ]), executor_class_path='some.executor.Class', pipeline_properties=self._pipeline_properties, )
def _construct_pipeline_graph(self, pipeline): """Constructs a Kubeflow Pipeline graph. Args: pipeline: The logical TFX pipeline to base the construction on. """ output_dir = os.path.join(pipeline.pipeline_args['pipeline_root'], pipeline.pipeline_args['pipeline_name']) beam_pipeline_args = [] tfx_image = None if 'additional_pipeline_args' in pipeline.pipeline_args: additional_pipeline_args = pipeline.pipeline_args[ 'additional_pipeline_args'] beam_pipeline_args = additional_pipeline_args.get( 'beam_pipeline_args', []) tfx_image = additional_pipeline_args.get('tfx_image') pipeline_properties = base_component.PipelineProperties( output_dir=output_dir, log_root=pipeline.pipeline_args['log_root'], beam_pipeline_args=beam_pipeline_args, tfx_image=tfx_image, ) # producers is a map from an output Channel, to a Kubeflow component that # is responsible for the named output represented by the Channel. # Assumption: Channels are unique in a pipeline. producers = {} # Assumption: There is a partial ordering of components in the list, i.e., # if component A depends on component B and C, then A appears after B and C # in the list. for component in pipeline.components: input_dict = {} for input_name, input_channel in component.input_dict.items(): if input_channel in producers: output = getattr( producers[input_channel]['component'].outputs, producers[input_channel]['channel_name']) if not isinstance(output, dsl.PipelineParam): raise ValueError( 'Component outputs should be of type dsl.PipelineParam.' ' Got type {} for output {}'.format( type(output), output)) input_dict[input_name] = output else: input_dict[input_name] = json.dumps( [x.json_dict() for x in input_channel.get()]) executor_class_path = '.'.join( [component.executor.__module__, component.executor.__name__]) kfp_component = base_component.BaseComponent( component_name=component.component_name, input_dict=input_dict, output_dict=self._prepare_output_dict(component.outputs), exec_properties=component.exec_properties, executor_class_path=executor_class_path, pipeline_properties=pipeline_properties) for channel_name, channel in component.outputs.get_all().items(): producers[channel] = {} producers[channel]['component'] = kfp_component producers[channel]['channel_name'] = channel_name