class StratifiedSampler(base_component.BaseComponent): """A TFX component to do stratified sampling. StratifiedSampler consumes examples data, and produces examples data ## Example ``` # Uses StratifiedSampler to inference on examples. stratified_sampler = StratifiedSampler( key='trip_miles', examples=example_gen.outputs['examples']) ``` """ SPEC_CLASS = StratifiedSamplerSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, examples: types.Channel, to_key_fn: Optional[Text] = None, to_key_fn_key: Optional[Text] = 'to_key_fn', pipeline_configuration: Optional[types.Channel] = None, stratified_examples: Optional[types.Channel] = None, splits_to_transform: Optional[List[Text]] = None, splits_to_copy: Optional[List[Text]] = None, samples_per_key: Optional[int] = None): """Construct an StratifiedSampler component. Args: examples: A Channel of 'Examples' type, usually produced by ExampleGen component. _required_ pipeline_configuration: A Channel of 'PipelineConfiguration' type, usually produced by FromCustomConfig component. stratified_examples: Channel of `Examples` to store the inference results. splits_to_transform: Optional list of split names to transform. splits_to_copy: Optional list of split names to copy. samples_per_key: Number of samples per key. to_key_fn_key: the name of the key that contains the to_key_fn - default is 'to_key_fn'. to_key_fn: To key function, the function that will extract the key - must be 'to_key: Example -> key For example something like: >>> def to_key(m): >>> return m.features.feature['trip_miles'].float_list.value[0] > 42. """ stratified_examples = stratified_examples or types.Channel( type=standard_artifacts.Examples) if stratified_examples is None: stratified_examples = types.Channel( type=standard_artifacts.Examples, matching_channel_name='examples') spec = StratifiedSamplerSpec( examples=examples, pipeline_configuration=pipeline_configuration, stratified_examples=stratified_examples, splits_to_transform=json_utils.dumps(splits_to_transform), splits_to_copy=json_utils.dumps(splits_to_copy), to_key_fn=to_key_fn, to_key_fn_key=to_key_fn_key, samples_per_key=samples_per_key) super(StratifiedSampler, self).__init__(spec=spec)
class StatisticsGen(base_beam_component.BaseBeamComponent): """Official TFX StatisticsGen component. The StatisticsGen component generates features statistics and random samples over training data, which can be used for visualization and validation. StatisticsGen uses Apache Beam and approximate algorithms to scale to large datasets. Please see https://www.tensorflow.org/tfx/data_validation for more details. ## Example ``` # Computes statistics over data for visualization and example validation. statistics_gen = StatisticsGen(examples=example_gen.outputs['examples']) ``` Component `outputs` contains: - `statistics`: Channel of type `standard_artifacts.ExampleStatistics` for statistics of each split provided in the input examples. """ SPEC_CLASS = StatisticsGenSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, examples: types.Channel = None, schema: Optional[types.Channel] = None, stats_options: Optional[tfdv.StatsOptions] = None, exclude_splits: Optional[List[Text]] = None): """Construct a StatisticsGen component. Args: examples: A Channel of `ExamplesPath` type, likely generated by the [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen). This needs to contain two splits labeled `train` and `eval`. _required_ schema: A `Schema` channel to use for automatically configuring the value of stats options passed to TFDV. stats_options: The StatsOptions instance to configure optional TFDV behavior. When stats_options.schema is set, it will be used instead of the `schema` channel input. Due to the requirement that stats_options be serialized, the slicer functions and custom stats generators are dropped and are therefore not usable. exclude_splits: Names of splits where statistics and sample should not be generated. Default behavior (when exclude_splits is set to None) is excluding no splits. """ if exclude_splits is None: exclude_splits = [] logging.info( 'Excluding no splits because exclude_splits is not set.') statistics = types.Channel(type=standard_artifacts.ExampleStatistics) # TODO(b/150802589): Move jsonable interface to tfx_bsl and use json_utils. stats_options_json = stats_options.to_json() if stats_options else None spec = StatisticsGenSpec( examples=examples, schema=schema, stats_options_json=stats_options_json, exclude_splits=json_utils.dumps(exclude_splits), statistics=statistics) super(StatisticsGen, self).__init__(spec=spec)
class TestFileBasedExampleGenComponent(component.FileBasedExampleGen): EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(TestExampleGenExecutor) def __init__(self, input_base, input_config=None, output_config=None): super().__init__(input_base=input_base, input_config=input_config, output_config=output_config)
def testConstructCustomExecutor(self): example_gen = component.FileBasedExampleGen( input_base='path', custom_executor_spec=executor_spec.BeamExecutorSpec( TestExampleGenExecutor)) self.assertEqual(driver.FileBasedDriver, example_gen.driver_class) self.assertEqual( standard_artifacts.Examples.TYPE_NAME, example_gen.outputs[standard_component_specs.EXAMPLES_KEY].type_name)
class BigQueryToElwcExampleGen(component.QueryBasedExampleGen): """Official TFX BigQueryToElwcExampleGen component. The BigQueryToElwcExampleGen component takes a query, and generates train and eval ExampleListWithContext(ELWC) for downstream components. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, query: Optional[Text] = None, elwc_config: Optional[elwc_config_pb2.ElwcConfig] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, example_artifacts: Optional[types.Channel] = None): """Constructs a BigQueryElwcExampleGen component. Args: query: BigQuery sql string, query result will be treated as a single split, can be overwritten by input_config. elwc_config: The elwc config contains a list of context feature fields. The fields are used to build context feature. Examples with the same context feature will be converted to an ELWC(ExampleListWithContext) instance. For example, when there are two examples with the same context field, the two examples will be intergrated to a ELWC instance. input_config: An example_gen_pb2.Input instance with Split.pattern as BigQuery sql string. If set, it overwrites the 'query' arg, and allows different queries per split. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Output proto message. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. Raises: RuntimeError: Only one of query and input_config should be set and elwc_config is required. """ if bool(query) == bool(input_config): raise RuntimeError('Exactly one of query and input_config should be set.') if not elwc_config: raise RuntimeError( 'elwc_config is required for BigQueryToElwcExampleGen.') input_config = input_config or utils.make_default_input_config(query) packed_custom_config = example_gen_pb2.CustomConfig() packed_custom_config.custom_config.Pack(elwc_config) super(BigQueryToElwcExampleGen, self).__init__( input_config=input_config, output_config=output_config, output_data_format=example_gen_pb2.FORMAT_PROTO, custom_config=packed_custom_config, example_artifacts=example_artifacts)
class ImportExampleGen(component.FileBasedExampleGen): # pylint: disable=protected-access """Official TFX ImportExampleGen component. The ImportExampleGen component takes TFRecord files with TF Example data format, and generates train and eval examples for downstream components. This component provides consistent and configurable partition, and it also shuffle the dataset for ML best practice. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__( self, input_base: Optional[Text] = None, input_config: Optional[Union[example_gen_pb2.Input, Dict[Text, Any]]] = None, output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, Dict[Text, Any]]] = None, payload_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE, example_artifacts: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an ImportExampleGen component. Args: input_base: an external directory containing the TFRecord files. input_config: An example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. If any field is provided as a RuntimeParameter, output_config should be constructed as a dict with the same field names as Output proto message. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. If unset, driver will default to searching for latest span with no restrictions. payload_format: Payload format of input data. Should be one of example_gen_pb2.PayloadFormat enum. Note that payload format of output data is the same as input. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. instance_name: Optional unique instance name. Necessary if multiple ImportExampleGen components are declared in the same pipeline. """ super(ImportExampleGen, self).__init__( input_base=input_base, input_config=input_config, output_config=output_config, range_config=range_config, example_artifacts=example_artifacts, output_data_format=payload_format, instance_name=instance_name)
class PrestoExampleGen(component.QueryBasedExampleGen): # pylint: disable=protected-access """Official TFX PrestoExampleGen component. The Presto examplegen component takes a query, connection client configuration, and generates train and eval examples for downstream components. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, conn_config: presto_config_pb2.PrestoConnConfig, query: Optional[Text] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, example_artifacts: Optional[types.Channel] = None): """Constructs a PrestoExampleGen component. Args: conn_config: Parameters for Presto connection client. query: Presto sql string, query result will be treated as a single split, can be overwritten by input_config. input_config: An example_gen_pb2.Input instance with Split.pattern as Presto sql string. If set, it overwrites the 'query' arg, and allows different queries per split. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. Raises: RuntimeError: Only one of query and input_config should be set. Or required host field in connection_config should be set. """ if bool(query) == bool(input_config): raise RuntimeError( 'Exactly one of query and input_config should be set.') if not bool(conn_config.host): raise RuntimeError( 'Required host field in connection config should be set.') input_config = input_config or utils.make_default_input_config(query) packed_custom_config = example_gen_pb2.CustomConfig() packed_custom_config.custom_config.Pack(conn_config) output_config = output_config or utils.make_default_output_config( input_config) super(PrestoExampleGen, self).__init__(input_config=input_config, output_config=output_config, custom_config=packed_custom_config, example_artifacts=example_artifacts)
class Filter(base_component.BaseComponent): """A TFX component to do filtering. Filter consumes examples data, and produces examples data ## Example # Uses Filter to inference on examples. >>> filter = Filter( >>> predicate_fn="def predicate(m):...", >>> examples=example_gen.outputs['examples']) """ SPEC_CLASS = FilterSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, examples: types.Channel, predicate_fn: Optional[Text] = None, predicate_fn_key: Optional[Text] = 'predicate_fn', pipeline_configuration: Optional[types.Channel] = None, filtered_examples: Optional[types.Channel] = None, splits_to_transform: Optional[List[Text]] = None, splits_to_copy: Optional[List[Text]] = None): """Construct an Filter component. Args: examples: A Channel of 'Examples' type, usually produced by ExampleGen component. _required_ pipeline_configuration: A Channel of 'PipelineConfiguration' type, usually produced by FromCustomConfig component. filtered_examples: Channel of `Examples` to store the inference results. splits_to_transform: Optional list of split names to transform. splits_to_copy: Optional list of split names to copy. predicate_fn_key: the name of the key that contains the predicate_fn - default is 'predicate_fn'. predicate_fn: To key function, the function that will tell if a example must be kept. Must be 'predicate: Example -> bool. For example something like: >>> def predicate(m): return m.features.feature['trip_miles'].float_list.value[0] > 42. """ filtered_examples = filtered_examples or types.Channel( type=standard_artifacts.Examples) if filtered_examples is None: filtered_examples = types.Channel(type=standard_artifacts.Examples, matching_channel_name='examples') spec = FilterSpec( examples=examples, pipeline_configuration=pipeline_configuration, filtered_examples=filtered_examples, splits_to_transform=json_utils.dumps(splits_to_transform), splits_to_copy=json_utils.dumps(splits_to_copy), predicate_fn=predicate_fn, predicate_fn_key=predicate_fn_key) super(Filter, self).__init__(spec=spec)
class _FakeBeamComponent(base_beam_component.BaseBeamComponent): SPEC_CLASS = _FakeComponentSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(base_executor.BaseExecutor) def __init__( self, type: Type[types.Artifact], # pylint: disable=redefined-builtin spec_kwargs: Dict[Text, Any]): spec = _FakeComponentSpec(output=types.Channel(type=type), **spec_kwargs) super(_FakeBeamComponent, self).__init__(spec=spec, instance_name=name)
class TestQueryBasedExampleGenComponent(component.QueryBasedExampleGen): EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(TestExampleGenExecutor) def __init__(self, input_config, output_config=None, output_data_format=example_gen_pb2.FORMAT_TF_EXAMPLE): super(TestQueryBasedExampleGenComponent, self).__init__( input_config=input_config, output_config=output_config, output_data_format=output_data_format)
def testConstructWithCustomConfig(self): custom_config = example_gen_pb2.CustomConfig(custom_config=any_pb2.Any()) example_gen = component.FileBasedExampleGen( input_base='path', custom_config=custom_config, custom_executor_spec=executor_spec.BeamExecutorSpec( TestExampleGenExecutor)) stored_custom_config = example_gen_pb2.CustomConfig() proto_utils.json_to_proto( example_gen.exec_properties[standard_component_specs.CUSTOM_CONFIG_KEY], stored_custom_config) self.assertEqual(custom_config, stored_custom_config)
class BigQueryExampleGen(component.QueryBasedExampleGen): """Cloud BigQueryExampleGen component. The BigQuery examplegen component takes a query, and generates train and eval examples for downstream components. Component `outputs` contains: - `examples`: Channel of type `standard_artifacts.Examples` for output train and eval examples. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__( self, query: Optional[str] = None, input_config: Optional[Union[example_gen_pb2.Input, data_types.RuntimeParameter]] = None, output_config: Optional[Union[example_gen_pb2.Output, data_types.RuntimeParameter]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, data_types.RuntimeParameter]] = None): """Constructs a BigQueryExampleGen component. Args: query: BigQuery sql string, query result will be treated as a single split, can be overwritten by input_config. input_config: An example_gen_pb2.Input instance with Split.pattern as BigQuery sql string. If set, it overwrites the 'query' arg, and allows different queries per split. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Output proto message. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. Raises: RuntimeError: Only one of query and input_config should be set. """ if bool(query) == bool(input_config): raise RuntimeError( 'Exactly one of query and input_config should be set.') input_config = input_config or utils.make_default_input_config(query) super().__init__(input_config=input_config, output_config=output_config, range_config=range_config)
def testConstructWithStaticRangeConfig(self): range_config = range_config_pb2.RangeConfig( static_range=range_config_pb2.StaticRange( start_span_number=1, end_span_number=1)) example_gen = component.FileBasedExampleGen( input_base='path', range_config=range_config, custom_executor_spec=executor_spec.BeamExecutorSpec( TestExampleGenExecutor)) stored_range_config = range_config_pb2.RangeConfig() proto_utils.json_to_proto( example_gen.exec_properties[standard_component_specs.RANGE_CONFIG_KEY], stored_range_config) self.assertEqual(range_config, stored_range_config)
class ImportExampleGen(component.FileBasedExampleGen): # pylint: disable=protected-access """Official TFX ImportExampleGen component. The ImportExampleGen component takes TFRecord files with TF Example data format, and generates train and eval examples for downstream components. This component provides consistent and configurable partition, and it also shuffle the dataset for ML best practice. Component `outputs` contains: - `examples`: Channel of type `standard_artifacts.Examples` for output train and eval examples. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__( self, input_base: Optional[str] = None, input_config: Optional[Union[example_gen_pb2.Input, data_types.RuntimeParameter]] = None, output_config: Optional[Union[example_gen_pb2.Output, data_types.RuntimeParameter]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, data_types.RuntimeParameter]] = None, payload_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE): """Construct an ImportExampleGen component. Args: input_base: an external directory containing the TFRecord files. input_config: An example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. If unset, driver will default to searching for latest span with no restrictions. payload_format: Payload format of input data. Should be one of example_gen_pb2.PayloadFormat enum. Note that payload format of output data is the same as input. """ super().__init__(input_base=input_base, input_config=input_config, output_config=output_config, range_config=range_config, output_data_format=payload_format)
class _FakeBeamComponent(base_beam_component.BaseBeamComponent): SPEC_CLASS = _FakeComponentSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec( base_executor.BaseExecutor) def __init__( self, type: Type[types.Artifact], # pylint: disable=redefined-builtin spec_kwargs: Dict[str, Any]): spec = _FakeComponentSpec(output=types.Channel(type=type), **spec_kwargs) super().__init__(spec=spec) self._id = name if dynamic_exec_property is not None: self.exec_properties['exec_prop'] = dynamic_exec_property
class BigQueryExampleGen(component.QueryBasedExampleGen): """Official TFX BigQueryExampleGen component. The BigQuery examplegen component takes a query, and generates train and eval examples for downstream components. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, query: Optional[Text] = None, input_config: Optional[example_gen_pb2.Input] = None, output_config: Optional[example_gen_pb2.Output] = None, example_artifacts: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Constructs a BigQueryExampleGen component. Args: query: BigQuery sql string, query result will be treated as a single split, can be overwritten by input_config. input_config: An example_gen_pb2.Input instance with Split.pattern as BigQuery sql string. If set, it overwrites the 'query' arg, and allows different queries per split. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Output proto message. example_artifacts: Optional channel of 'ExamplesPath' for output train and eval examples. instance_name: Optional unique instance name. Necessary if multiple BigQueryExampleGen components are declared in the same pipeline. Raises: RuntimeError: Only one of query and input_config should be set. """ if bool(query) == bool(input_config): raise RuntimeError( 'Exactly one of query and input_config should be set.') input_config = input_config or utils.make_default_input_config(query) super(BigQueryExampleGen, self).__init__(input_config=input_config, output_config=output_config, example_artifacts=example_artifacts, instance_name=instance_name)
def testBeamExecutorSpecCopy(self): class _NestedExecutor(base_executor.BaseExecutor): pass spec = executor_spec.BeamExecutorSpec(_NestedExecutor) spec.add_extra_flags('a') spec.add_beam_pipeline_args('b') spec_copy = spec.copy() del spec self.assertProtoEquals( """ python_executor_spec: { class_path: "__main__._NestedExecutor" extra_flags: "a" } beam_pipeline_args: "b" """, spec_copy.encode())
class ModelValidator(base_beam_component.BaseBeamComponent): """DEPRECATED: Please use `Evaluator` instead. The model validator component can be used to check model metrics threshold and validate current model against a previously validated model. If there isn't a prior validated model, model validator will just make sure the threshold passed. Otherwise, ModelValidator compares a newly trained models against a known good model, specifically the last model "blessed" by this component. A model is "blessed" if the exported model's metrics are within predefined thresholds around the prior model's metrics. *Note:* This component includes a driver to resolve last blessed model. ## Possible causes why model validation fails Model validation can fail for many reasons, but these are the most common: - problems with training data. For example, negative examples are dropped or features are missing. - problems with the test or evaluation data. For example, skew exists between the training and evaluation data. - changes in data distribution. This indicates the user behavior may have changed over time. - problems with the trainer. For example, the trainer was stopped before model is converged or the model is unstable. ## Example ``` # Performs quality validation of a candidate model (compared to a baseline). model_validator = ModelValidator( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) ``` """ SPEC_CLASS = ModelValidatorSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) DRIVER_CLASS = driver.Driver @deprecation_utils.deprecated( None, 'ModelValidator is deprecated, use Evaluator instead.') def __init__(self, examples: types.Channel, model: types.Channel, blessing: Optional[types.Channel] = None): """Construct a ModelValidator component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a [Trainer](https://www.tensorflow.org/tfx/guide/trainer) component. _required_ blessing: Output channel of type `standard_artifacts.ModelBlessing` that contains the validation result. """ blessing = blessing or types.Channel( type=standard_artifacts.ModelBlessing) spec = ModelValidatorSpec(examples=examples, model=model, blessing=blessing) super(ModelValidator, self).__init__(spec=spec)
class FileBasedExampleGen(base_beam_component.BaseBeamComponent): """A TFX component to ingest examples from a file system. The FileBasedExampleGen component is an API for getting file-based records into TFX pipelines. It consumes external files to generate examples which will be used by other internal components like StatisticsGen or Trainers. The component will also convert the input data into [tf.record](https://www.tensorflow.org/tutorials/load_data/tf_records) and generate train and eval example splits for downstream components. ## Example ``` _taxi_root = os.path.join(os.environ['HOME'], 'taxi') _data_root = os.path.join(_taxi_root, 'data', 'simple') # Brings data into the pipeline or otherwise joins/converts training data. example_gen = FileBasedExampleGen(input_base=_data_root) ``` """ SPEC_CLASS = FileBasedExampleGenSpec # EXECUTOR_SPEC should be overridden by subclasses. EXECUTOR_SPEC = executor_spec.BeamExecutorSpec( base_beam_executor.BaseBeamExecutor) DRIVER_CLASS = driver.FileBasedDriver def __init__( self, # TODO(b/159467778): deprecate this, use input_base instead. input: Optional[types.Channel] = None, # pylint: disable=redefined-builtin input_base: Optional[Text] = None, input_config: Optional[Union[example_gen_pb2.Input, Dict[Text, Any]]] = None, output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, Dict[Text, Any]]] = None, output_data_format: Optional[int] = example_gen_pb2. FORMAT_TF_EXAMPLE, example_artifacts: Optional[types.Channel] = None, custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None, instance_name: Optional[Text] = None): """Construct a FileBasedExampleGen component. Args: input: A Channel of type `standard_artifacts.ExternalArtifact`, which includes one artifact whose uri is an external directory containing the data files. (Deprecated by input_base) input_base: an external directory containing the data files. input_config: An [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If unset, input files will be treated as a single split. output_config: An example_gen_pb2.Output instance, providing the output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. If unset, driver will default to searching for latest span with no restrictions. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. example_artifacts: Channel of 'ExamplesPath' for output train and eval examples. custom_executor_spec: Optional custom executor spec overriding the default executor spec specified in the component attribute. instance_name: Optional unique instance name. Required only if multiple ExampleGen components are declared in the same pipeline. """ if input: logging.warning( 'The "input" argument to the ExampleGen component has been ' 'deprecated by "input_base". Please update your usage as support for ' 'this argument will be removed soon.') input_base = artifact_utils.get_single_uri(list(input.get())) # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) if not example_artifacts: example_artifacts = types.Channel(type=standard_artifacts.Examples) spec = FileBasedExampleGenSpec(input_base=input_base, input_config=input_config, output_config=output_config, custom_config=custom_config, range_config=range_config, output_data_format=output_data_format, examples=example_artifacts) super(FileBasedExampleGen, self).__init__(spec=spec, custom_executor_spec=custom_executor_spec, instance_name=instance_name)
class Transform(base_beam_component.BaseBeamComponent): """A TFX component to transform the input examples. The Transform component wraps TensorFlow Transform (tf.Transform) to preprocess data in a TFX pipeline. This component will load the preprocessing_fn from input module file, preprocess both 'train' and 'eval' splits of input examples, generate the `tf.Transform` output, and save both transform function and transformed examples to orchestrator desired locations. The Transform component can also invoke TFDV to compute statistics on the pre-transform and post-transform data. Invocations of TFDV take an optional [StatsOptions](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/statistics/stats_options.py) object. To configure the StatsOptions object that is passed to TFDV for both pre-transform and post-transform statistics, users can define the optional `stats_options_updater_fn` within the module file. ## Providing a preprocessing function The TFX executor will use the estimator provided in the `module_file` file to train the model. The Transform executor will look specifically for the `preprocessing_fn()` function within that file. An example of `preprocessing_fn()` can be found in the [user-supplied code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py) of the TFX Chicago Taxi pipeline example. ## Updating StatsOptions The Transform executor will look specifically for the `stats_options_updater_fn()` within the module file specified above. An example of `stats_options_updater_fn()` can be found in the [user-supplied code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/bert/mrpc/bert_mrpc_utils.py) of the TFX BERT MRPC pipeline example. ## Example ``` # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) ``` Component `outputs` contains: - `transform_graph`: Channel of type `standard_artifacts.TransformGraph`, which includes an exported Tensorflow graph suitable for both training and serving. - `transformed_examples`: Channel of type `standard_artifacts.Examples` for materialized transformed examples, which includes transform splits as specified in splits_config. This is optional controlled by `materialize`. Please see [the Transform guide](https://www.tensorflow.org/tfx/guide/transform) for more details. """ SPEC_CLASS = standard_component_specs.TransformSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__( self, examples: types.Channel, schema: types.Channel, module_file: Optional[Union[Text, data_types.RuntimeParameter]] = None, preprocessing_fn: Optional[Union[ Text, data_types.RuntimeParameter]] = None, splits_config: Optional[transform_pb2.SplitsConfig] = None, analyzer_cache: Optional[types.Channel] = None, materialize: bool = True, disable_analyzer_cache: bool = False, force_tf_compat_v1: bool = False, custom_config: Optional[Dict[Text, Any]] = None, disable_statistics: bool = False): """Construct a Transform component. Args: examples: A Channel of type `standard_artifacts.Examples` (required). This should contain custom splits specified in splits_config. If custom split is not provided, this should contain two splits 'train' and 'eval'. schema: A Channel of type `standard_artifacts.Schema`. This should contain a single schema artifact. module_file: The file path to a python module file, from which the 'preprocessing_fn' function will be loaded. Exactly one of 'module_file' or 'preprocessing_fn' must be supplied. The function needs to have the following signature: ``` def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: ... ``` where the values of input and returned Dict are either tf.Tensor or tf.SparseTensor. If additional inputs are needed for preprocessing_fn, they can be passed in custom_config: ``` def preprocessing_fn(inputs: Dict[Text, Any], custom_config: Dict[Text, Any]) -> Dict[Text, Any]: ... ``` Use of a RuntimeParameter for this argument is experimental. preprocessing_fn: The path to python function that implements a 'preprocessing_fn'. See 'module_file' for expected signature of the function. Exactly one of 'module_file' or 'preprocessing_fn' must be supplied. Use of a RuntimeParameter for this argument is experimental. splits_config: A transform_pb2.SplitsConfig instance, providing splits that should be analyzed and splits that should be transformed. Note analyze and transform splits can have overlap. Default behavior (when splits_config is not set) is analyze the 'train' split and transform all splits. If splits_config is set, analyze cannot be empty. analyzer_cache: Optional input 'TransformCache' channel containing cached information from previous Transform runs. When provided, Transform will try use the cached calculation if possible. materialize: If True, write transformed examples as an output. disable_analyzer_cache: If False, Transform will use input cache if provided and write cache output. If True, `analyzer_cache` must not be provided. force_tf_compat_v1: (Optional) If True and/or TF2 behaviors are disabled Transform will use Tensorflow in compat.v1 mode irrespective of installed version of Tensorflow. Defaults to `False`. custom_config: A dict which contains additional parameters that will be passed to preprocessing_fn. disable_statistics: If True, do not invoke TFDV to compute pre-transform and post-transform statistics. When statistics are computed, they will will be stored in the `pre_transform_feature_stats/` and `post_transform_feature_stats/` subfolders of the `transform_graph` export. Raises: ValueError: When both or neither of 'module_file' and 'preprocessing_fn' is supplied. """ if bool(module_file) == bool(preprocessing_fn): raise ValueError( "Exactly one of 'module_file' or 'preprocessing_fn' must be supplied." ) transform_graph = types.Channel(type=standard_artifacts.TransformGraph) transformed_examples = None if materialize: transformed_examples = types.Channel( type=standard_artifacts.Examples) transformed_examples.matching_channel_name = 'examples' (pre_transform_schema, pre_transform_stats, post_transform_schema, post_transform_stats, post_transform_anomalies) = (None, ) * 5 if not disable_statistics: pre_transform_schema = types.Channel( type=standard_artifacts.Schema) post_transform_schema = types.Channel( type=standard_artifacts.Schema) pre_transform_stats = types.Channel( type=standard_artifacts.ExampleStatistics) post_transform_stats = types.Channel( type=standard_artifacts.ExampleStatistics) post_transform_anomalies = types.Channel( type=standard_artifacts.ExampleAnomalies) if disable_analyzer_cache: updated_analyzer_cache = None if analyzer_cache: raise ValueError( '`analyzer_cache` is set when disable_analyzer_cache is True.' ) else: updated_analyzer_cache = types.Channel( type=standard_artifacts.TransformCache) spec = standard_component_specs.TransformSpec( examples=examples, schema=schema, module_file=module_file, preprocessing_fn=preprocessing_fn, force_tf_compat_v1=int(force_tf_compat_v1), splits_config=splits_config, transform_graph=transform_graph, transformed_examples=transformed_examples, analyzer_cache=analyzer_cache, updated_analyzer_cache=updated_analyzer_cache, custom_config=json_utils.dumps(custom_config), disable_statistics=int(disable_statistics), pre_transform_schema=pre_transform_schema, pre_transform_stats=pre_transform_stats, post_transform_schema=post_transform_schema, post_transform_stats=post_transform_stats, post_transform_anomalies=post_transform_anomalies) super(Transform, self).__init__(spec=spec) if udf_utils.should_package_user_modules(): # In this case, the `MODULE_PATH_KEY` execution property will be injected # as a reference to the given user module file after packaging, at which # point the `MODULE_FILE_KEY` execution property will be removed. udf_utils.add_user_module_dependency( self, standard_component_specs.MODULE_FILE_KEY, standard_component_specs.MODULE_PATH_KEY)
class BulkInferrer(base_beam_component.BaseBeamComponent): """A TFX component to do batch inference on a model with unlabelled examples. BulkInferrer consumes examples data and a model, and produces the inference results to an external location as PredictionLog proto. BulkInferrer will infer on validated model. ## Example ``` # Uses BulkInferrer to inference on examples. bulk_inferrer = BulkInferrer( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) ``` """ SPEC_CLASS = BulkInferrerSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, examples: types.Channel = None, model: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, Dict[Text, Any]]] = None, model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec, Dict[Text, Any]]] = None, output_example_spec: Optional[Union[ bulk_inferrer_pb2.OutputExampleSpec, Dict[Text, Any]]] = None, inference_result: Optional[types.Channel] = None, output_examples: Optional[types.Channel] = None, instance_name: Optional[Text] = None): """Construct an BulkInferrer component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. If any field is provided as a RuntimeParameter, data_spec should be constructed as a dict with the same field names as DataSpec proto message. model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model specification. If any field is provided as a RuntimeParameter, model_spec should be constructed as a dict with the same field names as ModelSpec proto message. output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify if you want BulkInferrer to output examples instead of inference result. If any field is provided as a RuntimeParameter, output_example_spec should be constructed as a dict with the same field names as OutputExampleSpec proto message. inference_result: Channel of type `standard_artifacts.InferenceResult` to store the inference results, must not be specified when output_example_spec is set. output_examples: Channel of type `standard_artifacts.Examples` to store the output examples, must not be specified when output_example_spec is unset. Check output_example_spec for details. instance_name: Optional name assigned to this specific instance of BulkInferrer. Required only if multiple BulkInferrer components are declared in the same pipeline. Raises: ValueError: Must not specify inference_result or output_examples depends on whether output_example_spec is set or not. """ if output_example_spec: if inference_result: raise ValueError( 'Must not specify inference_result when output_example_spec is set.' ) output_examples = output_examples or types.Channel( type=standard_artifacts.Examples) else: if output_examples: raise ValueError( 'Must not specify output_examples when output_example_spec is unset.' ) inference_result = inference_result or types.Channel( type=standard_artifacts.InferenceResult) spec = BulkInferrerSpec(examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), model_spec=model_spec or bulk_inferrer_pb2.ModelSpec(), output_example_spec=output_example_spec, inference_result=inference_result, output_examples=output_examples) super(BulkInferrer, self).__init__(spec=spec, instance_name=instance_name)
class _MyComponent(base_component.BaseComponent): SPEC_CLASS = _MyComponentSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(base_executor.BaseExecutor)
class BulkInferrer(base_beam_component.BaseBeamComponent): """A TFX component to do batch inference on a model with unlabelled examples. BulkInferrer consumes examples data and a model, and produces the inference results to an external location as PredictionLog proto. BulkInferrer will infer on validated model. ## Example ``` # Uses BulkInferrer to inference on examples. bulk_inferrer = BulkInferrer( examples=example_gen.outputs['examples'], model=trainer.outputs['model']) ``` Component `outputs` contains: - `inference_result`: Channel of type `standard_artifacts.InferenceResult` to store the inference results. - `output_examples`: Channel of type `standard_artifacts.Examples` to store the output examples. This is optional controlled by `output_example_spec`. See [the BulkInferrer guide](https://www.tensorflow.org/tfx/guide/bulkinferrer) for more details. """ SPEC_CLASS = BulkInferrerSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__(self, examples: types.Channel = None, model: Optional[types.Channel] = None, model_blessing: Optional[types.Channel] = None, data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec, Dict[Text, Any]]] = None, model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec, Dict[Text, Any]]] = None, output_example_spec: Optional[Union[ bulk_inferrer_pb2.OutputExampleSpec, Dict[Text, Any]]] = None): """Construct an BulkInferrer component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. model_blessing: A Channel of type `standard_artifacts.ModelBlessing`, usually produced by a ModelValidator component. data_spec: bulk_inferrer_pb2.DataSpec instance that describes data selection. If any field is provided as a RuntimeParameter, data_spec should be constructed as a dict with the same field names as DataSpec proto message. model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model specification. If any field is provided as a RuntimeParameter, model_spec should be constructed as a dict with the same field names as ModelSpec proto message. output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify if you want BulkInferrer to output examples instead of inference result. If any field is provided as a RuntimeParameter, output_example_spec should be constructed as a dict with the same field names as OutputExampleSpec proto message. """ if output_example_spec: output_examples = types.Channel(type=standard_artifacts.Examples) inference_result = None else: inference_result = types.Channel( type=standard_artifacts.InferenceResult) output_examples = None spec = BulkInferrerSpec(examples=examples, model=model, model_blessing=model_blessing, data_spec=data_spec or bulk_inferrer_pb2.DataSpec(), model_spec=model_spec or bulk_inferrer_pb2.ModelSpec(), output_example_spec=output_example_spec, inference_result=inference_result, output_examples=output_examples) super(BulkInferrer, self).__init__(spec=spec)
class Transform(base_beam_component.BaseBeamComponent): """A TFX component to transform the input examples. The Transform component wraps TensorFlow Transform (tf.Transform) to preprocess data in a TFX pipeline. This component will load the preprocessing_fn from input module file, preprocess both 'train' and 'eval' splits of input examples, generate the `tf.Transform` output, and save both transform function and transformed examples to orchestrator desired locations. ## Providing a preprocessing function The TFX executor will use the estimator provided in the `module_file` file to train the model. The Transform executor will look specifically for the `preprocessing_fn()` function within that file. An example of `preprocessing_fn()` can be found in the [user-supplied code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py) of the TFX Chicago Taxi pipeline example. ## Example ``` # Performs transformations and feature engineering in training and serving. transform = Transform( examples=example_gen.outputs['examples'], schema=infer_schema.outputs['schema'], module_file=module_file) ``` Please see https://www.tensorflow.org/tfx/transform for more details. """ SPEC_CLASS = TransformSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__( self, examples: types.Channel = None, schema: types.Channel = None, module_file: Optional[Union[Text, data_types.RuntimeParameter]] = None, preprocessing_fn: Optional[Union[ Text, data_types.RuntimeParameter]] = None, splits_config: transform_pb2.SplitsConfig = None, transform_graph: Optional[types.Channel] = None, transformed_examples: Optional[types.Channel] = None, analyzer_cache: Optional[types.Channel] = None, materialize: bool = True, disable_analyzer_cache: bool = False, force_tf_compat_v1: bool = True, custom_config: Optional[Dict[Text, Any]] = None): """Construct a Transform component. Args: examples: A Channel of type `standard_artifacts.Examples` (required). This should contain custom splits specified in splits_config. If custom split is not provided, this should contain two splits 'train' and 'eval'. schema: A Channel of type `standard_artifacts.Schema`. This should contain a single schema artifact. module_file: The file path to a python module file, from which the 'preprocessing_fn' function will be loaded. Exactly one of 'module_file' or 'preprocessing_fn' must be supplied. The function needs to have the following signature: ``` def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]: ... ``` where the values of input and returned Dict are either tf.Tensor or tf.SparseTensor. If additional inputs are needed for preprocessing_fn, they can be passed in custom_config: ``` def preprocessing_fn(inputs: Dict[Text, Any], custom_config: Dict[Text, Any]) -> Dict[Text, Any]: ... ``` preprocessing_fn: The path to python function that implements a 'preprocessing_fn'. See 'module_file' for expected signature of the function. Exactly one of 'module_file' or 'preprocessing_fn' must be supplied. splits_config: A transform_pb2.SplitsConfig instance, providing splits that should be analyzed and splits that should be transformed. Note analyze and transform splits can have overlap. Default behavior (when splits_config is not set) is analyze the 'train' split and transform all splits. If splits_config is set, analyze cannot be empty. transform_graph: Optional output 'TransformPath' channel for output of 'tf.Transform', which includes an exported Tensorflow graph suitable for both training and serving; transformed_examples: Optional output 'ExamplesPath' channel for materialized transformed examples, which includes transform splits as specified in splits_config. If custom split is not provided, this should include both 'train' and 'eval' splits. analyzer_cache: Optional input 'TransformCache' channel containing cached information from previous Transform runs. When provided, Transform will try use the cached calculation if possible. materialize: If True, write transformed examples as an output. If False, `transformed_examples` must not be provided. disable_analyzer_cache: If False, Transform will use input cache if provided and write cache output. If True, `analyzer_cache` must not be provided. force_tf_compat_v1: (Optional) If True, Transform will use Tensorflow in compat.v1 mode irrespective of installed version of Tensorflow. Defaults to `True`. Note: The default value will be switched to `False` in a future release. custom_config: A dict which contains additional parameters that will be passed to preprocessing_fn. Raises: ValueError: When both or neither of 'module_file' and 'preprocessing_fn' is supplied. """ if bool(module_file) == bool(preprocessing_fn): raise ValueError( "Exactly one of 'module_file' or 'preprocessing_fn' must be supplied." ) transform_graph = transform_graph or types.Channel( type=standard_artifacts.TransformGraph) if materialize and transformed_examples is None: transformed_examples = types.Channel( type=standard_artifacts.Examples) transformed_examples.matching_channel_name = 'examples' elif not materialize and transformed_examples is not None: raise ValueError( 'Must not specify transformed_examples when materialize is False.' ) if disable_analyzer_cache: updated_analyzer_cache = None if analyzer_cache: raise ValueError( '`analyzer_cache` is set when disable_analyzer_cache is True.' ) else: updated_analyzer_cache = types.Channel( type=standard_artifacts.TransformCache) spec = TransformSpec(examples=examples, schema=schema, module_file=module_file, preprocessing_fn=preprocessing_fn, force_tf_compat_v1=int(force_tf_compat_v1), splits_config=splits_config, transform_graph=transform_graph, transformed_examples=transformed_examples, analyzer_cache=analyzer_cache, updated_analyzer_cache=updated_analyzer_cache, custom_config=json_utils.dumps(custom_config)) super(Transform, self).__init__(spec=spec)
class CsvExampleGen(component.FileBasedExampleGen): # pylint: disable=protected-access """Official TFX CsvExampleGen component. The csv examplegen component takes csv data, and generates train and eval examples for downstream components. The csv examplegen encodes column values to tf.Example int/float/byte feature. For the case when there's missing cells, the csv examplegen uses: -- tf.train.Feature(`type`_list=tf.train.`type`List(value=[])), when the `type` can be inferred. -- tf.train.Feature() when it cannot infer the `type` from the column. Note that the type inferring will be per input split. If input isn't a single split, users need to ensure the column types align in each pre-splits. For example, given the following csv rows of a split: header:A,B,C,D row1: 1,,x,0.1 row2: 2,,y,0.2 row3: 3,,,0.3 row4: The output example will be example1: 1(int), empty feature(no type), x(string), 0.1(float) example2: 2(int), empty feature(no type), x(string), 0.2(float) example3: 3(int), empty feature(no type), empty list(string), 0.3(float) Note that the empty feature is `tf.train.Feature()` while empty list string feature is `tf.train.Feature(bytes_list=tf.train.BytesList(value=[]))`. Component `outputs` contains: - `examples`: Channel of type `standard_artifacts.Examples` for output train and eval examples. """ EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__( self, input_base: Optional[str] = None, input_config: Optional[Union[example_gen_pb2.Input, data_types.RuntimeParameter]] = None, output_config: Optional[Union[example_gen_pb2.Output, data_types.RuntimeParameter]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, data_types.RuntimeParameter]] = None): """Construct a CsvExampleGen component. Args: input_base: an external directory containing the CSV files. input_config: An example_gen_pb2.Input instance, providing input configuration. If unset, the files under input_base will be treated as a single split. output_config: An example_gen_pb2.Output instance, providing output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. If unset, driver will default to searching for latest span with no restrictions. """ super().__init__(input_base=input_base, input_config=input_config, output_config=output_config, range_config=range_config)
class QueryBasedExampleGen(base_beam_component.BaseBeamComponent): """A TFX component to ingest examples from query based systems. The QueryBasedExampleGen component can be extended to ingest examples from query based systems such as Presto or Bigquery. The component will also convert the input data into tf.record](https://www.tensorflow.org/tutorials/load_data/tf_records) and generate train and eval example splits for downstream components. ## Example ``` _query = "SELECT * FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`" # Brings data into the pipeline or otherwise joins/converts training data. example_gen = BigQueryExampleGen(query=_query) ``` Component `outputs` contains: - `examples`: Channel of type `standard_artifacts.Examples` for output train and eval examples. """ SPEC_CLASS = QueryBasedExampleGenSpec # EXECUTOR_SPEC should be overridden by subclasses. EXECUTOR_SPEC = executor_spec.BeamExecutorSpec( base_beam_executor.BaseBeamExecutor) DRIVER_CLASS = driver.QueryBasedDriver def __init__( self, input_config: Union[example_gen_pb2.Input, Dict[Text, Any]], output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, output_data_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE): """Construct a QueryBasedExampleGen component. Args: input_config: An [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If any field is provided as a RuntimeParameter, input_config should be constructed as a dict with the same field names as Input proto message. _required_ output_config: An [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing output configuration. If unset, the default splits will be labeled as 'train' and 'eval' with a distribution ratio of 2:1. If any field is provided as a RuntimeParameter, output_config should be constructed as a dict with the same field names as Output proto message. custom_config: An [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing custom configuration for ExampleGen. If any field is provided as a RuntimeParameter, output_config should be constructed as a dict. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. Raises: ValueError: The output_data_format value must be defined in the example_gen_pb2.PayloadFormat proto. """ # Configure outputs. output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = types.Channel(type=standard_artifacts.Examples) if output_data_format not in example_gen_pb2.PayloadFormat.values(): raise ValueError('The value of output_data_format must be defined in' 'the example_gen_pb2.PayloadFormat proto.') spec = QueryBasedExampleGenSpec( input_config=input_config, output_config=output_config, output_data_format=output_data_format, custom_config=custom_config, examples=example_artifacts) super(QueryBasedExampleGen, self).__init__(spec=spec)
class FileBasedExampleGen(base_beam_component.BaseBeamComponent): """A TFX component to ingest examples from a file system. The FileBasedExampleGen component is an API for getting file-based records into TFX pipelines. It consumes external files to generate examples which will be used by other internal components like StatisticsGen or Trainers. The component will also convert the input data into [tf.record](https://www.tensorflow.org/tutorials/load_data/tf_records) and generate train and eval example splits for downstream components. ## Example ``` _taxi_root = os.path.join(os.environ['HOME'], 'taxi') _data_root = os.path.join(_taxi_root, 'data', 'simple') # Brings data into the pipeline or otherwise joins/converts training data. example_gen = FileBasedExampleGen(input_base=_data_root) ``` Component `outputs` contains: - `examples`: Channel of type `standard_artifacts.Examples` for output train and eval examples. """ SPEC_CLASS = FileBasedExampleGenSpec # EXECUTOR_SPEC should be overridden by subclasses. EXECUTOR_SPEC = executor_spec.BeamExecutorSpec( base_beam_executor.BaseBeamExecutor) DRIVER_CLASS = driver.FileBasedDriver def __init__( self, input_base: Optional[Text] = None, input_config: Optional[Union[example_gen_pb2.Input, Dict[Text, Any]]] = None, output_config: Optional[Union[example_gen_pb2.Output, Dict[Text, Any]]] = None, custom_config: Optional[Union[example_gen_pb2.CustomConfig, Dict[Text, Any]]] = None, range_config: Optional[Union[range_config_pb2.RangeConfig, Dict[Text, Any]]] = None, output_data_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE, custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None): """Construct a FileBasedExampleGen component. Args: input_base: an external directory containing the data files. input_config: An [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto) instance, providing input configuration. If unset, input files will be treated as a single split. output_config: An example_gen_pb2.Output instance, providing the output configuration. If unset, default splits will be 'train' and 'eval' with size 2:1. custom_config: An optional example_gen_pb2.CustomConfig instance, providing custom configuration for executor. range_config: An optional range_config_pb2.RangeConfig instance, specifying the range of span values to consider. If unset, driver will default to searching for latest span with no restrictions. output_data_format: Payload format of generated data in output artifact, one of example_gen_pb2.PayloadFormat enum. custom_executor_spec: Optional custom executor spec overriding the default executor spec specified in the component attribute. """ # Configure inputs and outputs. input_config = input_config or utils.make_default_input_config() output_config = output_config or utils.make_default_output_config( input_config) example_artifacts = types.Channel(type=standard_artifacts.Examples) spec = FileBasedExampleGenSpec( input_base=input_base, input_config=input_config, output_config=output_config, custom_config=custom_config, range_config=range_config, output_data_format=output_data_format, examples=example_artifacts) super(FileBasedExampleGen, self).__init__( spec=spec, custom_executor_spec=custom_executor_spec)
class BeamComponent(base_beam_component.BaseBeamComponent): EXECUTOR_SPEC = executor_spec.BeamExecutorSpec( base_beam_executor.BaseBeamExecutor) SPEC_CLASS = _EmptyComponentSpec
def replace_executor_with_stub(pipeline: pipeline_pb2.Pipeline, test_data_dir: str, test_component_ids: List[str]): """Replace executors in pipeline IR with the stub executor. This funciton will replace the IR inplace. For example, pipeline_mock.replace_executor_with_stub( pipeline_ir, test_data_dir, test_component_ids = ['Trainer', 'Transform']) Then you can pass the modified `pipeline_ir` into a dag runner to execute the stubbed pipeline. Args: pipeline: The pipeline to alter. test_data_dir: The directory where pipeline outputs are recorded (pipeline_recorder.py). test_component_ids: List of ids of components that are to be tested. In other words, executors of components other than those specified by this list will be replaced with a BaseStubExecutor. Returns: None """ deployment_config = pipeline_pb2.IntermediateDeploymentConfig() if not pipeline.deployment_config.Unpack(deployment_config): raise NotImplementedError( 'Unexpected pipeline.deployment_config type "{}". Currently only ' 'IntermediateDeploymentConfig is supported.'.format( pipeline.deployment_config.type_url)) for component_id in deployment_config.executor_specs: if component_id not in test_component_ids: executable_spec = deployment_config.executor_specs[component_id] if executable_spec.Is( executable_spec_pb2.PythonClassExecutableSpec.DESCRIPTOR): stub_executor_class_spec = executor_spec.ExecutorClassSpec( base_stub_executor.BaseStubExecutor) stub_executor_class_spec.add_extra_flags( (base_stub_executor.TEST_DATA_DIR_FLAG + '=' + test_data_dir, base_stub_executor.COMPONENT_ID_FLAG + '=' + component_id)) stub_executor_spec = stub_executor_class_spec.encode() executable_spec.Pack(stub_executor_spec) elif executable_spec.Is( executable_spec_pb2.BeamExecutableSpec.DESCRIPTOR): stub_beam_executor_spec = executor_spec.BeamExecutorSpec( base_stub_executor.BaseStubExecutor) stub_beam_executor_spec.add_extra_flags( (base_stub_executor.TEST_DATA_DIR_FLAG + '=' + test_data_dir, base_stub_executor.COMPONENT_ID_FLAG + '=' + component_id)) stub_executor_spec = stub_beam_executor_spec.encode() executable_spec.Pack(stub_executor_spec) else: raise NotImplementedError( 'Unexpected executable_spec type "{}". Currently only ' 'PythonClassExecutableSpec and BeamExecutorSpec is supported.' .format(executable_spec.type_url)) pipeline.deployment_config.Pack(deployment_config)
class Evaluator(base_beam_component.BaseBeamComponent): """A TFX component to evaluate models trained by a TFX Trainer component. Component `outputs` contains: - `evaluation`: Channel of type `standard_artifacts.ModelEvaluation` to store the evaluation results. - `blessing`: Channel of type `standard_artifacts.ModelBlessing' that contains the blessing result. See [the Evaluator guide](https://www.tensorflow.org/tfx/guide/evaluator) for more details. """ SPEC_CLASS = standard_component_specs.EvaluatorSpec EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor) def __init__( self, examples: types.Channel = None, model: types.Channel = None, baseline_model: Optional[types.Channel] = None, # TODO(b/148618405): deprecate feature_slicing_spec. feature_slicing_spec: Optional[Union[ evaluator_pb2.FeatureSlicingSpec, Dict[Text, Any]]] = None, fairness_indicator_thresholds: Optional[List[Union[ float, data_types.RuntimeParameter]]] = None, example_splits: Optional[List[Text]] = None, eval_config: Optional[tfma.EvalConfig] = None, schema: Optional[types.Channel] = None, module_file: Optional[Text] = None, module_path: Optional[Text] = None): """Construct an Evaluator component. Args: examples: A Channel of type `standard_artifacts.Examples`, usually produced by an ExampleGen component. _required_ model: A Channel of type `standard_artifacts.Model`, usually produced by a Trainer component. baseline_model: An optional channel of type 'standard_artifacts.Model' as the baseline model for model diff and model validation purpose. feature_slicing_spec: Deprecated, please use eval_config instead. Only support estimator. [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto) instance that describes how Evaluator should slice the data. If any field is provided as a RuntimeParameter, feature_slicing_spec should be constructed as a dict with the same field names as FeatureSlicingSpec proto message. fairness_indicator_thresholds: Optional list of float (or RuntimeParameter) threshold values for use with TFMA fairness indicators. Experimental functionality: this interface and functionality may change at any time. TODO(b/142653905): add a link to additional documentation for TFMA fairness indicators here. example_splits: Names of splits on which the metrics are computed. Default behavior (when example_splits is set to None or Empty) is using the 'eval' split. eval_config: Instance of tfma.EvalConfig containg configuration settings for running the evaluation. This config has options for both estimator and Keras. schema: A `Schema` channel to use for TFXIO. module_file: A path to python module file containing UDFs for Evaluator customization. This functionality is experimental and may change at any time. The module_file can implement following functions at its top level. def custom_eval_shared_model( eval_saved_model_path, model_name, eval_config, **kwargs, ) -> tfma.EvalSharedModel: def custom_extractors( eval_shared_model, eval_config, tensor_adapter_config, ) -> List[tfma.extractors.Extractor]: module_path: A python path to the custom module that contains the UDFs. See 'module_file' for the required signature of UDFs. This functionality is experimental and this API may change at any time. Note this can not be set together with module_file. """ if bool(module_file) and bool(module_path): raise ValueError( 'Python module path can not be set together with module file path.' ) if eval_config is not None and feature_slicing_spec is not None: raise ValueError( "Exactly one of 'eval_config' or 'feature_slicing_spec' " "must be supplied.") if eval_config is None and feature_slicing_spec is None: feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec() logging.info( 'Neither eval_config nor feature_slicing_spec is passed, ' 'the model is treated as estimator.') if feature_slicing_spec: logging.warning('feature_slicing_spec is deprecated, please use ' 'eval_config instead.') blessing = types.Channel(type=standard_artifacts.ModelBlessing) evaluation = types.Channel(type=standard_artifacts.ModelEvaluation) spec = standard_component_specs.EvaluatorSpec( examples=examples, model=model, baseline_model=baseline_model, feature_slicing_spec=feature_slicing_spec, fairness_indicator_thresholds=fairness_indicator_thresholds, example_splits=json_utils.dumps(example_splits), evaluation=evaluation, eval_config=eval_config, blessing=blessing, schema=schema, module_file=module_file, module_path=module_path) super(Evaluator, self).__init__(spec=spec) if udf_utils.should_package_user_modules(): # In this case, the `MODULE_PATH_KEY` execution property will be injected # as a reference to the given user module file after packaging, at which # point the `MODULE_FILE_KEY` execution property will be removed. udf_utils.add_user_module_dependency( self, standard_component_specs.MODULE_FILE_KEY, standard_component_specs.MODULE_PATH_KEY)