Ejemplo n.º 1
0
class StratifiedSampler(base_component.BaseComponent):
    """A TFX component to do stratified sampling.
  StratifiedSampler consumes examples data, and produces examples data
  
  ## Example
  ```
    # Uses StratifiedSampler to inference on examples.
    stratified_sampler = StratifiedSampler(
        key='trip_miles',
        examples=example_gen.outputs['examples'])
  ```
  """

    SPEC_CLASS = StratifiedSamplerSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(self,
                 examples: types.Channel,
                 to_key_fn: Optional[Text] = None,
                 to_key_fn_key: Optional[Text] = 'to_key_fn',
                 pipeline_configuration: Optional[types.Channel] = None,
                 stratified_examples: Optional[types.Channel] = None,
                 splits_to_transform: Optional[List[Text]] = None,
                 splits_to_copy: Optional[List[Text]] = None,
                 samples_per_key: Optional[int] = None):
        """Construct an StratifiedSampler component.
    Args:
      examples: A Channel of 'Examples' type, usually produced by ExampleGen
        component. _required_
      pipeline_configuration: A Channel of 'PipelineConfiguration' type, usually produced by FromCustomConfig
        component.
      stratified_examples: Channel of `Examples` to store the inference
        results.
      splits_to_transform: Optional list of split names to transform.
      splits_to_copy: Optional list of split names to copy.
      samples_per_key: Number of samples per key.
      to_key_fn_key: the name of the key that contains the to_key_fn - default is 'to_key_fn'.
      to_key_fn: To key function, the function that will extract the key - must be 'to_key: Example -> key
                 For example something like:
                 >>> def to_key(m):
                 >>>   return m.features.feature['trip_miles'].float_list.value[0] > 42.
    """
        stratified_examples = stratified_examples or types.Channel(
            type=standard_artifacts.Examples)

        if stratified_examples is None:
            stratified_examples = types.Channel(
                type=standard_artifacts.Examples,
                matching_channel_name='examples')

        spec = StratifiedSamplerSpec(
            examples=examples,
            pipeline_configuration=pipeline_configuration,
            stratified_examples=stratified_examples,
            splits_to_transform=json_utils.dumps(splits_to_transform),
            splits_to_copy=json_utils.dumps(splits_to_copy),
            to_key_fn=to_key_fn,
            to_key_fn_key=to_key_fn_key,
            samples_per_key=samples_per_key)
        super(StratifiedSampler, self).__init__(spec=spec)
Ejemplo n.º 2
0
class StatisticsGen(base_beam_component.BaseBeamComponent):
    """Official TFX StatisticsGen component.

  The StatisticsGen component generates features statistics and random samples
  over training data, which can be used for visualization and validation.
  StatisticsGen uses Apache Beam and approximate algorithms to scale to large
  datasets.

  Please see https://www.tensorflow.org/tfx/data_validation for more details.

  ## Example
  ```
    # Computes statistics over data for visualization and example validation.
    statistics_gen = StatisticsGen(examples=example_gen.outputs['examples'])
  ```

  Component `outputs` contains:
   - `statistics`: Channel of type `standard_artifacts.ExampleStatistics` for
                   statistics of each split provided in the input examples.
  """

    SPEC_CLASS = StatisticsGenSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(self,
                 examples: types.Channel = None,
                 schema: Optional[types.Channel] = None,
                 stats_options: Optional[tfdv.StatsOptions] = None,
                 exclude_splits: Optional[List[Text]] = None):
        """Construct a StatisticsGen component.

    Args:
      examples: A Channel of `ExamplesPath` type, likely generated by the
        [ExampleGen component](https://www.tensorflow.org/tfx/guide/examplegen).
        This needs to contain two splits labeled `train` and `eval`. _required_
      schema: A `Schema` channel to use for automatically configuring the value
        of stats options passed to TFDV.
      stats_options: The StatsOptions instance to configure optional TFDV
        behavior. When stats_options.schema is set, it will be used instead of
        the `schema` channel input. Due to the requirement that stats_options be
        serialized, the slicer functions and custom stats generators are dropped
        and are therefore not usable.
      exclude_splits: Names of splits where statistics and sample should not
        be generated. Default behavior (when exclude_splits is set to None)
        is excluding no splits.
    """
        if exclude_splits is None:
            exclude_splits = []
            logging.info(
                'Excluding no splits because exclude_splits is not set.')
        statistics = types.Channel(type=standard_artifacts.ExampleStatistics)
        # TODO(b/150802589): Move jsonable interface to tfx_bsl and use json_utils.
        stats_options_json = stats_options.to_json() if stats_options else None
        spec = StatisticsGenSpec(
            examples=examples,
            schema=schema,
            stats_options_json=stats_options_json,
            exclude_splits=json_utils.dumps(exclude_splits),
            statistics=statistics)
        super(StatisticsGen, self).__init__(spec=spec)
Ejemplo n.º 3
0
class TestFileBasedExampleGenComponent(component.FileBasedExampleGen):

    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(TestExampleGenExecutor)

    def __init__(self, input_base, input_config=None, output_config=None):
        super().__init__(input_base=input_base,
                         input_config=input_config,
                         output_config=output_config)
Ejemplo n.º 4
0
 def testConstructCustomExecutor(self):
   example_gen = component.FileBasedExampleGen(
       input_base='path',
       custom_executor_spec=executor_spec.BeamExecutorSpec(
           TestExampleGenExecutor))
   self.assertEqual(driver.FileBasedDriver, example_gen.driver_class)
   self.assertEqual(
       standard_artifacts.Examples.TYPE_NAME,
       example_gen.outputs[standard_component_specs.EXAMPLES_KEY].type_name)
Ejemplo n.º 5
0
class BigQueryToElwcExampleGen(component.QueryBasedExampleGen):
  """Official TFX BigQueryToElwcExampleGen component.

  The BigQueryToElwcExampleGen component takes a query, and generates train
  and eval ExampleListWithContext(ELWC) for downstream components.
  """

  EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

  def __init__(self,
               query: Optional[Text] = None,
               elwc_config: Optional[elwc_config_pb2.ElwcConfig] = None,
               input_config: Optional[example_gen_pb2.Input] = None,
               output_config: Optional[example_gen_pb2.Output] = None,
               example_artifacts: Optional[types.Channel] = None):
    """Constructs a BigQueryElwcExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      elwc_config: The elwc config contains a list of context feature fields.
        The fields are used to build context feature. Examples with the same
        context feature will be converted to an ELWC(ExampleListWithContext)
        instance. For example, when there are two examples with the same context
        field, the two examples will be intergrated to a ELWC instance.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1. If any field is provided as a RuntimeParameter, input_config
          should be constructed as a dict with the same field names as Output
          proto message.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.

    Raises:
      RuntimeError: Only one of query and input_config should be set and
        elwc_config is required.
    """

    if bool(query) == bool(input_config):
      raise RuntimeError('Exactly one of query and input_config should be set.')
    if not elwc_config:
      raise RuntimeError(
          'elwc_config is required for BigQueryToElwcExampleGen.')
    input_config = input_config or utils.make_default_input_config(query)
    packed_custom_config = example_gen_pb2.CustomConfig()
    packed_custom_config.custom_config.Pack(elwc_config)
    super(BigQueryToElwcExampleGen, self).__init__(
        input_config=input_config,
        output_config=output_config,
        output_data_format=example_gen_pb2.FORMAT_PROTO,
        custom_config=packed_custom_config,
        example_artifacts=example_artifacts)
Ejemplo n.º 6
0
class ImportExampleGen(component.FileBasedExampleGen):  # pylint: disable=protected-access
  """Official TFX ImportExampleGen component.

  The ImportExampleGen component takes TFRecord files with TF Example data
  format, and generates train and eval examples for downstream components.
  This component provides consistent and configurable partition, and it also
  shuffle the dataset for ML best practice.
  """

  EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

  def __init__(
      self,
      input_base: Optional[Text] = None,
      input_config: Optional[Union[example_gen_pb2.Input, Dict[Text,
                                                               Any]]] = None,
      output_config: Optional[Union[example_gen_pb2.Output, Dict[Text,
                                                                 Any]]] = None,
      range_config: Optional[Union[range_config_pb2.RangeConfig,
                                   Dict[Text, Any]]] = None,
      payload_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE,
      example_artifacts: Optional[types.Channel] = None,
      instance_name: Optional[Text] = None):
    """Construct an ImportExampleGen component.

    Args:
      input_base: an external directory containing the TFRecord files.
      input_config: An example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base will be treated as a
        single split. If any field is provided as a RuntimeParameter,
        input_config should be constructed as a dict with the same field names
        as Input proto message.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1. If any field is provided as a RuntimeParameter,
        output_config should be constructed as a dict with the same field names
        as Output proto message.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      payload_format: Payload format of input data. Should be one of
        example_gen_pb2.PayloadFormat enum. Note that payload format of output
        data is the same as input.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      instance_name: Optional unique instance name. Necessary if multiple
        ImportExampleGen components are declared in the same pipeline.
    """
    super(ImportExampleGen, self).__init__(
        input_base=input_base,
        input_config=input_config,
        output_config=output_config,
        range_config=range_config,
        example_artifacts=example_artifacts,
        output_data_format=payload_format,
        instance_name=instance_name)
Ejemplo n.º 7
0
class PrestoExampleGen(component.QueryBasedExampleGen):  # pylint: disable=protected-access
    """Official TFX PrestoExampleGen component.

  The Presto examplegen component takes a query, connection client
  configuration, and generates train and eval examples for downstream
  components.
  """
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(self,
                 conn_config: presto_config_pb2.PrestoConnConfig,
                 query: Optional[Text] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[types.Channel] = None):
        """Constructs a PrestoExampleGen component.

    Args:
      conn_config: Parameters for Presto connection client.
      query: Presto sql string, query result will be treated as a single split,
        can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        Presto sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.

    Raises:
      RuntimeError: Only one of query and input_config should be set. Or
      required host field in connection_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        if not bool(conn_config.host):
            raise RuntimeError(
                'Required host field in connection config should be set.')

        input_config = input_config or utils.make_default_input_config(query)

        packed_custom_config = example_gen_pb2.CustomConfig()
        packed_custom_config.custom_config.Pack(conn_config)

        output_config = output_config or utils.make_default_output_config(
            input_config)

        super(PrestoExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             custom_config=packed_custom_config,
                             example_artifacts=example_artifacts)
Ejemplo n.º 8
0
class Filter(base_component.BaseComponent):
  """A TFX component to do filtering.
  Filter consumes examples data, and produces examples data
  
  ## Example
    # Uses Filter to inference on examples.
    >>> filter = Filter(
    >>>    predicate_fn="def predicate(m):...",
    >>>    examples=example_gen.outputs['examples'])

  """

  SPEC_CLASS = FilterSpec
  EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

  def __init__(self,
               examples: types.Channel,
               predicate_fn: Optional[Text] = None,
               predicate_fn_key: Optional[Text] = 'predicate_fn',
               pipeline_configuration: Optional[types.Channel] = None,
               filtered_examples: Optional[types.Channel] = None,
               splits_to_transform: Optional[List[Text]] = None,
               splits_to_copy: Optional[List[Text]] = None):
    """Construct an Filter component.
    Args:
      examples: A Channel of 'Examples' type, usually produced by ExampleGen
        component. _required_
      pipeline_configuration: A Channel of 'PipelineConfiguration' type, usually produced by FromCustomConfig
        component.
      filtered_examples: Channel of `Examples` to store the inference
        results.
      splits_to_transform: Optional list of split names to transform.
      splits_to_copy: Optional list of split names to copy.
      predicate_fn_key: the name of the key that contains the predicate_fn - default is 'predicate_fn'.
      predicate_fn: To key function, the function that will tell if a example must be kept.
                 Must be 'predicate: Example -> bool. For example something like:
                 >>> def predicate(m):
                       return m.features.feature['trip_miles'].float_list.value[0] > 42.
    """
    filtered_examples = filtered_examples or types.Channel(
      type=standard_artifacts.Examples)

    if filtered_examples is None:
      filtered_examples = types.Channel(type=standard_artifacts.Examples, matching_channel_name='examples')

    spec = FilterSpec(
      examples=examples,
      pipeline_configuration=pipeline_configuration,
      filtered_examples=filtered_examples,
      splits_to_transform=json_utils.dumps(splits_to_transform),
      splits_to_copy=json_utils.dumps(splits_to_copy),
      predicate_fn=predicate_fn,
      predicate_fn_key=predicate_fn_key)
    super(Filter, self).__init__(spec=spec)
Ejemplo n.º 9
0
  class _FakeBeamComponent(base_beam_component.BaseBeamComponent):

    SPEC_CLASS = _FakeComponentSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(base_executor.BaseExecutor)

    def __init__(
        self,
        type: Type[types.Artifact],  # pylint: disable=redefined-builtin
        spec_kwargs: Dict[Text, Any]):
      spec = _FakeComponentSpec(output=types.Channel(type=type), **spec_kwargs)
      super(_FakeBeamComponent, self).__init__(spec=spec, instance_name=name)
Ejemplo n.º 10
0
class TestQueryBasedExampleGenComponent(component.QueryBasedExampleGen):

  EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(TestExampleGenExecutor)

  def __init__(self,
               input_config,
               output_config=None,
               output_data_format=example_gen_pb2.FORMAT_TF_EXAMPLE):
    super(TestQueryBasedExampleGenComponent, self).__init__(
        input_config=input_config,
        output_config=output_config,
        output_data_format=output_data_format)
Ejemplo n.º 11
0
  def testConstructWithCustomConfig(self):
    custom_config = example_gen_pb2.CustomConfig(custom_config=any_pb2.Any())
    example_gen = component.FileBasedExampleGen(
        input_base='path',
        custom_config=custom_config,
        custom_executor_spec=executor_spec.BeamExecutorSpec(
            TestExampleGenExecutor))

    stored_custom_config = example_gen_pb2.CustomConfig()
    proto_utils.json_to_proto(
        example_gen.exec_properties[standard_component_specs.CUSTOM_CONFIG_KEY],
        stored_custom_config)
    self.assertEqual(custom_config, stored_custom_config)
Ejemplo n.º 12
0
class BigQueryExampleGen(component.QueryBasedExampleGen):
    """Cloud BigQueryExampleGen component.

  The BigQuery examplegen component takes a query, and generates train
  and eval examples for downstream components.

  Component `outputs` contains:
   - `examples`: Channel of type `standard_artifacts.Examples` for output train
                 and eval examples.
  """

    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(
        self,
        query: Optional[str] = None,
        input_config: Optional[Union[example_gen_pb2.Input,
                                     data_types.RuntimeParameter]] = None,
        output_config: Optional[Union[example_gen_pb2.Output,
                                      data_types.RuntimeParameter]] = None,
        range_config: Optional[Union[range_config_pb2.RangeConfig,
                                     data_types.RuntimeParameter]] = None):
        """Constructs a BigQueryExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1. If any field is provided as a RuntimeParameter,
        input_config should be constructed as a dict with the same field names
        as Output proto message.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider.

    Raises:
      RuntimeError: Only one of query and input_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        input_config = input_config or utils.make_default_input_config(query)
        super().__init__(input_config=input_config,
                         output_config=output_config,
                         range_config=range_config)
Ejemplo n.º 13
0
 def testConstructWithStaticRangeConfig(self):
   range_config = range_config_pb2.RangeConfig(
       static_range=range_config_pb2.StaticRange(
           start_span_number=1, end_span_number=1))
   example_gen = component.FileBasedExampleGen(
       input_base='path',
       range_config=range_config,
       custom_executor_spec=executor_spec.BeamExecutorSpec(
           TestExampleGenExecutor))
   stored_range_config = range_config_pb2.RangeConfig()
   proto_utils.json_to_proto(
       example_gen.exec_properties[standard_component_specs.RANGE_CONFIG_KEY],
       stored_range_config)
   self.assertEqual(range_config, stored_range_config)
Ejemplo n.º 14
0
class ImportExampleGen(component.FileBasedExampleGen):  # pylint: disable=protected-access
    """Official TFX ImportExampleGen component.

  The ImportExampleGen component takes TFRecord files with TF Example data
  format, and generates train and eval examples for downstream components.
  This component provides consistent and configurable partition, and it also
  shuffle the dataset for ML best practice.

  Component `outputs` contains:
   - `examples`: Channel of type `standard_artifacts.Examples` for output
   train
                 and eval examples.
  """

    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(
            self,
            input_base: Optional[str] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         data_types.RuntimeParameter]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          data_types.RuntimeParameter]] = None,
            range_config: Optional[Union[range_config_pb2.RangeConfig,
                                         data_types.RuntimeParameter]] = None,
            payload_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE):
        """Construct an ImportExampleGen component.

    Args:
      input_base: an external directory containing the TFRecord files.
      input_config: An example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base will be treated as a
        single split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      payload_format: Payload format of input data. Should be one of
        example_gen_pb2.PayloadFormat enum. Note that payload format of output
        data is the same as input.
    """
        super().__init__(input_base=input_base,
                         input_config=input_config,
                         output_config=output_config,
                         range_config=range_config,
                         output_data_format=payload_format)
Ejemplo n.º 15
0
    class _FakeBeamComponent(base_beam_component.BaseBeamComponent):

        SPEC_CLASS = _FakeComponentSpec
        EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(
            base_executor.BaseExecutor)

        def __init__(
                self,
                type: Type[types.Artifact],  # pylint: disable=redefined-builtin
                spec_kwargs: Dict[str, Any]):
            spec = _FakeComponentSpec(output=types.Channel(type=type),
                                      **spec_kwargs)
            super().__init__(spec=spec)
            self._id = name
            if dynamic_exec_property is not None:
                self.exec_properties['exec_prop'] = dynamic_exec_property
Ejemplo n.º 16
0
class BigQueryExampleGen(component.QueryBasedExampleGen):
    """Official TFX BigQueryExampleGen component.

  The BigQuery examplegen component takes a query, and generates train
  and eval examples for downstream components.
  """

    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(self,
                 query: Optional[Text] = None,
                 input_config: Optional[example_gen_pb2.Input] = None,
                 output_config: Optional[example_gen_pb2.Output] = None,
                 example_artifacts: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Constructs a BigQueryExampleGen component.

    Args:
      query: BigQuery sql string, query result will be treated as a single
        split, can be overwritten by input_config.
      input_config: An example_gen_pb2.Input instance with Split.pattern as
        BigQuery sql string. If set, it overwrites the 'query' arg, and allows
        different queries per split. If any field is provided as a
        RuntimeParameter, input_config should be constructed as a dict with the
        same field names as Input proto message.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1. If any field is provided as a RuntimeParameter,
        input_config should be constructed as a dict with the same field names
        as Output proto message.
      example_artifacts: Optional channel of 'ExamplesPath' for output train and
        eval examples.
      instance_name: Optional unique instance name. Necessary if multiple
        BigQueryExampleGen components are declared in the same pipeline.

    Raises:
      RuntimeError: Only one of query and input_config should be set.
    """
        if bool(query) == bool(input_config):
            raise RuntimeError(
                'Exactly one of query and input_config should be set.')
        input_config = input_config or utils.make_default_input_config(query)
        super(BigQueryExampleGen,
              self).__init__(input_config=input_config,
                             output_config=output_config,
                             example_artifacts=example_artifacts,
                             instance_name=instance_name)
Ejemplo n.º 17
0
    def testBeamExecutorSpecCopy(self):
        class _NestedExecutor(base_executor.BaseExecutor):
            pass

        spec = executor_spec.BeamExecutorSpec(_NestedExecutor)
        spec.add_extra_flags('a')
        spec.add_beam_pipeline_args('b')
        spec_copy = spec.copy()
        del spec
        self.assertProtoEquals(
            """
        python_executor_spec: {
            class_path: "__main__._NestedExecutor"
            extra_flags: "a"
        }
        beam_pipeline_args: "b"
        """, spec_copy.encode())
Ejemplo n.º 18
0
class ModelValidator(base_beam_component.BaseBeamComponent):
    """DEPRECATED: Please use `Evaluator` instead.

  The model validator component can be used to check model metrics threshold
  and validate current model against a previously validated model. If there
  isn't a prior validated model, model validator will just make sure the
  threshold passed.  Otherwise, ModelValidator compares a newly trained models
  against a known good model, specifically the last model "blessed" by this
  component.  A model is "blessed" if the exported model's metrics are within
  predefined thresholds around the prior model's metrics.

  *Note:* This component includes a driver to resolve last blessed model.

  ## Possible causes why model validation fails
  Model validation can fail for many reasons, but these are the most common:

  - problems with training data.  For example, negative examples are dropped or
    features are missing.
  - problems with the test or evaluation data.  For example, skew exists between
    the training and evaluation data.
  - changes in data distribution.  This indicates the user behavior may have
    changed over time.
  - problems with the trainer.  For example, the trainer was stopped before
    model is converged or the model is unstable.

  ## Example
  ```
    # Performs quality validation of a candidate model (compared to a baseline).
    model_validator = ModelValidator(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'])
  ```
  """

    SPEC_CLASS = ModelValidatorSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)
    DRIVER_CLASS = driver.Driver

    @deprecation_utils.deprecated(
        None, 'ModelValidator is deprecated, use Evaluator instead.')
    def __init__(self,
                 examples: types.Channel,
                 model: types.Channel,
                 blessing: Optional[types.Channel] = None):
        """Construct a ModelValidator component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples`, usually
        produced by an
        [ExampleGen](https://www.tensorflow.org/tfx/guide/examplegen) component.
        _required_
      model: A Channel of type `standard_artifacts.Model`, usually produced by
        a [Trainer](https://www.tensorflow.org/tfx/guide/trainer) component.
        _required_
      blessing: Output channel of type `standard_artifacts.ModelBlessing`
        that contains the validation result.
    """
        blessing = blessing or types.Channel(
            type=standard_artifacts.ModelBlessing)
        spec = ModelValidatorSpec(examples=examples,
                                  model=model,
                                  blessing=blessing)
        super(ModelValidator, self).__init__(spec=spec)
Ejemplo n.º 19
0
class FileBasedExampleGen(base_beam_component.BaseBeamComponent):
    """A TFX component to ingest examples from a file system.

  The FileBasedExampleGen component is an API for getting file-based records
  into TFX pipelines. It consumes external files to generate examples which will
  be used by other internal components like StatisticsGen or Trainers.  The
  component will also convert the input data into
  [tf.record](https://www.tensorflow.org/tutorials/load_data/tf_records)
  and generate train and eval example splits for downstream components.

  ## Example
  ```
  _taxi_root = os.path.join(os.environ['HOME'], 'taxi')
  _data_root = os.path.join(_taxi_root, 'data', 'simple')
  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = FileBasedExampleGen(input_base=_data_root)
  ```
  """

    SPEC_CLASS = FileBasedExampleGenSpec
    # EXECUTOR_SPEC should be overridden by subclasses.
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(
        base_beam_executor.BaseBeamExecutor)
    DRIVER_CLASS = driver.FileBasedDriver

    def __init__(
            self,
            # TODO(b/159467778): deprecate this, use input_base instead.
            input: Optional[types.Channel] = None,  # pylint: disable=redefined-builtin
            input_base: Optional[Text] = None,
            input_config: Optional[Union[example_gen_pb2.Input,
                                         Dict[Text, Any]]] = None,
            output_config: Optional[Union[example_gen_pb2.Output,
                                          Dict[Text, Any]]] = None,
            custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                          Dict[Text, Any]]] = None,
            range_config: Optional[Union[range_config_pb2.RangeConfig,
                                         Dict[Text, Any]]] = None,
            output_data_format: Optional[int] = example_gen_pb2.
        FORMAT_TF_EXAMPLE,
            example_artifacts: Optional[types.Channel] = None,
            custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None,
            instance_name: Optional[Text] = None):
        """Construct a FileBasedExampleGen component.

    Args:
      input: A Channel of type `standard_artifacts.ExternalArtifact`, which
        includes one artifact whose uri is an external directory containing the
        data files. (Deprecated by input_base)
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      example_artifacts: Channel of 'ExamplesPath' for output train and eval
        examples.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
      instance_name: Optional unique instance name. Required only if multiple
        ExampleGen components are declared in the same pipeline.
    """
        if input:
            logging.warning(
                'The "input" argument to the ExampleGen component has been '
                'deprecated by "input_base". Please update your usage as support for '
                'this argument will be removed soon.')
            input_base = artifact_utils.get_single_uri(list(input.get()))
        # Configure inputs and outputs.
        input_config = input_config or utils.make_default_input_config()
        output_config = output_config or utils.make_default_output_config(
            input_config)

        if not example_artifacts:
            example_artifacts = types.Channel(type=standard_artifacts.Examples)
        spec = FileBasedExampleGenSpec(input_base=input_base,
                                       input_config=input_config,
                                       output_config=output_config,
                                       custom_config=custom_config,
                                       range_config=range_config,
                                       output_data_format=output_data_format,
                                       examples=example_artifacts)
        super(FileBasedExampleGen,
              self).__init__(spec=spec,
                             custom_executor_spec=custom_executor_spec,
                             instance_name=instance_name)
Ejemplo n.º 20
0
class Transform(base_beam_component.BaseBeamComponent):
    """A TFX component to transform the input examples.

  The Transform component wraps TensorFlow Transform (tf.Transform) to
  preprocess data in a TFX pipeline. This component will load the
  preprocessing_fn from input module file, preprocess both 'train' and 'eval'
  splits of input examples, generate the `tf.Transform` output, and save both
  transform function and transformed examples to orchestrator desired locations.

  The Transform component can also invoke TFDV to compute statistics on the
  pre-transform and post-transform data. Invocations of TFDV take an optional
  [StatsOptions](https://github.com/tensorflow/data-validation/blob/master/tensorflow_data_validation/statistics/stats_options.py)
  object. To configure the StatsOptions object that is passed to TFDV for both
  pre-transform and post-transform statistics, users
  can define the optional `stats_options_updater_fn` within the module file.

  ## Providing a preprocessing function
  The TFX executor will use the estimator provided in the `module_file` file
  to train the model.  The Transform executor will look specifically for the
  `preprocessing_fn()` function within that file.

  An example of `preprocessing_fn()` can be found in the [user-supplied
  code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)
  of the TFX Chicago Taxi pipeline example.

  ## Updating StatsOptions
  The Transform executor will look specifically for the
  `stats_options_updater_fn()` within the module file specified above.

  An example of `stats_options_updater_fn()` can be found in the [user-supplied
  code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/bert/mrpc/bert_mrpc_utils.py)
  of the TFX BERT MRPC pipeline example.

  ## Example
  ```
  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      module_file=module_file)
  ```

  Component `outputs` contains:
   - `transform_graph`: Channel of type `standard_artifacts.TransformGraph`,
                        which includes an exported Tensorflow graph suitable
                        for both training and serving.
   - `transformed_examples`: Channel of type `standard_artifacts.Examples` for
                             materialized transformed examples, which includes
                             transform splits as specified in splits_config.
                             This is optional controlled by `materialize`.

  Please see [the Transform
  guide](https://www.tensorflow.org/tfx/guide/transform) for more details.
  """

    SPEC_CLASS = standard_component_specs.TransformSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(
            self,
            examples: types.Channel,
            schema: types.Channel,
            module_file: Optional[Union[Text,
                                        data_types.RuntimeParameter]] = None,
            preprocessing_fn: Optional[Union[
                Text, data_types.RuntimeParameter]] = None,
            splits_config: Optional[transform_pb2.SplitsConfig] = None,
            analyzer_cache: Optional[types.Channel] = None,
            materialize: bool = True,
            disable_analyzer_cache: bool = False,
            force_tf_compat_v1: bool = False,
            custom_config: Optional[Dict[Text, Any]] = None,
            disable_statistics: bool = False):
        """Construct a Transform component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples` (required).
        This should contain custom splits specified in splits_config. If
        custom split is not provided, this should contain two splits 'train'
        and 'eval'.
      schema: A Channel of type `standard_artifacts.Schema`. This should
        contain a single schema artifact.
      module_file: The file path to a python module file, from which the
        'preprocessing_fn' function will be loaded.
        Exactly one of 'module_file' or 'preprocessing_fn' must be supplied.

        The function needs to have the following signature:
        ```
        def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:
          ...
        ```
        where the values of input and returned Dict are either tf.Tensor or
        tf.SparseTensor.

        If additional inputs are needed for preprocessing_fn, they can be passed
        in custom_config:

        ```
        def preprocessing_fn(inputs: Dict[Text, Any], custom_config:
                             Dict[Text, Any]) -> Dict[Text, Any]:
          ...
        ```
        Use of a RuntimeParameter for this argument is experimental.
      preprocessing_fn: The path to python function that implements a
        'preprocessing_fn'. See 'module_file' for expected signature of the
        function. Exactly one of 'module_file' or 'preprocessing_fn' must be
        supplied. Use of a RuntimeParameter for this argument is experimental.
      splits_config: A transform_pb2.SplitsConfig instance, providing splits
        that should be analyzed and splits that should be transformed. Note
        analyze and transform splits can have overlap. Default behavior (when
        splits_config is not set) is analyze the 'train' split and transform
        all splits. If splits_config is set, analyze cannot be empty.
      analyzer_cache: Optional input 'TransformCache' channel containing
        cached information from previous Transform runs. When provided,
        Transform will try use the cached calculation if possible.
      materialize: If True, write transformed examples as an output.
      disable_analyzer_cache: If False, Transform will use input cache if
        provided and write cache output. If True, `analyzer_cache` must not be
        provided.
      force_tf_compat_v1: (Optional) If True and/or TF2 behaviors are disabled
        Transform will use Tensorflow in compat.v1 mode irrespective of
        installed version of Tensorflow. Defaults to `False`.
      custom_config: A dict which contains additional parameters that will be
        passed to preprocessing_fn.
      disable_statistics: If True, do not invoke TFDV to compute pre-transform
        and post-transform statistics. When statistics are computed, they will
        will be stored in the `pre_transform_feature_stats/` and
        `post_transform_feature_stats/` subfolders of the `transform_graph`
        export.

    Raises:
      ValueError: When both or neither of 'module_file' and 'preprocessing_fn'
        is supplied.
    """
        if bool(module_file) == bool(preprocessing_fn):
            raise ValueError(
                "Exactly one of 'module_file' or 'preprocessing_fn' must be supplied."
            )

        transform_graph = types.Channel(type=standard_artifacts.TransformGraph)
        transformed_examples = None
        if materialize:
            transformed_examples = types.Channel(
                type=standard_artifacts.Examples)
            transformed_examples.matching_channel_name = 'examples'

        (pre_transform_schema, pre_transform_stats, post_transform_schema,
         post_transform_stats, post_transform_anomalies) = (None, ) * 5
        if not disable_statistics:
            pre_transform_schema = types.Channel(
                type=standard_artifacts.Schema)
            post_transform_schema = types.Channel(
                type=standard_artifacts.Schema)
            pre_transform_stats = types.Channel(
                type=standard_artifacts.ExampleStatistics)
            post_transform_stats = types.Channel(
                type=standard_artifacts.ExampleStatistics)
            post_transform_anomalies = types.Channel(
                type=standard_artifacts.ExampleAnomalies)

        if disable_analyzer_cache:
            updated_analyzer_cache = None
            if analyzer_cache:
                raise ValueError(
                    '`analyzer_cache` is set when disable_analyzer_cache is True.'
                )
        else:
            updated_analyzer_cache = types.Channel(
                type=standard_artifacts.TransformCache)

        spec = standard_component_specs.TransformSpec(
            examples=examples,
            schema=schema,
            module_file=module_file,
            preprocessing_fn=preprocessing_fn,
            force_tf_compat_v1=int(force_tf_compat_v1),
            splits_config=splits_config,
            transform_graph=transform_graph,
            transformed_examples=transformed_examples,
            analyzer_cache=analyzer_cache,
            updated_analyzer_cache=updated_analyzer_cache,
            custom_config=json_utils.dumps(custom_config),
            disable_statistics=int(disable_statistics),
            pre_transform_schema=pre_transform_schema,
            pre_transform_stats=pre_transform_stats,
            post_transform_schema=post_transform_schema,
            post_transform_stats=post_transform_stats,
            post_transform_anomalies=post_transform_anomalies)
        super(Transform, self).__init__(spec=spec)

        if udf_utils.should_package_user_modules():
            # In this case, the `MODULE_PATH_KEY` execution property will be injected
            # as a reference to the given user module file after packaging, at which
            # point the `MODULE_FILE_KEY` execution property will be removed.
            udf_utils.add_user_module_dependency(
                self, standard_component_specs.MODULE_FILE_KEY,
                standard_component_specs.MODULE_PATH_KEY)
Ejemplo n.º 21
0
class BulkInferrer(base_beam_component.BaseBeamComponent):
    """A TFX component to do batch inference on a model with unlabelled examples.

  BulkInferrer consumes examples data and a model, and produces the inference
  results to an external location as PredictionLog proto.

  BulkInferrer will infer on validated model.

  ## Example
  ```
    # Uses BulkInferrer to inference on examples.
    bulk_inferrer = BulkInferrer(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'])
  ```
  """

    SPEC_CLASS = BulkInferrerSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(self,
                 examples: types.Channel = None,
                 model: Optional[types.Channel] = None,
                 model_blessing: Optional[types.Channel] = None,
                 data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec,
                                           Dict[Text, Any]]] = None,
                 model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec,
                                            Dict[Text, Any]]] = None,
                 output_example_spec: Optional[Union[
                     bulk_inferrer_pb2.OutputExampleSpec, Dict[Text,
                                                               Any]]] = None,
                 inference_result: Optional[types.Channel] = None,
                 output_examples: Optional[types.Channel] = None,
                 instance_name: Optional[Text] = None):
        """Construct an BulkInferrer component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples`, usually
        produced by an ExampleGen component. _required_
      model: A Channel of type `standard_artifacts.Model`, usually produced by
        a Trainer component.
      model_blessing: A Channel of type `standard_artifacts.ModelBlessing`,
        usually produced by a ModelValidator component.
      data_spec: bulk_inferrer_pb2.DataSpec instance that describes data
        selection. If any field is provided as a RuntimeParameter, data_spec
        should be constructed as a dict with the same field names as DataSpec
        proto message.
      model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model
        specification. If any field is provided as a RuntimeParameter,
        model_spec should be constructed as a dict with the same field names as
        ModelSpec proto message.
      output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify
        if you want BulkInferrer to output examples instead of inference result.
        If any field is provided as a RuntimeParameter, output_example_spec
        should be constructed as a dict with the same field names as
        OutputExampleSpec proto message.
      inference_result: Channel of type `standard_artifacts.InferenceResult`
        to store the inference results, must not be specified when
        output_example_spec is set.
      output_examples: Channel of type `standard_artifacts.Examples`
        to store the output examples, must not be specified when
        output_example_spec is unset. Check output_example_spec for details.
      instance_name: Optional name assigned to this specific instance of
        BulkInferrer. Required only if multiple BulkInferrer components are
        declared in the same pipeline.

    Raises:
      ValueError: Must not specify inference_result or output_examples depends
        on whether output_example_spec is set or not.
    """
        if output_example_spec:
            if inference_result:
                raise ValueError(
                    'Must not specify inference_result when output_example_spec is set.'
                )
            output_examples = output_examples or types.Channel(
                type=standard_artifacts.Examples)
        else:
            if output_examples:
                raise ValueError(
                    'Must not specify output_examples when output_example_spec is unset.'
                )
            inference_result = inference_result or types.Channel(
                type=standard_artifacts.InferenceResult)

        spec = BulkInferrerSpec(examples=examples,
                                model=model,
                                model_blessing=model_blessing,
                                data_spec=data_spec
                                or bulk_inferrer_pb2.DataSpec(),
                                model_spec=model_spec
                                or bulk_inferrer_pb2.ModelSpec(),
                                output_example_spec=output_example_spec,
                                inference_result=inference_result,
                                output_examples=output_examples)
        super(BulkInferrer, self).__init__(spec=spec,
                                           instance_name=instance_name)
Ejemplo n.º 22
0
class _MyComponent(base_component.BaseComponent):
    SPEC_CLASS = _MyComponentSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(base_executor.BaseExecutor)
Ejemplo n.º 23
0
class BulkInferrer(base_beam_component.BaseBeamComponent):
    """A TFX component to do batch inference on a model with unlabelled examples.

  BulkInferrer consumes examples data and a model, and produces the inference
  results to an external location as PredictionLog proto.

  BulkInferrer will infer on validated model.

  ## Example
  ```
    # Uses BulkInferrer to inference on examples.
    bulk_inferrer = BulkInferrer(
        examples=example_gen.outputs['examples'],
        model=trainer.outputs['model'])
  ```

  Component `outputs` contains:
   - `inference_result`: Channel of type `standard_artifacts.InferenceResult`
                         to store the inference results.
   - `output_examples`: Channel of type `standard_artifacts.Examples`
                        to store the output examples. This is optional
                        controlled by `output_example_spec`.

  See [the BulkInferrer
  guide](https://www.tensorflow.org/tfx/guide/bulkinferrer) for more details.
  """

    SPEC_CLASS = BulkInferrerSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(self,
                 examples: types.Channel = None,
                 model: Optional[types.Channel] = None,
                 model_blessing: Optional[types.Channel] = None,
                 data_spec: Optional[Union[bulk_inferrer_pb2.DataSpec,
                                           Dict[Text, Any]]] = None,
                 model_spec: Optional[Union[bulk_inferrer_pb2.ModelSpec,
                                            Dict[Text, Any]]] = None,
                 output_example_spec: Optional[Union[
                     bulk_inferrer_pb2.OutputExampleSpec, Dict[Text,
                                                               Any]]] = None):
        """Construct an BulkInferrer component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples`, usually
        produced by an ExampleGen component. _required_
      model: A Channel of type `standard_artifacts.Model`, usually produced by
        a Trainer component.
      model_blessing: A Channel of type `standard_artifacts.ModelBlessing`,
        usually produced by a ModelValidator component.
      data_spec: bulk_inferrer_pb2.DataSpec instance that describes data
        selection. If any field is provided as a RuntimeParameter, data_spec
        should be constructed as a dict with the same field names as DataSpec
        proto message.
      model_spec: bulk_inferrer_pb2.ModelSpec instance that describes model
        specification. If any field is provided as a RuntimeParameter,
        model_spec should be constructed as a dict with the same field names as
        ModelSpec proto message.
      output_example_spec: bulk_inferrer_pb2.OutputExampleSpec instance, specify
        if you want BulkInferrer to output examples instead of inference result.
        If any field is provided as a RuntimeParameter, output_example_spec
        should be constructed as a dict with the same field names as
        OutputExampleSpec proto message.
    """
        if output_example_spec:
            output_examples = types.Channel(type=standard_artifacts.Examples)
            inference_result = None
        else:
            inference_result = types.Channel(
                type=standard_artifacts.InferenceResult)
            output_examples = None

        spec = BulkInferrerSpec(examples=examples,
                                model=model,
                                model_blessing=model_blessing,
                                data_spec=data_spec
                                or bulk_inferrer_pb2.DataSpec(),
                                model_spec=model_spec
                                or bulk_inferrer_pb2.ModelSpec(),
                                output_example_spec=output_example_spec,
                                inference_result=inference_result,
                                output_examples=output_examples)
        super(BulkInferrer, self).__init__(spec=spec)
Ejemplo n.º 24
0
class Transform(base_beam_component.BaseBeamComponent):
    """A TFX component to transform the input examples.

  The Transform component wraps TensorFlow Transform (tf.Transform) to
  preprocess data in a TFX pipeline. This component will load the
  preprocessing_fn from input module file, preprocess both 'train' and 'eval'
  splits of input examples, generate the `tf.Transform` output, and save both
  transform function and transformed examples to orchestrator desired locations.

  ## Providing a preprocessing function
  The TFX executor will use the estimator provided in the `module_file` file
  to train the model.  The Transform executor will look specifically for the
  `preprocessing_fn()` function within that file.

  An example of `preprocessing_fn()` can be found in the [user-supplied
  code](https://github.com/tensorflow/tfx/blob/master/tfx/examples/chicago_taxi_pipeline/taxi_utils.py)
  of the TFX Chicago Taxi pipeline example.

  ## Example
  ```
  # Performs transformations and feature engineering in training and serving.
  transform = Transform(
      examples=example_gen.outputs['examples'],
      schema=infer_schema.outputs['schema'],
      module_file=module_file)
  ```

  Please see https://www.tensorflow.org/tfx/transform for more details.
  """

    SPEC_CLASS = TransformSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(
            self,
            examples: types.Channel = None,
            schema: types.Channel = None,
            module_file: Optional[Union[Text,
                                        data_types.RuntimeParameter]] = None,
            preprocessing_fn: Optional[Union[
                Text, data_types.RuntimeParameter]] = None,
            splits_config: transform_pb2.SplitsConfig = None,
            transform_graph: Optional[types.Channel] = None,
            transformed_examples: Optional[types.Channel] = None,
            analyzer_cache: Optional[types.Channel] = None,
            materialize: bool = True,
            disable_analyzer_cache: bool = False,
            force_tf_compat_v1: bool = True,
            custom_config: Optional[Dict[Text, Any]] = None):
        """Construct a Transform component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples` (required).
        This should contain custom splits specified in splits_config. If
        custom split is not provided, this should contain two splits 'train'
        and 'eval'.
      schema: A Channel of type `standard_artifacts.Schema`. This should
        contain a single schema artifact.
      module_file: The file path to a python module file, from which the
        'preprocessing_fn' function will be loaded.
        Exactly one of 'module_file' or 'preprocessing_fn' must be supplied.

        The function needs to have the following signature:
        ```
        def preprocessing_fn(inputs: Dict[Text, Any]) -> Dict[Text, Any]:
          ...
        ```
        where the values of input and returned Dict are either tf.Tensor or
        tf.SparseTensor.

        If additional inputs are needed for preprocessing_fn, they can be passed
        in custom_config:

        ```
        def preprocessing_fn(inputs: Dict[Text, Any], custom_config:
                             Dict[Text, Any]) -> Dict[Text, Any]:
          ...
        ```
      preprocessing_fn: The path to python function that implements a
        'preprocessing_fn'. See 'module_file' for expected signature of the
        function. Exactly one of 'module_file' or 'preprocessing_fn' must be
        supplied.
      splits_config: A transform_pb2.SplitsConfig instance, providing splits
        that should be analyzed and splits that should be transformed. Note
        analyze and transform splits can have overlap. Default behavior (when
        splits_config is not set) is analyze the 'train' split and transform
        all splits. If splits_config is set, analyze cannot be empty.
      transform_graph: Optional output 'TransformPath' channel for output of
        'tf.Transform', which includes an exported Tensorflow graph suitable for
        both training and serving;
      transformed_examples: Optional output 'ExamplesPath' channel for
        materialized transformed examples, which includes transform splits as
        specified in splits_config. If custom split is not provided, this should
        include both 'train' and 'eval' splits.
      analyzer_cache: Optional input 'TransformCache' channel containing
        cached information from previous Transform runs. When provided,
        Transform will try use the cached calculation if possible.
      materialize: If True, write transformed examples as an output. If False,
        `transformed_examples` must not be provided.
      disable_analyzer_cache: If False, Transform will use input cache if
        provided and write cache output. If True, `analyzer_cache` must not be
        provided.
      force_tf_compat_v1: (Optional) If True, Transform will use Tensorflow in
        compat.v1 mode irrespective of installed version of Tensorflow. Defaults
        to `True`. Note: The default value will be switched to `False` in a
        future release.
      custom_config: A dict which contains additional parameters that will be
        passed to preprocessing_fn.

    Raises:
      ValueError: When both or neither of 'module_file' and 'preprocessing_fn'
        is supplied.
    """
        if bool(module_file) == bool(preprocessing_fn):
            raise ValueError(
                "Exactly one of 'module_file' or 'preprocessing_fn' must be supplied."
            )

        transform_graph = transform_graph or types.Channel(
            type=standard_artifacts.TransformGraph)

        if materialize and transformed_examples is None:
            transformed_examples = types.Channel(
                type=standard_artifacts.Examples)
            transformed_examples.matching_channel_name = 'examples'
        elif not materialize and transformed_examples is not None:
            raise ValueError(
                'Must not specify transformed_examples when materialize is False.'
            )

        if disable_analyzer_cache:
            updated_analyzer_cache = None
            if analyzer_cache:
                raise ValueError(
                    '`analyzer_cache` is set when disable_analyzer_cache is True.'
                )
        else:
            updated_analyzer_cache = types.Channel(
                type=standard_artifacts.TransformCache)

        spec = TransformSpec(examples=examples,
                             schema=schema,
                             module_file=module_file,
                             preprocessing_fn=preprocessing_fn,
                             force_tf_compat_v1=int(force_tf_compat_v1),
                             splits_config=splits_config,
                             transform_graph=transform_graph,
                             transformed_examples=transformed_examples,
                             analyzer_cache=analyzer_cache,
                             updated_analyzer_cache=updated_analyzer_cache,
                             custom_config=json_utils.dumps(custom_config))
        super(Transform, self).__init__(spec=spec)
Ejemplo n.º 25
0
class CsvExampleGen(component.FileBasedExampleGen):  # pylint: disable=protected-access
    """Official TFX CsvExampleGen component.

  The csv examplegen component takes csv data, and generates train
  and eval examples for downstream components.

  The csv examplegen encodes column values to tf.Example int/float/byte feature.
  For the case when there's missing cells, the csv examplegen uses:
  -- tf.train.Feature(`type`_list=tf.train.`type`List(value=[])), when the
     `type` can be inferred.
  -- tf.train.Feature() when it cannot infer the `type` from the column.

  Note that the type inferring will be per input split. If input isn't a single
  split, users need to ensure the column types align in each pre-splits.

  For example, given the following csv rows of a split:

    header:A,B,C,D
    row1:  1,,x,0.1
    row2:  2,,y,0.2
    row3:  3,,,0.3
    row4:

  The output example will be
    example1: 1(int), empty feature(no type), x(string), 0.1(float)
    example2: 2(int), empty feature(no type), x(string), 0.2(float)
    example3: 3(int), empty feature(no type), empty list(string), 0.3(float)

    Note that the empty feature is `tf.train.Feature()` while empty list string
    feature is `tf.train.Feature(bytes_list=tf.train.BytesList(value=[]))`.

  Component `outputs` contains:
   - `examples`: Channel of type `standard_artifacts.Examples` for output train
                 and eval examples.
  """

    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(
        self,
        input_base: Optional[str] = None,
        input_config: Optional[Union[example_gen_pb2.Input,
                                     data_types.RuntimeParameter]] = None,
        output_config: Optional[Union[example_gen_pb2.Output,
                                      data_types.RuntimeParameter]] = None,
        range_config: Optional[Union[range_config_pb2.RangeConfig,
                                     data_types.RuntimeParameter]] = None):
        """Construct a CsvExampleGen component.

    Args:
      input_base: an external directory containing the CSV files.
      input_config: An example_gen_pb2.Input instance, providing input
        configuration. If unset, the files under input_base will be treated as a
        single split.
      output_config: An example_gen_pb2.Output instance, providing output
        configuration. If unset, default splits will be 'train' and 'eval' with
        size 2:1.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
    """
        super().__init__(input_base=input_base,
                         input_config=input_config,
                         output_config=output_config,
                         range_config=range_config)
Ejemplo n.º 26
0
class QueryBasedExampleGen(base_beam_component.BaseBeamComponent):
  """A TFX component to ingest examples from query based systems.

  The QueryBasedExampleGen component can be extended to ingest examples from
  query based systems such as Presto or Bigquery. The component will also
  convert the input data into
  tf.record](https://www.tensorflow.org/tutorials/load_data/tf_records)
  and generate train and eval example splits for downstream components.

  ## Example
  ```
  _query = "SELECT * FROM `bigquery-public-data.chicago_taxi_trips.taxi_trips`"
  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = BigQueryExampleGen(query=_query)
  ```

  Component `outputs` contains:
   - `examples`: Channel of type `standard_artifacts.Examples` for output train
                 and eval examples.
  """

  SPEC_CLASS = QueryBasedExampleGenSpec
  # EXECUTOR_SPEC should be overridden by subclasses.
  EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(
      base_beam_executor.BaseBeamExecutor)
  DRIVER_CLASS = driver.QueryBasedDriver

  def __init__(
      self,
      input_config: Union[example_gen_pb2.Input, Dict[Text, Any]],
      output_config: Optional[Union[example_gen_pb2.Output, Dict[Text,
                                                                 Any]]] = None,
      custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                    Dict[Text, Any]]] = None,
      output_data_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE):
    """Construct a QueryBasedExampleGen component.

    Args:
      input_config: An
        [example_gen_pb2.Input](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If any field is provided as a
          RuntimeParameter, input_config should be constructed as a dict with
          the same field names as Input proto message. _required_
      output_config: An
        [example_gen_pb2.Output](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing output configuration. If unset, the default splits
        will be labeled as 'train' and 'eval' with a distribution ratio of 2:1.
          If any field is provided as a RuntimeParameter, output_config should
          be constructed as a dict with the same field names as Output proto
          message.
      custom_config: An
        [example_gen_pb2.CustomConfig](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing custom configuration for ExampleGen. If any field
          is provided as a RuntimeParameter, output_config should be constructed
          as a dict.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.

    Raises:
      ValueError: The output_data_format value must be defined in the
        example_gen_pb2.PayloadFormat proto.
    """
    # Configure outputs.
    output_config = output_config or utils.make_default_output_config(
        input_config)
    example_artifacts = types.Channel(type=standard_artifacts.Examples)
    if output_data_format not in example_gen_pb2.PayloadFormat.values():
      raise ValueError('The value of output_data_format must be defined in'
                       'the example_gen_pb2.PayloadFormat proto.')

    spec = QueryBasedExampleGenSpec(
        input_config=input_config,
        output_config=output_config,
        output_data_format=output_data_format,
        custom_config=custom_config,
        examples=example_artifacts)
    super(QueryBasedExampleGen, self).__init__(spec=spec)
Ejemplo n.º 27
0
class FileBasedExampleGen(base_beam_component.BaseBeamComponent):
  """A TFX component to ingest examples from a file system.

  The FileBasedExampleGen component is an API for getting file-based records
  into TFX pipelines. It consumes external files to generate examples which will
  be used by other internal components like StatisticsGen or Trainers.  The
  component will also convert the input data into
  [tf.record](https://www.tensorflow.org/tutorials/load_data/tf_records)
  and generate train and eval example splits for downstream components.

  ## Example
  ```
  _taxi_root = os.path.join(os.environ['HOME'], 'taxi')
  _data_root = os.path.join(_taxi_root, 'data', 'simple')
  # Brings data into the pipeline or otherwise joins/converts training data.
  example_gen = FileBasedExampleGen(input_base=_data_root)
  ```

  Component `outputs` contains:
   - `examples`: Channel of type `standard_artifacts.Examples` for output train
                 and eval examples.
  """

  SPEC_CLASS = FileBasedExampleGenSpec
  # EXECUTOR_SPEC should be overridden by subclasses.
  EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(
      base_beam_executor.BaseBeamExecutor)
  DRIVER_CLASS = driver.FileBasedDriver

  def __init__(
      self,
      input_base: Optional[Text] = None,
      input_config: Optional[Union[example_gen_pb2.Input, Dict[Text,
                                                               Any]]] = None,
      output_config: Optional[Union[example_gen_pb2.Output, Dict[Text,
                                                                 Any]]] = None,
      custom_config: Optional[Union[example_gen_pb2.CustomConfig,
                                    Dict[Text, Any]]] = None,
      range_config: Optional[Union[range_config_pb2.RangeConfig,
                                   Dict[Text, Any]]] = None,
      output_data_format: Optional[int] = example_gen_pb2.FORMAT_TF_EXAMPLE,
      custom_executor_spec: Optional[executor_spec.ExecutorSpec] = None):
    """Construct a FileBasedExampleGen component.

    Args:
      input_base: an external directory containing the data files.
      input_config: An
        [`example_gen_pb2.Input`](https://github.com/tensorflow/tfx/blob/master/tfx/proto/example_gen.proto)
          instance, providing input configuration. If unset, input files will be
          treated as a single split.
      output_config: An example_gen_pb2.Output instance, providing the output
        configuration. If unset, default splits will be 'train' and
        'eval' with size 2:1.
      custom_config: An optional example_gen_pb2.CustomConfig instance,
        providing custom configuration for executor.
      range_config: An optional range_config_pb2.RangeConfig instance,
        specifying the range of span values to consider. If unset, driver will
        default to searching for latest span with no restrictions.
      output_data_format: Payload format of generated data in output artifact,
        one of example_gen_pb2.PayloadFormat enum.
      custom_executor_spec: Optional custom executor spec overriding the default
        executor spec specified in the component attribute.
    """
    # Configure inputs and outputs.
    input_config = input_config or utils.make_default_input_config()
    output_config = output_config or utils.make_default_output_config(
        input_config)
    example_artifacts = types.Channel(type=standard_artifacts.Examples)
    spec = FileBasedExampleGenSpec(
        input_base=input_base,
        input_config=input_config,
        output_config=output_config,
        custom_config=custom_config,
        range_config=range_config,
        output_data_format=output_data_format,
        examples=example_artifacts)
    super(FileBasedExampleGen, self).__init__(
        spec=spec, custom_executor_spec=custom_executor_spec)
Ejemplo n.º 28
0
 class BeamComponent(base_beam_component.BaseBeamComponent):
     EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(
         base_beam_executor.BaseBeamExecutor)
     SPEC_CLASS = _EmptyComponentSpec
Ejemplo n.º 29
0
def replace_executor_with_stub(pipeline: pipeline_pb2.Pipeline,
                               test_data_dir: str,
                               test_component_ids: List[str]):
  """Replace executors in pipeline IR with the stub executor.

  This funciton will replace the IR inplace.
  For example,

  pipeline_mock.replace_executor_with_stub(
      pipeline_ir,
      test_data_dir,
      test_component_ids = ['Trainer', 'Transform'])

  Then you can pass the modified `pipeline_ir` into a dag runner to execute
  the stubbed pipeline.

  Args:
    pipeline: The pipeline to alter.
    test_data_dir: The directory where pipeline outputs are recorded
      (pipeline_recorder.py).
    test_component_ids: List of ids of components that are to be tested. In
      other words, executors of components other than those specified by this
      list will be replaced with a BaseStubExecutor.

  Returns:
    None
  """
  deployment_config = pipeline_pb2.IntermediateDeploymentConfig()
  if not pipeline.deployment_config.Unpack(deployment_config):
    raise NotImplementedError(
        'Unexpected pipeline.deployment_config type "{}". Currently only '
        'IntermediateDeploymentConfig is supported.'.format(
            pipeline.deployment_config.type_url))

  for component_id in deployment_config.executor_specs:
    if component_id not in test_component_ids:
      executable_spec = deployment_config.executor_specs[component_id]
      if executable_spec.Is(
          executable_spec_pb2.PythonClassExecutableSpec.DESCRIPTOR):
        stub_executor_class_spec = executor_spec.ExecutorClassSpec(
            base_stub_executor.BaseStubExecutor)
        stub_executor_class_spec.add_extra_flags(
            (base_stub_executor.TEST_DATA_DIR_FLAG + '=' + test_data_dir,
             base_stub_executor.COMPONENT_ID_FLAG + '=' + component_id))
        stub_executor_spec = stub_executor_class_spec.encode()
        executable_spec.Pack(stub_executor_spec)
      elif executable_spec.Is(
          executable_spec_pb2.BeamExecutableSpec.DESCRIPTOR):
        stub_beam_executor_spec = executor_spec.BeamExecutorSpec(
            base_stub_executor.BaseStubExecutor)
        stub_beam_executor_spec.add_extra_flags(
            (base_stub_executor.TEST_DATA_DIR_FLAG + '=' + test_data_dir,
             base_stub_executor.COMPONENT_ID_FLAG + '=' + component_id))
        stub_executor_spec = stub_beam_executor_spec.encode()
        executable_spec.Pack(stub_executor_spec)
      else:
        raise NotImplementedError(
            'Unexpected executable_spec type "{}". Currently only '
            'PythonClassExecutableSpec and BeamExecutorSpec is supported.'
            .format(executable_spec.type_url))
  pipeline.deployment_config.Pack(deployment_config)
Ejemplo n.º 30
0
class Evaluator(base_beam_component.BaseBeamComponent):
    """A TFX component to evaluate models trained by a TFX Trainer component.

  Component `outputs` contains:
   - `evaluation`: Channel of type `standard_artifacts.ModelEvaluation` to store
                   the evaluation results.
   - `blessing`: Channel of type `standard_artifacts.ModelBlessing' that
                 contains the blessing result.

  See [the Evaluator guide](https://www.tensorflow.org/tfx/guide/evaluator) for
  more details.
  """

    SPEC_CLASS = standard_component_specs.EvaluatorSpec
    EXECUTOR_SPEC = executor_spec.BeamExecutorSpec(executor.Executor)

    def __init__(
            self,
            examples: types.Channel = None,
            model: types.Channel = None,
            baseline_model: Optional[types.Channel] = None,
            # TODO(b/148618405): deprecate feature_slicing_spec.
            feature_slicing_spec: Optional[Union[
                evaluator_pb2.FeatureSlicingSpec, Dict[Text, Any]]] = None,
            fairness_indicator_thresholds: Optional[List[Union[
                float, data_types.RuntimeParameter]]] = None,
            example_splits: Optional[List[Text]] = None,
            eval_config: Optional[tfma.EvalConfig] = None,
            schema: Optional[types.Channel] = None,
            module_file: Optional[Text] = None,
            module_path: Optional[Text] = None):
        """Construct an Evaluator component.

    Args:
      examples: A Channel of type `standard_artifacts.Examples`, usually
        produced by an ExampleGen component. _required_
      model: A Channel of type `standard_artifacts.Model`, usually produced by
        a Trainer component.
      baseline_model: An optional channel of type 'standard_artifacts.Model' as
        the baseline model for model diff and model validation purpose.
      feature_slicing_spec:
        Deprecated, please use eval_config instead. Only support estimator.
        [evaluator_pb2.FeatureSlicingSpec](https://github.com/tensorflow/tfx/blob/master/tfx/proto/evaluator.proto)
          instance that describes how Evaluator should slice the data. If any
          field is provided as a RuntimeParameter, feature_slicing_spec should
          be constructed as a dict with the same field names as
          FeatureSlicingSpec proto message.
      fairness_indicator_thresholds: Optional list of float (or
        RuntimeParameter) threshold values for use with TFMA fairness
          indicators. Experimental functionality: this interface and
          functionality may change at any time. TODO(b/142653905): add a link
          to additional documentation for TFMA fairness indicators here.
      example_splits: Names of splits on which the metrics are computed.
        Default behavior (when example_splits is set to None or Empty) is using
        the 'eval' split.
      eval_config: Instance of tfma.EvalConfig containg configuration settings
        for running the evaluation. This config has options for both estimator
        and Keras.
      schema: A `Schema` channel to use for TFXIO.
      module_file: A path to python module file containing UDFs for Evaluator
        customization. This functionality is experimental and may change at any
        time. The module_file can implement following functions at its top
        level.
          def custom_eval_shared_model(
             eval_saved_model_path, model_name, eval_config, **kwargs,
          ) -> tfma.EvalSharedModel:
          def custom_extractors(
            eval_shared_model, eval_config, tensor_adapter_config,
          ) -> List[tfma.extractors.Extractor]:
      module_path: A python path to the custom module that contains the UDFs.
        See 'module_file' for the required signature of UDFs. This functionality
        is experimental and this API may change at any time. Note this can
        not be set together with module_file.
    """
        if bool(module_file) and bool(module_path):
            raise ValueError(
                'Python module path can not be set together with module file path.'
            )

        if eval_config is not None and feature_slicing_spec is not None:
            raise ValueError(
                "Exactly one of 'eval_config' or 'feature_slicing_spec' "
                "must be supplied.")
        if eval_config is None and feature_slicing_spec is None:
            feature_slicing_spec = evaluator_pb2.FeatureSlicingSpec()
            logging.info(
                'Neither eval_config nor feature_slicing_spec is passed, '
                'the model is treated as estimator.')

        if feature_slicing_spec:
            logging.warning('feature_slicing_spec is deprecated, please use '
                            'eval_config instead.')

        blessing = types.Channel(type=standard_artifacts.ModelBlessing)
        evaluation = types.Channel(type=standard_artifacts.ModelEvaluation)
        spec = standard_component_specs.EvaluatorSpec(
            examples=examples,
            model=model,
            baseline_model=baseline_model,
            feature_slicing_spec=feature_slicing_spec,
            fairness_indicator_thresholds=fairness_indicator_thresholds,
            example_splits=json_utils.dumps(example_splits),
            evaluation=evaluation,
            eval_config=eval_config,
            blessing=blessing,
            schema=schema,
            module_file=module_file,
            module_path=module_path)
        super(Evaluator, self).__init__(spec=spec)

        if udf_utils.should_package_user_modules():
            # In this case, the `MODULE_PATH_KEY` execution property will be injected
            # as a reference to the given user module file after packaging, at which
            # point the `MODULE_FILE_KEY` execution property will be removed.
            udf_utils.add_user_module_dependency(
                self, standard_component_specs.MODULE_FILE_KEY,
                standard_component_specs.MODULE_PATH_KEY)