Exemple #1
0
class PTransform(
        tfx_namedtuple.namedtuple(
            'PTransform', ['ptransform', 'output_tensor_info_list', 'label']),
        AnalyzerDef):
    """(Experimental) OperationDef for PTransform anaylzer.

  This analyzer is implemented by
  `tensorflow_transform.beam.analyzer_impls._ptransform_impl`.

  Fields:
    ptransform: The `beam.PTransform` to be applied to the inputs.
    output_tensor_info_list: A list of `TensorInfo`s that defines the outputs of
        this `PTransform`.
    label: A unique label for this operation.
  """
    def __new__(cls, ptransform, output_tensor_info_list):
        return super(PTransform, cls).__new__(
            cls,
            ptransform=ptransform,
            output_tensor_info_list=output_tensor_info_list,
            label=_make_label(cls))

    @property
    def output_tensor_infos(self):
        return self.output_tensor_info_list
Exemple #2
0
class ValueNode(
        tfx_namedtuple.namedtuple('ValueNode',
                                  ['parent_operation', 'value_index'])):
    """A placeholder that will ultimately be translated to a PCollection.

  Attributes:
    parent_operation: The `OperationNode` that produces this value.
    value_index: The index of this value in the outputs of `parent_operation`.
  """
    __slots__ = ()

    def __init__(self, parent_operation, value_index: int):
        if not isinstance(parent_operation, OperationNode):
            raise TypeError(
                'parent_operation must be a OperationNode, got {} of type {}'.
                format(parent_operation, type(parent_operation)))
        num_outputs = parent_operation.operation_def.num_outputs
        if not (0 <= value_index and value_index < num_outputs):
            raise ValueError(
                'value_index was {} but parent_operation had {} outputs'.
                format(value_index, num_outputs))
        super().__init__()

    def __iter__(self):
        raise ValueError('ValueNode is not iterable')
class TensorInfo(
        tfx_namedtuple.namedtuple('TensorInfo',
                                  ['dtype', 'shape', 'temporary_asset_info'])):
    """A container for attributes of output tensors from analyzers.

  Fields:
    dtype: The TensorFlow dtype.
    shape: The shape of the tensor.
    temporary_asset_info: A named tuple containing information about the
      temporary asset file to write out while tracing the TF graph.
  """
    def __new__(
            cls: Type['TensorInfo'], dtype: tf.dtypes.DType,
            shape: Sequence[Optional[int]],
            temporary_asset_info: Optional[TemporaryAssetInfo]
    ) -> 'TensorInfo':
        if not isinstance(dtype, tf.DType):
            raise TypeError(
                'dtype must be a TensorFlow dtype, got {}'.format(dtype))
        if temporary_asset_info is not None and not isinstance(
                temporary_asset_info, TemporaryAssetInfo):
            raise TypeError(
                'temporary_asset_info should be an instance of TemporaryAssetInfo or '
                f'None, got {temporary_asset_info}')
        return super(TensorInfo,
                     cls).__new__(cls,
                                  dtype=dtype,
                                  shape=shape,
                                  temporary_asset_info=temporary_asset_info)
Exemple #4
0
class DatasetKey(tfx_namedtuple.namedtuple('DatasetKey', ['key'])):
  """A key for a dataset used for analysis."""
  _FLATTENED_DATASET_KEY = object()

  def __new__(cls, dataset_key):
    if dataset_key is not DatasetKey._FLATTENED_DATASET_KEY:
      dataset_key = _make_valid_cache_component(dataset_key)
    return super(DatasetKey, cls).__new__(cls, key=dataset_key)

  def __str__(self):
    if self.is_flattened_dataset_key():
      return str(DatasetKey('FlattenedDataset'))
    else:
      return super(DatasetKey, self).__str__()

  def __hash__(self):
    return hash(self.key)

  def __eq__(self, other):
    if self.key == other:
      return True
    return isinstance(other, DatasetKey) and self.key == other.key

  def is_flattened_dataset_key(self):
    return self.key == self._FLATTENED_DATASET_KEY
class CacheableCombinePerKeyFormatKeys(
        tfx_namedtuple.namedtuple('CacheableCombinePerKeyFormatKeys',
                                  ['combiner', 'label']), AnalyzerDef):
    """An analyzer that formats output for the non-stored per-key case.

  This analyzer converts the (key, output) pairs into a tuple of keys (of type
  string) and outputs.

  This analyzer is implemented by
  `tensorflow_transform.beam.analyzer_impls._CombinePerKeyFormatKeysImpl`

  Fields:
    combiner: The Combiner to use for extracting outputs.
    label: A unique label for this operation.
  """
    __slots__ = ()

    def __new__(cls, combiner):
        return super(CacheableCombinePerKeyFormatKeys,
                     cls).__new__(cls,
                                  combiner=combiner,
                                  label=_make_label(cls))

    @property
    def output_tensor_infos(self):
        # Returns a key vocab and one output per combiner output.
        return [TensorInfo(tf.string, (None, ), None)] + [
            TensorInfo(info.dtype,
                       (None, ) + info.shape, info.temporary_asset_info)
            for info in self.combiner.output_tensor_infos()
        ]
class PackedCombineAccumulate(
        tfx_namedtuple.namedtuple('PackedCombineAccumulate',
                                  ['combiners', 'label']), nodes.OperationDef):
    """An analyzer that packs a list of combiners into a single beam CombineFn.

  Fields:
    combiners:  A list of `analysis_graph_builder._CombinerOpWrapper` objects.
    label: A unique label for this operation.
  """
    __slots__ = ()

    def __new__(cls, combiners, label):
        return super(PackedCombineAccumulate,
                     cls).__new__(cls,
                                  combiners=combiners,
                                  label=_make_label(cls, label))

    @property
    def num_outputs(self):
        return 1

    # Note that this will not have any effect as packing of combiners is done
    # after the caching optimization.
    @property
    def is_partitionable(self):
        return True
class ExtractCombineMergeOutputs(
        tfx_namedtuple.namedtuple('ExtractOutputs',
                                  ['output_tensor_info_list', 'label']),
        AnalyzerDef):
    """An operation that represents extracting outputs of a combine merge.

  This operation represents a `beam.Map` that is applied to a PCollection.
  For each element of the PCollection, this corresponding element of the output
  PCollection is a tuple of outputs.

  Attributes:
    output_tensor_info_list: A list of `TensorInfo`s that defines the outputs of
      this operation.
    label: A unique label for this operation.
  """
    __slots__ = ()

    def __new__(cls, output_tensor_info_list):
        return super(ExtractCombineMergeOutputs, cls).__new__(
            cls,
            output_tensor_info_list=output_tensor_info_list,
            label=_make_label(cls))

    @property
    def output_tensor_infos(self):
        return self.output_tensor_info_list
Exemple #8
0
class CacheableCombineAccumulate(
        tfx_namedtuple.namedtuple('CacheableCombineAccumulate',
                                  ['combiner', 'label']), nodes.OperationDef):
    """An analyzer that runs a beam CombineFn to accumulate without merging.

  This analyzer reduces the values that it accepts as inputs, using the
  provided `Combiner`.  The `Combiner` is applied to the data by wrapping it as
  a `beam.CombineFn` and applying `beam.Combine`.

  Fields:
    combiner: The Combiner to be applies to the inputs.
    label: A unique label for this operation.
  """
    def __new__(cls, combiner):
        return super(CacheableCombineAccumulate,
                     cls).__new__(cls,
                                  combiner=combiner,
                                  label=_make_label(cls))

    @property
    def num_outputs(self):
        return 1

    @property
    def is_partitionable(self):
        return True

    @property
    def cache_coder(self):
        return self.combiner.accumulator_coder
Exemple #9
0
class VocabularyAccumulate(
        tfx_namedtuple.namedtuple(
            'VocabularyAccumulate',
            ['vocab_ordering_type', 'input_dtype', 'label']),
        nodes.OperationDef):
    """An operation that accumulates unique words with their frequency or weight.

  This operation is implemented by
  `tensorflow_transform.beam.analyzer_impls._VocabularyAccumulateImpl`.
  """
    def __new__(cls, vocab_ordering_type, input_dtype=tf.string.name):
        return super(VocabularyAccumulate,
                     cls).__new__(cls,
                                  vocab_ordering_type=vocab_ordering_type,
                                  input_dtype=input_dtype,
                                  label=_make_label(cls))

    @property
    def num_outputs(self):
        return 1

    @property
    def is_partitionable(self):
        return True

    @property
    def cache_coder(self):
        return _VocabularyAccumulatorCoder(input_dtype=self.input_dtype)
Exemple #10
0
class TensorInfo(
        tfx_namedtuple.namedtuple('TensorInfo',
                                  ['dtype', 'shape', 'temporary_asset_value'])
):
    """A container for attributes of output tensors from analyzers.

  Fields:
    dtype: The TensorFlow dtype.
    shape: The shape of the tensor.
    temporary_asset_value: A temporary value to write to an asset file while
      tracing the TF graph.
  """
    def __new__(cls, dtype, shape, temporary_asset_value):
        if not isinstance(dtype, tf.DType):
            raise TypeError(
                'dtype must be a TensorFlow dtype, got {}'.format(dtype))
        if temporary_asset_value is not None and not isinstance(
                temporary_asset_value, bytes):
            raise TypeError(
                'temporary_asset_value should be bytes or None, got {}'.format(
                    temporary_asset_value))
        return super(TensorInfo,
                     cls).__new__(cls,
                                  dtype=dtype,
                                  shape=shape,
                                  temporary_asset_value=temporary_asset_value)
Exemple #11
0
class CacheableCombinePerKeyFormatKeys(
        tfx_namedtuple.namedtuple('CacheableCombinePerKeyFormatKeys',
                                  ['combiner', 'label']), AnalyzerDef):
    """An analyzer that formats output for the non-stored per-key case.

  This analyzer converts the (key, output) pairs into a tuple of keys (of type
  string) and outputs.

  This analyzer is implemented by
  `tensorflow_transform.beam.analyzer_impls._CombinePerKeyFormatKeysImpl`

  Fields:
    combiner: The Combiner to use for extracting outputs.
    label: A unique label for this operation.
  """
    def __new__(cls, combiner, label=None):
        if label is None:
            scope = tf.compat.v1.get_default_graph().get_name_scope()
            label = '{}[{}]'.format(cls.__name__, scope)
        return super(CacheableCombinePerKeyFormatKeys,
                     cls).__new__(cls, combiner=combiner, label=label)

    @property
    def output_tensor_infos(self):
        # Returns a key vocab and one output per combiner output.
        return [TensorInfo(tf.string, (None, ), None)] + [
            TensorInfo(info.dtype,
                       (None, ) + info.shape, info.temporary_asset_value)
            for info in self.combiner.output_tensor_infos()
        ]
Exemple #12
0
class TensorSource(
        tfx_namedtuple.namedtuple('TensorSource', ['tensors', 'label']),
        nodes.OperationDef):
    """An `OperationDef` that defines extracting a tuple of tensor values.

  This `OperationDef` defines an operation that extracts the values of the given
  tensors into a PCollection of tuples of values.  It is used as a source for
  analyzers, which further transform

  This OperationDef accepts zero inputs and return a single output representing
  the PCollection of tuples of values.  It will be converted in
  tensorflow_transform.beam.analysis_graph_builder.build to an operation that
  extracts the tensors for a dictionary of tensors, after running a beam.ParDo
  to produce tensor values by running the graph on its inputs.

  Fields:
    tensors: The tensors whose values should be extracted.
    label: A unique label for this operation.
  """
    def __new__(cls, tensors):
        for tensor in tensors:
            if not isinstance(tensor, tf.Tensor):
                raise TypeError(
                    'tensor must be a Tensor, got {} of type {}'.format(
                        tensor, type(tensor)))
        return super(TensorSource, cls).__new__(cls,
                                                tensors=tensors,
                                                label=_make_label(cls))
Exemple #13
0
class PackedCombineAccumulate(
        tfx_namedtuple.namedtuple('PackedCombineAccumulate',
                                  ['combiners', 'label']), nodes.OperationDef):
    """An analyzer that packs a list of combiners into a single beam CombineFn.

  Fields:
    combiners:  A list of `analysis_graph_builder._CombinerOpWrapper` objects.
    label: A unique label for this operation.
  """
    def __new__(cls, combiners, label=None):
        if label is None:
            scope = tf.compat.v1.get_default_graph().get_name_scope()
            label = '{}[{}]'.format(cls.__name__, scope)
        return super(PackedCombineAccumulate, cls).__new__(cls,
                                                           combiners=combiners,
                                                           label=label)

    @property
    def num_outputs(self):
        return 1

    # Note that this will not have any effect as packing of combiners is done
    # after the caching optimization.
    @property
    def is_partitionable(self):
        return True
Exemple #14
0
class CacheableCombinePerKeyAccumulate(
        tfx_namedtuple.namedtuple('CacheableCombinePerKeyAccumulate',
                                  ['combiner', 'label']), AnalyzerDef):
    """An analyzer that runs `beam.CombinePerKey` to accumulate without merging.

  This analyzer reduces the values that it accepts as inputs, using the
  provided `Combiner`.  The `Combiner` is applied to the data by wrapping it as
  a `beam.CombineFn` and applying `beam.CombinePerKey`.

  This analyzer is implemented by
  `tensorflow_transform.beam.analyzer_impls._IntermediateAccumulateCombineImpl`.

  Fields:
    combiner: The Combiner to be applied to the inputs.
    label: A unique label for this operation.
  """
    def __new__(cls, combiner, label=None):
        if label is None:
            scope = tf.compat.v1.get_default_graph().get_name_scope()
            label = '{}[{}]'.format(cls.__name__, scope)
        return super(CacheableCombinePerKeyAccumulate,
                     cls).__new__(cls, combiner=combiner, label=label)

    @property
    def num_outputs(self):
        return 1

    @property
    def is_partitionable(self):
        return True

    @property
    def cache_coder(self):
        return _CombinerPerKeyAccumulatorCoder(self.combiner.accumulator_coder)
Exemple #15
0
class VocabularyMerge(
        tfx_namedtuple.namedtuple('VocabularyMerge', [
            'vocab_ordering_type', 'use_adjusted_mutual_info',
            'min_diff_from_avg', 'label'
        ]), nodes.OperationDef):
    """An operation that merges the accumulators produced by VocabularyAccumulate.

  This operation operates on the output of VocabularyAccumulate and is
  implemented by `tensorflow_transform.beam.analyzer_impls._VocabularyMergeImpl`
  .

  See `tft.vocabulary` for a description of the parameters.
  """
    def __new__(cls, vocab_ordering_type, use_adjusted_mutual_info,
                min_diff_from_avg):
        return super(VocabularyMerge, cls).__new__(
            cls,
            vocab_ordering_type=vocab_ordering_type,
            use_adjusted_mutual_info=use_adjusted_mutual_info,
            min_diff_from_avg=min_diff_from_avg,
            label=_make_label(cls))

    @property
    def num_outputs(self):
        return 1
Exemple #16
0
class ExtractCombineMergeOutputs(
        tfx_namedtuple.namedtuple('ExtractOutputs',
                                  ['output_tensor_info_list', 'label']),
        AnalyzerDef):
    """An operation that represents extracting outputs of a combine merge.

  This operation represents a `beam.Map` that is applied to a PCollection.
  For each element of the PCollection, this corresponding element of the output
  PCollection is a tuple of outputs.

  Attributes:
    output_tensor_info_list: A list of `TensorInfo`s that defines the outputs of
      this operation.
    label: A unique label for this operation.
  """
    def __new__(cls, output_tensor_info_list, label=None):
        if label is None:
            scope = tf.compat.v1.get_default_graph().get_name_scope()
            label = '{}[{}]'.format(cls.__name__, scope)
        return super(ExtractCombineMergeOutputs, cls).__new__(
            cls, output_tensor_info_list=output_tensor_info_list, label=label)

    @property
    def output_tensor_infos(self):
        return self.output_tensor_info_list
Exemple #17
0
class ScaleAndFlattenPerKeyBucketBouandaries(
        tfx_namedtuple.namedtuple('PostProcessPerKeyBucketBoundaries',
                                  ['output_tensor_dtype', 'label']),
        AnalyzerDef):
    """An analyzer which takes quantile boundaries per key and combines them.

  It receives a 2-d array of boundaries, computes scales and shifts to each
  row separately, a new boundaries 1-d array which is a combination of
  boundaries for all the keys, and the number of buckets defined for each key.

  This outputs boundaries, scale_factor_per_key, shift_per_key, num_buckets.

  For example, for an input boundaries matrix, [[0, 1, 2], [0, 1, 2]] it will
  return:
  boundaries: [0, 0.5, 1, 1.5, 2]
  scale_factor_per_key: [0.5, 0.5]
  shift_per_key: [0, 1]
  num_buckets: 4

  So the transformation of each input x before computing its bucket should be:
  F(x, key) = x * scale_factor_per_key[key] + shift_per_key[key]
  """
    def __new__(cls, output_tensor_dtype):
        return super(ScaleAndFlattenPerKeyBucketBouandaries,
                     cls).__new__(cls,
                                  output_tensor_dtype=output_tensor_dtype,
                                  label=_make_label(cls))

    @property
    def output_tensor_infos(self):
        # Boundaries, scale_factor_per_key, shift_per_key, num_buckets.
        return [TensorInfo(self.output_tensor_dtype, (None, ), None)] * 3 + [
            TensorInfo(tf.int64, (), None)
        ]
 class _State(
         tfx_namedtuple.namedtuple('_State', [
             'temp_dir',
             'evaluated_replacements',
         ])):
     """A named tuple storing state passed to this context manager."""
     @classmethod
     def make_empty(cls):
         """Return `_State` object with all fields set to `None`."""
         return cls(*(None, ) * len(cls._fields))
Exemple #19
0
class BeamDatasetMetadata(
        tfx_namedtuple.namedtuple('BeamDatasetMetadata',
                                  ['dataset_metadata', 'deferred_metadata'])):
    """A class like DatasetMetadata that also holds a dict of `PCollection`s.

  `deferred_metadata` is a PCollection containing a single DatasetMetadata.
  """
    @property
    def schema(self):
        return self.dataset_metadata.schema
Exemple #20
0
class ConstructBeamPipelineVisitor(nodes.Visitor):
    """Visitor that constructs the beam pipeline from the node graph."""

    ExtraArgs = tfx_namedtuple.namedtuple(  # pylint: disable=invalid-name
        'ExtraArgs', [
            'base_temp_dir',
            'pipeline',
            'flat_pcollection',
            'pcollection_dict',
            'tf_config',
            'graph',
            'input_signature',
            'input_specs',
            'input_tensor_adapter_config',
            'use_tf_compat_v1',
            'cache_pcoll_dict',
            'preprocessing_fn',
        ])

    def __init__(self, extra_args):
        self._extra_args = extra_args

    def visit(self, operation, inputs):
        try:
            ptransform_wrapper = (
                _PTRANSFORM_BY_OPERATION_DEF_SUBCLASS[operation.__class__])
            environment_tag = (EnvironmentTags.TF_COMPAT_V1
                               if self._extra_args.use_tf_compat_v1 else
                               EnvironmentTags.TF_V2_ONLY)
            ptransform, tag = ptransform_wrapper.get_ptransform(
                environment_tag)
        except KeyError:
            raise ValueError(
                'No implementation for {} was registered'.format(operation))

        # TODO(zoyahav): Consider extracting a single PCollection before passing to
        # ptransform if len(inputs) == 1.
        if tag is None:
            tagged_label = operation.label
        else:
            tagged_label = '{label}[{tag}]'.format(label=operation.label,
                                                   tag=tag)
        outputs = ((inputs or beam.pvalue.PBegin(self._extra_args.pipeline))
                   | tagged_label >> ptransform(operation, self._extra_args))

        if isinstance(outputs, beam.pvalue.PCollection):
            return (outputs, )
        else:
            return outputs

    def validate_value(self, value):
        if not isinstance(value, beam.pvalue.PCollection):
            raise TypeError('Expected a PCollection, got {} of type {}'.format(
                value, type(value)))
class EncodeCache(tfx_namedtuple.namedtuple('EncodeCache', ['coder', 'label']),
                  nodes.OperationDef):
    """OperationDef for encoding a cache instance.

  Fields:
    coder: An instance of CacheCoder used to encode cache.
    label: A unique label for this operation.
  """
    @property
    def is_partitionable(self):
        return True
class BeamDatasetMetadata(
        tfx_namedtuple.namedtuple(
            'BeamDatasetMetadata',
            ['dataset_metadata', 'deferred_metadata', 'asset_map'])):
    """A class like DatasetMetadata also holding `PCollection`s and an asset_map.

  `deferred_metadata` is a PCollection containing a single DatasetMetadata.
  `asset_map` is a Dictionary mapping asset keys to filenames.
  """
    @property
    def schema(self):
        return self.dataset_metadata.schema
class FlattenLists(tfx_namedtuple.namedtuple('FlattenLists', ['label']),
                   nodes.OperationDef):
    """An operation that represents flattening a PCollection of lists.

  Attributes:
    label: A unique label for this operation.
  """
    def __new__(cls):
        return super(FlattenLists, cls).__new__(cls, label=_make_label(cls))

    @property
    def is_partitionable(self):
        return True
Exemple #24
0
 class _State(
         tfx_namedtuple.namedtuple('_State', [
             'temp_dir',
             'desired_batch_size',
             'passthrough_keys',
             'use_deep_copy_optimization',
             'force_tf_compat_v1',
         ])):
     """A named tuple to store attributes of `Context`."""
     @classmethod
     def make_empty(cls):
         """Return `_State` object with all fields set to `None`."""
         return cls(*(None, ) * len(cls._fields))
Exemple #25
0
class ExtractInputForSavedModel(
    tfx_namedtuple.namedtuple('ExtractInputForSavedModel',
                              ['dataset_key', 'label']), nodes.OperationDef):
  """An operation that forwards the requested dataset in PCollection form.

  The resulting PCollection is either the dataset corresponding to
  `dataset_key`, or a flattened PCollection if `dataset_key` is not specified.

  Attributes:
    dataset_key: (Optional) dataset key str.
    label: A unique label for this operation.
  """
  pass
Exemple #26
0
class CreateSavedModel(
        tfx_namedtuple.namedtuple(
            'CreateSavedModel',
            ['table_initializers', 'output_signature', 'label']),
        nodes.OperationDef):
    """An operation that represents creating a SavedModel with bound values.

  This operation represents creating a SavedModel.  Its output is a
  PCollection containing a single element which is the directory containing the
  `SavedModel`.  The inputs are a PCollection of tensor bindings.  A tensor
  binding is the specification of a tensor and a value that it should be
  replaced with in the graph.

  This allows us to create a `SavedModel` in a deferred manner, which depends on
  deferred values (the tensor bindings) which were not known when the Beam graph
  was constructed.


  Attributes:
    table_initializers: A list of table initializer ops that should be run as
        part of this SavedModel.
    output_signature: The output signature of this `SavedModel`, as a dictionary
        whose keys are feature names and values are `Tensor`s or
        `SparseTensor`s.
    label: A unique label for this operation.
  """
    __slots__ = ()

    def _get_tensor_type_name(self, tensor):
        if isinstance(tensor, tf.Tensor):
            return 'Tensor'
        elif isinstance(tensor, tf.SparseTensor):
            return 'SparseTensor'
        raise ValueError('Got a {}, expected a Tensor or SparseTensor'.format(
            type(tensor)))

    def get_field_str(self, field_name):
        # Overriding the str representation of table initializers since it may be
        # different for various versions of TF.
        if field_name == 'table_initializers':
            return '{}'.format(len(self.table_initializers))
        elif field_name == 'output_signature':
            copied = self.output_signature.copy()
            for key in copied:
                value = self.output_signature[key]
                copied[key] = '{}<shape: {}, {}>'.format(
                    self._get_tensor_type_name(value), value.shape.as_list(),
                    value.dtype)
            return str(copied)
        return super().get_field_str(field_name)
class InstrumentDatasetCache(
        tfx_namedtuple.namedtuple('InstrumentDatasetCache',
                                  ['dataset_key', 'label']),
        nodes.OperationDef):
    """OperationDef instrumenting cached datasets.

  Fields:
    dataset_key: A dataset key.
    label: A unique label for this operation.
  """
    __slots__ = ()

    @property
    def is_partitionable(self):
        return True
Exemple #28
0
class VocabularyCount(tfx_namedtuple.namedtuple('VocabularyCount', ['label']),
                      nodes.OperationDef):
    """An operation counts the total number of tokens in a vocabulary.

  This operation takes in the output of VocabularyAccumulate and is implemented
  by `tensorflow_transform.beam.analyzer_impls._VocabularyCountImpl`.

  The output of this operation is a singleton Integer.
  """
    def __new__(cls):
        return super(VocabularyCount, cls).__new__(cls, label=_make_label(cls))

    @property
    def num_outputs(self):
        return 1
class AddKey(tfx_namedtuple.namedtuple('AddKey', ['key', 'label']),
             nodes.OperationDef):
    """An operation that represents adding a key to a value.

  This operation represents a `beam.Map` that is applied to a PCollection.
  For each element of the PCollection, this corresponding element of the output
  PCollection is a tuple of (key, value).

  Attributes:
    key: The key which should be added to each element of the input PCollection.
    label: A unique label for this operation.
  """
    @property
    def is_partitionable(self):
        return True
Exemple #30
0
class CacheableCombinePerKeyFormatLarge(
        tfx_namedtuple.namedtuple('CacheableCombinePerKeyFormatLarge',
                                  ['label']), nodes.OperationDef):
    """An analyzer that formats output prior to writing to file for per-key case.

  This operation operates on the output of CacheableCombinePerKeyAccumulate and
  is implemented by `tensorflow_transform.beam.analyzer_impls.
  _CombinePerKeyFormatLargeImpl`.
  """
    def __new__(cls):
        return super(CacheableCombinePerKeyFormatLarge,
                     cls).__new__(cls, label=_make_label(cls))

    @property
    def num_outputs(self):
        return 1