class PTransform( tfx_namedtuple.namedtuple( 'PTransform', ['ptransform', 'output_tensor_info_list', 'label']), AnalyzerDef): """(Experimental) OperationDef for PTransform anaylzer. This analyzer is implemented by `tensorflow_transform.beam.analyzer_impls._ptransform_impl`. Fields: ptransform: The `beam.PTransform` to be applied to the inputs. output_tensor_info_list: A list of `TensorInfo`s that defines the outputs of this `PTransform`. label: A unique label for this operation. """ def __new__(cls, ptransform, output_tensor_info_list): return super(PTransform, cls).__new__( cls, ptransform=ptransform, output_tensor_info_list=output_tensor_info_list, label=_make_label(cls)) @property def output_tensor_infos(self): return self.output_tensor_info_list
class ValueNode( tfx_namedtuple.namedtuple('ValueNode', ['parent_operation', 'value_index'])): """A placeholder that will ultimately be translated to a PCollection. Attributes: parent_operation: The `OperationNode` that produces this value. value_index: The index of this value in the outputs of `parent_operation`. """ __slots__ = () def __init__(self, parent_operation, value_index: int): if not isinstance(parent_operation, OperationNode): raise TypeError( 'parent_operation must be a OperationNode, got {} of type {}'. format(parent_operation, type(parent_operation))) num_outputs = parent_operation.operation_def.num_outputs if not (0 <= value_index and value_index < num_outputs): raise ValueError( 'value_index was {} but parent_operation had {} outputs'. format(value_index, num_outputs)) super().__init__() def __iter__(self): raise ValueError('ValueNode is not iterable')
class TensorInfo( tfx_namedtuple.namedtuple('TensorInfo', ['dtype', 'shape', 'temporary_asset_info'])): """A container for attributes of output tensors from analyzers. Fields: dtype: The TensorFlow dtype. shape: The shape of the tensor. temporary_asset_info: A named tuple containing information about the temporary asset file to write out while tracing the TF graph. """ def __new__( cls: Type['TensorInfo'], dtype: tf.dtypes.DType, shape: Sequence[Optional[int]], temporary_asset_info: Optional[TemporaryAssetInfo] ) -> 'TensorInfo': if not isinstance(dtype, tf.DType): raise TypeError( 'dtype must be a TensorFlow dtype, got {}'.format(dtype)) if temporary_asset_info is not None and not isinstance( temporary_asset_info, TemporaryAssetInfo): raise TypeError( 'temporary_asset_info should be an instance of TemporaryAssetInfo or ' f'None, got {temporary_asset_info}') return super(TensorInfo, cls).__new__(cls, dtype=dtype, shape=shape, temporary_asset_info=temporary_asset_info)
class DatasetKey(tfx_namedtuple.namedtuple('DatasetKey', ['key'])): """A key for a dataset used for analysis.""" _FLATTENED_DATASET_KEY = object() def __new__(cls, dataset_key): if dataset_key is not DatasetKey._FLATTENED_DATASET_KEY: dataset_key = _make_valid_cache_component(dataset_key) return super(DatasetKey, cls).__new__(cls, key=dataset_key) def __str__(self): if self.is_flattened_dataset_key(): return str(DatasetKey('FlattenedDataset')) else: return super(DatasetKey, self).__str__() def __hash__(self): return hash(self.key) def __eq__(self, other): if self.key == other: return True return isinstance(other, DatasetKey) and self.key == other.key def is_flattened_dataset_key(self): return self.key == self._FLATTENED_DATASET_KEY
class CacheableCombinePerKeyFormatKeys( tfx_namedtuple.namedtuple('CacheableCombinePerKeyFormatKeys', ['combiner', 'label']), AnalyzerDef): """An analyzer that formats output for the non-stored per-key case. This analyzer converts the (key, output) pairs into a tuple of keys (of type string) and outputs. This analyzer is implemented by `tensorflow_transform.beam.analyzer_impls._CombinePerKeyFormatKeysImpl` Fields: combiner: The Combiner to use for extracting outputs. label: A unique label for this operation. """ __slots__ = () def __new__(cls, combiner): return super(CacheableCombinePerKeyFormatKeys, cls).__new__(cls, combiner=combiner, label=_make_label(cls)) @property def output_tensor_infos(self): # Returns a key vocab and one output per combiner output. return [TensorInfo(tf.string, (None, ), None)] + [ TensorInfo(info.dtype, (None, ) + info.shape, info.temporary_asset_info) for info in self.combiner.output_tensor_infos() ]
class PackedCombineAccumulate( tfx_namedtuple.namedtuple('PackedCombineAccumulate', ['combiners', 'label']), nodes.OperationDef): """An analyzer that packs a list of combiners into a single beam CombineFn. Fields: combiners: A list of `analysis_graph_builder._CombinerOpWrapper` objects. label: A unique label for this operation. """ __slots__ = () def __new__(cls, combiners, label): return super(PackedCombineAccumulate, cls).__new__(cls, combiners=combiners, label=_make_label(cls, label)) @property def num_outputs(self): return 1 # Note that this will not have any effect as packing of combiners is done # after the caching optimization. @property def is_partitionable(self): return True
class ExtractCombineMergeOutputs( tfx_namedtuple.namedtuple('ExtractOutputs', ['output_tensor_info_list', 'label']), AnalyzerDef): """An operation that represents extracting outputs of a combine merge. This operation represents a `beam.Map` that is applied to a PCollection. For each element of the PCollection, this corresponding element of the output PCollection is a tuple of outputs. Attributes: output_tensor_info_list: A list of `TensorInfo`s that defines the outputs of this operation. label: A unique label for this operation. """ __slots__ = () def __new__(cls, output_tensor_info_list): return super(ExtractCombineMergeOutputs, cls).__new__( cls, output_tensor_info_list=output_tensor_info_list, label=_make_label(cls)) @property def output_tensor_infos(self): return self.output_tensor_info_list
class CacheableCombineAccumulate( tfx_namedtuple.namedtuple('CacheableCombineAccumulate', ['combiner', 'label']), nodes.OperationDef): """An analyzer that runs a beam CombineFn to accumulate without merging. This analyzer reduces the values that it accepts as inputs, using the provided `Combiner`. The `Combiner` is applied to the data by wrapping it as a `beam.CombineFn` and applying `beam.Combine`. Fields: combiner: The Combiner to be applies to the inputs. label: A unique label for this operation. """ def __new__(cls, combiner): return super(CacheableCombineAccumulate, cls).__new__(cls, combiner=combiner, label=_make_label(cls)) @property def num_outputs(self): return 1 @property def is_partitionable(self): return True @property def cache_coder(self): return self.combiner.accumulator_coder
class VocabularyAccumulate( tfx_namedtuple.namedtuple( 'VocabularyAccumulate', ['vocab_ordering_type', 'input_dtype', 'label']), nodes.OperationDef): """An operation that accumulates unique words with their frequency or weight. This operation is implemented by `tensorflow_transform.beam.analyzer_impls._VocabularyAccumulateImpl`. """ def __new__(cls, vocab_ordering_type, input_dtype=tf.string.name): return super(VocabularyAccumulate, cls).__new__(cls, vocab_ordering_type=vocab_ordering_type, input_dtype=input_dtype, label=_make_label(cls)) @property def num_outputs(self): return 1 @property def is_partitionable(self): return True @property def cache_coder(self): return _VocabularyAccumulatorCoder(input_dtype=self.input_dtype)
class TensorInfo( tfx_namedtuple.namedtuple('TensorInfo', ['dtype', 'shape', 'temporary_asset_value']) ): """A container for attributes of output tensors from analyzers. Fields: dtype: The TensorFlow dtype. shape: The shape of the tensor. temporary_asset_value: A temporary value to write to an asset file while tracing the TF graph. """ def __new__(cls, dtype, shape, temporary_asset_value): if not isinstance(dtype, tf.DType): raise TypeError( 'dtype must be a TensorFlow dtype, got {}'.format(dtype)) if temporary_asset_value is not None and not isinstance( temporary_asset_value, bytes): raise TypeError( 'temporary_asset_value should be bytes or None, got {}'.format( temporary_asset_value)) return super(TensorInfo, cls).__new__(cls, dtype=dtype, shape=shape, temporary_asset_value=temporary_asset_value)
class CacheableCombinePerKeyFormatKeys( tfx_namedtuple.namedtuple('CacheableCombinePerKeyFormatKeys', ['combiner', 'label']), AnalyzerDef): """An analyzer that formats output for the non-stored per-key case. This analyzer converts the (key, output) pairs into a tuple of keys (of type string) and outputs. This analyzer is implemented by `tensorflow_transform.beam.analyzer_impls._CombinePerKeyFormatKeysImpl` Fields: combiner: The Combiner to use for extracting outputs. label: A unique label for this operation. """ def __new__(cls, combiner, label=None): if label is None: scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(CacheableCombinePerKeyFormatKeys, cls).__new__(cls, combiner=combiner, label=label) @property def output_tensor_infos(self): # Returns a key vocab and one output per combiner output. return [TensorInfo(tf.string, (None, ), None)] + [ TensorInfo(info.dtype, (None, ) + info.shape, info.temporary_asset_value) for info in self.combiner.output_tensor_infos() ]
class TensorSource( tfx_namedtuple.namedtuple('TensorSource', ['tensors', 'label']), nodes.OperationDef): """An `OperationDef` that defines extracting a tuple of tensor values. This `OperationDef` defines an operation that extracts the values of the given tensors into a PCollection of tuples of values. It is used as a source for analyzers, which further transform This OperationDef accepts zero inputs and return a single output representing the PCollection of tuples of values. It will be converted in tensorflow_transform.beam.analysis_graph_builder.build to an operation that extracts the tensors for a dictionary of tensors, after running a beam.ParDo to produce tensor values by running the graph on its inputs. Fields: tensors: The tensors whose values should be extracted. label: A unique label for this operation. """ def __new__(cls, tensors): for tensor in tensors: if not isinstance(tensor, tf.Tensor): raise TypeError( 'tensor must be a Tensor, got {} of type {}'.format( tensor, type(tensor))) return super(TensorSource, cls).__new__(cls, tensors=tensors, label=_make_label(cls))
class PackedCombineAccumulate( tfx_namedtuple.namedtuple('PackedCombineAccumulate', ['combiners', 'label']), nodes.OperationDef): """An analyzer that packs a list of combiners into a single beam CombineFn. Fields: combiners: A list of `analysis_graph_builder._CombinerOpWrapper` objects. label: A unique label for this operation. """ def __new__(cls, combiners, label=None): if label is None: scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(PackedCombineAccumulate, cls).__new__(cls, combiners=combiners, label=label) @property def num_outputs(self): return 1 # Note that this will not have any effect as packing of combiners is done # after the caching optimization. @property def is_partitionable(self): return True
class CacheableCombinePerKeyAccumulate( tfx_namedtuple.namedtuple('CacheableCombinePerKeyAccumulate', ['combiner', 'label']), AnalyzerDef): """An analyzer that runs `beam.CombinePerKey` to accumulate without merging. This analyzer reduces the values that it accepts as inputs, using the provided `Combiner`. The `Combiner` is applied to the data by wrapping it as a `beam.CombineFn` and applying `beam.CombinePerKey`. This analyzer is implemented by `tensorflow_transform.beam.analyzer_impls._IntermediateAccumulateCombineImpl`. Fields: combiner: The Combiner to be applied to the inputs. label: A unique label for this operation. """ def __new__(cls, combiner, label=None): if label is None: scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(CacheableCombinePerKeyAccumulate, cls).__new__(cls, combiner=combiner, label=label) @property def num_outputs(self): return 1 @property def is_partitionable(self): return True @property def cache_coder(self): return _CombinerPerKeyAccumulatorCoder(self.combiner.accumulator_coder)
class VocabularyMerge( tfx_namedtuple.namedtuple('VocabularyMerge', [ 'vocab_ordering_type', 'use_adjusted_mutual_info', 'min_diff_from_avg', 'label' ]), nodes.OperationDef): """An operation that merges the accumulators produced by VocabularyAccumulate. This operation operates on the output of VocabularyAccumulate and is implemented by `tensorflow_transform.beam.analyzer_impls._VocabularyMergeImpl` . See `tft.vocabulary` for a description of the parameters. """ def __new__(cls, vocab_ordering_type, use_adjusted_mutual_info, min_diff_from_avg): return super(VocabularyMerge, cls).__new__( cls, vocab_ordering_type=vocab_ordering_type, use_adjusted_mutual_info=use_adjusted_mutual_info, min_diff_from_avg=min_diff_from_avg, label=_make_label(cls)) @property def num_outputs(self): return 1
class ExtractCombineMergeOutputs( tfx_namedtuple.namedtuple('ExtractOutputs', ['output_tensor_info_list', 'label']), AnalyzerDef): """An operation that represents extracting outputs of a combine merge. This operation represents a `beam.Map` that is applied to a PCollection. For each element of the PCollection, this corresponding element of the output PCollection is a tuple of outputs. Attributes: output_tensor_info_list: A list of `TensorInfo`s that defines the outputs of this operation. label: A unique label for this operation. """ def __new__(cls, output_tensor_info_list, label=None): if label is None: scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(ExtractCombineMergeOutputs, cls).__new__( cls, output_tensor_info_list=output_tensor_info_list, label=label) @property def output_tensor_infos(self): return self.output_tensor_info_list
class ScaleAndFlattenPerKeyBucketBouandaries( tfx_namedtuple.namedtuple('PostProcessPerKeyBucketBoundaries', ['output_tensor_dtype', 'label']), AnalyzerDef): """An analyzer which takes quantile boundaries per key and combines them. It receives a 2-d array of boundaries, computes scales and shifts to each row separately, a new boundaries 1-d array which is a combination of boundaries for all the keys, and the number of buckets defined for each key. This outputs boundaries, scale_factor_per_key, shift_per_key, num_buckets. For example, for an input boundaries matrix, [[0, 1, 2], [0, 1, 2]] it will return: boundaries: [0, 0.5, 1, 1.5, 2] scale_factor_per_key: [0.5, 0.5] shift_per_key: [0, 1] num_buckets: 4 So the transformation of each input x before computing its bucket should be: F(x, key) = x * scale_factor_per_key[key] + shift_per_key[key] """ def __new__(cls, output_tensor_dtype): return super(ScaleAndFlattenPerKeyBucketBouandaries, cls).__new__(cls, output_tensor_dtype=output_tensor_dtype, label=_make_label(cls)) @property def output_tensor_infos(self): # Boundaries, scale_factor_per_key, shift_per_key, num_buckets. return [TensorInfo(self.output_tensor_dtype, (None, ), None)] * 3 + [ TensorInfo(tf.int64, (), None) ]
class _State( tfx_namedtuple.namedtuple('_State', [ 'temp_dir', 'evaluated_replacements', ])): """A named tuple storing state passed to this context manager.""" @classmethod def make_empty(cls): """Return `_State` object with all fields set to `None`.""" return cls(*(None, ) * len(cls._fields))
class BeamDatasetMetadata( tfx_namedtuple.namedtuple('BeamDatasetMetadata', ['dataset_metadata', 'deferred_metadata'])): """A class like DatasetMetadata that also holds a dict of `PCollection`s. `deferred_metadata` is a PCollection containing a single DatasetMetadata. """ @property def schema(self): return self.dataset_metadata.schema
class ConstructBeamPipelineVisitor(nodes.Visitor): """Visitor that constructs the beam pipeline from the node graph.""" ExtraArgs = tfx_namedtuple.namedtuple( # pylint: disable=invalid-name 'ExtraArgs', [ 'base_temp_dir', 'pipeline', 'flat_pcollection', 'pcollection_dict', 'tf_config', 'graph', 'input_signature', 'input_specs', 'input_tensor_adapter_config', 'use_tf_compat_v1', 'cache_pcoll_dict', 'preprocessing_fn', ]) def __init__(self, extra_args): self._extra_args = extra_args def visit(self, operation, inputs): try: ptransform_wrapper = ( _PTRANSFORM_BY_OPERATION_DEF_SUBCLASS[operation.__class__]) environment_tag = (EnvironmentTags.TF_COMPAT_V1 if self._extra_args.use_tf_compat_v1 else EnvironmentTags.TF_V2_ONLY) ptransform, tag = ptransform_wrapper.get_ptransform( environment_tag) except KeyError: raise ValueError( 'No implementation for {} was registered'.format(operation)) # TODO(zoyahav): Consider extracting a single PCollection before passing to # ptransform if len(inputs) == 1. if tag is None: tagged_label = operation.label else: tagged_label = '{label}[{tag}]'.format(label=operation.label, tag=tag) outputs = ((inputs or beam.pvalue.PBegin(self._extra_args.pipeline)) | tagged_label >> ptransform(operation, self._extra_args)) if isinstance(outputs, beam.pvalue.PCollection): return (outputs, ) else: return outputs def validate_value(self, value): if not isinstance(value, beam.pvalue.PCollection): raise TypeError('Expected a PCollection, got {} of type {}'.format( value, type(value)))
class EncodeCache(tfx_namedtuple.namedtuple('EncodeCache', ['coder', 'label']), nodes.OperationDef): """OperationDef for encoding a cache instance. Fields: coder: An instance of CacheCoder used to encode cache. label: A unique label for this operation. """ @property def is_partitionable(self): return True
class BeamDatasetMetadata( tfx_namedtuple.namedtuple( 'BeamDatasetMetadata', ['dataset_metadata', 'deferred_metadata', 'asset_map'])): """A class like DatasetMetadata also holding `PCollection`s and an asset_map. `deferred_metadata` is a PCollection containing a single DatasetMetadata. `asset_map` is a Dictionary mapping asset keys to filenames. """ @property def schema(self): return self.dataset_metadata.schema
class FlattenLists(tfx_namedtuple.namedtuple('FlattenLists', ['label']), nodes.OperationDef): """An operation that represents flattening a PCollection of lists. Attributes: label: A unique label for this operation. """ def __new__(cls): return super(FlattenLists, cls).__new__(cls, label=_make_label(cls)) @property def is_partitionable(self): return True
class _State( tfx_namedtuple.namedtuple('_State', [ 'temp_dir', 'desired_batch_size', 'passthrough_keys', 'use_deep_copy_optimization', 'force_tf_compat_v1', ])): """A named tuple to store attributes of `Context`.""" @classmethod def make_empty(cls): """Return `_State` object with all fields set to `None`.""" return cls(*(None, ) * len(cls._fields))
class ExtractInputForSavedModel( tfx_namedtuple.namedtuple('ExtractInputForSavedModel', ['dataset_key', 'label']), nodes.OperationDef): """An operation that forwards the requested dataset in PCollection form. The resulting PCollection is either the dataset corresponding to `dataset_key`, or a flattened PCollection if `dataset_key` is not specified. Attributes: dataset_key: (Optional) dataset key str. label: A unique label for this operation. """ pass
class CreateSavedModel( tfx_namedtuple.namedtuple( 'CreateSavedModel', ['table_initializers', 'output_signature', 'label']), nodes.OperationDef): """An operation that represents creating a SavedModel with bound values. This operation represents creating a SavedModel. Its output is a PCollection containing a single element which is the directory containing the `SavedModel`. The inputs are a PCollection of tensor bindings. A tensor binding is the specification of a tensor and a value that it should be replaced with in the graph. This allows us to create a `SavedModel` in a deferred manner, which depends on deferred values (the tensor bindings) which were not known when the Beam graph was constructed. Attributes: table_initializers: A list of table initializer ops that should be run as part of this SavedModel. output_signature: The output signature of this `SavedModel`, as a dictionary whose keys are feature names and values are `Tensor`s or `SparseTensor`s. label: A unique label for this operation. """ __slots__ = () def _get_tensor_type_name(self, tensor): if isinstance(tensor, tf.Tensor): return 'Tensor' elif isinstance(tensor, tf.SparseTensor): return 'SparseTensor' raise ValueError('Got a {}, expected a Tensor or SparseTensor'.format( type(tensor))) def get_field_str(self, field_name): # Overriding the str representation of table initializers since it may be # different for various versions of TF. if field_name == 'table_initializers': return '{}'.format(len(self.table_initializers)) elif field_name == 'output_signature': copied = self.output_signature.copy() for key in copied: value = self.output_signature[key] copied[key] = '{}<shape: {}, {}>'.format( self._get_tensor_type_name(value), value.shape.as_list(), value.dtype) return str(copied) return super().get_field_str(field_name)
class InstrumentDatasetCache( tfx_namedtuple.namedtuple('InstrumentDatasetCache', ['dataset_key', 'label']), nodes.OperationDef): """OperationDef instrumenting cached datasets. Fields: dataset_key: A dataset key. label: A unique label for this operation. """ __slots__ = () @property def is_partitionable(self): return True
class VocabularyCount(tfx_namedtuple.namedtuple('VocabularyCount', ['label']), nodes.OperationDef): """An operation counts the total number of tokens in a vocabulary. This operation takes in the output of VocabularyAccumulate and is implemented by `tensorflow_transform.beam.analyzer_impls._VocabularyCountImpl`. The output of this operation is a singleton Integer. """ def __new__(cls): return super(VocabularyCount, cls).__new__(cls, label=_make_label(cls)) @property def num_outputs(self): return 1
class AddKey(tfx_namedtuple.namedtuple('AddKey', ['key', 'label']), nodes.OperationDef): """An operation that represents adding a key to a value. This operation represents a `beam.Map` that is applied to a PCollection. For each element of the PCollection, this corresponding element of the output PCollection is a tuple of (key, value). Attributes: key: The key which should be added to each element of the input PCollection. label: A unique label for this operation. """ @property def is_partitionable(self): return True
class CacheableCombinePerKeyFormatLarge( tfx_namedtuple.namedtuple('CacheableCombinePerKeyFormatLarge', ['label']), nodes.OperationDef): """An analyzer that formats output prior to writing to file for per-key case. This operation operates on the output of CacheableCombinePerKeyAccumulate and is implemented by `tensorflow_transform.beam.analyzer_impls. _CombinePerKeyFormatLargeImpl`. """ def __new__(cls): return super(CacheableCombinePerKeyFormatLarge, cls).__new__(cls, label=_make_label(cls)) @property def num_outputs(self): return 1