def _preprocessing_fn_with_chained_ptransforms(inputs): class FakeChainable(tfx_namedtuple.namedtuple('FakeChainable', ['label']), nodes.OperationDef): def __new__(cls): scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(FakeChainable, cls).__new__(cls, label=label) with tf.compat.v1.name_scope('x'): input_values_node = nodes.apply_operation(analyzer_nodes.TensorSource, tensors=[inputs['x']]) with tf.compat.v1.name_scope('ptransform1'): intermediate_value_node = nodes.apply_operation( FakeChainable, input_values_node) with tf.compat.v1.name_scope('ptransform2'): output_value_node = nodes.apply_operation(FakeChainable, intermediate_value_node) x_chained = analyzer_nodes.bind_future_as_tensor( output_value_node, analyzer_nodes.TensorInfo(tf.float32, (17, 27), None)) return {'x_chained': x_chained}
def _preprocessing_fn_for_generalized_chained_ptransforms(inputs): class FakeChainablePartitionable( collections.namedtuple('FakeChainablePartitionable', ['label']), nodes.OperationDef): def __new__(cls, label=None): if label is None: scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(FakeChainablePartitionable, cls).__new__(cls, label=label) @property def num_outputs(self): return 1 @property def is_partitionable(self): return True class FakeChainableCacheable( collections.namedtuple('FakeChainableCacheable', ['label']), nodes.OperationDef): def __new__(cls, label=None): if label is None: scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(FakeChainableCacheable, cls).__new__(cls, label=label) @property def num_outputs(self): return 1 @property def is_partitionable(self): return True @property def cache_coder(self): return 'Not-a-coder-but-thats-ok!' class FakeChainable(collections.namedtuple('FakeChainable', ['label']), nodes.OperationDef): def __new__(cls, label=None): if label is None: scope = tf.compat.v1.get_default_graph().get_name_scope() label = '{}[{}]'.format(cls.__name__, scope) return super(FakeChainable, cls).__new__(cls, label=label) @property def num_outputs(self): return 1 @property def is_partitionable(self): return False with tf.compat.v1.name_scope('x'): input_values_node = nodes.apply_operation(analyzer_nodes.TensorSource, tensors=[inputs['x']]) with tf.compat.v1.name_scope('partitionable1'): partitionable_outputs = nodes.apply_multi_output_operation( FakeChainablePartitionable, input_values_node) with tf.compat.v1.name_scope('cacheable1'): intermediate_cached_value_node = nodes.apply_multi_output_operation( FakeChainableCacheable, *partitionable_outputs) with tf.compat.v1.name_scope('partitionable2'): partitionable_outputs = nodes.apply_multi_output_operation( FakeChainablePartitionable, *intermediate_cached_value_node) with tf.compat.v1.name_scope('cacheable2'): cached_value_node = nodes.apply_multi_output_operation( FakeChainableCacheable, *partitionable_outputs) with tf.compat.v1.name_scope('partitionable3'): output_value_node = nodes.apply_multi_output_operation( FakeChainablePartitionable, *cached_value_node) with tf.compat.v1.name_scope('merge'): output_value_node = nodes.apply_operation(FakeChainable, *output_value_node) with tf.compat.v1.name_scope('not-cacheable'): non_cached_output = nodes.apply_operation(FakeChainable, input_values_node) x_chained = analyzer_nodes.bind_future_as_tensor( output_value_node, analyzer_nodes.TensorInfo(tf.float32, (17, 27), False)) x_plain = analyzer_nodes.bind_future_as_tensor( non_cached_output, analyzer_nodes.TensorInfo(tf.int64, (7, 13), False)) return {'x_chained': x_chained, 'x_plain': x_plain}
def output_tensor_infos(self) -> List[analyzer_nodes.TensorInfo]: return [analyzer_nodes.TensorInfo(tf.string, [None, 2], None)]
def ptransform_analyzer(inputs: Collection[tf.Tensor], ptransform: Union[_BeamPTransform, CacheablePTransformAnalyzer], output_dtypes: Collection[tf.dtypes.DType], output_shapes: Collection[List[int]], output_asset_default_values: Optional[Collection[ Optional[bytes]]] = None, name: Optional[str] = None): # pylint: disable=line-too-long """Applies a user-provided PTransform over the whole dataset. WARNING: This is experimental. Note that in order to have asset files copied correctly, any outputs that represent asset filenames must be added to the `tf.GraphKeys.ASSET_FILEPATHS` collection by the caller if using Transform's APIs in compat v1 mode. Example: >>> class MeanPerKey(beam.PTransform): ... def expand(self, pcoll: beam.PCollection[Tuple[np.ndarray, np.ndarray]]) -> Tuple[beam.PCollection[np.ndarray], beam.PCollection[np.ndarray]]: ... def extract_output(key_value_pairs): ... keys, values = zip(*key_value_pairs) ... return [beam.TaggedOutput('keys', keys), ... beam.TaggedOutput('values', values)] ... return tuple( ... pcoll ... | 'ZipAndFlatten' >> beam.FlatMap(lambda batches: list(zip(*batches))) ... | 'MeanPerKey' >> beam.CombinePerKey(beam.combiners.MeanCombineFn()) ... | 'ToList' >> beam.combiners.ToList() ... | 'Extract' >> beam.FlatMap(extract_output).with_outputs( ... 'keys', 'values')) >>> def preprocessing_fn(inputs): ... outputs = tft.experimental.ptransform_analyzer( ... inputs=[inputs['s'], inputs['x']], ... ptransform=MeanPerKey(), ... output_dtypes=[tf.string, tf.float32], ... output_shapes=[[2], [2]]) ... (keys, means) = outputs ... mean_a = tf.reshape(tf.gather(means, tf.where(keys == 'a')), []) ... return { 'x/mean_a': inputs['x'] / mean_a } >>> raw_data = [dict(x=1, s='a'), dict(x=8, s='b'), dict(x=3, s='a')] >>> feature_spec = dict( ... x=tf.io.FixedLenFeature([], tf.float32), ... s=tf.io.FixedLenFeature([], tf.string)) >>> raw_data_metadata = tft.DatasetMetadata.from_feature_spec(feature_spec) >>> with tft_beam.Context(temp_dir=tempfile.mkdtemp()): ... transformed_dataset, transform_fn = ( ... (raw_data, raw_data_metadata) ... | tft_beam.AnalyzeAndTransformDataset(preprocessing_fn)) >>> transformed_data, transformed_metadata = transformed_dataset >>> transformed_data [{'x/mean_a': 0.5}, {'x/mean_a': 4.0}, {'x/mean_a': 1.5}] Args: inputs: An ordered collection of input `Tensor`s. ptransform: A Beam PTransform that accepts a Beam PCollection where each element is a tuple of `ndarray`s. Each element in the tuple contains a batch of values for the corresponding input tensor of the analyzer and maintain their shapes and dtypes. It returns a `PCollection`, or a tuple of `PCollections`, each containing a single element which is an `ndarray` or a list of primitive types. The contents of these output `PCollection`s must be consistent with the given values of `output_dtypes` and `output_shapes`. It may inherit from `tft_beam.experimental.PTransformAnalyzer` if access to a temp base directory is needed. Alternatively, it could be an instance of `tft.experimental.CacheablePTransformAnalyzer` in order to enable cache for this analyzer, when analyzer cache is enabled for this pipeline. output_dtypes: An ordered collection of TensorFlow dtypes of the output of the analyzer. output_shapes: An ordered collection of shapes of the output of the analyzer. Must have the same length as output_dtypes. output_asset_default_values: (Optional) An ordered collection of optional `bytes` aligned with output_dtypes/output_shapes. Every item in this collection which is not `None` indicates that the output is a TF asset path, and its value would be used as the default value of this asset file prior to analysis. name: (Optional) Similar to a TF op name. Used to define a unique scope for this analyzer, which can be used for debugging info. Returns: A list of output `Tensor`s. These will have `dtype` and `shape` as specified by `output_dtypes` and `output_shapes`. Raises: ValueError: If output_dtypes and output_shapes have different lengths. """ # pylint: enable=line-too-long if len(output_dtypes) != len(output_shapes): raise ValueError( 'output_dtypes ({}) and output_shapes ({}) had different' ' lengths'.format(output_dtypes, output_shapes)) if output_asset_default_values is not None: if len(output_asset_default_values) != len(output_dtypes): raise ValueError( 'output_dtypes ({}) and output_asset_default_values ({}) had ' 'different lengths'.format(output_dtypes, output_asset_default_values)) output_asset_default_values = [ analyzer_nodes.TemporaryAssetInfo(value, 'text') for value in output_asset_default_values ] else: output_asset_default_values = [None] * len(output_dtypes) with tf.compat.v1.name_scope(name, 'ptransform'): output_tensor_infos = [ analyzer_nodes.TensorInfo(dtype, shape, default_asset_content) for dtype, shape, default_asset_content in zip( output_dtypes, output_shapes, output_asset_default_values) ] return _apply_analyzer(ptransform, *inputs, output_tensor_info_list=output_tensor_infos)