def testCreatePhasesWithUnwrappedLoop(self): # Test a preprocessing function with control flow. # # The loop represents # # i = 0 # while i < 10: # i += 1 # x += 1 # # We need to call an analyzer after the loop because only the transitive # parents of analyzers are inspected by create_phases def preprocessing_fn(inputs): def _subtract_ten(x): i = tf.constant(0) c = lambda i, x: tf.less(i, 10) b = lambda i, x: (tf.add(i, 1), tf.add(x, -1)) return tf.while_loop(c, b, [i, x])[1] scaled_to_0_1 = mappers.scale_to_0_1(_subtract_ten(inputs['x'])) return {'x_scaled': scaled_to_0_1} input_schema = sch.Schema({ 'x': sch.ColumnSchema(tf.int32, [], sch.FixedColumnRepresentation()) }) graph, _, _ = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) with self.assertRaisesRegexp(ValueError, 'Cycle detected'): _ = impl_helper.create_phases(graph)
def testRunPreprocessingFn(self): schema = self.toSchema({ 'dense_1': tf.FixedLenFeature((), tf.float32), 'dense_2': tf.FixedLenFeature((1, 2), tf.int64), 'var_len': tf.VarLenFeature(tf.string), 'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100) }) def preprocessing_fn(inputs): return { 'dense_out': mappers.scale_to_0_1(inputs['dense_1']), 'sparse_out': tf.sparse_reshape(inputs['sparse'], (1, 10)), } _, inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn, schema) # Verify that the input placeholders have the correct types. expected_dtype_and_shape = { 'dense_1': (tf.float32, tf.TensorShape([None])), 'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])), 'var_len': (tf.string, tf.TensorShape([None, None])), 'sparse': (tf.float32, tf.TensorShape([None, None])), 'dense_out': (tf.float32, tf.TensorShape([None])), 'sparse_out': (tf.float32, tf.TensorShape([None, None])), } for key, tensor in itertools.chain(six.iteritems(inputs), six.iteritems(outputs)): dtype, shape = expected_dtype_and_shape[key] self.assertEqual(tensor.dtype, dtype) tensor.get_shape().assert_is_compatible_with(shape)
def testRunTransformFn(self): schema = self.toSchema({ 'dense_1': tf.FixedLenFeature((), tf.float32), 'dense_2': tf.FixedLenFeature((1, 2), tf.int64), 'var_len': tf.VarLenFeature(tf.string), 'sparse': tf.SparseFeature('ix', 'val', tf.float32, 100) }) def preprocessing_fn(inputs): return { 'dense_out': mappers.scale_to_0_1(inputs['dense_1']), 'sparse_out': api.map(lambda x: tf.sparse_reshape(x, (1, 10)), inputs['sparse']) } inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn, schema) # Verify that the input placeholders have the correct types. expected_dtype_and_shape = { 'dense_1': (tf.float32, tf.TensorShape([None])), 'dense_2': (tf.int64, tf.TensorShape([None, 1, 2])), 'var_len': (tf.string, tf.TensorShape(None)), 'sparse': (tf.float32, tf.TensorShape(None)), 'dense_out': (tf.float32, tf.TensorShape([None])), 'sparse_out': (tf.float32, tf.TensorShape([None, None])), } for key, column in inputs.items() + outputs.items(): dtype, shape = expected_dtype_and_shape[key] self.assertEqual(column.tensor.dtype, dtype) self.assertShapesEqual(column.tensor.get_shape(), shape)
def testCreatePhasesWithLoop(self): # Test a preprocessing function with control flow. # # The loop represents # # i = 0 # while i < 10: # i += 1 # x += 1 # # To get an error in the case where apply_function is not called, we have # to call an analyzer first (see testCreatePhasesWithUnwrappedLoop). So # we also do so here. def preprocessing_fn(inputs): def _subtract_ten(x): i = tf.constant(0) c = lambda i, x: tf.less(i, 10) b = lambda i, x: (tf.add(i, 1), tf.add(x, -1)) return tf.while_loop(c, b, [i, x])[1] scaled_to_0_1 = mappers.scale_to_0_1( api.apply_function(_subtract_ten, inputs['x'])) return {'x_scaled': scaled_to_0_1} input_schema = sch.Schema({ 'x': sch.ColumnSchema(tf.int32, [], sch.FixedColumnRepresentation()) }) graph, _, _ = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) phases = impl_helper.create_phases(graph) self.assertEqual(len(phases), 1) self.assertEqual(len(phases[0].analyzers), 2)
def testRunTransformFnBadTransform(self): schema = self.toSchema({ 'x': tf.FixedLenFeature((3,), tf.float32), }) def preprocessing_fn(inputs): return { 'x_sum': api.map(tf.reduce_sum, inputs['x']), } # Verify that we raise if preprocessing_fn outputs a tensor with rank 0. with self.assertRaises(ValueError): _ = impl_helper.run_preprocessing_fn(preprocessing_fn, schema)
def testCreatePhasesWithDegenerateFunctionApplication(self): # Tests the case of a function whose inputs and outputs overlap. def preprocessing_fn(inputs): return {'index': api.apply_function(lambda x: x, inputs['a'])} input_schema = sch.Schema({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) _, _ = impl_helper.run_preprocessing_fn(preprocessing_fn, input_schema) phases = impl_helper.create_phases() self.assertEqual(len(phases), 0)
def testImportAndExportWithTensorValueMapping(self): # Export the function "z = x * min(y) + x + min(y)" with min(y) replaced by # 6. def preprocessing_fn(inputs): return { 'z': api.map(lambda x, y: x * y + x + y, inputs['x'], analyzers.min(inputs['y'])) } input_schema = self.toSchema({ 'x': tf.FixedLenFeature((), tf.float32), 'y': tf.FixedLenFeature((), tf.float32) }) inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) saved_model_dir = os.path.join(self.get_temp_dir(), 'replace_original') input_columns_to_statistics = impl_helper.make_transform_fn_def( input_schema, inputs, outputs, saved_model_dir) self.assertEqual(len(input_columns_to_statistics.keys()), 1) y_min_input_name = input_columns_to_statistics.keys()[0] g = tf.Graph() with g.as_default(): x = tf.placeholder(tf.float32, ()) y = tf.placeholder(tf.float32, ()) z = x * y + x + y new_saved_model_dir = os.path.join(self.get_temp_dir(), 'replace_new') impl_helper.replace_tensors_with_constant_values( saved_model_dir, new_saved_model_dir, { y_min_input_name: impl_helper.ConstantTensorValue(6, tf.float32, ()) }) # Import the function, applying it to constants for x and y. g = tf.Graph() with g.as_default(): x = tf.constant(5, tf.float32, (1, )) y = tf.constant(1000, tf.float32, (1, )) # Value is never used. outputs = saved_transform_io.apply_saved_transform( new_saved_model_dir, { 'x': x, 'y': y }) z = outputs['z'] sess = tf.Session() with sess.as_default(): # Check result is 5 * 6 + 5 + 6 = 41. self.assertEqual(41, z.eval())
def testImportAndExportDense(self): # Export the function "z = x * y + x + y" def preprocessing_fn(inputs): return { 'z': api.map(lambda x, y: x * y + x + y, inputs['x'], inputs['y']) } input_schema = self.toSchema({ 'x': tf.FixedLenFeature((), tf.float32), 'y': tf.FixedLenFeature((), tf.float32) }) inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) saved_model_dir = os.path.join(self.get_temp_dir(), 'dense') _ = impl_helper.make_transform_fn_def(input_schema, inputs, outputs, saved_model_dir) # Import the function, applying it to constants for x and y. g = tf.Graph() with g.as_default(): x = tf.constant(5, tf.float32, (1, )) y = tf.constant(6, tf.float32, (1, )) outputs = saved_transform_io.apply_saved_transform( saved_model_dir, { 'x': x, 'y': y }) z = outputs['z'] sess = tf.Session() with sess.as_default(): # Check result is 5 * 6 + 5 + 6 = 41. self.assertEqual(41, z.eval()) # Import the graph, feeding it values for x and y. g = tf.Graph() with g.as_default(): inputs, outputs = impl_helper.load_transform_fn_def( saved_model_dir) x = inputs['x'] y = inputs['y'] z = outputs['z'] sess = tf.Session() with sess.as_default(): # Check result is 5 * 6 + 5 + 6 = 41. self.assertEqual(41, sess.run(z, {x: [5], y: [6]}))
def testCreatePhasesWithUnwrappedTable(self): # Test a preprocessing function with a table that is not wrapped in # `apply_function`. def preprocessing_fn(inputs): table = lookup.index_table_from_tensor(['a', 'b']) integerized = table.lookup(inputs['x']) return {'integerized': integerized} input_schema = sch.Schema({ 'x': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) graph, _, _ = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) with self.assertRaisesRegexp(ValueError, 'Found table initializers'): _ = impl_helper.create_phases(graph)
def testCreatePhasesWithMultipleLevelsOfAnalyzers(self): # Test a preprocessing function similar to scale_to_0_1 except that it # involves multiple interleavings of analyzers and transforms. def preprocessing_fn(inputs): scaled_to_0 = inputs['x'] - analyzers.min(inputs['x']) scaled_to_0_1 = scaled_to_0 / analyzers.max(scaled_to_0) return {'x_scaled': scaled_to_0_1} input_schema = sch.Schema({ 'x': sch.ColumnSchema(tf.float32, [], sch.FixedColumnRepresentation()) }) graph, _, _ = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) phases = impl_helper.create_phases(graph) self.assertEqual(len(phases), 2) self.assertEqual(len(phases[0].analyzers), 1) self.assertEqual(len(phases[1].analyzers), 1)
def testImportAndExportSparse(self): # Export the function "z = x + y" def preprocessing_fn(inputs): return {'z': api.map(tf.sparse_add, inputs['x'], inputs['y'])} input_schema = self.toSchema({ 'x': tf.VarLenFeature(tf.float32), 'y': tf.VarLenFeature(tf.float32) }) inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) saved_model_dir = os.path.join(self.get_temp_dir(), 'sparse') _ = impl_helper.make_transform_fn_def(input_schema, inputs, outputs, saved_model_dir) # Import the function, applying it to constants for x and y. g = tf.Graph() with g.as_default(): x = tf.SparseTensor(indices=[[0]], values=tf.constant(5, shape=(1, ), dtype=tf.float32), dense_shape=[1]) y = tf.SparseTensor(indices=[[0]], values=tf.constant(6, shape=(1, ), dtype=tf.float32), dense_shape=[1]) outputs = saved_transform_io.apply_saved_transform( saved_model_dir, { 'x': x, 'y': y }) z = outputs['z'] sess = tf.Session() with sess.as_default(): # Check result is 5 + 6 = 11. result = z.eval() self.assertEqual(result.indices, [[0]]) self.assertEqual(result.values, [11]) self.assertEqual(result.dense_shape, [1])
def testCreatePhasesWithTable(self): # Test a preprocessing function with table that can only be run after the # first analyzer has run. Note converting an integerized string into a # float doesn't make much sense, but is a legal tensorflow computation. def preprocessing_fn(inputs): integerized = mappers.string_to_int(inputs['x']) integerized = tf.to_float(integerized) scaled_to_0_1 = integerized / analyzers.max(integerized) return {'x_scaled': scaled_to_0_1} input_schema = sch.Schema({ 'x': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) graph, _, _ = impl_helper.run_preprocessing_fn( preprocessing_fn, input_schema) phases = impl_helper.create_phases(graph) self.assertEqual(len(phases), 2) self.assertEqual(len(phases[0].analyzers), 1) self.assertEqual(len(phases[1].analyzers), 1) self.assertEqual(len(phases[0].table_initializers), 0) self.assertEqual(len(phases[1].table_initializers), 1)
def make_transform_graph(output_dir, schema, features): """Writes a tft transform fn, and metadata files. Args: output_dir: output folder schema: schema list features: features dict """ tft_input_schema = make_tft_input_schema(schema, os.path.join(output_dir, STATS_FILE)) tft_input_metadata = dataset_metadata.DatasetMetadata(schema=tft_input_schema) preprocessing_fn = make_preprocessing_fn(output_dir, features) # copy from /tft/beam/impl inputs, outputs = impl_helper.run_preprocessing_fn( preprocessing_fn=preprocessing_fn, schema=tft_input_schema) output_metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) transform_fn_dir = os.path.join(output_dir, TRANSFORM_FN_DIR) # This writes the SavedModel impl_helper.make_transform_fn_def( schema=tft_input_schema, inputs=inputs, outputs=outputs, saved_model_dir=transform_fn_dir) metadata_io.write_metadata( metadata=output_metadata, path=os.path.join(output_dir, TRANSFORMED_METADATA_DIR)) metadata_io.write_metadata( metadata=tft_input_metadata, path=os.path.join(output_dir, RAW_METADATA_DIR))
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() # NOTE: it's important that create_phases is called directly after # run_preprocessing_fn, because we later mutate the graph's # TABLE_INITIALIZERS collection which would break the logic in # create_phases. graph, inputs, outputs = impl_helper.run_preprocessing_fn( self._preprocessing_fn, input_schema) phases = impl_helper.create_phases(graph) # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. This # graph has the anaylzer outputs computed so far replaced with constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform( graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = ( tensor_pcoll_mapping | 'CreateSavedModelForAnaylzerInputs[%d]' % level >> _ReplaceTensorsWithConstants( unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the values # of tensors in the graph. In that case, the tensors must be "constant" in # that they don't depend on input data. The tensors can depend on analyzer # outputs though. This allows us to set metadata that depends on analyzer # outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with the # tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(graph, outputs)) deferred_metadata_tensor_names = [ future.name for column_schema in tft_api.get_column_schemas(graph).values() for future in column_schema.substitute_futures({})] name_pcoll_dict = ( tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues( deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. """ input_values, input_metadata = dataset input_schema = input_metadata.schema input_batches = input_values | 'BatchInstances' >> beam.ParDo( _BatchDoFn()) class _CreateTransformFn(beam.PTransform): """Create a TransformFnDef, binding statistics in a deferred manner. This function constructs a tensorflow graph eagerly and then (in a deferred manner) fills in analyzer outputs with their actual computed values. We construct the tensorflow graph up front because that implies serializing MetaGraphDef protos rather than pickling the user-defined TITO functions. The graph contains placeholders for `_AnalyzerOutput`s which are then replaced with their actual values (as constant tensors) in a deferred manner. Args: input_columns: A map from column names to `Column`s. output_columns: A map from column names to `Column`s. temp_dir: Temp dir to store `SavedModel`s. """ def __init__(self, input_columns, output_columns, temp_dir): # Generally the pipeline is inferred from its inputs, however we need # to know the pipeline for beam.Create. self.pipeline = input_values.pipeline self._input_columns = input_columns self._output_columns = output_columns self._temp_dir = temp_dir def expand(self, analyzer_outputs_to_pcoll): """Converts a dict of statistics to a transform function. Args: analyzer_outputs_to_pcoll: A dictionary mapping `_AnalyzerOutput`s to the values of these statistics as a PCollection. Returns: A single-element PCollection containing the directory name with the SavedModel. """ # Create a transform_fn with unbound values. unbound_transform_fn_dir = os.path.join( self._temp_dir, 'unbound_transform_fn') input_columns_to_statistics = impl_helper.make_transform_fn_def( input_schema, self._input_columns, self._output_columns, unbound_transform_fn_dir) transform_fn = (self.pipeline | 'CreateTransformFn' >> beam.Create([unbound_transform_fn_dir])) if not analyzer_outputs_to_pcoll: return transform_fn # Convert the statistics dict into a DictPCollectionView so it can be # passed as a side input to the beam Map below. tagged_statistics = [] for tag, statistic in input_columns_to_statistics.items(): pcoll = analyzer_outputs_to_pcoll[statistic] tagged_statistics.append( pcoll | 'AddTag[%s]' % tag >> beam.Map(lambda x, tag=tag: (tag, x))) statistics_side_input = beam.pvalue.AsDict( tagged_statistics | 'MergeStatistics' >> beam.Flatten()) # Run a mapper that inserts statistic values into the graph. return (transform_fn | 'ReplaceTensorsWithConstantValues' >> beam.Map( impl_helper.replace_tensors_with_constant_values, bound_saved_model_dir=os.path.join( self._temp_dir, 'transform_fn'), input_value_mapping=statistics_side_input)) inputs, outputs = impl_helper.run_preprocessing_fn( self._preprocessing_fn, input_schema) # Get a list of lists, containing analyzers (i.e. _AnalyzerOutput objects) # by level in the DAG of Columns/Statistics. Analyzers at level n are ready # to run once all analyzers at level n - 1 are complete. analyzers_by_level = self._analyzers_by_level(outputs) # Iterate through levels, keeping track of analyzer outputs (i.e. # statistics) via a mapping of `_AnalyzerOutput` -> single element # PCollection. analyzer_outputs_to_pcoll = {} for level, analyzer_outputs in enumerate(analyzers_by_level): # Create a TransformFnDef representing the graph needed to generate # all the inputs required by the analyzer_outputs at this level. We # assign arbitrary names to the outputs of this TransformFnDef. analyzer_input_columns = {} for idx, analyzer_output in enumerate(analyzer_outputs): if len(analyzer_output.inputs) != 1: raise NotImplementedError( 'Analyzers must have exactly one input') analyzer_input_key = 'analyzer_%d_input' % idx analyzer_input_columns[ analyzer_input_key] = analyzer_output.inputs[0] transform_fn = ( analyzer_outputs_to_pcoll | 'CreateTransformFn_%d' % level >> _CreateTransformFn( inputs, analyzer_input_columns, os.path.join(self._output_dir, 'tmp', 'level_%s' % level))) analyzer_input_schema = impl_helper.infer_feature_schema( analyzer_input_columns) # Run the TransformFnDef in a mapper. analysis_inputs = ( input_batches | 'ComputeAnalyzerInputs_%d' % level >> beam.ParDo( _RunMetaGraphDoFn(input_schema, analyzer_input_schema), saved_model_dir=beam.pvalue.AsSingleton(transform_fn))) # For each analyzer output, look up its input values (by tensor name) # and run the analyzer in these values. for idx, analyzer_output in enumerate(analyzer_outputs): analyzer_input_key = 'analyzer_%d_input' % idx analyzer_outputs_to_pcoll[analyzer_output] = ( analysis_inputs | 'Extract_%d_%d' % (level, idx) >> beam.Map( # pylint: disable=cell-var-from-loop # This lint warning is prone to false positives, and it's not # clear why the warning is required here. lambda x, key=analyzer_input_key: [inst[key] for inst in x]) | 'Analyze_%d_%d' % (level, idx) >> self._Analyze(analyzer_output)) output_metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) transform_fn = (analyzer_outputs_to_pcoll | 'CreateTransformFn' >> _CreateTransformFn( inputs, outputs, self._output_dir)) return transform_fn, output_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() class _ReplaceTensorsWithConstants(beam.PTransform): """Bind statistics in a deferred manner. This transform fills in analyzer outputs with their actual computed values. Args: saved_model_dir: The directory containing the SavedModel. """ def __init__(self, saved_model_dir): # Generally the pipeline is inferred from its inputs, however we need # to know the pipeline for beam.Create. self.pipeline = input_values.pipeline self._saved_model_dir = saved_model_dir def expand(self, tensor_pcoll_mapping): """Converts a dict of statistics to a transform function. Args: tensor_pcoll_mapping: A dictionary mapping `Tensor`s to singleton `PCollection`s. Returns: A single-element PCollection containing the directory name with the SavedModel. """ transform_fn = (self.pipeline | 'CreateTransformFn' >> beam.Create([self._saved_model_dir])) if not tensor_pcoll_mapping: return transform_fn # Convert tensor_value_mapping into a DictPCollectionView so it can be # passed as a side input to the beam Map below. tensor_value_pairs = [] for name, pcoll in six.iteritems(tensor_pcoll_mapping): tensor_value_pairs.append( pcoll | 'AddName[%s]' % name >> beam.Map(lambda x, name=name: (name, x))) tensor_value_mapping = beam.pvalue.AsDict( tensor_value_pairs | 'MergeTensorValuePairs' >> beam.Flatten()) # Run a mapper that inserts statistic values into the graph. We wrap # replace_tensors_with_constant_values in a wrapper that also creates # a temp dir. This makes the wrapper idempotent since any retry will # use a different temp dir. def replace_tensors_with_constant_values( saved_model_dir, tensor_value_mapping, serialized_tf_config): tf_config = _maybe_deserialize_tf_config( serialized_tf_config) with tf.Session(config=tf_config) as session: temp_dir = _make_unique_temp_dir(base_temp_dir) input_tensors, output_tensors = ( saved_transform_io.partially_apply_saved_transform( saved_model_dir, {}, tensor_value_mapping)) saved_transform_io.write_saved_transform_from_session( session, input_tensors, output_tensors, temp_dir) return temp_dir serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( self.pipeline.runner) return (transform_fn | 'ReplaceTensorsWithConstantValues' >> beam.Map(replace_tensors_with_constant_values, tensor_value_mapping=tensor_value_mapping, serialized_tf_config=serialized_tf_config)) class _ComputeTensorPcollMappingUpdate(beam.PTransform): """Create a mapping from `Tensor`s to PCollections. Creates a mapping from `Tensor`s to PCollections for the outputs of the new analyzers. An existing mapping will be provided as the argument to the extend() method. Args: phase: The Phase to run """ def __init__(self, saved_model_dir, analyzer_inputs_schema, analyzers): self._saved_model_dir = saved_model_dir self._analyzer_inputs_schema = analyzer_inputs_schema self._analyzers = analyzers def expand(self, input_values_and_tensor_pcoll_mapping): input_values, tensor_pcoll_mapping = ( input_values_and_tensor_pcoll_mapping) # Create a transform_fn to produce inputs to new analyzers. transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(self._saved_model_dir)) # Run the transform_fn. serialized_tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( self.pipeline.runner) analyzer_input_values = ( input_values | 'ComputeAnalyzerInputs' >> beam.ParDo( _RunMetaGraphDoFn(input_schema, self._analyzer_inputs_schema, serialized_tf_config), saved_model_dir=beam.pvalue.AsSingleton(transform_fn))) # For each analyzer output, look up its input values (by tensor name) # and run the analyzer on these values. # tensor_pcoll_mapping_update = {} for idx, analyzer in enumerate(self._analyzers): analyzer_impl = analyzer_impls._impl_for_analyzer( analyzer.spec) # pylint: enable=protected-access assert len(analyzer.inputs) == 1 output_pcolls = (analyzer_input_values | 'Extract_%d' % idx >> beam.Map( lambda batch, key: batch[key], key=analyzer.inputs[0].name) | 'Analyze_%d' % idx >> analyzer_impl) assert len(analyzer.outputs) == len(output_pcolls) for tensor, pcoll in zip(analyzer.outputs, output_pcolls): tensor_pcoll_mapping_update[tensor.name] = pcoll return tensor_pcoll_mapping_update # NOTE: it's important that create_phases is called directly after # run_preprocessing_fn, because we later mutate the graph's # TABLE_INITIALIZERS collection which would break the logic in # create_phases. graph, inputs, outputs = impl_helper.run_preprocessing_fn( self._preprocessing_fn, input_schema) phases = impl_helper.create_phases(graph) # Iterate through levels, generating PCollections for columns that are the # outputs of `Operations` that are not `MapOperation`s. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] for level, phase in enumerate(phases): analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor analyzer_inputs_schema = impl_helper.infer_feature_schema( analyzer_inputs) table_initializers.extend(phase.table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, saved_model_dir) tensor_pcoll_mapping_update = ( (input_values, tensor_pcoll_mapping) | 'ComputeTensorPcollMappingUpdate_%d' % level >> _ComputeTensorPcollMappingUpdate( saved_model_dir, analyzer_inputs_schema, phase.analyzers)) tensor_pcoll_mapping.update(tensor_pcoll_mapping_update) output_metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = (tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants(saved_model_dir)) return transform_fn, output_metadata