def testWriteTransformFn(self): path = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA_WITH_FUTURES, { 'a': pipeline | 'CreateA' >> beam.Create([3]), }) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(path)) transformed_metadata_dir = os.path.join( path, transform_fn_io.TRANSFORMED_METADATA_DIR) metadata = metadata_io.read_metadata(transformed_metadata_dir) self.assertEqual(metadata, _TEST_METADATA) transform_fn_dir = os.path.join(path, transform_fn_io.TRANSFORM_FN_DIR) self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def testWriteTransformFn(self): transform_output_dir = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) # Combine test metadata with a dict of PCollections resolving futures. deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(transform_output_dir)) # Test reading with TFTransformOutput tf_transform_output = tft.TFTransformOutput(transform_output_dir) metadata = tf_transform_output.transformed_metadata self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) transform_fn_dir = tf_transform_output.transform_savedmodel_dir self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def testWriteTransformFn(self): transform_output_dir = os.path.join(self.get_temp_dir(), 'output') with beam.Pipeline() as pipeline: # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA_WITH_FUTURES, { 'a': pipeline | 'CreateA' >> beam.Create([3]), }) _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(transform_output_dir)) # Test reading with TFTransformOutput tf_transform_output = tft.TFTransformOutput(transform_output_dir) metadata = tf_transform_output.transformed_metadata self.assertEqual(metadata, _TEST_METADATA) transform_fn_dir = tf_transform_output.transform_savedmodel_dir self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir))
def expand(self, dataset_and_transform_fn): """Transforms the dataset using the transform_fn. Args: dataset_and_transform_fn: A tuple of dataset and preprocessing function. Returns: A dataset transformed according to the transform_fn. """ (input_values, input_metadata), (transform_fn, output_metadata) = ( dataset_and_transform_fn) if self._use_tfxio: input_schema = None input_tensor_adapter_config = input_metadata else: input_schema = input_metadata.schema input_tensor_adapter_config = None # If exclude_outputs is set, update the output metadata. if self._exclude_outputs is not None: if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata): new_metadata = _remove_columns_from_metadata( output_metadata.dataset_metadata, self._exclude_outputs) new_deferred_metadata = ( output_metadata.deferred_metadata | 'RemoveColumms' >> beam.Map(_remove_columns_from_metadata, self._exclude_outputs)) output_metadata = beam_metadata_io.BeamDatasetMetadata( new_metadata, new_deferred_metadata) else: output_metadata = _remove_columns_from_metadata( output_metadata, self._exclude_outputs) tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get( type(self.pipeline.runner)) if not self._use_tfxio: input_values |= 'Batch' >> _BatchElements() output_instances = ( input_values | 'Transform' >> beam.ParDo( _RunMetaGraphDoFn( tf_config, input_schema=input_schema, input_tensor_adapter_config=input_tensor_adapter_config, use_tfxio=self._use_tfxio, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys(), exclude_outputs=self._exclude_outputs), saved_model_dir=beam.pvalue.AsSingleton(transform_fn)) | 'ConvertAndUnbatch' >> beam.FlatMap( _convert_and_unbatch_to_instance_dicts, schema=output_metadata.schema, passthrough_keys=Context.get_passthrough_keys())) _clear_shared_state_after_barrier(self.pipeline, output_instances) return (output_instances, output_metadata)
def testWriteMetadataDeferred(self): # Write metadata to disk using WriteMetadata PTransform, combining # incomplete metadata with (deferred) complete metadata. with beam.Pipeline() as pipeline: path = self.get_temp_dir() deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA)
def testWriteTransformFnIsRetryable(self): tft.test_case.skip_if_external_environment( 'Retries are currently not available on this environment.') original_copy_tree_to_unique_temp_dir = ( transform_fn_io._copy_tree_to_unique_temp_dir) def mock_copy_tree_to_unique_temp_dir(source, base_temp_dir_path): """Mocks transform_fn_io._copy_tree to fail the first time it is called by this test, thus forcing a retry which should succeed.""" global _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED if not _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED: _COPY_TREE_TO_UNIQUE_TEMP_DIR_CALLED = True original_copy_tree_to_unique_temp_dir(source, base_temp_dir_path) raise ArithmeticError('Some error') return original_copy_tree_to_unique_temp_dir( source, base_temp_dir_path) with self._makeTestPipeline() as pipeline: transform_output_dir = os.path.join(self.get_temp_dir(), 'output') # Create an empty directory for the source saved model dir. saved_model_dir = os.path.join(self.get_temp_dir(), 'source') file_io.recursive_create_dir(saved_model_dir) saved_model_path = os.path.join(saved_model_dir, 'saved_model') with file_io.FileIO(saved_model_path, mode='w') as f: f.write('some content') saved_model_dir_pcoll = ( pipeline | 'CreateSavedModelDir' >> beam.Create([saved_model_dir])) # Combine test metadata with a dict of PCollections resolving futures. deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata) with mock.patch.object(transform_fn_io, '_copy_tree_to_unique_temp_dir', mock_copy_tree_to_unique_temp_dir): _ = ((saved_model_dir_pcoll, metadata) | transform_fn_io.WriteTransformFn(transform_output_dir)) # Test reading with TFTransformOutput tf_transform_output = tft.TFTransformOutput(transform_output_dir) metadata = tf_transform_output.transformed_metadata self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) transform_fn_dir = tf_transform_output.transform_savedmodel_dir self.assertTrue(file_io.file_exists(transform_fn_dir)) self.assertTrue(file_io.is_directory(transform_fn_dir)) # Check temp directory created by failed run was cleaned up. self.assertEqual(2, len(file_io.list_directory(transform_output_dir)))
def testWriteMetadataDeferredProperties(self): # Write deferred properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() # Combine _TEST_METADATA with the complete (deferred) metadata. deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [_TEST_METADATA_COMPLETE]) metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA, deferred_metadata) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA_COMPLETE)
def testWriteMetadataDeferredProperties(self): # Write deferred properties as metadata to disk. with beam.Pipeline() as pipeline: path = self.get_temp_dir() # Combine test metadata with a dict of PCollections resolving futures. metadata = beam_metadata_io.BeamDatasetMetadata( _TEST_METADATA_WITH_FUTURES, { 'a': pipeline | 'CreateA' >> beam.Create([3]), 'b': pipeline | 'CreateB' >> beam.Create([5]) }) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertMetadataEqual(metadata, _TEST_METADATA)
def testWriteMetadataDeferred(self): # Write metadata to disk using WriteMetadata PTransform, combining # incomplete metadata with (deferred) complete metadata. expected_asset_map = {'key': 'value'} with beam.Pipeline() as pipeline: path = self.get_temp_dir() deferred_metadata = pipeline | 'CreateDeferredMetadata' >> beam.Create( [test_metadata.COMPLETE_METADATA]) metadata = beam_metadata_io.BeamDatasetMetadata( test_metadata.INCOMPLETE_METADATA, deferred_metadata, expected_asset_map) _ = metadata | beam_metadata_io.WriteMetadata(path, pipeline) # Load from disk and check that it is as expected. metadata = metadata_io.read_metadata(path) self.assertEqual(metadata, test_metadata.COMPLETE_METADATA) with tf.io.gfile.GFile( os.path.join(path, output_wrapper.TFTransformOutput.ASSET_MAP)) as f: asset_map = json.loads(f.read()) self.assertDictEqual(asset_map, expected_asset_map)
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ flattened_pcoll, input_values_pcoll_dict, input_metadata = dataset input_schema = input_metadata.schema input_values_pcoll_dict = input_values_pcoll_dict or dict() analyzer_cache.validate_dataset_keys(input_values_pcoll_dict.keys()) with tf.Graph().as_default() as graph: with tf.name_scope('inputs'): feature_spec = input_schema.as_feature_spec() input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = self._preprocessing_fn(copied_inputs) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not output_signature: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES))) pipeline = flattened_pcoll.pipeline serialized_tf_config = common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access pipeline.runner) extra_args = common.ConstructBeamPipelineVisitor.ExtraArgs( base_temp_dir=Context.create_base_temp_dir(), serialized_tf_config=serialized_tf_config, pipeline=pipeline, flat_pcollection=flattened_pcoll, pcollection_dict=input_values_pcoll_dict, graph=graph, input_signature=input_signature, input_schema=input_schema, cache_location=self._cache_location) transform_fn_future = analysis_graph_builder.build( graph, input_signature, output_signature, input_values_pcoll_dict.keys(), self._cache_location) transform_fn_pcoll = nodes.Traverser( common.ConstructBeamPipelineVisitor(extra_args)).visit_value_node( transform_fn_future) # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _augment_metadata will use the analyzer # outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(output_signature, graph)) deferred_metadata = ( transform_fn_pcoll | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll) return transform_fn_pcoll, full_metadata
def expand(self, dataset_and_transform_fn): """Transforms the dataset using the transform_fn. Args: dataset_and_transform_fn: A tuple of dataset and preprocessing function. Returns: A dataset transformed according to the transform_fn. """ (input_values, input_metadata), (transform_fn, output_metadata) = ( dataset_and_transform_fn) # If exclude_outputs is set, update the output metadata. if self._exclude_outputs is not None: if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata): # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict. output_metadata, pcollections = output_metadata schema = output_metadata.schema # Update DatasetMetadata to remove excluded outputs output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) # Update pcollections to keep only pcollections that resolve futures in # the updated metadata. unresolved_future_names = set( future.name for future in output_metadata.substitute_futures({})) pcollections = { name: pcollection for name, pcollection in six.iteritems(pcollections) if name in unresolved_future_names } # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata output_metadata = beam_metadata_io.BeamDatasetMetadata( output_metadata, pcollections) else: schema = output_metadata.schema output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access self.pipeline.runner)) output_instances = ( input_values | 'Batch' >> _BatchElements() | 'Transform' >> beam.ParDo( _RunMetaGraphDoFn( input_metadata.schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys(), exclude_outputs=self._exclude_outputs), saved_model_dir=beam.pvalue.AsSingleton(transform_fn)) | 'ConvertAndUnbatch' >> beam.FlatMap( _convert_and_unbatch_to_instance_dicts, schema=output_metadata.schema, passthrough_keys=Context.get_passthrough_keys())) _clear_shared_state_after_barrier(self.pipeline, output_instances) return (output_instances, output_metadata)
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs'): inputs = input_schema.as_batched_placeholders() # In order to avoid a bug where import_graph_def fails when the input_map # and return_elements of an imported graph are the same (b/34288791), we # avoid using the placeholder of an input column as an output of a graph. # We do this by applying tf.identity to all inputs of the # preprocessing_fn. Note this applies at the level of raw tensors. outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs)) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not outputs: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref(tf.GraphKeys.TRAINABLE_VARIABLES))) # NOTE: it's important that create_phases is called directly after # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS # collection which would break the logic in create_phases. phases = impl_helper.create_phases() # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = ( tensor_pcoll_mapping | 'CreateSavedModelForAnalyzerInputs[%d]' % level >> _ReplaceTensorsWithConstants(unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'BatchAnalyzerInputs[%d]' % level >> _BatchElements() | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), passthrough_keys=Context.get_passthrough_keys()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs( phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the # values of tensors in the graph. In that case, the tensors must be # "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with # the tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) deferred_metadata_tensor_names = { future.name for column_schema in metadata.schema.column_schemas.values() for future in column_schema.substitute_futures({}) } name_pcoll_dict = ( tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues(deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() # NOTE: it's important that create_phases is called directly after # run_preprocessing_fn, because we later mutate the graph's # TABLE_INITIALIZERS collection which would break the logic in # create_phases. graph, inputs, outputs = impl_helper.run_preprocessing_fn( self._preprocessing_fn, input_schema) phases = impl_helper.create_phases(graph) # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. This # graph has the anaylzer outputs computed so far replaced with constants. analyzer_inputs = {} for analyzer in phase.analyzers: for input_tensor in analyzer.inputs: analyzer_inputs[input_tensor.name] = input_tensor table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform( graph, inputs, analyzer_inputs, unbound_saved_model_dir) saved_model_dir = ( tensor_pcoll_mapping | 'CreateSavedModelForAnaylzerInputs[%d]' % level >> _ReplaceTensorsWithConstants( unbound_saved_model_dir, base_temp_dir, input_values.pipeline)) # Run this saved model on the input dataset to obtain the inputs to the # analyzers. analyzer_input_values = ( input_values | 'ComputeAnalyzerInputs[%d]' % level >> beam.ParDo( _RunMetaGraphDoFn( input_schema, serialized_tf_config, shared_graph_state_handle=shared.Shared()), saved_model_dir=beam.pvalue.AsSingleton(saved_model_dir))) # Compute the analyzers from their inputs. `analyzer_outputs_dict` is a # map from tensor names to singleton PCollections of `_TensorValue`s. analyzer_outputs_dict = ( analyzer_input_values | 'ComputeAnalyzerOutputs[%d]' % level >> _ComputeAnalyzerOutputs(phase.analyzers, base_temp_dir)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(analyzer_outputs_dict) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. The metadata may contain Futures that refer to the values # of tensors in the graph. In that case, the tensors must be "constant" in # that they don't depend on input data. The tensors can depend on analyzer # outputs though. This allows us to set metadata that depends on analyzer # outputs. # # We first extract the names of the tensors that are referenced by the # Futures, and then compute them by calling _ComputeScalarConstants with the # tensor-PCollection mapping representing the analyzer outputs. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(graph, outputs)) deferred_metadata_tensor_names = [ future.name for column_schema in tft_api.get_column_schemas(graph).values() for future in column_schema.substitute_futures({})] name_pcoll_dict = ( tensor_pcoll_mapping | 'ComputeTensorValues' >> _ComputeTensorValues( deferred_metadata_tensor_names, saved_model_dir, input_values.pipeline)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, name_pcoll_dict) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ (flattened_pcoll, input_values_pcoll_dict, dataset_cache_dict, input_metadata) = dataset if self._use_tfxio: input_schema = None input_tensor_adapter_config = input_metadata else: input_schema = input_metadata.schema input_tensor_adapter_config = None input_values_pcoll_dict = input_values_pcoll_dict or dict() with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): if self._use_tfxio: specs = TensorAdapter(input_tensor_adapter_config).OriginalTypeSpecs() else: specs = schema_utils.schema_as_feature_spec(input_schema).feature_spec input_signature = impl_helper.batched_placeholders_from_specs(specs) # In order to avoid a bug where import_graph_def fails when the # input_map and return_elements of an imported graph are the same # (b/34288791), we avoid using the placeholder of an input column as an # output of a graph. We do this by applying tf.identity to all inputs of # the preprocessing_fn. Note this applies at the level of raw tensors. # TODO(b/34288791): Remove this workaround and use a shallow copy of # inputs instead. A shallow copy is needed in case # self._preprocessing_fn mutates its input. copied_inputs = impl_helper.copy_tensors(input_signature) output_signature = self._preprocessing_fn(copied_inputs) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not output_signature: raise ValueError('The preprocessing function returned an empty dict') if graph.get_collection(tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.compat.v1.GraphKeys.TRAINABLE_VARIABLES))) pipeline = self.pipeline or (flattened_pcoll or next( v for v in input_values_pcoll_dict.values() if v is not None)).pipeline # Add a stage that inspects graph collections for API use counts and logs # them as a beam metric. _ = (pipeline | 'InstrumentAPI' >> _InstrumentAPI(graph)) tf_config = _DEFAULT_TENSORFLOW_CONFIG_BY_BEAM_RUNNER_TYPE.get( type(pipeline.runner)) extra_args = beam_common.ConstructBeamPipelineVisitor.ExtraArgs( base_temp_dir=Context.create_base_temp_dir(), tf_config=tf_config, pipeline=pipeline, flat_pcollection=flattened_pcoll, pcollection_dict=input_values_pcoll_dict, graph=graph, input_signature=input_signature, input_schema=input_schema, input_tensor_adapter_config=input_tensor_adapter_config, use_tfxio=self._use_tfxio, cache_pcoll_dict=dataset_cache_dict) transform_fn_future, cache_value_nodes = analysis_graph_builder.build( graph, input_signature, output_signature, input_values_pcoll_dict.keys(), cache_dict=dataset_cache_dict) traverser = nodes.Traverser( beam_common.ConstructBeamPipelineVisitor(extra_args)) transform_fn_pcoll = traverser.visit_value_node(transform_fn_future) if cache_value_nodes is not None: output_cache_pcoll_dict = {} for (dataset_key, cache_key), value_node in six.iteritems(cache_value_nodes): if dataset_key not in output_cache_pcoll_dict: output_cache_pcoll_dict[dataset_key] = {} output_cache_pcoll_dict[dataset_key][cache_key] = ( traverser.visit_value_node(value_node)) else: output_cache_pcoll_dict = None # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _infer_metadata_from_saved_model will use the # analyzer outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=schema_inference.infer_feature_schema(output_signature, graph)) deferred_metadata = ( transform_fn_pcoll | 'ComputeDeferredMetadata' >> beam.Map(_infer_metadata_from_saved_model)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(pipeline, transform_fn_pcoll) return (transform_fn_pcoll, full_metadata), output_cache_pcoll_dict
def expand(self, dataset): """Analyze the dataset. Args: dataset: A dataset. Returns: A TransformFn containing the deferred transform function. Raises: ValueError: If preprocessing_fn has no outputs. """ input_values, input_metadata = dataset input_schema = input_metadata.schema base_temp_dir = Context.create_base_temp_dir() graph = tf.Graph() with graph.as_default(): with tf.name_scope('inputs'): inputs = input_schema.as_batched_placeholders() # In order to avoid a bug where import_graph_def fails when the input_map # and return_elements of an imported graph are the same (b/34288791), we # avoid using the placeholder of an input column as an output of a graph. # We do this by applying tf.identity to all inputs of the # preprocessing_fn. Note this applies at the level of raw tensors. outputs = self._preprocessing_fn(impl_helper.copy_tensors(inputs)) # At this point we check that the preprocessing_fn has at least one # output. This is because if we allowed the output of preprocessing_fn to # be empty, we wouldn't be able to determine how many instances to # "unbatch" the output into. if not outputs: raise ValueError( 'The preprocessing function returned an empty dict') if graph.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): raise ValueError( 'The preprocessing function contained trainable variables ' '{}'.format( graph.get_collection_ref( tf.GraphKeys.TRAINABLE_VARIABLES))) # NOTE: it's important that create_phases is called directly after # preprocessing_fn, because we later mutate the graph's TABLE_INITIALIZERS # collection which would break the logic in create_phases. phases = impl_helper.create_phases() # Iterate through levels. tensor_pcoll_mapping is a mapping from tensor # names to singleton PCollections containing a _TensorValue. We compute # tensor_pcoll_mapping in phases, where at each phase we compute the # analyzers that are ready to run and update tensor_pcoll_mapping. tensor_pcoll_mapping = {} table_initializers = graph.get_collection_ref( tf.GraphKeys.TABLE_INITIALIZERS) original_table_initializers = list(table_initializers) del table_initializers[:] serialized_tf_config = ( common._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access input_values.pipeline.runner)) for level, phase in enumerate(phases): # Create a SavedModel that describes the mapping from the input data # to the inputs of the analyzers at this level. The colum names of the # outputs are the tensor names of the analyzer inputs in the graph. # This graph has the anaylzer outputs computed so far replaced with # constants. analyzer_inputs = {} for analyzer in phase.analyzer_infos: for input_tensor_name in analyzer.input_tensor_names: analyzer_inputs[ input_tensor_name] = graph.get_tensor_by_name( input_tensor_name) table_initializers.extend(phase.table_initializers) unbound_saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, analyzer_inputs, unbound_saved_model_dir) tensor_pcoll_mapping_update = ( (input_values, tensor_pcoll_mapping) | 'RunPhase[{}]'.format(level) >> _RunPhase( phase.analyzer_infos, unbound_saved_model_dir, base_temp_dir, input_schema, serialized_tf_config)) # Update the mapping for all analyzers. tensor_pcoll_mapping.update(tensor_pcoll_mapping_update) del table_initializers[:] table_initializers.extend(original_table_initializers) saved_model_dir = _make_unique_temp_dir(base_temp_dir) _write_saved_transform(graph, inputs, outputs, saved_model_dir) transform_fn = ( tensor_pcoll_mapping | 'ReplaceTensorsWithConstants' >> _ReplaceTensorsWithConstants( saved_model_dir, base_temp_dir, input_values.pipeline)) # Infer metadata. We take the inferred metadata and apply overrides that # refer to values of tensors in the graph. The override tensors must # be "constant" in that they don't depend on input data. The tensors can # depend on analyzer outputs though. This allows us to set metadata that # depends on analyzer outputs. _augment_metadata will use the analyzer # outputs stored in `transform_fn` to compute the metadata in a # deferred manner, once the analyzer outputs are known. metadata = dataset_metadata.DatasetMetadata( schema=impl_helper.infer_feature_schema(outputs)) deferred_metadata = (transform_fn | 'ComputeDeferredMetadata' >> beam.Map( _augment_metadata, metadata)) full_metadata = beam_metadata_io.BeamDatasetMetadata( metadata, deferred_metadata) _clear_shared_state_after_barrier(input_values.pipeline, transform_fn) return transform_fn, full_metadata