def test_perform_combiner_packing_optimization( self, feature_spec, preprocessing_fn, num_phases, expected_dot_graph_str_before_packing, expected_dot_graph_str_after_packing): graph, structured_inputs, structured_outputs = ( impl_helper.trace_preprocessing_function( preprocessing_fn, feature_spec, use_tf_compat_v1=True)) def _side_effect_fn(saved_model_future, cache_value_nodes, unused_num_phases): return (saved_model_future, cache_value_nodes) with mock.patch.object( combiner_packing_util, 'perform_combiner_packing_optimization', side_effect=_side_effect_fn): transform_fn_future_before, unused_cache = analysis_graph_builder.build( graph, structured_inputs, structured_outputs) transform_fn_future_after, unused_cache = ( combiner_packing_util.perform_combiner_packing_optimization( transform_fn_future_before, unused_cache, num_phases)) dot_string_before = nodes.get_dot_graph( [transform_fn_future_before]).to_string() self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string_before), first=dot_string_before, second=expected_dot_graph_str_before_packing) dot_string_after = nodes.get_dot_graph( [transform_fn_future_after]).to_string() self.WriteRenderedDotFile(dot_string_after) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string_after), first=dot_string_after, second=expected_dot_graph_str_after_packing)
def test_perform_combiner_packing_optimization( self, feature_spec, preprocessing_fn, num_phases, expected_dot_graph_str_before_packing, expected_dot_graph_str_after_packing): with tf.compat.v1.Graph().as_default() as graph: with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) def _side_effect_fn(saved_model_future, cache_value_nodes, unused_num_phases): return (saved_model_future, cache_value_nodes) with mock.patch.object(combiner_packing_util, 'perform_combiner_packing_optimization', side_effect=_side_effect_fn): transform_fn_future_before, unused_cache = analysis_graph_builder.build( graph, input_signature, output_signature) transform_fn_future_after, unused_cache = ( combiner_packing_util.perform_combiner_packing_optimization( transform_fn_future_before, unused_cache, num_phases)) dot_string_before = nodes.get_dot_graph([transform_fn_future_before ]).to_string() self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string_before), first=dot_string_before, second=expected_dot_graph_str_before_packing) dot_string_after = nodes.get_dot_graph([transform_fn_future_after ]).to_string() self.WriteRenderedDotFile(dot_string_after) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string_after), first=dot_string_after, second=expected_dot_graph_str_after_packing)
def test_get_analysis_cache_entry_keys(self, use_tf_compat_v1): if not use_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') full_dataset_keys = ['a', 'b'] def preprocessing_fn(inputs): return {'x': tft.scale_to_0_1(inputs['x'])} mocked_cache_entry_key = 'A' def mocked_make_cache_entry_key(_): return mocked_cache_entry_key feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)} specs = (feature_spec if use_tf_compat_v1 else impl_helper.get_type_specs_from_feature_specs(feature_spec)) with mock.patch( 'tensorflow_transform.beam.analysis_graph_builder.' 'analyzer_cache.make_cache_entry_key', side_effect=mocked_make_cache_entry_key): cache_entry_keys = ( analysis_graph_builder.get_analysis_cache_entry_keys( preprocessing_fn, specs, full_dataset_keys, force_tf_compat_v1=use_tf_compat_v1)) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) self.assertCountEqual(cache_entry_keys, [mocked_cache_entry_key])
def testGetDotGraph(self): a = nodes.apply_operation(_Constant, value='a', label='Constant[a]') b = nodes.apply_operation(_Constant, value='b', label='Constant[b]') b_copy, a_copy = nodes.apply_multi_output_operation(_Swap, a, b, label='Swap[0]') b_copy2, unused_a_copy2 = nodes.apply_multi_output_operation( _Swap, a_copy, b_copy, label='Swap[1]') dot_string = nodes.get_dot_graph([b_copy2]).to_string() self.WriteRenderedDotFile(dot_string) self.assertMultiLineEqual( dot_string, """\ digraph G { directed=True; node [shape=Mrecord]; "Constant[a]" [label="{_Constant|value: a|label: Constant[a]}"]; "Constant[b]" [label="{_Constant|value: b|label: Constant[b]}"]; "Swap[0]" [label="{_Swap|label: Swap[0]|{<0>0|<1>1}}"]; "Constant[a]" -> "Swap[0]"; "Constant[b]" -> "Swap[0]"; "Swap[1]" [label="{_Swap|label: Swap[1]|{<0>0|<1>1}}"]; "Swap[0]":1 -> "Swap[1]"; "Swap[0]":0 -> "Swap[1]"; } """, msg='Result dot graph is:\n{}'.format(dot_string))
def test_optimize_traversal(self, feature_spec, preprocessing_fn, dataset_input_cache_dict, expected_dot_graph_str): span_0_key, span_1_key = 'span-0', 'span-1' if dataset_input_cache_dict is not None: cache = {span_0_key: dataset_input_cache_dict} else: cache = {} with tf.compat.v1.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) transform_fn_future, cache_output_dict = analysis_graph_builder.build( tf.compat.v1.get_default_graph(), input_signature, output_signature, {span_0_key, span_1_key}, cache) leaf_nodes = [transform_fn_future] + sorted(cache_output_dict.values(), key=str) dot_string = nodes.get_dot_graph(leaf_nodes).to_string() self.WriteRenderedDotFile(dot_string) self.assertSameElements( dot_string.split('\n'), expected_dot_graph_str.split('\n'), msg='Result dot graph is:\n{}'.format(dot_string))
def test_get_analysis_dataset_keys(self, preprocessing_fn, full_dataset_keys, cached_dataset_keys, expected_dataset_keys, expected_flat_data_required): # We force all dataset keys with entries in the cache dict will have a cache # hit. mocked_cache_entry_key = b'M' input_cache = { key: { mocked_cache_entry_key: 'C' } for key in cached_dataset_keys } feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)} with mock.patch( 'tensorflow_transform.beam.analysis_graph_builder.' 'analyzer_cache.make_cache_entry_key', return_value=mocked_cache_entry_key): dataset_keys, flat_data_required = ( analysis_graph_builder.get_analysis_dataset_keys( preprocessing_fn, feature_spec, full_dataset_keys, input_cache)) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) self.assertCountEqual(expected_dataset_keys, dataset_keys) self.assertEqual(expected_flat_data_required, flat_data_required)
def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str): graph, structured_inputs, structured_outputs = ( impl_helper.trace_preprocessing_function( preprocessing_fn, feature_spec, use_tf_compat_v1=True)) transform_fn_future, unused_cache = analysis_graph_builder.build( graph, structured_inputs, structured_outputs) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string), first=dot_string, second=expected_dot_graph_str)
def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str): with tf.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) transform_fn_future = analysis_graph_builder.build( tf.get_default_graph(), input_signature, output_signature) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string), first=dot_string, second=expected_dot_graph_str)
def test_get_analysis_cache_entry_keys(self): full_dataset_keys = ['a', 'b'] def preprocessing_fn(inputs): return {'x': tft.scale_to_0_1(inputs['x'])} mocked_cache_entry_key = 'A' def mocked_make_cache_entry_key(_): return mocked_cache_entry_key feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)} with mock.patch( 'tensorflow_transform.beam.analysis_graph_builder.' 'analyzer_cache.make_cache_entry_key', side_effect=mocked_make_cache_entry_key): cache_entry_keys = ( analysis_graph_builder.get_analysis_cache_entry_keys( preprocessing_fn, feature_spec, full_dataset_keys)) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) self.assertCountEqual(cache_entry_keys, [mocked_cache_entry_key])
def test_optimize_traversal(self, feature_spec, preprocessing_fn, write_cache_fn, expected_dot_graph_str): cache_location = self._make_cache_location() span_0_key, span_1_key = 'span-0', 'span-1' if write_cache_fn is not None: write_cache_fn(cache_location.input_cache_dir, [span_0_key, span_1_key]) with tf.name_scope('inputs'): input_signature = impl_helper.feature_spec_as_batched_placeholders( feature_spec) output_signature = preprocessing_fn(input_signature) transform_fn_future = analysis_graph_builder.build( tf.get_default_graph(), input_signature, output_signature, {span_0_key, span_1_key}, cache_location) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertSameElements( dot_string.split('\n'), expected_dot_graph_str.split('\n'), msg='Result dot graph is:\n{}'.format(dot_string))
def test_get_analysis_dataset_keys(self, preprocessing_fn, full_dataset_keys, cached_dataset_keys, expected_dataset_keys, use_tf_compat_v1): if not use_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') full_dataset_keys = [ analysis_graph_builder.analyzer_cache.DatasetKey(k) for k in full_dataset_keys ] # We force all dataset keys with entries in the cache dict will have a cache # hit. mocked_cache_entry_key = b'M' input_cache = { key: { mocked_cache_entry_key: 'C' } for key in cached_dataset_keys } feature_spec = {'x': tf.io.FixedLenFeature([], tf.float32)} specs = (feature_spec if use_tf_compat_v1 else impl_helper.get_type_specs_from_feature_specs(feature_spec)) with mock.patch( 'tensorflow_transform.beam.analysis_graph_builder.' 'analyzer_cache.make_cache_entry_key', return_value=mocked_cache_entry_key): dataset_keys = (analysis_graph_builder.get_analysis_dataset_keys( preprocessing_fn, specs, full_dataset_keys, input_cache, force_tf_compat_v1=use_tf_compat_v1)) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) self.assertCountEqual(expected_dataset_keys, dataset_keys)
def test_build(self, feature_spec, preprocessing_fn, expected_dot_graph_str, expected_dot_graph_str_tf2, use_tf_compat_v1): if not use_tf_compat_v1: test_case.skip_if_not_tf2('Tensorflow 2.x required') specs = (feature_spec if use_tf_compat_v1 else impl_helper.get_type_specs_from_feature_specs(feature_spec)) graph, structured_inputs, structured_outputs = ( impl_helper.trace_preprocessing_function( preprocessing_fn, specs, use_tf_compat_v1=use_tf_compat_v1, base_temp_dir=os.path.join(self.get_temp_dir(), self._testMethodName))) transform_fn_future, unused_cache = analysis_graph_builder.build( graph, structured_inputs, structured_outputs) dot_string = nodes.get_dot_graph([transform_fn_future]).to_string() self.WriteRenderedDotFile(dot_string) self.assertMultiLineEqual( msg='Result dot graph is:\n{}'.format(dot_string), first=dot_string, second=(expected_dot_graph_str if use_tf_compat_v1 else expected_dot_graph_str_tf2))
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1') _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 's_integerized': tft.compute_and_apply_vocabulary( inputs['s'], labels=inputs['label'], use_adjusted_mutual_info=True), } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', 'label': 0, }, { 'x': 4, 'y': -4, 's': 'a', 'label': 1, }, { 'x': 5, 'y': 11, 's': 'a', 'label': 1, }, { 'x': 1, 'y': -4, 's': u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), 'label': 1, }], span_1_key: [{ 'x': 12, 'y': 1, 's': u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), 'label': 0 }, { 'x': 10, 'y': 1, 's': 'c', 'label': 1 }], } expected_vocabulary_contents = np.array( [b'a', u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), b'c'], dtype=object) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = ( cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir)) transformed_dataset = (( (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 0, }, { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 2, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(7, len(cache_output[key])) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) # 4 from analyzing 2 spans, and 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2') _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) self.assertFalse(second_output_cache) # Only 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0) # The root CreateSavedModel is optimized away because the data doesn't get # processed at all (only cache). self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = (cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir)) transformed_dataset = ( ((input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn( transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(6, len(cache_output[key])) with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') self.assertFalse(second_output_cache)
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__CacheableCombineAccumulate--x_1-mean_and_var--': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']), '__v0__CacheableCombineAccumulate--x-x--': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), '__v0__CacheableCombineAccumulate--y_1-mean_and_var--': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']), '__v0__CacheableCombineAccumulate--y-y--': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
def test_caching_vocab_for_integer_categorical(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): return { 'x_vocab': tft.compute_and_apply_vocabulary( inputs['x'], frequency_threshold=2) } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, }, { 'x': -4, }, { 'x': -1, }, { 'x': 4, }], span_1_key: [{ 'x': -2, }, { 'x': -1, }, { 'x': 6, }, { 'x': 7, }], } expected_transformed_data = [{ 'x_vocab': 0, }, { 'x_vocab': 1, }, { 'x_vocab': -1, }, { 'x_vocab': -1, }] with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) cache_dict = { span_0_key: { b'__v0__VocabularyAccumulate[compute_and_apply_vocabulary/vocabulary]-\x05e\xfe4\x03H.P\xb5\xcb\xd22\xe3\x16\x15\xf8\xf5\xe38\xd9': p | 'CreateB' >> beam.Create( [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) self.assertNotIn(span_0_key, cache_output) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, _ = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') # 4 from analysis since 1 span was completely cached, and 4 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 1) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 1) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'd'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) cache_dict = { span_0_key: { b'__v0__CacheableCombineAccumulate[x_1/mean_and_var]-.\xc4t>ZBv\xea\xa5SU\xf4\x065\xc6\x1c\x81W\xf9\x1b': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']), b'__v0__CacheableCombineAccumulate[x/x]-\x95\xc5w\x88\x85\x8b5V\xc9\x00\xe0\x0f\x03\x1a\xdaL\x9d\xd5\xb3\xe3': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), b'__v0__CacheableCombineAccumulate[y_1/mean_and_var]-E^\xb7VZ\xeew4rm\xab\xa3\xa4k|J\x80ck\x16': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']), b'__v0__CacheableCombineAccumulate[y/y]-\xdf\x1ey\x03\x1c\x96\xd5' b' e\x9bJ\xa1\xd2\xfc\x9c\x03\x0fM \xdb': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) # The output cache should not have entries for the cache that is present # in the input cache. self.assertEqual( len(cache_output[span_0_key]), len(cache_output[span_1_key]) - 4) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 1, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 2, }, ] beam_test_util.assert_that(transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) # 4 from analyzing 2 spans, and 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 6) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 4) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 8) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)