def test_non_frequency_vocabulary_merge(self): """This test compares vocabularies produced with and without cache.""" mi_vocab_name = 'mutual_information_vocab' adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab' weighted_frequency_vocab_name = 'weighted_frequency_vocab' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary(inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary(inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs span_0_key = 'span-0' span_1_key = 'span-1' input_data = [ dict(s='a', weight=1, label=1), dict(s='a', weight=0.5, label=1), dict(s='b', weight=0.75, label=1), dict(s='b', weight=1, label=0), ] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), 'weight': tf.io.FixedLenFeature([], tf.float32), })) input_data_dict = { span_0_key: input_data, span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn_with_cache, output_cache = ( (flat_data, input_data_dict, {}, input_metadata) | (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) expected_accumulators = { '__v0__VocabularyAccumulate--vocabulary--': [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'], '__v0__VocabularyAccumulate--vocabulary_1--': [b'["a", [2, 1.0, 0.0, 1.0]]', b'["b", [2, 0.5, 0.0, 1.0]]'], '__v0__VocabularyAccumulate--vocabulary_2--': [b'["a", 1.5]', b'["b", 1.75]'], } spans = [span_0_key, span_1_key] self.assertCountEqual(output_cache.keys(), spans) for span in spans: self.assertCountEqual(output_cache[span].keys(), expected_accumulators.keys()) for key, value in six.iteritems(expected_accumulators): self.assertCountEqual(output_cache[span][key], value) transform_fn_no_cache = ( (input_data * 2, input_metadata) | (beam_impl.AnalyzeDataset(preprocessing_fn))) transform_fn_with_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_with_cache') _ = transform_fn_with_cache | tft_beam.WriteTransformFn( transform_fn_with_cache_dir) transform_fn_no_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_no_cache') _ = transform_fn_no_cache | tft_beam.WriteTransformFn( transform_fn_no_cache_dir) tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir) tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir) for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name, weighted_frequency_vocab_name): cache_path = tft_output_cache.vocabulary_file_by_name( vocab_filename) no_cache_path = tft_output_no_cache.vocabulary_file_by_name( vocab_filename) with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile( no_cache_path, 'rb') as f2: self.assertEqual( f1.readlines(), f2.readlines(), 'vocab with cache != vocab without cache for: {}'.format( vocab_filename))
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = (cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir)) transformed_dataset = ( ((input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn( transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(6, len(cache_output[key])) with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') self.assertFalse(second_output_cache)
def test_caching_vocab_for_integer_categorical(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): return { 'x_vocab': tft.compute_and_apply_vocabulary(inputs['x'], frequency_threshold=2) } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, }, { 'x': -4, }, { 'x': -1, }, { 'x': 4, }], span_1_key: [{ 'x': -2, }, { 'x': -1, }, { 'x': 6, }, { 'x': 7, }], } expected_transformed_data = [{ 'x_vocab': 0, }, { 'x_vocab': 1, }, { 'x_vocab': -1, }, { 'x_vocab': -1, }] with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__VocabularyAccumulate--compute_and_apply_vocabulary-vocabulary--': p | 'CreateB' >> beam.Create( [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, _ = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first')
def test_non_frequency_vocabulary_merge(self): """This test compares vocabularies produced with and without cache.""" mi_vocab_name = 'mutual_information_vocab' adjusted_mi_vocab_name = 'adjusted_mutual_information_vocab' weighted_frequency_vocab_name = 'weighted_frequency_vocab' def preprocessing_fn(inputs): _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=mi_vocab_name, min_diff_from_avg=0.1, use_adjusted_mutual_info=False) _ = tft.vocabulary( inputs['s'], labels=inputs['label'], store_frequency=True, vocab_filename=adjusted_mi_vocab_name, min_diff_from_avg=1.0, use_adjusted_mutual_info=True) _ = tft.vocabulary( inputs['s'], weights=inputs['weight'], store_frequency=True, vocab_filename=weighted_frequency_vocab_name, use_adjusted_mutual_info=False) return inputs span_0_key = 'span-0' span_1_key = 'span-1' input_data = [ dict(s='a', weight=1, label=1), dict(s='a', weight=0.5, label=1), dict(s='b', weight=0.75, label=1), dict(s='b', weight=1, label=0), ] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), 'weight': tf.io.FixedLenFeature([], tf.float32), })) input_data_dict = { span_0_key: input_data, span_1_key: input_data, } with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_with_cache, output_cache = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) transform_fn_with_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_with_cache') _ = transform_fn_with_cache | tft_beam.WriteTransformFn( transform_fn_with_cache_dir) expected_accumulators = { b'__v0__VocabularyAccumulate[vocabulary]-\xd3\xe0p\x82\xb1\xa0z\xa3S\xd7N8@\x8f\xa2\xd7\xa1\x9e\xac;': [ b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]', b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]' ], b'__v0__VocabularyAccumulate[vocabulary_1]-A\xc7_0\xee\xff\x88@E<\xde\xcb\x8d\xff5\xebyZZ\x8d': [ b'["a", [2, [0.0, 1.0], [0.0, 0.0], 1.0]]', b'["b", [2, [0.5, 0.5], [0.0, 0.0], 1.0]]' ], b"__v0__VocabularyAccumulate[vocabulary_2]-\x97\x1c>\x851\x94'\xdc\xdf\xfd\xcc\x86\xb7\xb8\xe1\xe8*\x89B\t": [b'["a", 1.5]', b'["b", 1.75]'], } spans = [span_0_key, span_1_key] self.assertCountEqual(output_cache.keys(), spans) for span in spans: self.assertCountEqual(output_cache[span].keys(), expected_accumulators.keys()) for idx, (key, value) in enumerate(six.iteritems(expected_accumulators)): beam_test_util.assert_that( output_cache[span][key], beam_test_util.equal_to(value), label='AssertCache[{}][{}]'.format(span, idx)) # 4 from analysis on each of the input spans. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 6) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create(input_data * 2) transform_fn_no_cache = ((flat_data, input_metadata) | (beam_impl.AnalyzeDataset(preprocessing_fn))) transform_fn_no_cache_dir = os.path.join(self.base_test_dir, 'transform_fn_no_cache') _ = transform_fn_no_cache | tft_beam.WriteTransformFn( transform_fn_no_cache_dir) # 4 from analysis on each of the input spans. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) tft_output_cache = tft.TFTransformOutput(transform_fn_with_cache_dir) tft_output_no_cache = tft.TFTransformOutput(transform_fn_no_cache_dir) for vocab_filename in (mi_vocab_name, adjusted_mi_vocab_name, weighted_frequency_vocab_name): cache_path = tft_output_cache.vocabulary_file_by_name(vocab_filename) no_cache_path = tft_output_no_cache.vocabulary_file_by_name( vocab_filename) with tf.io.gfile.GFile(cache_path, 'rb') as f1, tf.io.gfile.GFile( no_cache_path, 'rb') as f2: self.assertEqual( f1.readlines(), f2.readlines(), 'vocab with cache != vocab without cache for: {}'.format( vocab_filename))
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # TODO(b/37788560): Get these names programmatically. cache_dict = { span_0_key: { '__v0__CacheableCombineAccumulate--x_1-mean_and_var--': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']), '__v0__CacheableCombineAccumulate--x-x--': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), '__v0__CacheableCombineAccumulate--y_1-mean_and_var--': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']), '__v0__CacheableCombineAccumulate--y-y--': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)
def test_caching_vocab_for_integer_categorical(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): return { 'x_vocab': tft.compute_and_apply_vocabulary( inputs['x'], frequency_threshold=2) } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, }, { 'x': -4, }, { 'x': -1, }, { 'x': 4, }], span_1_key: [{ 'x': -2, }, { 'x': -1, }, { 'x': 6, }, { 'x': 7, }], } expected_transformed_data = [{ 'x_vocab': 0, }, { 'x_vocab': 1, }, { 'x_vocab': -1, }, { 'x_vocab': -1, }] with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) cache_dict = { span_0_key: { b'__v0__VocabularyAccumulate[compute_and_apply_vocabulary/vocabulary]-\x05e\xfe4\x03H.P\xb5\xcb\xd22\xe3\x16\x15\xf8\xf5\xe38\xd9': p | 'CreateB' >> beam.Create( [b'[-2, 2]', b'[-4, 1]', b'[-1, 1]', b'[4, 1]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) self.assertNotIn(span_0_key, cache_output) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) transformed_data, _ = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') # 4 from analysis since 1 span was completely cached, and 4 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 1) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 1) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1') _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 's_integerized': tft.compute_and_apply_vocabulary( inputs['s'], labels=inputs['label'], use_adjusted_mutual_info=True), } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', 'label': 0, }, { 'x': 4, 'y': -4, 's': 'a', 'label': 1, }, { 'x': 5, 'y': 11, 's': 'a', 'label': 1, }, { 'x': 1, 'y': -4, 's': u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), 'label': 1, }], span_1_key: [{ 'x': 12, 'y': 1, 's': u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), 'label': 0 }, { 'x': 10, 'y': 1, 's': 'c', 'label': 1 }], } expected_vocabulary_contents = np.array( [b'a', u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), b'c'], dtype=object) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = ( cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir)) transformed_dataset = (( (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 0, }, { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 2, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(7, len(cache_output[key])) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) # 4 from analyzing 2 spans, and 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2') _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) self.assertFalse(second_output_cache) # Only 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0) # The root CreateSavedModel is optimized away because the data doesn't get # processed at all (only cache). self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)
def test_single_phase_mixed_analyzer_run_once(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'd'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) cache_dict = { span_0_key: { b'__v0__CacheableCombineAccumulate[x_1/mean_and_var]-.\xc4t>ZBv\xea\xa5SU\xf4\x065\xc6\x1c\x81W\xf9\x1b': p | 'CreateA' >> beam.Create([b'[2.0, 1.0, 9.0, 0.0]']), b'__v0__CacheableCombineAccumulate[x/x]-\x95\xc5w\x88\x85\x8b5V\xc9\x00\xe0\x0f\x03\x1a\xdaL\x9d\xd5\xb3\xe3': p | 'CreateB' >> beam.Create([b'[2.0, 4.0]']), b'__v0__CacheableCombineAccumulate[y_1/mean_and_var]-E^\xb7VZ\xeew4rm\xab\xa3\xa4k|J\x80ck\x16': p | 'CreateC' >> beam.Create([b'[2.0, -1.5, 6.25, 0.0]']), b'__v0__CacheableCombineAccumulate[y/y]-\xdf\x1ey\x03\x1c\x96\xd5' b' e\x9bJ\xa1\xd2\xfc\x9c\x03\x0fM \xdb': p | 'CreateD' >> beam.Create([b'[4.0, 1.0]']), }, span_1_key: {}, } transform_fn, cache_output = ( (flat_data, input_data_dict, cache_dict, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | 'Transform' >> beam_impl.TransformDataset()) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) # The output cache should not have entries for the cache that is present # in the input cache. self.assertEqual( len(cache_output[span_0_key]), len(cache_output[span_1_key]) - 4) transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 1, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 2, }, ] beam_test_util.assert_that(transformed_data, beam_test_util.equal_to(expected_transformed)) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) # 4 from analyzing 2 spans, and 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 6) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 4) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 8) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2)
def test_single_phase_run_twice(self): cache_location = self._make_cache_location('input_cache_1', 'output_cache_1') span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset exepected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: key_cache_dir = os.path.join(cache_location.output_cache_dir, key) self.assertTrue(tf.gfile.IsDirectory(key_cache_dir)) self.assertEqual(len(tf.gfile.ListDirectory(key_cache_dir)), 6) cache_location = self._make_cache_location('output_cache_1', 'output_cache_2') with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) self.assertFalse(tf.gfile.IsDirectory(cache_location.output_cache_dir))
def test_single_phase_mixed_analyzer_run_once(self): cache_location = self._make_cache_location() span_0_key = 'span-0' span_1_key = 'span-1' _write_cache('__v0__CacheableCombineAccumulate--x_1-mean_and_var--', span_0_key, [2.0, 1.0, 9.0], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--x-x--', span_0_key, [2.0, 4.0], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--y_1-mean_and_var--', span_0_key, [2.0, -1.5, 6.25], cache_location.input_cache_dir) _write_cache('__v0__CacheableCombineAccumulate--y-y--', span_0_key, [4.0, 1.0], cache_location.input_cache_dir) def preprocessing_fn(inputs): integerized_s = tft.compute_and_apply_vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'integerized_s': integerized_s, 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } # Run AnalyzeAndTransform on some input data and compare with expected # output. input_data = [{'x': 12, 'y': 1, 's': 'c'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.FixedLenFeature([], tf.float32), 'y': tf.FixedLenFeature([], tf.float32), 's': tf.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'b', }, { 'x': 4, 'y': -4, 's': 'b', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): flat_data = input_data_dict.values() | 'Flatten' >> beam.Flatten() transform_fn = ((flat_data, input_data_dict, input_metadata) | (beam_impl.AnalyzeDatasetWithCache( preprocessing_fn, cache_location))) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn) | beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset exepected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, 'integerized_s': 0, }, ] self.assertDataCloseOrEqual(transformed_data, exepected_transformed_data) transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn') _ = transform_fn | tft_beam.WriteTransformFn(transform_fn_dir)