def _OptimizeRun( pipeline: beam.Pipeline, input_cache_dir: Text, output_cache_dir: Text, analyze_data_list: List[_Dataset], feature_spec: Mapping[Text, Any], preprocessing_fn: Any, cache_source: beam.PTransform ) -> Tuple[Dict[Text, Optional[_Dataset]], Dict[Text, Dict[ Text, beam.pvalue.PCollection]], bool]: """Utilizes TFT cache if applicable and removes unused datasets.""" analysis_key_to_dataset = { analyzer_cache.make_dataset_key(dataset.file_pattern_suffix): dataset for dataset in analyze_data_list } if input_cache_dir is not None: input_cache = pipeline | analyzer_cache.ReadAnalysisCacheFromFS( input_cache_dir, list(analysis_key_to_dataset.keys()), source=cache_source) elif output_cache_dir is not None: input_cache = {} else: # Using None here to indicate that this pipeline will not read or write # cache. input_cache = None if input_cache is None: # Cache is disabled so we won't be filtering out any datasets, and will # always perform a flatten over all of them. filtered_analysis_dataset_keys = list(analysis_key_to_dataset.keys()) flat_data_required = True else: filtered_analysis_dataset_keys, flat_data_required = ( tft_beam.analysis_graph_builder.get_analysis_dataset_keys( preprocessing_fn, feature_spec, list(analysis_key_to_dataset.keys()), input_cache)) if len(filtered_analysis_dataset_keys) < len(analysis_key_to_dataset): tf.logging.info('Not reading the following datasets due to cache: %s', [ v.file_pattern_suffix for k, v in analysis_key_to_dataset.items() if k not in filtered_analysis_dataset_keys ]) new_analyze_data_dict = {} for key, dataset in six.iteritems(analysis_key_to_dataset): if key in filtered_analysis_dataset_keys: new_analyze_data_dict[key] = dataset else: new_analyze_data_dict[key] = None return (new_analyze_data_dict, input_cache, flat_data_required)
def test_cache_helpers_with_alternative_io(self): class LocalSink(beam.PTransform): def __init__(self, path): self._path = path def expand(self, pcoll): def write_to_file(value): tf.io.gfile.makedirs(self._path) with open(os.path.join(self._path, 'cache'), 'wb') as f: f.write(value) return pcoll | beam.Map(write_to_file) test_cache_dict = { analyzer_cache.DatasetKey('a'): { 'b': [bytes([17, 19, 27, 31])] } } class LocalSource(beam.PTransform): def __init__(self, path): del path def expand(self, pbegin): return pbegin | beam.Create([test_cache_dict['a']['b']]) dataset_keys = list(test_cache_dict.keys()) cache_dir = self.get_temp_dir() with beam.Pipeline() as p: _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS( p, cache_dir, dataset_keys, sink=LocalSink) read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( cache_dir, dataset_keys, source=LocalSource) self.assertItemsEqual(read_cache.keys(), ['a']) self.assertItemsEqual(read_cache['a'].keys(), ['b']) beam_test_util.assert_that( read_cache['a']['b'], beam_test_util.equal_to([test_cache_dict['a']['b']]))
def test_cache_helpers_round_trip(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) with beam.Pipeline() as p: cache_pcoll_dict = { 'dataset_key_0': { 'a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']), 'b': p | 'CreateB' >> beam.Create([b'[5]']), }, 'dataset_key_1': { 'c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( base_test_dir) with beam.Pipeline() as p: read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( base_test_dir, list(cache_pcoll_dict.keys())) def assert_equal_matcher(expected_encoded): def _assert_equal(encoded_cache_list): (encode_cache,) = encoded_cache_list self.assertEqual(expected_encoded, encode_cache) return _assert_equal beam_test_util.assert_that( read_cache['dataset_key_0']['a'], beam_test_util.equal_to([b'[1, 2, 3]']), label='AssertA') beam_test_util.assert_that( read_cache['dataset_key_0']['b'], assert_equal_matcher(b'[5]'), label='AssertB') beam_test_util.assert_that( read_cache['dataset_key_1']['c'], assert_equal_matcher(b'[9, 5, 2, 1]'), label='AssertC')
def test_cache_helpers_round_trip(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) dataset_key_0 = analyzer_cache.DatasetKey('dataset_key_0') dataset_key_1 = analyzer_cache.DatasetKey('dataset_key_1') dataset_keys = (dataset_key_0, dataset_key_1) with beam.Pipeline() as p: cache_pcoll_dict = { dataset_key_0: { b'\x8a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']), b'\x8b': p | 'CreateB' >> beam.Create([b'[5]']), b'\x8b1': p | 'CreateB1' >> beam.Create([b'[6]']), }, dataset_key_1: { b'\x8c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, dataset_keys) with beam.Pipeline() as p: read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( base_test_dir, list(cache_pcoll_dict.keys()), [b'\x8a', b'\x8b', b'\x8c']) beam_test_util.assert_that( read_cache[dataset_key_0][b'\x8a'], beam_test_util.equal_to([b'[1, 2, 3]']), label='AssertA') beam_test_util.assert_that( read_cache[dataset_key_0][b'\x8b'], beam_test_util.equal_to([b'[5]']), label='AssertB') beam_test_util.assert_that( read_cache[dataset_key_1][b'\x8c'], beam_test_util.equal_to([b'[9, 5, 2, 1]']), label='AssertC')
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s']) _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), } input_data = [{'x': 12, 'y': 1, 's': 'b'}, {'x': 10, 'y': 1, 's': 'c'}] input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', }, { 'x': 4, 'y': -4, 's': 'a', }], span_1_key: input_data, } with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = (cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS(self._cache_dir)) transformed_dataset = ( ((input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, { 'x_mean': 6.0, 'x_min': -2.0, 'y_mean': -0.25, 'y_min': -4.0, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn( transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(6, len(cache_output[key])) with beam_impl.Context(temp_dir=self.get_temp_dir()): with beam.Pipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph( [analysis_graph_builder._ANALYSIS_GRAPH]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = ( ((input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') self.assertFalse(second_output_cache)
def test_single_phase_run_twice(self): span_0_key = 'span-0' span_1_key = 'span-1' def preprocessing_fn(inputs): _ = tft.vocabulary(inputs['s'], vocab_filename='vocab1') _ = tft.bucketize(inputs['x'], 2, name='bucketize') return { 'x_min': tft.min(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'x_mean': tft.mean(inputs['x'], name='x') + tf.zeros_like(inputs['x']), 'y_min': tft.min(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 'y_mean': tft.mean(inputs['y'], name='y') + tf.zeros_like(inputs['y']), 's_integerized': tft.compute_and_apply_vocabulary( inputs['s'], labels=inputs['label'], use_adjusted_mutual_info=True), } input_metadata = dataset_metadata.DatasetMetadata( dataset_schema.from_feature_spec({ 'x': tf.io.FixedLenFeature([], tf.float32), 'y': tf.io.FixedLenFeature([], tf.float32), 's': tf.io.FixedLenFeature([], tf.string), 'label': tf.io.FixedLenFeature([], tf.int64), })) input_data_dict = { span_0_key: [{ 'x': -2, 'y': 1, 's': 'a', 'label': 0, }, { 'x': 4, 'y': -4, 's': 'a', 'label': 1, }, { 'x': 5, 'y': 11, 's': 'a', 'label': 1, }, { 'x': 1, 'y': -4, 's': u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), 'label': 1, }], span_1_key: [{ 'x': 12, 'y': 1, 's': u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), 'label': 0 }, { 'x': 10, 'y': 1, 's': 'c', 'label': 1 }], } expected_vocabulary_contents = np.array( [b'a', u'Θα₯πΗ©ΔΎαΈΏκΘ―π±πππ΄'.encode('utf-8'), b'c'], dtype=object) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) transform_fn_1, cache_output = ( (flat_data, input_data_pcoll_dict, {}, input_metadata) | 'Analyze' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) _ = ( cache_output | 'WriteCache' >> analyzer_cache.WriteAnalysisCacheToFS( self._cache_dir)) transformed_dataset = (( (input_data_pcoll_dict[span_1_key], input_metadata), transform_fn_1) | 'Transform' >> beam_impl.TransformDataset()) del input_data_pcoll_dict transformed_data, unused_transformed_metadata = transformed_dataset expected_transformed_data = [ { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 0, }, { 'x_mean': 5.0, 'x_min': -2.0, 'y_mean': 1.0, 'y_min': -4.0, 's_integerized': 2, }, ] beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='first') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_1') _ = transform_fn_1 | tft_beam.WriteTransformFn(transform_fn_dir) for key in input_data_dict: self.assertIn(key, cache_output) self.assertEqual(7, len(cache_output[key])) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) # 4 from analyzing 2 spans, and 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 8) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 0) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 2) with _TestPipeline() as p: flat_data = p | 'CreateInputData' >> beam.Create( list(itertools.chain(*input_data_dict.values()))) # wrap each value in input_data_dict as a pcoll. input_data_pcoll_dict = {} for a, b in six.iteritems(input_data_dict): input_data_pcoll_dict[a] = p | a >> beam.Create(b) input_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( self._cache_dir, list(input_data_dict.keys())) transform_fn_2, second_output_cache = ( (flat_data, input_data_pcoll_dict, input_cache, input_metadata) | 'AnalyzeAgain' >> (beam_impl.AnalyzeDatasetWithCache(preprocessing_fn))) dot_string = nodes.get_dot_graph([analysis_graph_builder._ANALYSIS_GRAPH ]).to_string() self.WriteRenderedDotFile(dot_string) transformed_dataset = (( (input_data_dict[span_1_key], input_metadata), transform_fn_2) | 'TransformAgain' >> beam_impl.TransformDataset()) transformed_data, unused_transformed_metadata = transformed_dataset beam_test_util.assert_that( transformed_data, beam_test_util.equal_to(expected_transformed_data), label='second') transform_fn_dir = os.path.join(self.base_test_dir, 'transform_fn_2') _ = transform_fn_2 | tft_beam.WriteTransformFn(transform_fn_dir) tf_transform_output = tft.TFTransformOutput(transform_fn_dir) vocab1_path = tf_transform_output.vocabulary_file_by_name('vocab1') self.AssertVocabularyContents(vocab1_path, expected_vocabulary_contents) self.assertFalse(second_output_cache) # Only 2 from transform. self.assertEqual(_get_counter_value(p.metrics, 'num_instances'), 2) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_decoded'), 14) self.assertEqual(_get_counter_value(p.metrics, 'cache_entries_encoded'), 0) # The root CreateSavedModel is optimized away because the data doesn't get # processed at all (only cache). self.assertEqual(_get_counter_value(p.metrics, 'saved_models_created'), 1)