class AnalyzerCacheTest(test_case.TransformTestCase): def test_validate_dataset_keys(self): analyzer_cache.validate_dataset_keys( {'foo', 'Foo', 'A1', 'A_1', 'A.1', 'A-1'}) for key in {'foo 1', 'foo@1', 'foo*', 'foo[]', 'foo/goo'}: with self.assertRaisesRegexp( ValueError, 'Dataset key .* does not match allowed pattern:'): analyzer_cache.validate_dataset_keys({key}) @test_case.named_parameters( dict(testcase_name='JsonNumpyCacheCoder', coder=analyzer_nodes.JsonNumpyCacheCoder(), value=[1, 2.5, 3, '4']), dict(testcase_name='JsonNumpyCacheCoderNpArray', coder=analyzer_nodes.JsonNumpyCacheCoder(), value=np.array([1, 2.5, 3, '4'])), dict(testcase_name='JsonNumpyCacheCoderNestedNpTypes', coder=analyzer_nodes.JsonNumpyCacheCoder(), value=[np.int64(1), np.float32(2.5), 3, '4']), dict(testcase_name='_VocabularyAccumulatorCoderIntAccumulator', coder=analyzer_nodes._VocabularyAccumulatorCoder(), value=[b'A', 17]), dict(testcase_name='_VocabularyAccumulatorCoderIntAccumulatorNonUtf8', coder=analyzer_nodes._VocabularyAccumulatorCoder(), value=[b'\x8a', 29]), dict(testcase_name='_VocabularyAccumulatorCoderClassAccumulator', coder=analyzer_nodes._VocabularyAccumulatorCoder(), value=[ b'A', analyzers._WeightedMeanAndVarAccumulator( count=np.array(5), mean=np.array([.4, .9, 1.5]), variance=np.array([.1, .4, .5]), weight=np.array(0.), ) ]), dict( testcase_name='_QuantilesAccumulatorCoderClassAccumulator', coder=analyzers._QuantilesAccumulatorCacheCoder(), value=[ '\n\x0f\r\x00\x00 A\x15\x00\x00\x80?%\x00\x00\x80?\n\x14\r\x00\x00@A\x15\x00\x00\x80?\x1d\x00\x00\x80?%\x00\x00\x00@', '', _get_quantiles_summary() ]), dict(testcase_name='_CombinerPerKeyAccumulatorCoder', coder=analyzer_nodes._CombinerPerKeyAccumulatorCoder( analyzer_nodes.JsonNumpyCacheCoder()), value=[b'\x8a', [np.int64(1), np.float32(2.5), 3, '4']]), ) def test_coders_round_trip(self, coder, value): encoded = coder.encode_cache(value) np.testing.assert_equal(coder.decode_cache(encoded), value) def test_cache_helpers_round_trip(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) dataset_key_0 = 'dataset_key_0' dataset_key_1 = 'dataset_key_1' dataset_keys = (dataset_key_0, dataset_key_1) with beam.Pipeline() as p: cache_pcoll_dict = { dataset_key_0: { b'\x8a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']), b'\x8b': p | 'CreateB' >> beam.Create([b'[5]']), }, dataset_key_1: { b'\x8c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, dataset_keys) with beam.Pipeline() as p: read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( base_test_dir, list(cache_pcoll_dict.keys())) beam_test_util.assert_that(read_cache['dataset_key_0'][b'\x8a'], beam_test_util.equal_to([b'[1, 2, 3]']), label='AssertA') beam_test_util.assert_that(read_cache['dataset_key_0'][b'\x8b'], beam_test_util.equal_to([b'[5]']), label='AssertB') beam_test_util.assert_that(read_cache['dataset_key_1'][b'\x8c'], beam_test_util.equal_to( [b'[9, 5, 2, 1]']), label='AssertC') def test_cache_merge(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) dataset_key_0 = 'dataset_key_0' dataset_key_1 = 'dataset_key_1' dataset_keys = (dataset_key_0, dataset_key_1) cache_keys = list('abcd') def read_manifests(): return [ analyzer_cache._ManifestFile( analyzer_cache._get_dataset_cache_path(base_test_dir, key)).read() for key in dataset_keys ] with beam.Pipeline() as p: cache_pcoll_dict = { dataset_key_0: { 'a': p | 'CreateA' >> beam.Create([b'a']), 'b': p | 'CreateB' >> beam.Create([b'b']), }, dataset_key_1: { 'c': p | 'CreateC' >> beam.Create([b'c']), 'd': p | 'CreateD' >> beam.Create([b'd']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, dataset_keys) first_manifests = read_manifests() with beam.Pipeline() as p: cache_pcoll_dict = { dataset_key_0: { 'c': p | 'CreateC' >> beam.Create([b'c']), 'd': p | 'CreateD' >> beam.Create([b'd']), }, dataset_key_1: { 'a': p | 'CreateA' >> beam.Create([b'a']), 'b': p | 'CreateB' >> beam.Create([b'b']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( p, base_test_dir, dataset_keys) second_manifests = read_manifests() self.assertEqual(len(first_manifests), len(second_manifests)) for manifest_a, manifest_b in zip(first_manifests, second_manifests): for key_value_pair in manifest_a.items(): self.assertIn(key_value_pair, manifest_b.items()) self.assertEqual(2, len(manifest_a)) self.assertCountEqual(range(len(manifest_a)), manifest_a.values()) self.assertEqual(4, len(manifest_b)) self.assertCountEqual(range(len(manifest_b)), manifest_b.values()) self.assertCountEqual(cache_keys, manifest_b.keys()) def test_cache_helpers_with_alternative_io(self): class LocalSink(beam.PTransform): def __init__(self, path): self._path = path def expand(self, pcoll): def write_to_file(value): tf.io.gfile.makedirs(self._path) with open(os.path.join(self._path, 'cache'), 'wb') as f: f.write(value) return pcoll | beam.Map(write_to_file) test_cache_dict = {'a': {'b': [bytes([17, 19, 27, 31])]}} class LocalSource(beam.PTransform): def __init__(self, path): del path def expand(self, pbegin): return pbegin | beam.Create([test_cache_dict['a']['b']]) dataset_keys = list(test_cache_dict.keys()) cache_dir = self.get_temp_dir() with beam.Pipeline() as p: _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS( p, cache_dir, dataset_keys, sink=LocalSink) read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( cache_dir, dataset_keys, source=LocalSource) self.assertItemsEqual(read_cache.keys(), ['a']) self.assertItemsEqual(read_cache['a'].keys(), ['b']) beam_test_util.assert_that( read_cache['a']['b'], beam_test_util.equal_to([test_cache_dict['a']['b']]))
_MEAN_AND_VAR_SIMPLE_TEST = dict( testcase_name='WeightedMeanAndVarSimple', combiner=analyzers.WeightedMeanAndVarCombiner(np.float32, output_shape=(), compute_variance=False, compute_weighted=False), batches=[ _make_mean_and_var_accumulator_from_instance([[1, 2, 3, 4, 5, 6, 7]]), # Count is 5*0xFFFF=327675 for this accumulator. _make_mean_and_var_accumulator_from_instance([[8, 9, 10, 11, 12]] * 0xFFFF), _make_mean_and_var_accumulator_from_instance([[100, 200, 3000]]), ], expected_outputs=analyzers._WeightedMeanAndVarAccumulator( count=np.array(327685), mean=np.float32(10.00985092390558), weight=np.float32(1.0), variance=np.float32(0.0))) _MEAN_AND_VAR_BIG_TEST = dict( testcase_name='WeightedMeanAndVarBig', combiner=analyzers.WeightedMeanAndVarCombiner(np.float32, output_shape=()), batches=[ _make_mean_and_var_accumulator_from_instance([[1, 2, 3, 4, 5, 6, 7]]), _make_mean_and_var_accumulator_from_instance([[1e15, 2e15, 3000]]), _make_mean_and_var_accumulator_from_instance([[100, 200]]), ], expected_outputs=[ np.float32(2.50e+14), np.float32(3.541666666665e+29), ],
def _make_mean_and_var_accumulator_from_instance(instance, axis=None): return analyzers._WeightedMeanAndVarAccumulator( count=np.sum(np.ones_like(instance), axis=axis), mean=np.mean(instance, axis=axis), weight=np.sum(np.ones_like(instance), axis=axis), variance=np.var(instance, axis=axis))
class AnalyzerCacheTest(test_case.TransformTestCase): def test_validate_dataset_keys(self): analyzer_cache.validate_dataset_keys( {'foo', 'Foo', 'A1', 'A_1', 'A.1', 'A-1'}) for key in {'foo 1', 'foo@1', 'foo*', 'foo[]', 'foo/goo'}: with self.assertRaisesRegexp( ValueError, 'Dataset key .* does not match allowed pattern:'): analyzer_cache.validate_dataset_keys({key}) @test_case.named_parameters( dict(testcase_name='JsonNumpyCacheCoder', coder_cls=analyzer_nodes.JsonNumpyCacheCoder, value=[1, 2.5, 3, '4']), dict(testcase_name='_VocabularyAccumulatorCoderIntAccumulator', coder_cls=analyzer_nodes._VocabularyAccumulatorCoder, value=['A', 17]), dict(testcase_name='_VocabularyAccumulatorCoderClassAccumulator', coder_cls=analyzer_nodes._VocabularyAccumulatorCoder, value=[ 'A', analyzers._WeightedMeanAndVarAccumulator( count=np.array(5), mean=np.array([.4, .9, 1.5]), variance=np.array([.1, .4, .5]), weight=np.array(0.), ) ]), ) def test_coders_round_trip(self, coder_cls, value): coder = coder_cls() encoded = coder.encode_cache(value) np.testing.assert_equal(value, coder.decode_cache(encoded)) def test_cache_helpers_round_trip(self): base_test_dir = os.path.join( os.environ.get('TEST_UNDECLARED_OUTPUTS_DIR', self.get_temp_dir()), self._testMethodName) with beam.Pipeline() as p: cache_pcoll_dict = { 'dataset_key_0': { 'a': p | 'CreateA' >> beam.Create([b'[1, 2, 3]']), 'b': p | 'CreateB' >> beam.Create([b'[5]']), }, 'dataset_key_1': { 'c': p | 'CreateC' >> beam.Create([b'[9, 5, 2, 1]']), }, } _ = cache_pcoll_dict | analyzer_cache.WriteAnalysisCacheToFS( base_test_dir) with beam.Pipeline() as p: read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( base_test_dir, list(cache_pcoll_dict.keys())) def assert_equal_matcher(expected_encoded): def _assert_equal(encoded_cache_list): (encode_cache, ) = encoded_cache_list self.assertEqual(expected_encoded, encode_cache) return _assert_equal beam_test_util.assert_that(read_cache['dataset_key_0']['a'], beam_test_util.equal_to([b'[1, 2, 3]']), label='AssertA') beam_test_util.assert_that(read_cache['dataset_key_0']['b'], assert_equal_matcher(b'[5]'), label='AssertB') beam_test_util.assert_that(read_cache['dataset_key_1']['c'], assert_equal_matcher(b'[9, 5, 2, 1]'), label='AssertC') def test_cache_helpers_with_alternative_io(self): class LocalSink(beam.PTransform): def __init__(self, path, file_name_suffix): del file_name_suffix self._path = path def expand(self, pcoll): def write_to_file(value): tf.io.gfile.makedirs(self._path) with open(os.path.join(self._path, 'cache'), 'w') as f: f.write(value) return pcoll | beam.Map(write_to_file) test_cache_dict = {'a': {'b': [str([17, 19, 27, 31])]}} class LocalSource(beam.PTransform): def __init__(self, path): del path def expand(self, pbegin): return pbegin | beam.Create([test_cache_dict['a']['b']]) cache_dir = self.get_temp_dir() with beam.Pipeline() as p: _ = test_cache_dict | analyzer_cache.WriteAnalysisCacheToFS( cache_dir, sink=LocalSink) read_cache = p | analyzer_cache.ReadAnalysisCacheFromFS( cache_dir, list(test_cache_dict.keys()), source=LocalSource) self.assertItemsEqual(read_cache.keys(), ['a']) self.assertItemsEqual(read_cache['a'].keys(), ['b']) beam_test_util.assert_that( read_cache['a']['b'], beam_test_util.equal_to([test_cache_dict['a']['b']]))