def testSingleBucket(self): def _map_fn(v): return (v, array_ops.fill([v], v), array_ops.fill([3], string_ops.as_string(v))) input_dataset = (dataset_ops.Dataset.from_tensor_slices( math_ops.range(32)).map(_map_fn)) bucketed_dataset = input_dataset.apply( grouping.group_by_window( lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) iterator = bucketed_dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) which_bucket, bucketed_values = sess.run(get_next) self.assertEqual(0, which_bucket) expected_scalar_int = np.arange(32, dtype=np.int64) expected_unk_int64 = np.zeros((32, 31)).astype(np.int64) for i in range(32): expected_unk_int64[i, :i] = i expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T self.assertAllEqual(expected_scalar_int, bucketed_values[0]) self.assertAllEqual(expected_unk_int64, bucketed_values[1]) self.assertAllEqual(expected_vec3_str, bucketed_values[2])
def testDynamicWindowSize(self): components = np.arange(100).astype(np.int64) # Key fn: even/odd # Reduce fn: batches of 5 # Window size fn: even=5, odd=10 def window_size_func(key): window_sizes = constant_op.constant([5, 10], dtype=dtypes.int64) return window_sizes[key] dataset = dataset_ops.Dataset.from_tensor_slices(components).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20), None, window_size_func)) get_next = self.getNext(dataset) with self.assertRaises(errors.OutOfRangeError): batches = 0 while True: result = self.evaluate(get_next()) is_even = all(x % 2 == 0 for x in result) is_odd = all(x % 2 == 1 for x in result) self.assertTrue(is_even or is_odd) expected_batch_size = 5 if is_even else 10 self.assertEqual(expected_batch_size, result.shape[0]) batches += 1 self.assertEqual(batches, 15)
def testGroupByWindowDynamicBatchWithPartialBatchWithDropRemainder(self): # This test exercises nested batch functionality, dynamic batch size # and drop_remainder=True together. dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) def reduce_fn(key, ds): # key == 0 -> .batch(5) # key == 1 -> .batch(10) return ds.batch(batch_size=(key + 1) * 5, drop_remainder=True) dataset = dataset.apply( grouping.group_by_window(key_func=lambda x: x, reduce_func=reduce_fn, window_size=11)) dataset = distribute._RebatchDataset(dataset, num_workers=2) self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)]) # The batches of 5 (value == 0) will be split into minibatches of (3, 2) and # the batches of 10 (value == 1) split into minibatches of (5, 5) # [(batch_size, value), ...] pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (5, 1), (5, 1), (3, 0), (2, 0)] expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output)
def testGroupByWindowDynamicBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) def reduce_fn(key, ds): # key == 0 -> .batch(5) # key == 1 -> .batch(10) return ds.batch(batch_size=(key + 1) * 5) dataset = dataset.apply( grouping.group_by_window(key_func=lambda x: x, reduce_func=reduce_fn, window_size=10)) dataset = distribute._RebatchDataset(dataset, num_workers=2) self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)]) # The batches of 5 (value == 0) will be split into minibatches of (3, 2) and # the batches of 10 (value == 1) split into minibatches of (5, 5) # [(batch_size, value), ...] pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (5, 1), (5, 1)] pairs = pairs * 2 expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output)
def testSingleBucket(self): def _map_fn(v): return (v, array_ops.fill([v], v), array_ops.fill([3], string_ops.as_string(v))) input_dataset = dataset_ops.Dataset.from_tensor_slices( math_ops.range(32)).map(_map_fn) bucketed_dataset = input_dataset.apply( grouping.group_by_window( lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) get_next = self.getNext(bucketed_dataset) which_bucket, bucketed_values = self.evaluate(get_next()) self.assertEqual(0, which_bucket) expected_scalar_int = np.arange(32, dtype=np.int64) expected_unk_int64 = np.zeros((32, 31)).astype(np.int64) for i in range(32): expected_unk_int64[i, :i] = i expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T self.assertAllEqual(expected_scalar_int, bucketed_values[0]) self.assertAllEqual(expected_unk_int64, bucketed_values[1]) self.assertAllEqual(expected_vec3_str, bucketed_values[2])
def testSingleBucket(self): def _map_fn(v): return (v, array_ops.fill([v], v), array_ops.fill([3], string_ops.as_string(v))) input_dataset = ( dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn)) bucketed_dataset = input_dataset.apply( grouping.group_by_window( lambda x, y, z: 0, lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) iterator = bucketed_dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) which_bucket, bucketed_values = sess.run(get_next) self.assertEqual(0, which_bucket) expected_scalar_int = np.arange(32, dtype=np.int64) expected_unk_int64 = np.zeros((32, 31)).astype(np.int64) for i in range(32): expected_unk_int64[i, :i] = i expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T self.assertAllEqual(expected_scalar_int, bucketed_values[0]) self.assertAllEqual(expected_unk_int64, bucketed_values[1]) self.assertAllEqual(expected_vec3_str, bucketed_values[2])
def testSimple(self): components = np.random.randint(100, size=(200,)).astype(np.int64) iterator = ( dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x) .apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) counts = [] with self.assertRaises(errors.OutOfRangeError): while True: result = sess.run(get_next) self.assertTrue( all(x % 2 == 0 for x in result) or all(x % 2 == 1) for x in result) counts.append(result.shape[0]) self.assertEqual(len(components), sum(counts)) num_full_batches = len([c for c in counts if c == 4]) self.assertGreaterEqual(num_full_batches, 24) self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
def testTwoLevelDistribute(self): cluster_1_size = 3 dispatcher_1, workers_1 = self.start_cluster( # to avoid gcing workers, pylint: disable=unused-variable cluster_1_size, name="cluster_1") dispatcher_2, workers_2 = self.start_cluster(1, name="cluster_2") # to avoid gcing workers, pylint: disable=unused-variable num_sizes = 10 size_repeats = 5 strings = ["a" * i for i in range(num_sizes)] * size_repeats ds = dataset_ops.Dataset.from_tensor_slices(strings) ds = ds.shuffle(len(strings)) ds = _make_distributed_dataset(ds, dispatcher_1) # Large enough so that all strings of the same size are windowed together. window_size = cluster_1_size * size_repeats batch_size = size_repeats def key_func(x): return math_ops.cast(string_ops.string_length_v2(x), dtypes.int64) ds = ds.apply( grouping.group_by_window( key_func=key_func, reduce_func=lambda _, x: x.batch(batch_size), window_size=window_size)) ds = _make_distributed_dataset(ds, dispatcher_2) it = iter(ds) for _ in range(num_sizes): element = next(it).numpy() for _ in range(1, cluster_1_size): self.assertAllEqual(next(it).numpy(), element) self.assertEmpty(list(it))
def testGroupByWindowCardinality(self): dataset = dataset_ops.Dataset.range(1).repeat().apply( grouping.group_by_window( lambda x: x % 2, lambda key, window: dataset_ops.Dataset.from_tensors(key), 4)) self.assertEqual(self.evaluate(dataset.cardinality()), dataset_ops.INFINITE)
def testConsumeWindowDatasetMoreThanOnce(self): components = np.random.randint(50, size=(200,)).astype(np.int64) def reduce_func(key, window): # Apply two different kinds of padding to the input: tight # padding, and quantized (to a multiple of 10) padding. return dataset_ops.Dataset.zip(( window.padded_batch( 4, padded_shapes=tensor_shape.TensorShape([None])), window.padded_batch( 4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])), )) dataset = dataset_ops.Dataset.from_tensor_slices( components ).map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x)).apply( grouping.group_by_window( lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64), reduce_func, 4)) get_next = self.getNext(dataset) counts = [] with self.assertRaises(errors.OutOfRangeError): while True: tight_result, multiple_of_10_result = self.evaluate(get_next()) self.assertEqual(0, multiple_of_10_result.shape[1] % 10) self.assertAllEqual(tight_result, multiple_of_10_result[:, :tight_result.shape[1]]) counts.append(tight_result.shape[0]) self.assertEqual(len(components), sum(counts))
def testTwoLevelDistribute(self): cluster_1_size = 3 cluster_1 = data_service_test_base.TestCluster( num_workers=cluster_1_size) cluster_2 = data_service_test_base.TestCluster(num_workers=1) num_sizes = 10 size_repeats = 5 strings = ["a" * i for i in range(num_sizes)] * size_repeats ds = dataset_ops.Dataset.from_tensor_slices(strings) ds = ds.shuffle(len(strings)) ds = self.make_distributed_dataset(ds, cluster_1) # Large enough so that all strings of the same size are windowed together. window_size = cluster_1_size * size_repeats batch_size = size_repeats def key_func(x): return math_ops.cast(string_ops.string_length_v2(x), dtypes.int64) ds = ds.apply( grouping.group_by_window( key_func=key_func, reduce_func=lambda _, x: x.batch(batch_size), window_size=window_size)) ds = self.make_distributed_dataset(ds, cluster_2) get_next = self.getNext(ds) for _ in range(num_sizes): element = self.evaluate(get_next()) for _ in range(1, cluster_1_size): self.assertAllEqual(self.evaluate(get_next()), element) self.assertEmpty(self.getIteratorOutput(get_next))
def testSimple(self): components = np.random.randint(100, size=(200, )).astype(np.int64) iterator = (dataset_ops.Dataset.from_tensor_slices(components).map( lambda x: x * x).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) counts = [] with self.assertRaises(errors.OutOfRangeError): while True: result = sess.run(get_next) self.assertTrue( all(x % 2 == 0 for x in result) or all(x % 2 == 1) for x in result) counts.append(result.shape[0]) self.assertEqual(len(components), sum(counts)) num_full_batches = len([c for c in counts if c == 4]) self.assertGreaterEqual(num_full_batches, 24) self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
def make_group_by_window_dataset(var): def reduce_fn(key, bucket): del key, bucket return dataset_ops.Dataset.from_tensors(var) return dataset_ops.Dataset.from_tensors(0).repeat(10).apply( grouping.group_by_window(lambda _: 0, reduce_fn, 10))
def testRoundRobinBucketizing(self): # Tests a common use case for round robin reads. At each step, all # consumers should get batches with the same bucket size. cluster = self.create_cluster(num_workers=4) num_elements = 100 ds = dataset_ops.Dataset.range(num_elements, output_type=dtypes.int32) ds = ds.shuffle(num_elements) low_bucket_max = 30 mid_bucket_max = 60 bucket_boundaries = [low_bucket_max, mid_bucket_max] batch_size = 10 num_consumers = 3 bucket_batch_sizes = [batch_size] * (len(bucket_boundaries) + 1) ds = ds.apply( grouping.bucket_by_sequence_length(lambda x: x, bucket_boundaries, bucket_batch_sizes, drop_remainder=True)) ds = ds.apply( grouping.group_by_window( lambda x: math_ops.cast(x[1], dtypes.int64), lambda _, x: dataset_ops.Dataset.from_tensors(x), window_size=num_consumers)) ds = ds.flat_map(lambda x: x) ds = ds.repeat() consumers = [] for consumer_index in range(num_consumers): consumers.append( self.make_distributed_dataset(ds, cluster, job_name="test", consumer_index=consumer_index, num_consumers=num_consumers)) # Use parallel interleave to read from consumers in parallel. ds = dataset_ops.Dataset.from_tensor_slices(consumers) ds = ds.interleave(lambda x: x.prefetch(num_elements), cycle_length=num_consumers, num_parallel_calls=num_consumers) num_rounds = 10 get_next = self.getNext(ds, requires_initialization=True) results = [] for _ in range(num_rounds): results.append(self.evaluate(get_next())) def get_bucket(elem): bucket_ind = 0 while bucket_ind < len(bucket_boundaries ) and elem >= bucket_boundaries[bucket_ind]: bucket_ind += 1 return bucket_ind for i in range(0, len(results), num_consumers): batches = results[num_consumers * i:num_consumers * i + num_consumers] bucket_inds = [get_bucket(batch[0]) for batch in batches] for bucket_ind in bucket_inds[1:]: self.assertEqual(bucket_inds[0], bucket_ind)
def testGroupByWindowWithAutotune(self): dataset = dataset_ops.Dataset.range(1000).apply( grouping.group_by_window( lambda x: x // 10, lambda key, window: dataset_ops.Dataset.from_tensors(key), 4)) dataset = dataset.map(lambda x: x + 1, num_parallel_calls=-1) get_next = self.getNext(dataset) self.evaluate(get_next())
def testShortCircuit(self): dataset = dataset_ops.Dataset.range(10) dataset = dataset.apply( grouping.group_by_window(lambda x: x, lambda _, window: window.batch(1), 1)) self.assertDatasetProduces( dataset, expected_output=[[i] for i in range(10)])
def testEmpty(self): dataset = dataset_ops.Dataset.range(4).apply( grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)) get_next = self.getNext(dataset) with self.assertRaisesRegexp( errors.InvalidArgumentError, "Window size must be greater than zero, but got 0."): print(self.evaluate(get_next()))
def testEvenOddBuckets(self): def _map_fn(v): return (v, array_ops.fill([v], v), array_ops.fill([3], string_ops.as_string(v))) input_dataset = ( dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn)) bucketed_dataset = input_dataset.apply( grouping.group_by_window( lambda x, y, z: math_ops.cast(x % 2, dtypes.int64), lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) iterator = bucketed_dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) # Get two minibatches (one containing even values, one containing odds) which_bucket_even, bucketed_values_even = sess.run(get_next) which_bucket_odd, bucketed_values_odd = sess.run(get_next) # Count number of bucket_tensors. self.assertEqual(3, len(bucketed_values_even)) self.assertEqual(3, len(bucketed_values_odd)) # Ensure bucket 0 was used for all minibatch entries. self.assertAllEqual(0, which_bucket_even) self.assertAllEqual(1, which_bucket_odd) # Test the first bucket outputted, the events starting at 0 expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64) expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64) for i in range(0, 32): expected_unk_int64[i, :2 * i] = 2 * i expected_vec3_str = np.vstack( 3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T self.assertAllEqual(expected_scalar_int, bucketed_values_even[0]) self.assertAllEqual(expected_unk_int64, bucketed_values_even[1]) self.assertAllEqual(expected_vec3_str, bucketed_values_even[2]) # Test the second bucket outputted, the odds starting at 1 expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64) expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64) for i in range(0, 32): expected_unk_int64[i, :2 * i + 1] = 2 * i + 1 expected_vec3_str = np.vstack( 3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0]) self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1]) self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
def testEvenOddBuckets(self): def _map_fn(v): return (v, array_ops.fill([v], v), array_ops.fill([3], string_ops.as_string(v))) input_dataset = (dataset_ops.Dataset.from_tensor_slices( math_ops.range(64)).map(_map_fn)) bucketed_dataset = input_dataset.apply( grouping.group_by_window( lambda x, y, z: math_ops.cast(x % 2, dtypes.int64), lambda k, bucket: self._dynamicPad(k, bucket, 32), 32)) iterator = bucketed_dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) # Get two minibatches (one containing even values, one containing odds) which_bucket_even, bucketed_values_even = sess.run(get_next) which_bucket_odd, bucketed_values_odd = sess.run(get_next) # Count number of bucket_tensors. self.assertEqual(3, len(bucketed_values_even)) self.assertEqual(3, len(bucketed_values_odd)) # Ensure bucket 0 was used for all minibatch entries. self.assertAllEqual(0, which_bucket_even) self.assertAllEqual(1, which_bucket_odd) # Test the first bucket outputted, the events starting at 0 expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64) expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64) for i in range(0, 32): expected_unk_int64[i, :2 * i] = 2 * i expected_vec3_str = np.vstack( 3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T self.assertAllEqual(expected_scalar_int, bucketed_values_even[0]) self.assertAllEqual(expected_unk_int64, bucketed_values_even[1]) self.assertAllEqual(expected_vec3_str, bucketed_values_even[2]) # Test the second bucket outputted, the odds starting at 1 expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64) expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64) for i in range(0, 32): expected_unk_int64[i, :2 * i + 1] = 2 * i + 1 expected_vec3_str = np.vstack( 3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0]) self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1]) self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) dataset = dataset_ops.Dataset.from_tensor_slices(components).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)) get_next = self.getNext(dataset) self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next())) self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next())) # The small outputs at the end are deterministically produced in key # order. self.assertAllEqual([0, 0, 0], self.evaluate(get_next())) self.assertAllEqual([1], self.evaluate(get_next()))
def testEmpty(self): iterator = dataset_ops.make_initializable_iterator( dataset_ops.Dataset.range(4).apply( grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: self.evaluate(init_op) with self.assertRaisesRegexp( errors.InvalidArgumentError, "Window size must be greater than zero, but got 0."): print(self.evaluate(get_next))
def testEmpty(self): iterator = (dataset_ops.Dataset.range(4).apply( grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) with self.assertRaisesRegexp( errors.InvalidArgumentError, "Window size must be greater than zero, but got 0."): print(sess.run(get_next))
def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) iterator = ( dataset_ops.Dataset.from_tensor_slices(components).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) self.assertAllEqual([0, 0, 0, 0], sess.run(get_next)) self.assertAllEqual([1, 1, 1, 1], sess.run(get_next)) # The small outputs at the end are deterministically produced in key # order. self.assertAllEqual([0, 0, 0], sess.run(get_next)) self.assertAllEqual([1], sess.run(get_next))
def testEvenOddBucketsFilterOutAllOdd(self): def _map_fn(v): return { "x": v, "y": array_ops.fill([v], v), "z": array_ops.fill([3], string_ops.as_string(v)) } def _dynamic_pad_fn(bucket, window, _): return dataset_ops.Dataset.zip( (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch( 32, { "x": tensor_shape.TensorShape([]), "y": tensor_shape.TensorShape([None]), "z": tensor_shape.TensorShape([3]) }))) input_dataset = ( dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn) .filter(lambda d: math_ops.equal(d["x"] % 2, 0))) bucketed_dataset = input_dataset.apply( grouping.group_by_window( lambda d: math_ops.cast(d["x"] % 2, dtypes.int64), lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32)) iterator = bucketed_dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) # Get two minibatches ([0, 2, ...] and [64, 66, ...]) which_bucket0, bucketed_values_even0 = sess.run(get_next) which_bucket1, bucketed_values_even1 = sess.run(get_next) # Ensure that bucket 1 was completely filtered out self.assertAllEqual(0, which_bucket0) self.assertAllEqual(0, which_bucket1) self.assertAllEqual( np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"]) self.assertAllEqual( np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
def testImmediateOutput(self): components = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat( -1).apply( grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4)) get_next = self.getNext(dataset) # The input is infinite, so this test demonstrates that: # 1. We produce output without having to consume the entire input, # 2. Different buckets can produce output at different rates, and # 3. For deterministic input, the output is deterministic. for _ in range(3): self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next())) self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next())) self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next())) self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
def testReduceFuncError(self): components = np.random.randint(100, size=(200,)).astype(np.int64) def reduce_func(_, xs): # Introduce an incorrect padded shape that cannot (currently) be # detected at graph construction time. return xs.padded_batch( 4, padded_shapes=(tensor_shape.TensorShape([]), constant_op.constant([5], dtype=dtypes.int64) * -1)) dataset = dataset_ops.Dataset.from_tensor_slices( components).map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply( grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32)) get_next = self.getNext(dataset) with self.assertRaises(errors.InvalidArgumentError): self.evaluate(get_next())
def testEvenOddBucketsFilterOutAllOdd(self): def _map_fn(v): return { "x": v, "y": array_ops.fill([v], v), "z": array_ops.fill([3], string_ops.as_string(v)) } def _dynamic_pad_fn(bucket, window, _): return dataset_ops.Dataset.zip( (dataset_ops.Dataset.from_tensors(bucket), window.padded_batch( 32, { "x": tensor_shape.TensorShape([]), "y": tensor_shape.TensorShape([None]), "z": tensor_shape.TensorShape([3]) }))) input_dataset = (dataset_ops.Dataset.from_tensor_slices( math_ops.range(128)).map(_map_fn).filter( lambda d: math_ops.equal(d["x"] % 2, 0))) bucketed_dataset = input_dataset.apply( grouping.group_by_window( lambda d: math_ops.cast(d["x"] % 2, dtypes.int64), lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32)) iterator = bucketed_dataset.make_initializable_iterator() init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) # Get two minibatches ([0, 2, ...] and [64, 66, ...]) which_bucket0, bucketed_values_even0 = sess.run(get_next) which_bucket1, bucketed_values_even1 = sess.run(get_next) # Ensure that bucket 1 was completely filtered out self.assertAllEqual(0, which_bucket0) self.assertAllEqual(0, which_bucket1) self.assertAllEqual(np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"]) self.assertAllEqual(np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
def testGroupByWindowBatching(self, drop_remainder): dataset = dataset_ops.Dataset.from_tensor_slices( [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)]) reduce_fn = lambda bucket_id, ds: ds.batch( batch_size=10, drop_remainder=drop_remainder) dataset = dataset.apply( grouping.group_by_window( key_func=lambda x: x[0] % 4, reduce_func=reduce_fn, window_size=10)) rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=2) self.assertEqual([[5, 3] if drop_remainder else [None, 3]], [ts.as_list() for ts in _flat_shapes(rebatched_dataset)]) # pylint: disable=g-complex-comprehension expected_output = [[[j + i * 4 + k * 20] * 3 for i in range(5)] for j in range(4) for k in range(2)] self.assertDatasetProduces(rebatched_dataset, expected_output)
def testGroupByWindowDynamicBatch(self, drop_remainder): dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) reduce_fn = lambda bucket_id, ds: ds.batch( # pylint: disable=g-long-lambda batch_size=(bucket_id + 1) * 5, drop_remainder=drop_remainder) dataset = dataset.apply( grouping.group_by_window(key_func=lambda x: x, reduce_func=reduce_fn, window_size=10)) dataset = distribute._RebatchDataset(dataset, num_workers=2) self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)]) # pylint: disable=g-complex-comprehension x = [(2, 0), (2, 0), (2, 0), (2, 0), (2, 0), (5, 1), (5, 1), (2, 0), (2, 0), (2, 0), (2, 0), (2, 0), (5, 1), (5, 1)] expected_output = [[value] * batch_size for batch_size, value in x] self.assertDatasetProduces(dataset, expected_output)
def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) self.assertAllEqual([0, 0, 0, 0], sess.run(get_next)) self.assertAllEqual([1, 1, 1, 1], sess.run(get_next)) # The small outputs at the end are deterministically produced in key # order. self.assertAllEqual([0, 0, 0], sess.run(get_next)) self.assertAllEqual([1], sess.run(get_next))
def testGroupByWindowStaticBatch(self): dataset = dataset_ops.Dataset.from_tensor_slices( [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)]) reduce_fn = lambda bucket_id, ds: ds.batch( # pylint: disable=g-long-lambda batch_size=10) dataset = dataset.apply( grouping.group_by_window( key_func=lambda x: x[0] % 4, reduce_func=reduce_fn, window_size=10)) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=2) self.assertEqual([[None, 3]], [ts.as_list() for ts in _flat_shapes(rebatched_dataset)]) # pylint: disable=g-complex-comprehension expected_output = [[[j + i * 4 + k * 20] * 3 for i in range(5)] for j in range(4) for k in range(2)] self.assertDatasetProduces(rebatched_dataset, expected_output)
def testStatefulGroupByWindowNotCheckpointable(self): stateful_key_func = self._statefulInt64Func key_func = lambda _: math_ops.cast(0, dtypes.int64) stateful_reduce_func = lambda _, x: self._statefulDatasetFunc(x) reduce_func = lambda _, x: x stateful_window_func = self._statefulInt64Func window_func = lambda x: math_ops.cast(0, dtypes.int64) test_cases = [ (stateful_key_func, reduce_func, window_func), (key_func, stateful_reduce_func, window_func), (key_func, reduce_func, stateful_window_func), ] for key_func_fn, reduce_func_fn, window_func in test_cases: dataset = dataset_ops.Dataset.range(10) dataset = dataset.apply( grouping.group_by_window( key_func_fn, reduce_func_fn, window_size_func=window_func)) self._assertNotCheckpointable(dataset)
def testSimple(self): components = np.random.randint(100, size=(200,)).astype(np.int64) dataset = dataset_ops.Dataset.from_tensor_slices( components).map(lambda x: x * x).apply( grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4)) get_next = self.getNext(dataset) counts = [] with self.assertRaises(errors.OutOfRangeError): while True: result = self.evaluate(get_next()) self.assertTrue( all(x % 2 == 0 for x in result) or all(x % 2 == 1) for x in result) counts.append(result.shape[0]) self.assertEqual(len(components), sum(counts)) num_full_batches = len([c for c in counts if c == 4]) self.assertGreaterEqual(num_full_batches, 24) self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
def testGroupByWindowDynamicBatchWithPartialBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) def reduce_fn(key, ds): # key == 0 -> .batch(5) # key == 1 -> .batch(10) return ds.batch(batch_size=(key + 1) * 5) dataset = dataset.apply( grouping.group_by_window( key_func=lambda x: x, reduce_func=reduce_fn, window_size=11)) dataset = distribute._RebatchDataset(dataset, num_replicas=2) self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)]) pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (1, 0), (0, 0), (5, 1), (5, 1), (1, 1), (0, 1), (3, 0), (2, 0), (2, 0), (2, 0), (5, 1), (4, 1)] expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output)
def testImmediateOutput(self): components = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) iterator = ( dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply( grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) # The input is infinite, so this test demonstrates that: # 1. We produce output without having to consume the entire input, # 2. Different buckets can produce output at different rates, and # 3. For deterministic input, the output is deterministic. for _ in range(3): self.assertAllEqual([0, 0, 0, 0], sess.run(get_next)) self.assertAllEqual([1, 1, 1, 1], sess.run(get_next)) self.assertAllEqual([2, 2, 2, 2], sess.run(get_next)) self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
def group_by_window(key_func, reduce_func, window_size=None, window_size_func=None): """A transformation that groups windows of elements by key and reduces them. This transformation maps each consecutive element in a dataset to a key using `key_func` and groups the elements by key. It then applies `reduce_func` to at most `window_size_func(key)` elements matching the same key. All except the final window for each key will contain `window_size_func(key)` elements; the final window may be smaller. You may provide either a constant `window_size` or a window size determined by the key through `window_size_func`. Args: key_func: A function mapping a nested structure of tensors (having shapes and types defined by `self.output_shapes` and `self.output_types`) to a scalar `tf.int64` tensor. reduce_func: A function mapping a key and a dataset of up to `window_size` consecutive elements matching that key to another dataset. window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of consecutive elements matching the same key to combine in a single batch, which will be passed to `reduce_func`. Mutually exclusive with `window_size_func`. window_size_func: A function mapping a key to a `tf.int64` scalar `tf.Tensor`, representing the number of consecutive elements matching the same key to combine in a single batch, which will be passed to `reduce_func`. Mutually exclusive with `window_size`. Returns: A `Dataset` transformation function, which can be passed to `tf.data.Dataset.apply`. Raises: ValueError: if neither or both of {`window_size`, `window_size_func`} are passed. """ return grouping.group_by_window(key_func, reduce_func, window_size, window_size_func)
def testImmediateOutput(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) iterator = (dataset_ops.Dataset.from_tensor_slices(components).repeat( -1).apply( grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4)).make_initializable_iterator()) init_op = iterator.initializer get_next = iterator.get_next() with self.cached_session() as sess: sess.run(init_op) # The input is infinite, so this test demonstrates that: # 1. We produce output without having to consume the entire input, # 2. Different buckets can produce output at different rates, and # 3. For deterministic input, the output is deterministic. for _ in range(3): self.assertAllEqual([0, 0, 0, 0], sess.run(get_next)) self.assertAllEqual([1, 1, 1, 1], sess.run(get_next)) self.assertAllEqual([2, 2, 2, 2], sess.run(get_next)) self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
def testShard(self): filename = self._createFile() dataset = readers.TFRecordDataset([filename]) def reduce_func(key, dataset): shard_filename = string_ops.string_join( [filename, string_ops.as_string(key)]) writer = writers.TFRecordWriter(shard_filename) writer.write(dataset.map(lambda _, x: x)) return dataset_ops.Dataset.from_tensors(shard_filename) dataset = dataset.enumerate() dataset = dataset.apply( grouping.group_by_window(lambda i, _: i % 2, reduce_func, dtypes.int64.max)) get_next = self.getNext(dataset) for i in range(2): shard_filename = (filename + str(i)).encode() self.assertEqual(self.evaluate(get_next()), shard_filename) for j, r in enumerate( tf_record.tf_record_iterator(shard_filename)): self.assertAllEqual(self._record(i + 2 * j), r)
def _build_dataset(self, components): return dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply( grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))