Example #1
0
    def testSingleBucket(self):
        def _map_fn(v):
            return (v, array_ops.fill([v], v),
                    array_ops.fill([3], string_ops.as_string(v)))

        input_dataset = (dataset_ops.Dataset.from_tensor_slices(
            math_ops.range(32)).map(_map_fn))

        bucketed_dataset = input_dataset.apply(
            grouping.group_by_window(
                lambda x, y, z: 0,
                lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))

        iterator = bucketed_dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)

            which_bucket, bucketed_values = sess.run(get_next)

            self.assertEqual(0, which_bucket)

            expected_scalar_int = np.arange(32, dtype=np.int64)
            expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
            for i in range(32):
                expected_unk_int64[i, :i] = i
            expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T

            self.assertAllEqual(expected_scalar_int, bucketed_values[0])
            self.assertAllEqual(expected_unk_int64, bucketed_values[1])
            self.assertAllEqual(expected_vec3_str, bucketed_values[2])
  def testDynamicWindowSize(self):
    components = np.arange(100).astype(np.int64)

    # Key fn: even/odd
    # Reduce fn: batches of 5
    # Window size fn: even=5, odd=10

    def window_size_func(key):
      window_sizes = constant_op.constant([5, 10], dtype=dtypes.int64)
      return window_sizes[key]

    dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
        grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
                                 None, window_size_func))

    get_next = self.getNext(dataset)
    with self.assertRaises(errors.OutOfRangeError):
      batches = 0
      while True:
        result = self.evaluate(get_next())
        is_even = all(x % 2 == 0 for x in result)
        is_odd = all(x % 2 == 1 for x in result)
        self.assertTrue(is_even or is_odd)
        expected_batch_size = 5 if is_even else 10
        self.assertEqual(expected_batch_size, result.shape[0])
        batches += 1

    self.assertEqual(batches, 15)
Example #3
0
    def testGroupByWindowDynamicBatchWithPartialBatchWithDropRemainder(self):
        # This test exercises nested batch functionality, dynamic batch size
        # and drop_remainder=True together.
        dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2)

        def reduce_fn(key, ds):
            # key == 0 -> .batch(5)
            # key == 1 -> .batch(10)
            return ds.batch(batch_size=(key + 1) * 5, drop_remainder=True)

        dataset = dataset.apply(
            grouping.group_by_window(key_func=lambda x: x,
                                     reduce_func=reduce_fn,
                                     window_size=11))
        dataset = distribute._RebatchDataset(dataset, num_workers=2)

        self.assertEqual([[None]],
                         [ts.as_list() for ts in _flat_shapes(dataset)])

        # The batches of 5 (value == 0) will be split into minibatches of (3, 2) and
        # the batches of 10 (value == 1) split into minibatches of (5, 5)
        # [(batch_size, value), ...]
        pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (5, 1), (5, 1), (3, 0),
                 (2, 0)]
        expected_output = [[value] * batch_size for batch_size, value in pairs]
        self.assertDatasetProduces(dataset, expected_output)
Example #4
0
    def testGroupByWindowDynamicBatch(self):
        # {0, 1, 0, 1, ...}
        dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2)

        def reduce_fn(key, ds):
            # key == 0 -> .batch(5)
            # key == 1 -> .batch(10)
            return ds.batch(batch_size=(key + 1) * 5)

        dataset = dataset.apply(
            grouping.group_by_window(key_func=lambda x: x,
                                     reduce_func=reduce_fn,
                                     window_size=10))
        dataset = distribute._RebatchDataset(dataset, num_workers=2)

        self.assertEqual([[None]],
                         [ts.as_list() for ts in _flat_shapes(dataset)])

        # The batches of 5 (value == 0) will be split into minibatches of (3, 2) and
        # the batches of 10 (value == 1) split into minibatches of (5, 5)
        # [(batch_size, value), ...]
        pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (5, 1), (5, 1)]
        pairs = pairs * 2
        expected_output = [[value] * batch_size for batch_size, value in pairs]
        self.assertDatasetProduces(dataset, expected_output)
  def testSingleBucket(self):

    def _map_fn(v):
      return (v, array_ops.fill([v], v),
              array_ops.fill([3], string_ops.as_string(v)))

    input_dataset = dataset_ops.Dataset.from_tensor_slices(
        math_ops.range(32)).map(_map_fn)

    bucketed_dataset = input_dataset.apply(
        grouping.group_by_window(
            lambda x, y, z: 0,
            lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
    get_next = self.getNext(bucketed_dataset)

    which_bucket, bucketed_values = self.evaluate(get_next())

    self.assertEqual(0, which_bucket)

    expected_scalar_int = np.arange(32, dtype=np.int64)
    expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
    for i in range(32):
      expected_unk_int64[i, :i] = i
    expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T

    self.assertAllEqual(expected_scalar_int, bucketed_values[0])
    self.assertAllEqual(expected_unk_int64, bucketed_values[1])
    self.assertAllEqual(expected_vec3_str, bucketed_values[2])
Example #6
0
  def testSingleBucket(self):

    def _map_fn(v):
      return (v, array_ops.fill([v], v),
              array_ops.fill([3], string_ops.as_string(v)))

    input_dataset = (
        dataset_ops.Dataset.from_tensor_slices(math_ops.range(32)).map(_map_fn))

    bucketed_dataset = input_dataset.apply(
        grouping.group_by_window(
            lambda x, y, z: 0,
            lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))

    iterator = bucketed_dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)

      which_bucket, bucketed_values = sess.run(get_next)

      self.assertEqual(0, which_bucket)

      expected_scalar_int = np.arange(32, dtype=np.int64)
      expected_unk_int64 = np.zeros((32, 31)).astype(np.int64)
      for i in range(32):
        expected_unk_int64[i, :i] = i
      expected_vec3_str = np.vstack(3 * [np.arange(32).astype(bytes)]).T

      self.assertAllEqual(expected_scalar_int, bucketed_values[0])
      self.assertAllEqual(expected_unk_int64, bucketed_values[1])
      self.assertAllEqual(expected_vec3_str, bucketed_values[2])
Example #7
0
  def testSimple(self):
    components = np.random.randint(100, size=(200,)).astype(np.int64)
    iterator = (
        dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
        .apply(
            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
                                     4)).make_initializable_iterator())
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)
      counts = []
      with self.assertRaises(errors.OutOfRangeError):
        while True:
          result = sess.run(get_next)
          self.assertTrue(
              all(x % 2 == 0
                  for x in result) or all(x % 2 == 1)
              for x in result)
          counts.append(result.shape[0])

      self.assertEqual(len(components), sum(counts))
      num_full_batches = len([c for c in counts if c == 4])
      self.assertGreaterEqual(num_full_batches, 24)
      self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
    def testTwoLevelDistribute(self):
        cluster_1_size = 3
        dispatcher_1, workers_1 = self.start_cluster(  # to avoid gcing workers, pylint: disable=unused-variable
            cluster_1_size,
            name="cluster_1")
        dispatcher_2, workers_2 = self.start_cluster(1, name="cluster_2")  # to avoid gcing workers, pylint: disable=unused-variable
        num_sizes = 10
        size_repeats = 5
        strings = ["a" * i for i in range(num_sizes)] * size_repeats
        ds = dataset_ops.Dataset.from_tensor_slices(strings)
        ds = ds.shuffle(len(strings))
        ds = _make_distributed_dataset(ds, dispatcher_1)
        # Large enough so that all strings of the same size are windowed together.
        window_size = cluster_1_size * size_repeats
        batch_size = size_repeats

        def key_func(x):
            return math_ops.cast(string_ops.string_length_v2(x), dtypes.int64)

        ds = ds.apply(
            grouping.group_by_window(
                key_func=key_func,
                reduce_func=lambda _, x: x.batch(batch_size),
                window_size=window_size))
        ds = _make_distributed_dataset(ds, dispatcher_2)

        it = iter(ds)
        for _ in range(num_sizes):
            element = next(it).numpy()
            for _ in range(1, cluster_1_size):
                self.assertAllEqual(next(it).numpy(), element)
        self.assertEmpty(list(it))
 def testGroupByWindowCardinality(self):
     dataset = dataset_ops.Dataset.range(1).repeat().apply(
         grouping.group_by_window(
             lambda x: x % 2,
             lambda key, window: dataset_ops.Dataset.from_tensors(key), 4))
     self.assertEqual(self.evaluate(dataset.cardinality()),
                      dataset_ops.INFINITE)
  def testConsumeWindowDatasetMoreThanOnce(self):
    components = np.random.randint(50, size=(200,)).astype(np.int64)

    def reduce_func(key, window):
      # Apply two different kinds of padding to the input: tight
      # padding, and quantized (to a multiple of 10) padding.
      return dataset_ops.Dataset.zip((
          window.padded_batch(
              4, padded_shapes=tensor_shape.TensorShape([None])),
          window.padded_batch(
              4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
      ))

    dataset = dataset_ops.Dataset.from_tensor_slices(
        components
    ).map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x)).apply(
        grouping.group_by_window(
            lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
            reduce_func, 4))

    get_next = self.getNext(dataset)
    counts = []
    with self.assertRaises(errors.OutOfRangeError):
      while True:
        tight_result, multiple_of_10_result = self.evaluate(get_next())
        self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
        self.assertAllEqual(tight_result,
                            multiple_of_10_result[:, :tight_result.shape[1]])
        counts.append(tight_result.shape[0])
    self.assertEqual(len(components), sum(counts))
Example #11
0
    def testTwoLevelDistribute(self):
        cluster_1_size = 3
        cluster_1 = data_service_test_base.TestCluster(
            num_workers=cluster_1_size)
        cluster_2 = data_service_test_base.TestCluster(num_workers=1)
        num_sizes = 10
        size_repeats = 5
        strings = ["a" * i for i in range(num_sizes)] * size_repeats
        ds = dataset_ops.Dataset.from_tensor_slices(strings)
        ds = ds.shuffle(len(strings))
        ds = self.make_distributed_dataset(ds, cluster_1)
        # Large enough so that all strings of the same size are windowed together.
        window_size = cluster_1_size * size_repeats
        batch_size = size_repeats

        def key_func(x):
            return math_ops.cast(string_ops.string_length_v2(x), dtypes.int64)

        ds = ds.apply(
            grouping.group_by_window(
                key_func=key_func,
                reduce_func=lambda _, x: x.batch(batch_size),
                window_size=window_size))
        ds = self.make_distributed_dataset(ds, cluster_2)

        get_next = self.getNext(ds)
        for _ in range(num_sizes):
            element = self.evaluate(get_next())
            for _ in range(1, cluster_1_size):
                self.assertAllEqual(self.evaluate(get_next()), element)
        self.assertEmpty(self.getIteratorOutput(get_next))
Example #12
0
    def testSimple(self):
        components = np.random.randint(100, size=(200, )).astype(np.int64)
        iterator = (dataset_ops.Dataset.from_tensor_slices(components).map(
            lambda x: x * x).apply(
                grouping.group_by_window(lambda x: x % 2,
                                         lambda _, xs: xs.batch(4),
                                         4)).make_initializable_iterator())
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)
            counts = []
            with self.assertRaises(errors.OutOfRangeError):
                while True:
                    result = sess.run(get_next)
                    self.assertTrue(
                        all(x % 2 == 0 for x in result) or all(x % 2 == 1)
                        for x in result)
                    counts.append(result.shape[0])

            self.assertEqual(len(components), sum(counts))
            num_full_batches = len([c for c in counts if c == 4])
            self.assertGreaterEqual(num_full_batches, 24)
            self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
    def make_group_by_window_dataset(var):
        def reduce_fn(key, bucket):
            del key, bucket
            return dataset_ops.Dataset.from_tensors(var)

        return dataset_ops.Dataset.from_tensors(0).repeat(10).apply(
            grouping.group_by_window(lambda _: 0, reduce_fn, 10))
Example #14
0
    def testRoundRobinBucketizing(self):
        # Tests a common use case for round robin reads. At each step, all
        # consumers should get batches with the same bucket size.
        cluster = self.create_cluster(num_workers=4)
        num_elements = 100
        ds = dataset_ops.Dataset.range(num_elements, output_type=dtypes.int32)
        ds = ds.shuffle(num_elements)
        low_bucket_max = 30
        mid_bucket_max = 60
        bucket_boundaries = [low_bucket_max, mid_bucket_max]
        batch_size = 10
        num_consumers = 3
        bucket_batch_sizes = [batch_size] * (len(bucket_boundaries) + 1)
        ds = ds.apply(
            grouping.bucket_by_sequence_length(lambda x: x,
                                               bucket_boundaries,
                                               bucket_batch_sizes,
                                               drop_remainder=True))
        ds = ds.apply(
            grouping.group_by_window(
                lambda x: math_ops.cast(x[1], dtypes.int64),
                lambda _, x: dataset_ops.Dataset.from_tensors(x),
                window_size=num_consumers))
        ds = ds.flat_map(lambda x: x)
        ds = ds.repeat()

        consumers = []
        for consumer_index in range(num_consumers):
            consumers.append(
                self.make_distributed_dataset(ds,
                                              cluster,
                                              job_name="test",
                                              consumer_index=consumer_index,
                                              num_consumers=num_consumers))
        # Use parallel interleave to read from consumers in parallel.
        ds = dataset_ops.Dataset.from_tensor_slices(consumers)
        ds = ds.interleave(lambda x: x.prefetch(num_elements),
                           cycle_length=num_consumers,
                           num_parallel_calls=num_consumers)

        num_rounds = 10
        get_next = self.getNext(ds, requires_initialization=True)
        results = []
        for _ in range(num_rounds):
            results.append(self.evaluate(get_next()))

        def get_bucket(elem):
            bucket_ind = 0
            while bucket_ind < len(bucket_boundaries
                                   ) and elem >= bucket_boundaries[bucket_ind]:
                bucket_ind += 1
            return bucket_ind

        for i in range(0, len(results), num_consumers):
            batches = results[num_consumers * i:num_consumers * i +
                              num_consumers]
            bucket_inds = [get_bucket(batch[0]) for batch in batches]
            for bucket_ind in bucket_inds[1:]:
                self.assertEqual(bucket_inds[0], bucket_ind)
Example #15
0
 def testGroupByWindowWithAutotune(self):
     dataset = dataset_ops.Dataset.range(1000).apply(
         grouping.group_by_window(
             lambda x: x // 10,
             lambda key, window: dataset_ops.Dataset.from_tensors(key), 4))
     dataset = dataset.map(lambda x: x + 1, num_parallel_calls=-1)
     get_next = self.getNext(dataset)
     self.evaluate(get_next())
  def testShortCircuit(self):

    dataset = dataset_ops.Dataset.range(10)
    dataset = dataset.apply(
        grouping.group_by_window(lambda x: x, lambda _, window: window.batch(1),
                                 1))
    self.assertDatasetProduces(
        dataset, expected_output=[[i] for i in range(10)])
  def make_group_by_window_dataset(var):

    def reduce_fn(key, bucket):
      del key, bucket
      return dataset_ops.Dataset.from_tensors(var)

    return dataset_ops.Dataset.from_tensors(0).repeat(10).apply(
        grouping.group_by_window(lambda _: 0, reduce_fn, 10))
  def testEmpty(self):
    dataset = dataset_ops.Dataset.range(4).apply(
        grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))

    get_next = self.getNext(dataset)
    with self.assertRaisesRegexp(
        errors.InvalidArgumentError,
        "Window size must be greater than zero, but got 0."):
      print(self.evaluate(get_next()))
Example #19
0
  def testEvenOddBuckets(self):

    def _map_fn(v):
      return (v, array_ops.fill([v], v),
              array_ops.fill([3], string_ops.as_string(v)))

    input_dataset = (
        dataset_ops.Dataset.from_tensor_slices(math_ops.range(64)).map(_map_fn))

    bucketed_dataset = input_dataset.apply(
        grouping.group_by_window(
            lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
            lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))

    iterator = bucketed_dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)

      # Get two minibatches (one containing even values, one containing odds)
      which_bucket_even, bucketed_values_even = sess.run(get_next)
      which_bucket_odd, bucketed_values_odd = sess.run(get_next)

      # Count number of bucket_tensors.
      self.assertEqual(3, len(bucketed_values_even))
      self.assertEqual(3, len(bucketed_values_odd))

      # Ensure bucket 0 was used for all minibatch entries.
      self.assertAllEqual(0, which_bucket_even)
      self.assertAllEqual(1, which_bucket_odd)

      # Test the first bucket outputted, the events starting at 0
      expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
      expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
      for i in range(0, 32):
        expected_unk_int64[i, :2 * i] = 2 * i
        expected_vec3_str = np.vstack(
            3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T

      self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
      self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
      self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])

      # Test the second bucket outputted, the odds starting at 1
      expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
      expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
      for i in range(0, 32):
        expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
        expected_vec3_str = np.vstack(
            3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T

      self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
      self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
      self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
Example #20
0
    def testEvenOddBuckets(self):
        def _map_fn(v):
            return (v, array_ops.fill([v], v),
                    array_ops.fill([3], string_ops.as_string(v)))

        input_dataset = (dataset_ops.Dataset.from_tensor_slices(
            math_ops.range(64)).map(_map_fn))

        bucketed_dataset = input_dataset.apply(
            grouping.group_by_window(
                lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
                lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))

        iterator = bucketed_dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)

            # Get two minibatches (one containing even values, one containing odds)
            which_bucket_even, bucketed_values_even = sess.run(get_next)
            which_bucket_odd, bucketed_values_odd = sess.run(get_next)

            # Count number of bucket_tensors.
            self.assertEqual(3, len(bucketed_values_even))
            self.assertEqual(3, len(bucketed_values_odd))

            # Ensure bucket 0 was used for all minibatch entries.
            self.assertAllEqual(0, which_bucket_even)
            self.assertAllEqual(1, which_bucket_odd)

            # Test the first bucket outputted, the events starting at 0
            expected_scalar_int = np.arange(0, 32 * 2, 2, dtype=np.int64)
            expected_unk_int64 = np.zeros((32, 31 * 2)).astype(np.int64)
            for i in range(0, 32):
                expected_unk_int64[i, :2 * i] = 2 * i
                expected_vec3_str = np.vstack(
                    3 * [np.arange(0, 32 * 2, 2).astype(bytes)]).T

            self.assertAllEqual(expected_scalar_int, bucketed_values_even[0])
            self.assertAllEqual(expected_unk_int64, bucketed_values_even[1])
            self.assertAllEqual(expected_vec3_str, bucketed_values_even[2])

            # Test the second bucket outputted, the odds starting at 1
            expected_scalar_int = np.arange(1, 32 * 2 + 1, 2, dtype=np.int64)
            expected_unk_int64 = np.zeros((32, 31 * 2 + 1)).astype(np.int64)
            for i in range(0, 32):
                expected_unk_int64[i, :2 * i + 1] = 2 * i + 1
                expected_vec3_str = np.vstack(
                    3 * [np.arange(1, 32 * 2 + 1, 2).astype(bytes)]).T

            self.assertAllEqual(expected_scalar_int, bucketed_values_odd[0])
            self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
            self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
 def testSmallGroups(self):
   components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
   dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
       grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4), 4))
   get_next = self.getNext(dataset)
   self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
   self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next()))
   # The small outputs at the end are deterministically produced in key
   # order.
   self.assertAllEqual([0, 0, 0], self.evaluate(get_next()))
   self.assertAllEqual([1], self.evaluate(get_next()))
  def testEmpty(self):
    iterator = dataset_ops.make_initializable_iterator(
        dataset_ops.Dataset.range(4).apply(
            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)))
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      self.evaluate(init_op)
      with self.assertRaisesRegexp(
          errors.InvalidArgumentError,
          "Window size must be greater than zero, but got 0."):
        print(self.evaluate(get_next))
Example #23
0
    def testEmpty(self):
        iterator = (dataset_ops.Dataset.range(4).apply(
            grouping.group_by_window(lambda _: 0, lambda _, xs: xs,
                                     0)).make_initializable_iterator())
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)
            with self.assertRaisesRegexp(
                    errors.InvalidArgumentError,
                    "Window size must be greater than zero, but got 0."):
                print(sess.run(get_next))
Example #24
0
  def testSmallGroups(self):
    components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
    iterator = (
        dataset_ops.Dataset.from_tensor_slices(components).apply(
            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
                                     4)).make_initializable_iterator())
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)
      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
      # The small outputs at the end are deterministically produced in key
      # order.
      self.assertAllEqual([0, 0, 0], sess.run(get_next))
      self.assertAllEqual([1], sess.run(get_next))
Example #25
0
  def testEvenOddBucketsFilterOutAllOdd(self):

    def _map_fn(v):
      return {
          "x": v,
          "y": array_ops.fill([v], v),
          "z": array_ops.fill([3], string_ops.as_string(v))
      }

    def _dynamic_pad_fn(bucket, window, _):
      return dataset_ops.Dataset.zip(
          (dataset_ops.Dataset.from_tensors(bucket),
           window.padded_batch(
               32, {
                   "x": tensor_shape.TensorShape([]),
                   "y": tensor_shape.TensorShape([None]),
                   "z": tensor_shape.TensorShape([3])
               })))

    input_dataset = (
        dataset_ops.Dataset.from_tensor_slices(math_ops.range(128)).map(_map_fn)
        .filter(lambda d: math_ops.equal(d["x"] % 2, 0)))

    bucketed_dataset = input_dataset.apply(
        grouping.group_by_window(
            lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
            lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))

    iterator = bucketed_dataset.make_initializable_iterator()
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)

      # Get two minibatches ([0, 2, ...] and [64, 66, ...])
      which_bucket0, bucketed_values_even0 = sess.run(get_next)
      which_bucket1, bucketed_values_even1 = sess.run(get_next)

      # Ensure that bucket 1 was completely filtered out
      self.assertAllEqual(0, which_bucket0)
      self.assertAllEqual(0, which_bucket1)
      self.assertAllEqual(
          np.arange(0, 64, 2, dtype=np.int64), bucketed_values_even0["x"])
      self.assertAllEqual(
          np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 def testImmediateOutput(self):
   components = np.array(
       [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
   dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
       -1).apply(
           grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
                                    4))
   get_next = self.getNext(dataset)
   # The input is infinite, so this test demonstrates that:
   # 1. We produce output without having to consume the entire input,
   # 2. Different buckets can produce output at different rates, and
   # 3. For deterministic input, the output is deterministic.
   for _ in range(3):
     self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
     self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next()))
     self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next()))
     self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next()))
  def testReduceFuncError(self):
    components = np.random.randint(100, size=(200,)).astype(np.int64)

    def reduce_func(_, xs):
      # Introduce an incorrect padded shape that cannot (currently) be
      # detected at graph construction time.
      return xs.padded_batch(
          4,
          padded_shapes=(tensor_shape.TensorShape([]),
                         constant_op.constant([5], dtype=dtypes.int64) * -1))

    dataset = dataset_ops.Dataset.from_tensor_slices(
        components).map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32))
    get_next = self.getNext(dataset)
    with self.assertRaises(errors.InvalidArgumentError):
      self.evaluate(get_next())
Example #28
0
    def testEvenOddBucketsFilterOutAllOdd(self):
        def _map_fn(v):
            return {
                "x": v,
                "y": array_ops.fill([v], v),
                "z": array_ops.fill([3], string_ops.as_string(v))
            }

        def _dynamic_pad_fn(bucket, window, _):
            return dataset_ops.Dataset.zip(
                (dataset_ops.Dataset.from_tensors(bucket),
                 window.padded_batch(
                     32, {
                         "x": tensor_shape.TensorShape([]),
                         "y": tensor_shape.TensorShape([None]),
                         "z": tensor_shape.TensorShape([3])
                     })))

        input_dataset = (dataset_ops.Dataset.from_tensor_slices(
            math_ops.range(128)).map(_map_fn).filter(
                lambda d: math_ops.equal(d["x"] % 2, 0)))

        bucketed_dataset = input_dataset.apply(
            grouping.group_by_window(
                lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
                lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))

        iterator = bucketed_dataset.make_initializable_iterator()
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)

            # Get two minibatches ([0, 2, ...] and [64, 66, ...])
            which_bucket0, bucketed_values_even0 = sess.run(get_next)
            which_bucket1, bucketed_values_even1 = sess.run(get_next)

            # Ensure that bucket 1 was completely filtered out
            self.assertAllEqual(0, which_bucket0)
            self.assertAllEqual(0, which_bucket1)
            self.assertAllEqual(np.arange(0, 64, 2, dtype=np.int64),
                                bucketed_values_even0["x"])
            self.assertAllEqual(np.arange(64, 128, 2, dtype=np.int64),
                                bucketed_values_even1["x"])
  def testGroupByWindowBatching(self, drop_remainder):
    dataset = dataset_ops.Dataset.from_tensor_slices(
        [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)])
    reduce_fn = lambda bucket_id, ds: ds.batch(
        batch_size=10, drop_remainder=drop_remainder)
    dataset = dataset.apply(
        grouping.group_by_window(
            key_func=lambda x: x[0] % 4, reduce_func=reduce_fn, window_size=10))
    rebatched_dataset = distribute._RebatchDataset(dataset, num_workers=2)

    self.assertEqual([[5, 3] if drop_remainder else [None, 3]],
                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
    # pylint: disable=g-complex-comprehension
    expected_output = [[[j + i * 4 + k * 20] * 3
                        for i in range(5)]
                       for j in range(4)
                       for k in range(2)]
    self.assertDatasetProduces(rebatched_dataset, expected_output)
    def testGroupByWindowDynamicBatch(self, drop_remainder):
        dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2)
        reduce_fn = lambda bucket_id, ds: ds.batch(  # pylint: disable=g-long-lambda
            batch_size=(bucket_id + 1) * 5,
            drop_remainder=drop_remainder)
        dataset = dataset.apply(
            grouping.group_by_window(key_func=lambda x: x,
                                     reduce_func=reduce_fn,
                                     window_size=10))
        dataset = distribute._RebatchDataset(dataset, num_workers=2)

        self.assertEqual([[None]],
                         [ts.as_list() for ts in _flat_shapes(dataset)])
        # pylint: disable=g-complex-comprehension
        x = [(2, 0), (2, 0), (2, 0), (2, 0), (2, 0), (5, 1), (5, 1), (2, 0),
             (2, 0), (2, 0), (2, 0), (2, 0), (5, 1), (5, 1)]
        expected_output = [[value] * batch_size for batch_size, value in x]
        self.assertDatasetProduces(dataset, expected_output)
Example #31
0
    def testSmallGroups(self):
        components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0],
                              dtype=np.int64)
        iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
            grouping.group_by_window(lambda x: x % 2,
                                     lambda _, xs: xs.batch(4),
                                     4)).make_initializable_iterator())
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)
            self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
            self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
            # The small outputs at the end are deterministically produced in key
            # order.
            self.assertAllEqual([0, 0, 0], sess.run(get_next))
            self.assertAllEqual([1], sess.run(get_next))
  def testGroupByWindowStaticBatch(self):
    dataset = dataset_ops.Dataset.from_tensor_slices(
        [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)])
    reduce_fn = lambda bucket_id, ds: ds.batch(  # pylint: disable=g-long-lambda
        batch_size=10)
    dataset = dataset.apply(
        grouping.group_by_window(
            key_func=lambda x: x[0] % 4, reduce_func=reduce_fn, window_size=10))
    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=2)

    self.assertEqual([[None, 3]],
                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])
    # pylint: disable=g-complex-comprehension
    expected_output = [[[j + i * 4 + k * 20] * 3
                        for i in range(5)]
                       for j in range(4)
                       for k in range(2)]
    self.assertDatasetProduces(rebatched_dataset, expected_output)
Example #33
0
  def testStatefulGroupByWindowNotCheckpointable(self):
    stateful_key_func = self._statefulInt64Func
    key_func = lambda _: math_ops.cast(0, dtypes.int64)
    stateful_reduce_func = lambda _, x: self._statefulDatasetFunc(x)
    reduce_func = lambda _, x: x
    stateful_window_func = self._statefulInt64Func
    window_func = lambda x: math_ops.cast(0, dtypes.int64)

    test_cases = [
        (stateful_key_func, reduce_func, window_func),
        (key_func, stateful_reduce_func, window_func),
        (key_func, reduce_func, stateful_window_func),
    ]
    for key_func_fn, reduce_func_fn, window_func in test_cases:
      dataset = dataset_ops.Dataset.range(10)
      dataset = dataset.apply(
          grouping.group_by_window(
              key_func_fn, reduce_func_fn, window_size_func=window_func))
      self._assertNotCheckpointable(dataset)
  def testSimple(self):
    components = np.random.randint(100, size=(200,)).astype(np.int64)
    dataset = dataset_ops.Dataset.from_tensor_slices(
        components).map(lambda x: x * x).apply(
            grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
                                     4))
    get_next = self.getNext(dataset)
    counts = []
    with self.assertRaises(errors.OutOfRangeError):
      while True:
        result = self.evaluate(get_next())
        self.assertTrue(
            all(x % 2 == 0 for x in result) or all(x % 2 == 1) for x in result)
        counts.append(result.shape[0])

    self.assertEqual(len(components), sum(counts))
    num_full_batches = len([c for c in counts if c == 4])
    self.assertGreaterEqual(num_full_batches, 24)
    self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
  def testGroupByWindowDynamicBatchWithPartialBatch(self):
    # {0, 1, 0, 1, ...}
    dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2)

    def reduce_fn(key, ds):
      # key == 0 -> .batch(5)
      # key == 1 -> .batch(10)
      return ds.batch(batch_size=(key + 1) * 5)

    dataset = dataset.apply(
        grouping.group_by_window(
            key_func=lambda x: x, reduce_func=reduce_fn, window_size=11))
    dataset = distribute._RebatchDataset(dataset, num_replicas=2)

    self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(dataset)])

    pairs = [(3, 0), (2, 0), (3, 0), (2, 0), (1, 0), (0, 0), (5, 1), (5, 1),
             (1, 1), (0, 1), (3, 0), (2, 0), (2, 0), (2, 0), (5, 1), (4, 1)]
    expected_output = [[value] * batch_size for batch_size, value in pairs]
    self.assertDatasetProduces(dataset, expected_output)
Example #36
0
  def testImmediateOutput(self):
    components = np.array(
        [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
    iterator = (
        dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
            grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
                                     4)).make_initializable_iterator())
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)
      # The input is infinite, so this test demonstrates that:
      # 1. We produce output without having to consume the entire input,
      # 2. Different buckets can produce output at different rates, and
      # 3. For deterministic input, the output is deterministic.
      for _ in range(3):
        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
Example #37
0
def group_by_window(key_func,
                    reduce_func,
                    window_size=None,
                    window_size_func=None):
    """A transformation that groups windows of elements by key and reduces them.

  This transformation maps each consecutive element in a dataset to a key
  using `key_func` and groups the elements by key. It then applies
  `reduce_func` to at most `window_size_func(key)` elements matching the same
  key. All except the final window for each key will contain
  `window_size_func(key)` elements; the final window may be smaller.

  You may provide either a constant `window_size` or a window size determined by
  the key through `window_size_func`.

  Args:
    key_func: A function mapping a nested structure of tensors
      (having shapes and types defined by `self.output_shapes` and
      `self.output_types`) to a scalar `tf.int64` tensor.
    reduce_func: A function mapping a key and a dataset of up to `window_size`
      consecutive elements matching that key to another dataset.
    window_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
      consecutive elements matching the same key to combine in a single
      batch, which will be passed to `reduce_func`. Mutually exclusive with
      `window_size_func`.
    window_size_func: A function mapping a key to a `tf.int64` scalar
      `tf.Tensor`, representing the number of consecutive elements matching
      the same key to combine in a single batch, which will be passed to
      `reduce_func`. Mutually exclusive with `window_size`.

  Returns:
    A `Dataset` transformation function, which can be passed to
    `tf.data.Dataset.apply`.

  Raises:
    ValueError: if neither or both of {`window_size`, `window_size_func`} are
      passed.
  """
    return grouping.group_by_window(key_func, reduce_func, window_size,
                                    window_size_func)
Example #38
0
    def testImmediateOutput(self):
        components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0],
                              dtype=np.int64)
        iterator = (dataset_ops.Dataset.from_tensor_slices(components).repeat(
            -1).apply(
                grouping.group_by_window(lambda x: x % 3,
                                         lambda _, xs: xs.batch(4),
                                         4)).make_initializable_iterator())
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)
            # The input is infinite, so this test demonstrates that:
            # 1. We produce output without having to consume the entire input,
            # 2. Different buckets can produce output at different rates, and
            # 3. For deterministic input, the output is deterministic.
            for _ in range(3):
                self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
                self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
                self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
                self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
Example #39
0
    def testShard(self):
        filename = self._createFile()
        dataset = readers.TFRecordDataset([filename])

        def reduce_func(key, dataset):
            shard_filename = string_ops.string_join(
                [filename, string_ops.as_string(key)])
            writer = writers.TFRecordWriter(shard_filename)
            writer.write(dataset.map(lambda _, x: x))
            return dataset_ops.Dataset.from_tensors(shard_filename)

        dataset = dataset.enumerate()
        dataset = dataset.apply(
            grouping.group_by_window(lambda i, _: i % 2, reduce_func,
                                     dtypes.int64.max))

        get_next = self.getNext(dataset)
        for i in range(2):
            shard_filename = (filename + str(i)).encode()
            self.assertEqual(self.evaluate(get_next()), shard_filename)
            for j, r in enumerate(
                    tf_record.tf_record_iterator(shard_filename)):
                self.assertAllEqual(self._record(i + 2 * j), r)
 def _build_dataset(self, components):
   return dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
       grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4), 4))