Example #1
0
  def testAdditionalOperationsAfterReadBack(self):
    self.setUpTFRecord()
    filenames = self.test_filenames

    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in range(0, 10)
        for r in range(0, 10)
    ]

    tmpdir = self.makeSnapshotDirectory()
    dataset = core_readers._TFRecordDataset(filenames)
    dataset = dataset.apply(snapshot.snapshot(tmpdir))
    self.assertDatasetProduces(dataset, expected)

    # remove the original files and try to read the data back only from snapshot
    self.removeTFRecords()

    dataset2 = core_readers._TFRecordDataset(filenames)
    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
    self.assertDatasetProduces(dataset2, expected)

    expected_after = [
        b"cord %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in range(0, 10)
        for r in range(0, 10)
    ]

    dataset3 = core_readers._TFRecordDataset(filenames)
    dataset3 = dataset3.apply(snapshot.snapshot(tmpdir))
    dataset3 = dataset3.map(lambda x: string_ops.substr_v2(x, 2, 1000))
    self.assertDatasetProduces(dataset3, expected_after)
  def testReadShuffledSnapshotWithSeedAfterWrite(self):
    self.setUpTFRecord(num_files=10, num_records=50)
    filenames = self._filenames

    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in range(0, 10)
        for r in range(0, 50)
    ]

    tmpdir = self.snapshot_dir
    dataset = core_readers._TFRecordDataset(filenames)
    dataset = dataset.apply(
        snapshot.legacy_snapshot(tmpdir, shard_size_bytes=10))
    self.assertDatasetProduces(dataset, expected)

    # remove the original files and try to read the data back only from snapshot
    self.removeTFRecords()

    dataset2 = core_readers._TFRecordDataset(filenames)
    dataset2 = dataset2.apply(
        snapshot.legacy_snapshot(
            tmpdir,
            shard_size_bytes=10,
            shuffle_on_read=True,
            shuffle_seed=123456))
    next2 = self.getNext(dataset2)

    dataset3 = core_readers._TFRecordDataset(filenames)
    dataset3 = dataset3.apply(
        snapshot.legacy_snapshot(
            tmpdir,
            shard_size_bytes=10,
            shuffle_on_read=True,
            shuffle_seed=123456))
    next3 = self.getNext(dataset3)

    # make sure that the items are read back in the same order for both datasets
    for _ in range(500):
      res2 = self.evaluate(next2())
      res3 = self.evaluate(next3())
      self.assertEqual(res2, res3)
    def testTFRecordReaderWithDirectFileNames(self):
        # Using `_TFRecordDataset` creates a raw op rather than wrapping it around
        # a flat_map automatically.
        dataset = core_readers._TFRecordDataset(self._filenames)
        dataset = distribute._AutoShardDataset(dataset, 5, 0)

        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in (0, 5)
        ]
        self.assertDatasetProduces(dataset, expected)
Example #4
0
    def testReadSnapshotBackAfterWrite(self):
        self.setUpTFRecord()
        filenames = self.test_filenames

        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in range(0, 10)
        ]

        tmpdir = self.makeSnapshotDirectory()
        dataset = core_readers._TFRecordDataset(filenames)
        dataset = dataset.apply(snapshot.snapshot(tmpdir))
        self.assertDatasetProduces(dataset, expected)

        # remove the original files and try to read the data back only from snapshot
        self.removeTFRecords()

        dataset2 = core_readers._TFRecordDataset(filenames)
        dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
        self.assertDatasetProduces(dataset2, expected)
Example #5
0
    def testReadShuffledSnapshotAfterWrite(self):
        self.setUpTFRecord(num_files=10, num_records=50)
        filenames = self.test_filenames

        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in range(0, 50)
        ]

        tmpdir = self.snapshot_dir
        dataset = core_readers._TFRecordDataset(filenames)
        dataset = dataset.apply(
            snapshot.legacy_snapshot(tmpdir, shard_size_bytes=100))
        self.assertDatasetProduces(dataset, expected)

        # remove the original files and try to read the data back only from snapshot
        self.removeTFRecords()

        dataset2 = core_readers._TFRecordDataset(filenames)
        dataset2 = dataset2.apply(
            snapshot.legacy_snapshot(tmpdir,
                                     shard_size_bytes=100,
                                     shuffle_on_read=True))
        next2 = self.getNext(dataset2)

        res1 = self.evaluate(next2())
        res2 = self.evaluate(next2())
        res3 = self.evaluate(next2())
        res4 = self.evaluate(next2())
        res5 = self.evaluate(next2())

        # make sure that we don't read the file back in the same order.
        self.assertNotEqual([res1, res2, res3, res4, res5], expected[0:5])

        # make sure all the elements are still there
        dataset3 = core_readers._TFRecordDataset(filenames)
        dataset3 = dataset3.apply(
            snapshot.legacy_snapshot(tmpdir,
                                     shard_size_bytes=100,
                                     shuffle_on_read=True))
        self.assertDatasetProduces(dataset3, expected, assert_items_equal=True)
  def testTFRecordReaderWithDirectFileNames(self):
    # Using `_TFRecordDataset` creates a raw op rather than wrapping it around
    # a flat_map automatically.
    dataset = core_readers._TFRecordDataset(self.test_filenames)
    dataset = distribute._AutoShardDataset(dataset, 5, 0)

    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in range(0, 10)
        for r in (0, 5)
    ]
    self.assertDatasetProduces(dataset, expected)
Example #7
0
    def testReadSnapshotDatasetDefault(self):
        self.createTFRecords()
        filenames = self._test_filenames
        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in range(0, 100)
        ]

        dataset = core_readers._TFRecordDataset(filenames)
        dataset = dataset.apply(snapshot.snapshot(self._snapshot_dir))
        self.assertDatasetProduces(dataset, expected)
        self.assertSnapshotDirectoryContains(
            self._snapshot_dir,
            num_fingerprints=1,
            num_runs_per_fingerprint=1,
            num_snapshot_shards_per_run=multiprocessing.cpu_count())

        self.removeTFRecords()
        dataset2 = core_readers._TFRecordDataset(filenames)
        dataset2 = dataset2.apply(snapshot.snapshot(self._snapshot_dir))
        self.assertDatasetProduces(dataset2, expected)
Example #8
0
  def testReadSnapshotBackAfterWrite(self):
    self.setUpTFRecord()
    filenames = self.test_filenames

    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in range(0, 10)
        for r in range(0, 10)
    ]

    tmpdir = self.makeSnapshotDirectory()
    dataset = core_readers._TFRecordDataset(filenames)
    dataset = dataset.apply(snapshot.snapshot(tmpdir))
    self.assertDatasetProduces(dataset, expected)

    # remove the original files and try to read the data back only from snapshot
    self.removeTFRecords()

    dataset2 = core_readers._TFRecordDataset(filenames)
    dataset2 = dataset2.apply(snapshot.snapshot(tmpdir))
    self.assertDatasetProduces(dataset2, expected)
    def testTFRecordReaderWithDirectFileNamesAndShapes(self):
        # Using `_TFRecordDataset` creates a raw op rather than wrapping it around
        # a flat_map automatically.
        dataset = core_readers._TFRecordDataset(self._filenames)

        # BatchDataset contains `output_types` and `output_shapes`
        dataset = dataset.batch(5)
        dataset = distribute._AutoShardDataset(dataset, 2, 0)

        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in range(0, 5)
        ]
        self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
Example #10
0
    def testReadSnapshotDatasetCustomShardFn(self):
        self.createTFRecords()
        filenames = self._test_filenames
        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in range(0, 100)
        ]

        dataset = core_readers._TFRecordDataset(filenames)
        dataset = dataset.apply(
            snapshot.snapshot(self._snapshot_dir,
                              shard_func=lambda _: np.int64(0)))
        self.assertDatasetProduces(dataset, expected)
        self.assertSnapshotDirectoryContains(self._snapshot_dir,
                                             num_fingerprints=1,
                                             num_runs_per_fingerprint=1,
                                             num_snapshot_shards_per_run=1)

        self.removeTFRecords()
        dataset2 = core_readers._TFRecordDataset(filenames)
        dataset2 = dataset2.apply(
            snapshot.snapshot(self._snapshot_dir, shard_func=lambda _: 0))
        self.assertDatasetProduces(dataset2, expected)
    def testAutoshardPolicyOff(self):
        options = options_lib.Options()
        options.experimental_distribute.auto_shard_policy = (
            options_lib.AutoShardPolicy.OFF)

        dataset = core_readers._TFRecordDataset(self._filenames)
        dataset = dataset.with_options(options)
        dataset = distribute._AutoShardDataset(dataset, 5, 0)

        # Should return every record in every file since autosharding is turned off.
        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in range(0, 10)
        ]
        self.assertDatasetProduces(dataset, expected)
  def testTFRecordReaderWithDirectFileNamesAndShapes(self):
    # Using `_TFRecordDataset` creates a raw op rather than wrapping it around
    # a flat_map automatically.
    dataset = core_readers._TFRecordDataset(self.test_filenames)

    # BatchDataset contains `output_types` and `output_shapes`
    dataset = dataset.batch(5)
    dataset = distribute._AutoShardDataset(dataset, 2, 0)

    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in range(0, 10)
        for r in range(0, 5)
    ]
    self.assertDatasetProduces(dataset, list(chunk(expected, 5)))
Example #13
0
  def testReadSnapshotDatasetCustomReaderFn(self):
    self.createTFRecords()
    filenames = self._test_filenames
    expected = [
        b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
        for f in range(0, 10)
        for r in range(0, 100)
    ]

    dataset = core_readers._TFRecordDataset(filenames)
    dataset = dataset.apply(
        snapshot.snapshot(
            self._snapshot_dir,
            reader_func=(
                lambda ds: ds.interleave(  # pylint:disable=g-long-lambda
                    lambda x: x,
                    cycle_length=4,
                    num_parallel_calls=4))))
    self.assertDatasetProduces(dataset, expected)
    self.assertSnapshotDirectoryContains(
        self._snapshot_dir,
        num_fingerprints=1,
        num_runs_per_fingerprint=1,
        num_snapshot_shards_per_run=multiprocessing.cpu_count())

    self.removeTFRecords()
    dataset2 = core_readers._TFRecordDataset(filenames)
    dataset2 = dataset2.apply(
        snapshot.snapshot(
            self._snapshot_dir,
            reader_func=(
                lambda ds: ds.interleave(  # pylint:disable=g-long-lambda
                    lambda x: x,
                    cycle_length=4,
                    num_parallel_calls=4))))
    self.assertDatasetProducesSet(dataset2, expected)
    def testWorkersGreaterThanNumFilesWithDataSharding(self):
        options = options_lib.Options()
        options.experimental_distribute.auto_shard_policy = (
            options_lib.AutoShardPolicy.DATA)

        dataset = core_readers._TFRecordDataset(self._filenames)
        dataset = dataset.with_options(options)
        dataset = distribute._AutoShardDataset(dataset, 5, 0)

        # Should return "Record (0,5) of file (0 --> 9)" since we are sharding by
        # individual elements, we should be able to get some data from all files.
        expected = [
            b"Record %d of file %d" % (r, f)  # pylint:disable=g-complex-comprehension
            for f in range(0, 10) for r in (0, 5)
        ]
        self.assertDatasetProduces(dataset, expected)