Exemple #1
0
    def test_avro_dataset(self):
        """Test case for AvroDataset."""
        # The test.bin was created from avro/lang/c++/examples/datafile.cc.
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_avro", "test.bin")
        filename = "file://" + filename

        schema_filename = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "test_avro",
            "cpx.json")
        with open(schema_filename, 'r') as f:
            schema = f.read()

        columns = ['im', 're']
        output_types = (dtypes.float64, dtypes.float64)
        num_repeats = 2

        dataset = avro_io.AvroDataset([filename], columns, schema,
                                      output_types).repeat(num_repeats)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(num_repeats):
                for i in range(100):
                    (im, re) = (i + 100, i * 100)
                    vv = sess.run(get_next)
                    self.assertAllClose((im, re), vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

        dataset = avro_io.AvroDataset([filename, filename],
                                      columns,
                                      schema,
                                      output_types,
                                      batch=3)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for ii in range(0, 198, 3):
                i = ii % 100
                (im, re) = ([
                    i + 100, ((i + 1) % 100) + 100, ((i + 2) % 100) + 100
                ], [i * 100, ((i + 1) % 100) * 100, ((i + 2) % 100) * 100])
                vv = sess.run(get_next)
                self.assertAllClose((im, re), vv)
            (im, re) = ([198, 199], [9800, 9900])
            vv = sess.run(get_next)
            self.assertAllClose((im, re), vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #2
0
    def test_cifar_10_dataset(self):
        """Test case for CIFARDataset.
    """
        url = 'https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz'
        filedata = urlopen(url)
        f, filename = tempfile.mkstemp()
        os.write(f, filedata.read())
        os.close(f)
        (x_train, y_train), (x_test,
                             y_test) = tf.keras.datasets.cifar10.load_data()

        num_repeats = 2

        dataset = cifar_io.CIFAR10Dataset(filename,
                                          batch=3).repeat(num_repeats)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)
            for _ in range(num_repeats):  # Dataset is repeated.
                for i in range(16666):
                    image, label = sess.run(get_next)
                    self.assertAllEqual(image[0], x_train[i * 3 + 0])
                    self.assertEqual(label[0], y_train[i * 3 + 0])
                    self.assertAllEqual(image[1], x_train[i * 3 + 1])
                    self.assertEqual(label[1], y_train[i * 3 + 1])
                    self.assertAllEqual(image[2], x_train[i * 3 + 2])
                    self.assertEqual(label[2], y_train[i * 3 + 2])
                image, label = sess.run(get_next)
                self.assertAllEqual(image[0], x_train[49998])
                self.assertEqual(label[0], y_train[49998])
                self.assertAllEqual(image[1], x_train[49999])
                self.assertEqual(label[1], y_train[49999])
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

        dataset = cifar_io.CIFAR10Dataset(filename,
                                          test=True).repeat(num_repeats)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.cached_session() as sess:
            sess.run(init_op)
            for _ in range(num_repeats):  # Dataset is repeated.
                for i in range(10000):
                    image, label = sess.run(get_next)
                    self.assertAllEqual(image, x_test[i])
                    self.assertEqual(label, y_test[i])
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #3
0
    def test_hdf5_dataset_int32_zlib(self):
        """Test case for HDF5Dataset with zlib."""
        # Note the file is generated with tdset.h5:
        # with h5py.File('compressed_h5.h5', 'w') as output_f:
        #   output_f.create_dataset(
        #       '/dset1', data=h5f['/dset1'][()], compression='gzip')
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_hdf5", "compressed_h5.h5")
        filename = "file://" + filename
        column = '/dset1'
        dtype = dtypes.int32
        shape = tf.TensorShape([None, 20])

        dataset = hdf5_io.HDF5Dataset(filename,
                                      column,
                                      start=0,
                                      stop=10,
                                      dtype=dtype,
                                      shape=shape).apply(
                                          tf.data.experimental.unbatch())
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()
        with self.test_session() as sess:
            sess.run(init_op)
            for i in range(10):
                v0 = list([np.asarray([v for v in range(i, i + 20)])])
                vv = sess.run(get_next)
                self.assertAllEqual(v0, [vv])
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #4
0
    def test_hdf5_dataset_int32(self):
        """Test case for HDF5Dataset."""
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_hdf5", "tdset.h5")
        filename = "file://" + filename
        column = '/dset1'
        dtype = dtypes.int32
        shape = tf.TensorShape([None, 20])

        dataset = hdf5_io.HDF5Dataset(filename,
                                      column,
                                      start=0,
                                      stop=10,
                                      dtype=dtype,
                                      shape=shape).apply(
                                          tf.data.experimental.unbatch())
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()
        with self.test_session() as sess:
            sess.run(init_op)
            for i in range(10):
                v0 = list([np.asarray([v for v in range(i, i + 20)])])
                vv = sess.run(get_next)
                self.assertAllEqual(v0, [vv])
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #5
0
  def test_sequence_file_dataset(self):
    """Test case for SequenceFileDataset.

    The file is generated with `org.apache.hadoop.io.Text` for key/value.
    There are 25 records in the file with the format of:
    key = XXX
    value = VALUEXXX
    where XXX is replaced as the line number (starts with 001).
    """
    filename = os.path.join(os.path.dirname(os.path.abspath(__file__)), "test_hadoop", "string.seq")

    num_repeats = 2

    dataset = hadoop_io.SequenceFileDataset([filename]).repeat(
        num_repeats)
    iterator = data.make_initializable_iterator(dataset)
    init_op = iterator.initializer
    get_next = iterator.get_next()

    with self.cached_session() as sess:
      sess.run(init_op)
      for _ in range(num_repeats):  # Dataset is repeated.
        for i in range(25):  # 25 records.
          v0 = ("%03d" % (i + 1)).encode()
          v1 = ("VALUE%03d" % (i + 1)).encode()
          self.assertEqual((v0, v1), sess.run(get_next))
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
Exemple #6
0
  def test_hdf5_dataset(self):
    """Test case for HDF5Dataset."""
    filename = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "test_hdf5", "tdset.h5")
    filename = "file://" + filename
    columns = ['/dset2']
    output_types = [dtypes.float64]
    output_shapes = [(1, 20)]

    dataset = hdf5_io.HDF5Dataset(
        [filename], columns, output_types, output_shapes, batch=1)
    iterator = data.make_initializable_iterator(dataset)
    init_op = iterator.initializer
    get_next = iterator.get_next()
    with self.test_session() as sess:
      sess.run(init_op)
      for i in range(30):
        v0 = list(
            [np.asarray([[i + 1e-04 * v for v in range(20)]],
                        dtype=np.float64)])
        vv = sess.run(get_next)
        self.assertAllEqual(v0, vv)
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
Exemple #7
0
    def test_hdf5_dataset_int32_zlib(self):
        """Test case for HDF5Dataset with zlib."""
        # Note the file is generated with tdset.h5:
        # with h5py.File('compressed_h5.h5', 'w') as output_f:
        #   output_f.create_dataset(
        #       '/dset1', data=h5f['/dset1'][()], compression='gzip')
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_hdf5", "compressed_h5.h5")
        filename = "file://" + filename
        columns = ['/dset1']
        output_types = [dtypes.int32]
        output_shapes = [(1, 20)]

        dataset = hdf5_io.HDF5Dataset([filename], columns, output_types,
                                      output_shapes)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()
        with self.test_session() as sess:
            sess.run(init_op)
            for i in range(10):
                v0 = list([np.asarray([v for v in range(i, i + 20)])])
                vv = sess.run(get_next)
                self.assertAllEqual(v0, vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #8
0
  def test_json_dataset(self):
    """Test case for JSONDataset."""
    filename = os.path.join(
        os.path.dirname(os.path.abspath(__file__)),
        "test_json",
        "feature.json")

    columns = ['floatfeature', 'integerfeature']
    output_types = (dtypes.float64, dtypes.int64)
    num_repeats = 2

    dataset = json_io.JSONDataset(
        filename, columns=columns, dtypes=output_types).repeat(num_repeats)
    iterator = data.make_initializable_iterator(dataset)
    init_op = iterator.initializer
    get_next = iterator.get_next()

    test_json = [(1.1, 2), (2.1, 3)]
    with self.test_session() as sess:
      sess.run(init_op)
      for _ in range(num_repeats):
        for i in range(2):
          (floatf, intf) = test_json[i]
          vv = sess.run(get_next)
          self.assertAllClose((floatf, intf), vv)
      with self.assertRaises(errors.OutOfRangeError):
        sess.run(get_next)
Exemple #9
0
    def test_hdf5_invalid_dataset(self):
        """test_hdf5_invalid_dataset"""
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_hdf5", "tdset.h5")
        filename = "file://" + filename
        dataset = hdf5_io.HDF5Dataset([filename], ['/invalid', '/invalid2'],
                                      [dtypes.int32, dtypes.int32], [(1, 20),
                                                                     (1, 30)])
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                         "unable to open dataset /invalid"):
                sess.run(get_next)
Exemple #10
0
    def test_parquet_dataset(self):
        """Test case for ParquetDataset.

    Note: The sample file is generated from:
    `parquet-cpp/examples/low-level-api/reader_writer`
    This test extracts columns of [0, 1, 2, 4, 5]
    with column data types of [bool, int32, int64, float, double].
    Please check `parquet-cpp/examples/low-level-api/reader-writer.cc`
    to find details of how records are generated:
    Column 0 (bool): True for even rows and False otherwise.
    Column 1 (int32): Equal to row_index.
    Column 2 (int64): Equal to row_index * 1000 * 1000 * 1000 * 1000.
    Column 4 (float): Equal to row_index * 1.1.
    Column 5 (double): Equal to row_index * 1.1111111.
    """
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_parquet", "parquet_cpp_example.parquet")

        filenames = tensorflow.constant([filename], dtypes.string)
        columns = [0, 1, 2, 4, 5]
        output_types = (dtypes.bool, dtypes.int32, dtypes.int64,
                        dtypes.float32, dtypes.float64)
        num_repeats = 2

        dataset = parquet_io.ParquetDataset(filenames, columns,
                                            output_types).repeat(num_repeats)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(num_repeats):  # Dataset is repeated.
                for i in range(500):  # 500 rows.
                    v0 = ((i % 2) == 0)
                    v1 = i
                    v2 = i * 1000 * 1000 * 1000 * 1000
                    v4 = 1.1 * i
                    v5 = 1.1111111 * i
                    self.assertAllClose((v0, v1, v2, v4, v5),
                                        sess.run(get_next))
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #11
0
    def test_hdf5_invalid_dataset(self):
        """test_hdf5_invalid_dataset"""
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_hdf5", "tdset.h5")
        filename = "file://" + filename
        dataset = hdf5_io.HDF5Dataset(filename,
                                      '/invalid',
                                      dtype=dtypes.int32,
                                      shape=tf.TensorShape([1, 20]),
                                      start=0,
                                      stop=10)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                         "unable to open dataset"):
                sess.run(get_next)
Exemple #12
0
    def test_hdf5_dataset_binary(self):
        """Test case for HDF5Dataset."""
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_hdf5", "tbinary.h5")
        filename = "file://" + filename
        columns = ['integer', 'float', 'double']
        output_types = [dtypes.int32, dtypes.float32, dtypes.float64]
        output_shapes = [(1), (1), (1)]

        dataset = hdf5_io.HDF5Dataset([filename], columns, output_types,
                                      output_shapes)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()
        with self.test_session() as sess:
            sess.run(init_op)
            for i in range(1, 7):
                vv = sess.run(get_next)
                self.assertAllEqual((i, np.float32(i), np.float64(i)), vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)
Exemple #13
0
    def test_parquet_dataset(self):
        """Test case for ParquetDataset.

    Note: The sample file is generated from:
    `parquet-cpp/examples/low-level-api/reader_writer`
    This test extracts columns of [0, 1, 2, 4, 5]
    with column data types of [bool, int32, int64, float, double].
    Please check `parquet-cpp/examples/low-level-api/reader-writer.cc`
    to find details of how records are generated:
    Column 0 (bool): True for even rows and False otherwise.
    Column 1 (int32): Equal to row_index.
    Column 2 (int64): Equal to row_index * 1000 * 1000 * 1000 * 1000.
    Column 4 (float): Equal to row_index * 1.1.
    Column 5 (double): Equal to row_index * 1.1111111.
    """
        filename = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                                "test_parquet", "parquet_cpp_example.parquet")
        filename = "file://" + filename
        columns = [
            'boolean_field', 'int32_field', 'int64_field', 'float_field',
            'double_field'
        ]
        output_types = (dtypes.bool, dtypes.int32, dtypes.int64,
                        dtypes.float32, dtypes.float64)
        num_repeats = 2

        dataset = parquet_io.ParquetDataset([filename], columns,
                                            output_types).repeat(num_repeats)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(num_repeats):  # Dataset is repeated.
                for i in range(500):  # 500 rows.
                    v0 = ((i % 2) == 0)
                    v1 = i
                    v2 = i * 1000 * 1000 * 1000 * 1000
                    v4 = 1.1 * i
                    v5 = 1.1111111 * i
                    vv = sess.run(get_next)
                    self.assertAllClose((v0, v1, v2, v4, v5), vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

        dataset = parquet_io.ParquetDataset([filename],
                                            columns,
                                            output_types,
                                            batch=1)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for i in range(500):
                v0 = ((i % 2) == 0)
                v1 = i
                v2 = i * 1000 * 1000 * 1000 * 1000
                v4 = 1.1 * i
                v5 = 1.1111111 * i
                vv = sess.run(get_next)
                self.assertAllClose(([v0], [v1], [v2], [v4], [v5]), vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

        dataset = parquet_io.ParquetDataset([filename, filename],
                                            columns,
                                            output_types,
                                            batch=3)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for ii in range(0, 999, 3):
                v0, v1, v2, v4, v5 = [], [], [], [], []
                for i in [ii % 500, (ii + 1) % 500, (ii + 2) % 500]:
                    v0.append((i % 2) == 0)
                    v1.append(i)
                    v2.append(i * 1000 * 1000 * 1000 * 1000)
                    v4.append(1.1 * i)
                    v5.append(1.1111111 * i)
                vv = sess.run(get_next)
                self.assertAllClose((v0, v1, v2, v4, v5), vv)
            i = 999 % 500
            v0 = ((i % 2) == 0)
            v1 = i
            v2 = i * 1000 * 1000 * 1000 * 1000
            v4 = 1.1 * i
            v5 = 1.1111111 * i
            vv = sess.run(get_next)
            self.assertAllClose(([v0], [v1], [v2], [v4], [v5]), vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)

        # With compression
        filename = filename + '.gz'
        dataset = parquet_io.ParquetDataset([filename], columns,
                                            output_types).repeat(num_repeats)
        iterator = data.make_initializable_iterator(dataset)
        init_op = iterator.initializer
        get_next = iterator.get_next()

        with self.test_session() as sess:
            sess.run(init_op)
            for _ in range(num_repeats):  # Dataset is repeated.
                for i in range(500):  # 500 rows.
                    v0 = ((i % 2) == 0)
                    v1 = i
                    v2 = i * 1000 * 1000 * 1000 * 1000
                    v4 = 1.1 * i
                    v5 = 1.1111111 * i
                    vv = sess.run(get_next)
                    self.assertAllClose((v0, v1, v2, v4, v5), vv)
            with self.assertRaises(errors.OutOfRangeError):
                sess.run(get_next)