Example #1
0
    def testCompressionOptions(self):
        """Create record with mix of random and repeated data to test compression on."""
        rnd = random.Random(123)
        random_record = compat.as_bytes("".join(
            rnd.choice(string.digits) for _ in range(10000)))
        repeated_record = compat.as_bytes(_TEXT)
        for _ in range(10000):
            start_i = rnd.randint(0, len(_TEXT))
            length = rnd.randint(10, 200)
            repeated_record += _TEXT[start_i:start_i + length]
        records = [random_record, repeated_record, random_record]

        tests = [
            ("compression_level", 2, -1),  # Lower compression is worse.
            ("compression_level", 6, 0),  # Default compression_level is equal.
            ("flush_mode", zlib.Z_FULL_FLUSH, 1),  # A few less bytes.
            ("flush_mode", zlib.Z_NO_FLUSH, 0),  # NO_FLUSH is the default.
            ("input_buffer_size", 4096, 0),  # Increases time not size.
            ("output_buffer_size", 4096, 0),  # Increases time not size.
            ("window_bits", 8,
             -1),  # Smaller than default window increases size.
            ("compression_strategy", zlib.Z_HUFFMAN_ONLY, -1),  # Worse.
            ("compression_strategy", zlib.Z_FILTERED, -1),  # Worse.
        ]

        compression_type = tf_record.TFRecordCompressionType.ZLIB
        options_a = tf_record.TFRecordOptions(compression_type)
        for prop, value, delta_sign in tests:
            options_b = tf_record.TFRecordOptions(
                compression_type=compression_type, **{prop: value})
            delta = self._CompressionSizeDelta(records, options_a, options_b)
            self.assertTrue(
                delta == 0 if delta_sign == 0 else delta // delta_sign > 0,
                "Setting {} = {}, file was {} smaller didn't match sign of {}".
                format(prop, value, delta, delta_sign))
Example #2
0
    def testReadGzipFiles(self):
        files = self._CreateFiles()
        gzip_files = []
        for i, fn in enumerate(files):
            with open(fn, "rb") as f:
                cdata = f.read()

                zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
                with gzip.GzipFile(zfn, "wb") as f:
                    f.write(cdata)
                gzip_files.append(zfn)

        with self.test_session() as sess:
            options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
            reader = io_ops.TFRecordReader(name="test_reader", options=options)
            queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
            key, value = reader.read(queue)

            queue.enqueue_many([gzip_files]).run()
            queue.close().run()
            for i in range(self._num_files):
                for j in range(self._num_records):
                    k, v = sess.run([key, value])
                    self.assertTrue(
                        compat.as_text(k).startswith("%s:" % gzip_files[i]))
                    self.assertAllEqual(self._Record(i, j), v)
Example #3
0
 def setUp(self, compression_type=TFRecordCompressionType.NONE):
     super(TFRecordWriterCloseAndFlushTests, self).setUp()
     self._fn = os.path.join(self.get_temp_dir(),
                             "tf_record_writer_test.txt")
     self._options = tf_record.TFRecordOptions(compression_type)
     self._writer = tf_record.TFRecordWriter(self._fn, self._options)
     self._num_records = 20
Example #4
0
    def testZLibFlushRecord(self):
        fn = self._WriteRecordsToFile([b"small record"], "small_record")
        with open(fn, "rb") as h:
            buff = h.read()

        # creating more blocks and trailing blocks shouldn't break reads
        compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS)

        output = b""
        for c in buff:
            if isinstance(c, int):
                c = six.int2byte(c)
            output += compressor.compress(c)
            output += compressor.flush(zlib.Z_FULL_FLUSH)

        output += compressor.flush(zlib.Z_FULL_FLUSH)
        output += compressor.flush(zlib.Z_FULL_FLUSH)
        output += compressor.flush(zlib.Z_FINISH)

        # overwrite the original file with the compressed data
        with open(fn, "wb") as h:
            h.write(output)

        with self.test_session() as sess:
            options = tf_record.TFRecordOptions(
                compression_type=TFRecordCompressionType.ZLIB)
            reader = io_ops.TFRecordReader(name="test_reader", options=options)
            queue = data_flow_ops.FIFOQueue(1, [dtypes.string], shapes=())
            key, value = reader.read(queue)
            queue.enqueue(fn).run()
            queue.close().run()
            k, v = sess.run([key, value])
            self.assertTrue(compat.as_text(k).startswith("%s:" % fn))
            self.assertAllEqual(b"small record", v)
Example #5
0
def save_rows_to_tf_record_file(rows,
                                make_sequence_example_fn,
                                sessions_df_length,
                                export_filename,
                                content_article_embeddings=None,
                                num_of_articles_in_sub_group=None):
    tf_record_options = tf_record.TFRecordOptions(
        tf_record.TFRecordCompressionType.GZIP)

    tf_writer = tf_record.TFRecordWriter(export_filename,
                                         options=tf_record_options)
    try:
        counter = 1
        for row in rows:
            start = time.time()
            print(f"{counter}/{sessions_df_length}")

            seq_example = make_sequence_example_fn(
                row, num_of_articles_in_sub_group, content_article_embeddings)

            end = time.time()
            print(end - start)
            counter += 1
            tf_writer.write(seq_example.SerializeToString())
    finally:
        tf_writer.close()
        sys.stdout.flush()
Example #6
0
  def testNoCompressionType(self):
    self.assertEqual(
        "",
        tf_record.TFRecordOptions.get_compression_type_string(
            tf_record.TFRecordOptions()))

    self.assertEqual(
        "",
        tf_record.TFRecordOptions.get_compression_type_string(
            tf_record.TFRecordOptions("")))

    with self.assertRaises(ValueError):
      tf_record.TFRecordOptions(5)

    with self.assertRaises(ValueError):
      tf_record.TFRecordOptions("BZ2")
Example #7
0
    def testZLibFlushRecord(self):
        """test ZLib Flush Record"""
        original = [b"small record"]
        fn = self._WriteRecordsToFile(original, "small_record")
        with open(fn, "rb") as h:
            buff = h.read()

        # creating more blocks and trailing blocks shouldn't break reads
        compressor = zlib.compressobj(9, zlib.DEFLATED, zlib.MAX_WBITS)

        output = b""
        for c in buff:
            if isinstance(c, int):
                c = six.int2byte(c)
            output += compressor.compress(c)
            output += compressor.flush(zlib.Z_FULL_FLUSH)

        output += compressor.flush(zlib.Z_FULL_FLUSH)
        output += compressor.flush(zlib.Z_FULL_FLUSH)
        output += compressor.flush(zlib.Z_FINISH)

        # overwrite the original file with the compressed data
        with open(fn, "wb") as h:
            h.write(output)

        options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
        actual = list(tf_record.tf_record_iterator(fn, options=options))
        self.assertEqual(actual, original)
Example #8
0
  def testZlibCompressionType(self):
    zlib_t = tf_record.TFRecordCompressionType.ZLIB

    self.assertEqual(
        "ZLIB",
        tf_record.TFRecordOptions.get_compression_type_string(
            tf_record.TFRecordOptions("ZLIB")))

    self.assertEqual(
        "ZLIB",
        tf_record.TFRecordOptions.get_compression_type_string(
            tf_record.TFRecordOptions(zlib_t)))

    self.assertEqual(
        "ZLIB",
        tf_record.TFRecordOptions.get_compression_type_string(
            tf_record.TFRecordOptions(tf_record.TFRecordOptions(zlib_t))))
Example #9
0
    def testGzipReadWrite(self):
        """Verify that files produced are gzip compatible."""
        original = [b"foo", b"bar"]
        fn = self._WriteRecordsToFile(original, "gzip_read_write.tfrecord")
        gzfn = self._GzipCompressFile(fn, "tfrecord.gz")

        options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
        actual = list(tf_record.tf_record_iterator(gzfn, options=options))
        self.assertEqual(actual, original)
Example #10
0
 def testWriteGZIP(self):
     options = tf_record.TFRecordOptions(
         tf_record.TFRecordCompressionType.GZIP)
     self.evaluate(
         self.writer_fn(self._createFile(options), compression_type="GZIP"))
     for i, r in enumerate(
             tf_record.tf_record_iterator(self._outputFilename(),
                                          options=options)):
         self.assertAllEqual(self._record(i), r)
Example #11
0
    def testWriteGzipRead(self):
        original = [b"foo", b"bar"]
        options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
        fn = self._WriteRecordsToFile(original, "write_gzip_read.tfrecord.gz",
                                      options)

        gzfn = self._GzipDecompressFile(fn, "write_gzip_read.tfrecord")
        actual = list(tf_record.tf_record_iterator(gzfn))
        self.assertEqual(actual, original)
Example #12
0
    def testZlibReadWrite(self):
        """Verify that files produced are zlib compatible."""
        original = [b"foo", b"bar"]
        fn = self._WriteRecordsToFile(original, "zlib_read_write.tfrecord")
        zfn = self._ZlibCompressFile(fn, "zlib_read_write.tfrecord.z")

        # read the compressed contents and verify.
        options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
        actual = list(tf_record.tf_record_iterator(zfn, options=options))
        self.assertEqual(actual, original)
Example #13
0
    def testWriteZlibRead(self):
        """Verify compression with TFRecordWriter is zlib library compatible."""
        original = [b"foo", b"bar"]
        options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
        fn = self._WriteRecordsToFile(original, "write_zlib_read.tfrecord.z",
                                      options)

        zfn = self._ZlibDecompressFile(fn, "write_zlib_read.tfrecord")
        actual = list(tf_record.tf_record_iterator(zfn))
        self.assertEqual(actual, original)
 def testWriteZlibReadLarge(self):
   """Verify compression for large records is zlib library compatible."""
   # Make it large (about 5MB)
   original = [_TEXT * 10240]
   options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
   fn = self._WriteRecordsToFile(original, "write_zlib_read_large.tfrecord.z",
                                 options)
   zfn = self._ZlibDecompressFile(fn, "write_zlib_read_large.tfrecord")
   actual = list(tf_record.tf_record_iterator(zfn))
   self.assertEqual(actual, original)
  def testZlibReadWriteLarge(self):
    """Verify that writing large contents also works."""

    # Make it large (about 5MB)
    original = [_TEXT * 10240]
    fn = self._WriteRecordsToFile(original, "zlib_read_write_large.tfrecord")
    zfn = self._ZlibCompressFile(fn, "zlib_read_write_large.tfrecord.z")

    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
    actual = list(tf_record.tf_record_iterator(zfn, options=options))
    self.assertEqual(actual, original)
Example #16
0
  def generateTestData(self, prefix, n, m,
      compression_type=tf_record.TFRecordCompressionType.NONE):
    options = tf_record.TFRecordOptions(compression_type)
    for i in range(n):
      f = os.path.join(self.get_temp_dir(), prefix + "." + str(i))
      w = tf_record.TFRecordWriter(f, options=options)

      for j in range(m):
        w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8"))

    w.close()
    def testIterator(self):
        records = [self._Record(0, i) for i in range(self._num_records)]
        options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
        fn = self._WriteRecordsToFile(records, "compressed_records", options)

        reader = tf_record.tf_record_iterator(fn, options)
        for expected in records:
            record = next(reader)
            self.assertAllEqual(expected, record)
        with self.assertRaises(StopIteration):
            record = next(reader)
Example #18
0
 def testIterator(self):
     fn = self._WriteCompressedRecordsToFile(
         [self._Record(i) for i in range(self._num_records)],
         "compressed_records")
     options = tf_record.TFRecordOptions(
         compression_type=TFRecordCompressionType.ZLIB)
     reader = tf_record.tf_record_iterator(fn, options)
     for i in range(self._num_records):
         record = next(reader)
         self.assertAllEqual(self._Record(i), record)
     with self.assertRaises(StopIteration):
         record = next(reader)
    def testWriteReadZLibFiles(self):
        # Write uncompressed then compress manually.
        options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE)
        files = self._CreateFiles(options, prefix="uncompressed")
        zlib_files = [
            self._ZlibCompressFile(fn, "tfrecord_%s.z" % i)
            for i, fn in enumerate(files)
        ]
        self._AssertFilesEqual(files, zlib_files, False)

        # Now write compressd and verify same.
        options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
        compressed_files = self._CreateFiles(options, prefix="compressed")
        self._AssertFilesEqual(compressed_files, zlib_files, True)

        # Decompress compress and verify same.
        uncompressed_files = [
            self._ZlibDecompressFile(fn, "tfrecord_%s.z" % i)
            for i, fn in enumerate(compressed_files)
        ]
        self._AssertFilesEqual(uncompressed_files, files, True)
 def testWriteGZIP(self):
   options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP)
   with self.cached_session() as sess:
     sess.run(
         self.writer,
         feed_dict={
             self.filename: self._createFile(options),
             self.compression_type: "GZIP",
         })
   for i, r in enumerate(
       tf_record.tf_record_iterator(self._outputFilename(), options=options)):
     self.assertAllEqual(self._record(i), r)
Example #21
0
 def _WriteCompressedRecordsToFile(
         self,
         records,
         name="tfrecord.z",
         compression_type=tf_record.TFRecordCompressionType.ZLIB):
     fn = os.path.join(self.get_temp_dir(), name)
     options = tf_record.TFRecordOptions(compression_type=compression_type)
     writer = tf_record.TFRecordWriter(fn, options=options)
     for r in records:
         writer.write(r)
     writer.close()
     del writer
     return fn
def save_rows_to_tf_record_file(df_rows, make_sequence_example_fn,
                                export_filename):
    tf_record_options = tf_record.TFRecordOptions(
        tf_record.TFRecordCompressionType.GZIP)

    tf_writer = tf_record.TFRecordWriter(export_filename,
                                         options=tf_record_options)
    try:
        for index, row in df_rows.iterrows():
            seq_example = make_sequence_example_fn(row)
            tf_writer.write(seq_example.SerializeToString())
    finally:
        tf_writer.close()
        sys.stdout.flush()
    def testWriteReadGzipFiles(self):
        # Write uncompressed then compress manually.
        options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE)
        files = self._CreateFiles(options, prefix="uncompressed")
        gzip_files = [
            self._GzipCompressFile(fn, "tfrecord_%s.gz" % i)
            for i, fn in enumerate(files)
        ]
        self._AssertFilesEqual(files, gzip_files, False)

        # Now write compressd and verify same.
        options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
        compressed_files = self._CreateFiles(options, prefix="compressed")

        # Note: Gzips written by TFRecordWriter add 'tfrecord_0' so
        # compressed_files can't be compared with gzip_files

        # Decompress compress and verify same.
        uncompressed_files = [
            self._GzipDecompressFile(fn, "tfrecord_%s.gz" % i)
            for i, fn in enumerate(compressed_files)
        ]
        self._AssertFilesEqual(uncompressed_files, files, True)
Example #24
0
    def _CreateFiles(self):
        filenames = []
        for i in range(self._num_files):
            fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
            filenames.append(fn)
            options = tf_record.TFRecordOptions(
                compression_type=TFRecordCompressionType.ZLIB)
            writer = tf_record.TFRecordWriter(fn, options=options)
            for j in range(self._num_records):
                writer.write(self._Record(i, j))
            writer.close()
            del writer

        return filenames
Example #25
0
    def testReadGzipFiles(self):
        options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
        files = self._CreateFiles(options)

        reader = io_ops.TFRecordReader(name="test_reader", options=options)
        queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
        key, value = reader.read(queue)

        self.evaluate(queue.enqueue_many([files]))
        self.evaluate(queue.close())
        for i in range(self._num_files):
            for j in range(self._num_records):
                k, v = self.evaluate([key, value])
                self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
                self.assertAllEqual(self._Record(i, j), v)
Example #26
0
  def testGzipReadWrite(self):
    """Verify that files produced are gzip compatible."""
    original = [b"foo", b"bar"]
    fn = self._WriteRecordsToFile(original, "gzip_read_write.tfrecord")

    # gzip compress the file and write compressed contents to file.
    with open(fn, "rb") as f:
      cdata = f.read()
    gzfn = os.path.join(self.get_temp_dir(), "tf_record.gz")
    with gzip.GzipFile(gzfn, "wb") as f:
      f.write(cdata)

    actual = []
    for r in tf_record.tf_record_iterator(
        gzfn, options=tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)):
      actual.append(r)
    self.assertEqual(actual, original)
Example #27
0
  def testOneEpoch(self):
    files = self._CreateFiles()
    with self.test_session() as sess:
      options = tf_record.TFRecordOptions(
          compression_type=TFRecordCompressionType.ZLIB)
      reader = io_ops.TFRecordReader(name="test_reader", options=options)
      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
      key, value = reader.read(queue)

      queue.enqueue_many([files]).run()
      queue.close().run()
      for i in range(self._num_files):
        for j in range(self._num_records):
          k, v = sess.run([key, value])
          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
          self.assertAllEqual(self._Record(i, j), v)

      with self.assertRaisesOpError("is closed and has insufficient elements "
                                    "\\(requested 1, current size 0\\)"):
        k, v = sess.run([key, value])
Example #28
0
def main(unused_argv):
    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[111])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["1111111111"])),
        }))

    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
    writer = python_io.TFRecordWriter("/tmp/test1.tfrecord", options)

    writer.write(example.SerializeToString())

    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[222])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["2222222222"])),
        }))
    writer.write(example.SerializeToString())

    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[333])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["3333333333"])),
        }))
    writer.write(example.SerializeToString())

    writer.close()

    tf.compat.v1.logging.info('File /tmp/test1.tfrecord generated!')
Example #29
0
def input_pipeline(mode, batch_size=BATCH_SIZE, num_epochs=NUM_EPOCHS):
    with tf.name_scope('img_pipeline'):
        if mode == 'train':
            filenames = [TRAIN_FILENAME]
            image_feature = 'train/image'
            label_feature = 'train/label'
        else:
            filenames = [VAL_FILENAME]
            image_feature = 'val/image'
            label_feature = 'val/label'

        feature = {
            image_feature: tf.FixedLenFeature([], tf.string),
            label_feature: tf.FixedLenFeature([], tf.int64)
        }

        # Create a list of filenames and pass it to a queue
        filename_queue = tf.train.string_input_producer(filenames,
                                                        num_epochs=NUM_EPOCHS +
                                                        1)
        # Define a reader and read the next record
        options = tf_record.TFRecordOptions(
            compression_type=tf_record.TFRecordCompressionType.GZIP)
        reader = tf.TFRecordReader(options=options)
        _, serialized_example = reader.read(filename_queue)
        # Decode the record read by the reader
        features = tf.parse_single_example(serialized_example,
                                           features=feature)
        # Convert the image data from string back to the numbers
        image = tf.decode_raw(features[image_feature], tf.uint8)

        # Cast label data into one_hot encoded
        label = tf.cast(features[label_feature], tf.int32)
        label = tf.one_hot(label, NUM_CLASSES)
        # Reshape image data into the original shape
        image = tf.reshape(image, [256, 256, 3])

        # Any preprocessing here ...
        # 1. random cropping 224x224
        # 2. random LR-flipping
        image = tf.random_crop(image, [224, 224, 3])
        image = tf.image.random_flip_left_right(image)

        #print_features(image)

        # Creates batches by randomly shuffling tensors
        # min_after_dequeue defines how big a buffer we will randomly sample
        #   from -- bigger means better shuffling but slower start up and more
        #   memory used.
        # capacity must be larger than min_after_dequeue and the amount larger
        #   determines the maximum we will prefetch.  Recommendation:
        #   min_after_dequeue + (num_threads + a small safety margin) * batch_size
        min_after_dequeue = 100
        num_threads = 6
        capacity = min_after_dequeue + (num_threads + 2) * BATCH_SIZE
        images, labels = tf.train.shuffle_batch(
            [image, label],
            batch_size=BATCH_SIZE,
            capacity=capacity,
            num_threads=num_threads,
            min_after_dequeue=min_after_dequeue)

        #print("input_pipeline will return now.")
        return images, labels