Beispiel #1
0
    def testReadTruncatedFile_preservesReadOffset(self):
        """Verify that tf_record_iterator throws an exception on bad TFRecords.

    When a truncated record is completed, the iterator should return that new
    record on the next attempt at iteration, preserving the read offset. This
    behavior is required by TensorBoard.
    """
        # Write out a record and read it back it to get the raw bytes.
        fn = os.path.join(self.get_temp_dir(), "temp_file")
        with tf_record.TFRecordWriter(fn) as writer:
            writer.write(b"truncated")
        with open(fn, "rb") as f:
            record_bytes = f.read()
        # Start the file with a good record.
        fn_truncated = os.path.join(self.get_temp_dir(), "truncated_file")
        with tf_record.TFRecordWriter(fn_truncated) as writer:
            writer.write(b"good")
        with open(fn_truncated, "ab", buffering=0) as f:
            # Cause truncation by omitting the last byte from the record.
            f.write(record_bytes[:-1])
            iterator = tf_record.tf_record_iterator(fn_truncated)
            # Good record appears first.
            self.assertEqual(b"good", next(iterator))
            # Truncated record repeatedly causes DataLossError upon iteration.
            with self.assertRaises(errors_impl.DataLossError):
                next(iterator)
            with self.assertRaises(errors_impl.DataLossError):
                next(iterator)
            # Retrying after completing the record successfully returns the rest of
            # the file contents, preserving the prior read offset.
            f.write(record_bytes[-1:])
            self.assertEqual(b"truncated", next(iterator))
            with self.assertRaises(StopIteration):
                next(iterator)
Beispiel #2
0
    def testInputFn(self):
        with tempfile.NamedTemporaryFile() as records_file:
            with tf_record.TFRecordWriter(records_file.name) as records_writer:
                example = tf.train.Example()
                height = 5
                width = 3
                example.features.feature['height'].int64_list.value.append(
                    height)
                example.features.feature['width'].int64_list.value.append(
                    width)
                example.features.feature['patch'].float_list.value.extend(
                    range(height * width))
                label = 1
                example.features.feature['label'].int64_list.value.append(
                    label)
                for _ in range(3):
                    records_writer.write(example.SerializeToString())

            flags.FLAGS.input_patches = records_file.name
            batch_tensors = glyph_patches.input_fn()

            with self.test_session() as sess:
                batch = sess.run(batch_tensors)

                self.assertAllEqual(
                    batch[0]['patch'],
                    np.arange(height * width).reshape(
                        (1, height, width)).repeat(3, axis=0))
                self.assertAllEqual(batch[1], [label, label, label])
Beispiel #3
0
    def testReadGrowingFile_preservesReadOffset(self):
        """Verify that tf_record_iterator preserves read offset even after EOF.

    When a file is iterated to EOF, the iterator should raise StopIteration but
    not actually close the reader. Then if later new data is appended, the
    iterator should start returning that new data on the next call to next(),
    preserving the read offset. This behavior is required by TensorBoard.
    """
        # Start the file with a good record.
        fn = os.path.join(self.get_temp_dir(), "file.tfrecord")
        with tf_record.TFRecordWriter(fn) as writer:
            writer.write(b"one")
            writer.write(b"two")
            writer.flush()
            iterator = tf_record.tf_record_iterator(fn)
            self.assertEqual(b"one", next(iterator))
            self.assertEqual(b"two", next(iterator))
            # Iterating at EOF results in StopIteration repeatedly.
            with self.assertRaises(StopIteration):
                next(iterator)
            with self.assertRaises(StopIteration):
                next(iterator)
            # Retrying after adding a new record successfully returns the new record,
            # preserving the prior read offset.
            writer.write(b"three")
            writer.flush()
            self.assertEqual(b"three", next(iterator))
            with self.assertRaises(StopIteration):
                next(iterator)
Beispiel #4
0
 def setUp(self, compression_type=TFRecordCompressionType.NONE):
     super(TFRecordWriterCloseAndFlushTests, self).setUp()
     self._fn = os.path.join(self.get_temp_dir(),
                             "tf_record_writer_test.txt")
     self._options = tf_record.TFRecordOptions(compression_type)
     self._writer = tf_record.TFRecordWriter(self._fn, self._options)
     self._num_records = 20
Beispiel #5
0
def create_tfrecord_files(output_dir, num_files=3, num_records_per_file=10):
    """Creates TFRecords files.

  The method must be called within an active session.

  Args:
    output_dir: The directory where the files are stored.
    num_files: The number of files to create.
    num_records_per_file: The number of records per file.

  Returns:
    A list of the paths to the TFRecord files.
  """
    tfrecord_paths = []
    for i in range(num_files):
        path = os.path.join(output_dir,
                            'flowers.tfrecord-%d-of-%s' % (i, num_files))
        tfrecord_paths.append(path)

        writer = tf_record.TFRecordWriter(path)
        for _ in range(num_records_per_file):
            _, example = generate_image(image_shape=(10, 10, 3))
            writer.write(example)
        writer.close()

    return tfrecord_paths
Beispiel #6
0
def save_rows_to_tf_record_file(rows,
                                make_sequence_example_fn,
                                sessions_df_length,
                                export_filename,
                                content_article_embeddings=None,
                                num_of_articles_in_sub_group=None):
    tf_record_options = tf_record.TFRecordOptions(
        tf_record.TFRecordCompressionType.GZIP)

    tf_writer = tf_record.TFRecordWriter(export_filename,
                                         options=tf_record_options)
    try:
        counter = 1
        for row in rows:
            start = time.time()
            print(f"{counter}/{sessions_df_length}")

            seq_example = make_sequence_example_fn(
                row, num_of_articles_in_sub_group, content_article_embeddings)

            end = time.time()
            print(end - start)
            counter += 1
            tf_writer.write(seq_example.SerializeToString())
    finally:
        tf_writer.close()
        sys.stdout.flush()
Beispiel #7
0
 def _WriteRecordsToFile(self, records, name="tf_record"):
     fn = os.path.join(self.get_temp_dir(), name)
     writer = tf_record.TFRecordWriter(fn, options=None)
     for r in records:
         writer.write(r)
     writer.close()
     del writer
     return fn
  def generateTestData(self, prefix, n, m):
    for i in range(n):
      f = os.path.join(self.get_temp_dir(), prefix + "." + str(i))
      w = tf_record.TFRecordWriter(f)

      for j in range(m):
        w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8"))

    w.close()
Beispiel #9
0
 def _CreateFiles(self):
     filenames = []
     for i in range(self._num_files):
         fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
         filenames.append(fn)
         writer = tf_record.TFRecordWriter(fn)
         for j in range(self._num_records):
             writer.write(self._Record(i, j))
     return filenames
 def _make_test_tfrecord(self):
   f = tempfile.NamedTemporaryFile(dir=self.get_temp_dir(), delete=False)
   w = tf_record.TFRecordWriter(f.name)
   for i in range(100):
     ex = example_pb2.Example()
     ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3)))
     ex.features.feature["fixed_len_float"].float_list.value.extend(
         [float(i), 2 * float(i)])
     w.write(ex.SerializeToString())
   return f.name
Beispiel #11
0
  def generateTestData(self, prefix, n, m,
      compression_type=tf_record.TFRecordCompressionType.NONE):
    options = tf_record.TFRecordOptions(compression_type)
    for i in range(n):
      f = os.path.join(self.get_temp_dir(), prefix + "." + str(i))
      w = tf_record.TFRecordWriter(f, options=options)

      for j in range(m):
        w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8"))

    w.close()
def _make_tfexample_series(num_features, num_samples, test_tmpdir):
  _, data_file = tempfile.mkstemp(dir=test_tmpdir)
  with tf_record.TFRecordWriter(data_file) as writer:
    for i in range(num_samples):
      example = example_pb2.Example()
      times = example.features.feature[TrainEvalFeatures.TIMES]
      times.int64_list.value.append(i)
      values = example.features.feature[TrainEvalFeatures.VALUES]
      values.float_list.value.extend(
          [float(i) * 2. + feature_number
           for feature_number in range(num_features)])
      writer.write(example.SerializeToString())
  return data_file
Beispiel #13
0
 def _WriteCompressedRecordsToFile(
         self,
         records,
         name="tfrecord.z",
         compression_type=tf_record.TFRecordCompressionType.ZLIB):
     fn = os.path.join(self.get_temp_dir(), name)
     options = tf_record.TFRecordOptions(compression_type=compression_type)
     writer = tf_record.TFRecordWriter(fn, options=options)
     for r in records:
         writer.write(r)
     writer.close()
     del writer
     return fn
Beispiel #14
0
 def testKmeans(self):
     num_features = FLAGS.patch_height * FLAGS.patch_width
     dummy_data = np.random.random((500, num_features))
     with tempfile.NamedTemporaryFile(mode='r') as patches_file:
         with tf_record.TFRecordWriter(patches_file.name) as patches_writer:
             for patch in dummy_data:
                 example = example_pb2.Example()
                 example.features.feature[
                     'features'].float_list.value.extend(patch)
                 patches_writer.write(example.SerializeToString())
         clusters = staffline_patches_kmeans_pipeline.train_kmeans(
             patches_file.name, NUM_CLUSTERS, BATCH_SIZE, TRAIN_STEPS)
         self.assertEqual(clusters.shape, (NUM_CLUSTERS, num_features))
Beispiel #15
0
 def testBadFile(self):
     """Verify that tf_record_iterator throws an exception on bad TFRecords."""
     fn = os.path.join(self.get_temp_dir(), "bad_file")
     with tf_record.TFRecordWriter(fn) as writer:
         writer.write(b"123")
     fn_truncated = os.path.join(self.get_temp_dir(), "bad_file_truncated")
     with open(fn, "rb") as f:
         with open(fn_truncated, "wb") as f2:
             # DataLossError requires that we've written the header, so this must
             # be at least 12 bytes.
             f2.write(f.read(14))
     with self.assertRaises(errors_impl.DataLossError):
         for _ in tf_record.tf_record_iterator(fn_truncated):
             pass
    def test_read_batched_sequence_example_dataset(self, sloppy_ordering):
        # Save protos in a sstable file in a temp folder.
        serialized_sequence_examples = [
            SEQ_EXAMPLE_PROTO_1.SerializeToString(),
            SEQ_EXAMPLE_PROTO_2.SerializeToString()
        ] * 100
        data_dir = test.get_temp_dir()
        data_file = os.path.join(data_dir, "test_sequence_example.tfrecord")
        if file_io.file_exists(data_file):
            file_io.delete_file(data_file)

        with tf_record.TFRecordWriter(data_file) as writer:
            for s in serialized_sequence_examples:
                writer.write(s)

        batched_dataset = data_lib.read_batched_sequence_example_dataset(
            file_pattern=data_file,
            batch_size=2,
            list_size=2,
            context_feature_spec=CONTEXT_FEATURE_SPEC,
            example_feature_spec=EXAMPLE_FEATURE_SPEC,
            reader=readers.TFRecordDataset,
            shuffle=False,
            sloppy_ordering=sloppy_ordering)

        features = batched_dataset.make_one_shot_iterator().get_next()
        self.assertAllEqual(sorted(features),
                            ["query_length", "unigrams", "utility"])
        # Check static shapes for dense tensors.
        self.assertAllEqual([2, 1],
                            features["query_length"].get_shape().as_list())
        self.assertAllEqual([2, 2, 1],
                            features["utility"].get_shape().as_list())

        with session.Session() as sess:
            sess.run(variables.local_variables_initializer())
            queue_runner.start_queue_runners()
            feature_map = sess.run(features)
            # Test dense_shape, indices and values for a SparseTensor.
            self.assertAllEqual(feature_map["unigrams"].dense_shape, [2, 2, 3])
            self.assertAllEqual(
                feature_map["unigrams"].indices,
                [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 1, 2], [1, 0, 0]])
            self.assertAllEqual(
                feature_map["unigrams"].values,
                [b"tensorflow", b"learning", b"to", b"rank", b"gbdt"])
            # Check values directly for dense tensors.
            self.assertAllEqual(feature_map["query_length"], [[3], [2]])
            self.assertAllEqual(feature_map["utility"],
                                [[[0.], [1.0]], [[0.], [0.]]])
def save_rows_to_tf_record_file(df_rows, make_sequence_example_fn,
                                export_filename):
    tf_record_options = tf_record.TFRecordOptions(
        tf_record.TFRecordCompressionType.GZIP)

    tf_writer = tf_record.TFRecordWriter(export_filename,
                                         options=tf_record_options)
    try:
        for index, row in df_rows.iterrows():
            seq_example = make_sequence_example_fn(row)
            tf_writer.write(seq_example.SerializeToString())
    finally:
        tf_writer.close()
        sys.stdout.flush()
Beispiel #18
0
    def _CreateFiles(self):
        filenames = []
        for i in range(self._num_files):
            fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
            filenames.append(fn)
            options = tf_record.TFRecordOptions(
                compression_type=TFRecordCompressionType.ZLIB)
            writer = tf_record.TFRecordWriter(fn, options=options)
            for j in range(self._num_records):
                writer.write(self._Record(i, j))
            writer.close()
            del writer

        return filenames
Beispiel #19
0
    def do_POST(self):
        post_vars = cgi.parse_qs(
            self.rfile.read(int(self.headers.getheader('content-length'))))
        labels = [
            post_vars['cluster%d' % i][0]
            for i in moves.xrange(self.clusters.shape[0])
        ]
        examples = create_examples(self.clusters, labels)

        with tf_record.TFRecordWriter(self.output_path) as writer:
            for example in examples:
                writer.write(example.SerializeToString())
        self.send_response(http_client.OK)
        self.end_headers()
        self.wfile.write('Success')  # printed in the labeler alert
Beispiel #20
0
    def testAsFunctionFromReader(self):
        with ops.device("CPU"):
            file_path = os.path.join(
                self.get_temp_dir(),
                "{}.tfrecord.gz".format("tf_record_asset"))
            with tf_record.TFRecordWriter(file_path, "GZIP") as f:
                for v in ["a", "aa", "aaa"]:
                    f.write(str(v))
            original_dataset = readers.TFRecordDataset([file_path],
                                                       compression_type="GZIP")
            fn = original_dataset._trace_variant_creation()
            variant = fn()

            revived_dataset = dataset_ops._VariantDataset(
                variant, original_dataset.element_spec)
            self.assertDatasetProduces(revived_dataset, ["a", "aa", "aaa"])
def main(_):
    tf.logging.info('Building the pipeline...')
    records_dir = tempfile.mkdtemp(prefix='staffline_kmeans')
    try:
        patch_file_prefix = os.path.join(records_dir, 'patches')
        with pipeline_flags.create_pipeline() as pipeline:
            filenames = file_io.get_matching_files(FLAGS.music_pattern)
            assert filenames, 'Must have matched some filenames'
            if 0 < FLAGS.num_pages < len(filenames):
                filenames = random.sample(filenames, FLAGS.num_pages)
            filenames = pipeline | beam.transforms.Create(filenames)
            patches = filenames | beam.ParDo(
                staffline_patches_dofn.StafflinePatchesDoFn(
                    patch_height=FLAGS.patch_height,
                    patch_width=FLAGS.patch_width,
                    num_stafflines=FLAGS.num_stafflines,
                    timeout_ms=FLAGS.timeout_ms,
                    max_patches_per_page=FLAGS.max_patches_per_page))
            if FLAGS.num_outputs:
                patches |= combiners.Sample.FixedSizeGlobally(
                    FLAGS.num_outputs)
            patches |= beam.io.WriteToTFRecord(
                patch_file_prefix, beam.coders.ProtoCoder(tf.train.Example))
            tf.logging.info('Running the pipeline...')
        tf.logging.info('Running k-means...')
        patch_files = file_io.get_matching_files(patch_file_prefix + '*')
        clusters = train_kmeans(patch_files, FLAGS.kmeans_num_clusters,
                                FLAGS.kmeans_batch_size,
                                FLAGS.kmeans_num_steps)
        tf.logging.info('Writing the centroids...')
        with tf_record.TFRecordWriter(FLAGS.output_path) as writer:
            for cluster in clusters:
                example = tf.train.Example()
                example.features.feature['features'].float_list.value.extend(
                    cluster)
                example.features.feature['height'].int64_list.value.append(
                    FLAGS.patch_height)
                example.features.feature['width'].int64_list.value.append(
                    FLAGS.patch_width)
                writer.write(example.SerializeToString())
        tf.logging.info('Done!')
    finally:
        shutil.rmtree(records_dir)
 def _WriteRecordsToFile(self, records, name="tfrecord", options=None):
     fn = os.path.join(self.get_temp_dir(), name)
     with tf_record.TFRecordWriter(fn, options=options) as writer:
         for r in records:
             writer.write(r)
     return fn
Beispiel #23
0
 def write_records_to_file(filename, records):
     writer = tf_record.TFRecordWriter(filename)
     for record in records:
         writer.write(record)
     writer.close()