def testReadTruncatedFile_preservesReadOffset(self): """Verify that tf_record_iterator throws an exception on bad TFRecords. When a truncated record is completed, the iterator should return that new record on the next attempt at iteration, preserving the read offset. This behavior is required by TensorBoard. """ # Write out a record and read it back it to get the raw bytes. fn = os.path.join(self.get_temp_dir(), "temp_file") with tf_record.TFRecordWriter(fn) as writer: writer.write(b"truncated") with open(fn, "rb") as f: record_bytes = f.read() # Start the file with a good record. fn_truncated = os.path.join(self.get_temp_dir(), "truncated_file") with tf_record.TFRecordWriter(fn_truncated) as writer: writer.write(b"good") with open(fn_truncated, "ab", buffering=0) as f: # Cause truncation by omitting the last byte from the record. f.write(record_bytes[:-1]) iterator = tf_record.tf_record_iterator(fn_truncated) # Good record appears first. self.assertEqual(b"good", next(iterator)) # Truncated record repeatedly causes DataLossError upon iteration. with self.assertRaises(errors_impl.DataLossError): next(iterator) with self.assertRaises(errors_impl.DataLossError): next(iterator) # Retrying after completing the record successfully returns the rest of # the file contents, preserving the prior read offset. f.write(record_bytes[-1:]) self.assertEqual(b"truncated", next(iterator)) with self.assertRaises(StopIteration): next(iterator)
def testInputFn(self): with tempfile.NamedTemporaryFile() as records_file: with tf_record.TFRecordWriter(records_file.name) as records_writer: example = tf.train.Example() height = 5 width = 3 example.features.feature['height'].int64_list.value.append( height) example.features.feature['width'].int64_list.value.append( width) example.features.feature['patch'].float_list.value.extend( range(height * width)) label = 1 example.features.feature['label'].int64_list.value.append( label) for _ in range(3): records_writer.write(example.SerializeToString()) flags.FLAGS.input_patches = records_file.name batch_tensors = glyph_patches.input_fn() with self.test_session() as sess: batch = sess.run(batch_tensors) self.assertAllEqual( batch[0]['patch'], np.arange(height * width).reshape( (1, height, width)).repeat(3, axis=0)) self.assertAllEqual(batch[1], [label, label, label])
def testReadGrowingFile_preservesReadOffset(self): """Verify that tf_record_iterator preserves read offset even after EOF. When a file is iterated to EOF, the iterator should raise StopIteration but not actually close the reader. Then if later new data is appended, the iterator should start returning that new data on the next call to next(), preserving the read offset. This behavior is required by TensorBoard. """ # Start the file with a good record. fn = os.path.join(self.get_temp_dir(), "file.tfrecord") with tf_record.TFRecordWriter(fn) as writer: writer.write(b"one") writer.write(b"two") writer.flush() iterator = tf_record.tf_record_iterator(fn) self.assertEqual(b"one", next(iterator)) self.assertEqual(b"two", next(iterator)) # Iterating at EOF results in StopIteration repeatedly. with self.assertRaises(StopIteration): next(iterator) with self.assertRaises(StopIteration): next(iterator) # Retrying after adding a new record successfully returns the new record, # preserving the prior read offset. writer.write(b"three") writer.flush() self.assertEqual(b"three", next(iterator)) with self.assertRaises(StopIteration): next(iterator)
def setUp(self, compression_type=TFRecordCompressionType.NONE): super(TFRecordWriterCloseAndFlushTests, self).setUp() self._fn = os.path.join(self.get_temp_dir(), "tf_record_writer_test.txt") self._options = tf_record.TFRecordOptions(compression_type) self._writer = tf_record.TFRecordWriter(self._fn, self._options) self._num_records = 20
def create_tfrecord_files(output_dir, num_files=3, num_records_per_file=10): """Creates TFRecords files. The method must be called within an active session. Args: output_dir: The directory where the files are stored. num_files: The number of files to create. num_records_per_file: The number of records per file. Returns: A list of the paths to the TFRecord files. """ tfrecord_paths = [] for i in range(num_files): path = os.path.join(output_dir, 'flowers.tfrecord-%d-of-%s' % (i, num_files)) tfrecord_paths.append(path) writer = tf_record.TFRecordWriter(path) for _ in range(num_records_per_file): _, example = generate_image(image_shape=(10, 10, 3)) writer.write(example) writer.close() return tfrecord_paths
def save_rows_to_tf_record_file(rows, make_sequence_example_fn, sessions_df_length, export_filename, content_article_embeddings=None, num_of_articles_in_sub_group=None): tf_record_options = tf_record.TFRecordOptions( tf_record.TFRecordCompressionType.GZIP) tf_writer = tf_record.TFRecordWriter(export_filename, options=tf_record_options) try: counter = 1 for row in rows: start = time.time() print(f"{counter}/{sessions_df_length}") seq_example = make_sequence_example_fn( row, num_of_articles_in_sub_group, content_article_embeddings) end = time.time() print(end - start) counter += 1 tf_writer.write(seq_example.SerializeToString()) finally: tf_writer.close() sys.stdout.flush()
def _WriteRecordsToFile(self, records, name="tf_record"): fn = os.path.join(self.get_temp_dir(), name) writer = tf_record.TFRecordWriter(fn, options=None) for r in records: writer.write(r) writer.close() del writer return fn
def generateTestData(self, prefix, n, m): for i in range(n): f = os.path.join(self.get_temp_dir(), prefix + "." + str(i)) w = tf_record.TFRecordWriter(f) for j in range(m): w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8")) w.close()
def _CreateFiles(self): filenames = [] for i in range(self._num_files): fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i) filenames.append(fn) writer = tf_record.TFRecordWriter(fn) for j in range(self._num_records): writer.write(self._Record(i, j)) return filenames
def _make_test_tfrecord(self): f = tempfile.NamedTemporaryFile(dir=self.get_temp_dir(), delete=False) w = tf_record.TFRecordWriter(f.name) for i in range(100): ex = example_pb2.Example() ex.features.feature["var_len_int"].int64_list.value.extend(range((i % 3))) ex.features.feature["fixed_len_float"].float_list.value.extend( [float(i), 2 * float(i)]) w.write(ex.SerializeToString()) return f.name
def generateTestData(self, prefix, n, m, compression_type=tf_record.TFRecordCompressionType.NONE): options = tf_record.TFRecordOptions(compression_type) for i in range(n): f = os.path.join(self.get_temp_dir(), prefix + "." + str(i)) w = tf_record.TFRecordWriter(f, options=options) for j in range(m): w.write("{0:0{width}}".format(i * m + j, width=10).encode("utf-8")) w.close()
def _make_tfexample_series(num_features, num_samples, test_tmpdir): _, data_file = tempfile.mkstemp(dir=test_tmpdir) with tf_record.TFRecordWriter(data_file) as writer: for i in range(num_samples): example = example_pb2.Example() times = example.features.feature[TrainEvalFeatures.TIMES] times.int64_list.value.append(i) values = example.features.feature[TrainEvalFeatures.VALUES] values.float_list.value.extend( [float(i) * 2. + feature_number for feature_number in range(num_features)]) writer.write(example.SerializeToString()) return data_file
def _WriteCompressedRecordsToFile( self, records, name="tfrecord.z", compression_type=tf_record.TFRecordCompressionType.ZLIB): fn = os.path.join(self.get_temp_dir(), name) options = tf_record.TFRecordOptions(compression_type=compression_type) writer = tf_record.TFRecordWriter(fn, options=options) for r in records: writer.write(r) writer.close() del writer return fn
def testKmeans(self): num_features = FLAGS.patch_height * FLAGS.patch_width dummy_data = np.random.random((500, num_features)) with tempfile.NamedTemporaryFile(mode='r') as patches_file: with tf_record.TFRecordWriter(patches_file.name) as patches_writer: for patch in dummy_data: example = example_pb2.Example() example.features.feature[ 'features'].float_list.value.extend(patch) patches_writer.write(example.SerializeToString()) clusters = staffline_patches_kmeans_pipeline.train_kmeans( patches_file.name, NUM_CLUSTERS, BATCH_SIZE, TRAIN_STEPS) self.assertEqual(clusters.shape, (NUM_CLUSTERS, num_features))
def testBadFile(self): """Verify that tf_record_iterator throws an exception on bad TFRecords.""" fn = os.path.join(self.get_temp_dir(), "bad_file") with tf_record.TFRecordWriter(fn) as writer: writer.write(b"123") fn_truncated = os.path.join(self.get_temp_dir(), "bad_file_truncated") with open(fn, "rb") as f: with open(fn_truncated, "wb") as f2: # DataLossError requires that we've written the header, so this must # be at least 12 bytes. f2.write(f.read(14)) with self.assertRaises(errors_impl.DataLossError): for _ in tf_record.tf_record_iterator(fn_truncated): pass
def test_read_batched_sequence_example_dataset(self, sloppy_ordering): # Save protos in a sstable file in a temp folder. serialized_sequence_examples = [ SEQ_EXAMPLE_PROTO_1.SerializeToString(), SEQ_EXAMPLE_PROTO_2.SerializeToString() ] * 100 data_dir = test.get_temp_dir() data_file = os.path.join(data_dir, "test_sequence_example.tfrecord") if file_io.file_exists(data_file): file_io.delete_file(data_file) with tf_record.TFRecordWriter(data_file) as writer: for s in serialized_sequence_examples: writer.write(s) batched_dataset = data_lib.read_batched_sequence_example_dataset( file_pattern=data_file, batch_size=2, list_size=2, context_feature_spec=CONTEXT_FEATURE_SPEC, example_feature_spec=EXAMPLE_FEATURE_SPEC, reader=readers.TFRecordDataset, shuffle=False, sloppy_ordering=sloppy_ordering) features = batched_dataset.make_one_shot_iterator().get_next() self.assertAllEqual(sorted(features), ["query_length", "unigrams", "utility"]) # Check static shapes for dense tensors. self.assertAllEqual([2, 1], features["query_length"].get_shape().as_list()) self.assertAllEqual([2, 2, 1], features["utility"].get_shape().as_list()) with session.Session() as sess: sess.run(variables.local_variables_initializer()) queue_runner.start_queue_runners() feature_map = sess.run(features) # Test dense_shape, indices and values for a SparseTensor. self.assertAllEqual(feature_map["unigrams"].dense_shape, [2, 2, 3]) self.assertAllEqual( feature_map["unigrams"].indices, [[0, 0, 0], [0, 1, 0], [0, 1, 1], [0, 1, 2], [1, 0, 0]]) self.assertAllEqual( feature_map["unigrams"].values, [b"tensorflow", b"learning", b"to", b"rank", b"gbdt"]) # Check values directly for dense tensors. self.assertAllEqual(feature_map["query_length"], [[3], [2]]) self.assertAllEqual(feature_map["utility"], [[[0.], [1.0]], [[0.], [0.]]])
def save_rows_to_tf_record_file(df_rows, make_sequence_example_fn, export_filename): tf_record_options = tf_record.TFRecordOptions( tf_record.TFRecordCompressionType.GZIP) tf_writer = tf_record.TFRecordWriter(export_filename, options=tf_record_options) try: for index, row in df_rows.iterrows(): seq_example = make_sequence_example_fn(row) tf_writer.write(seq_example.SerializeToString()) finally: tf_writer.close() sys.stdout.flush()
def _CreateFiles(self): filenames = [] for i in range(self._num_files): fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i) filenames.append(fn) options = tf_record.TFRecordOptions( compression_type=TFRecordCompressionType.ZLIB) writer = tf_record.TFRecordWriter(fn, options=options) for j in range(self._num_records): writer.write(self._Record(i, j)) writer.close() del writer return filenames
def do_POST(self): post_vars = cgi.parse_qs( self.rfile.read(int(self.headers.getheader('content-length')))) labels = [ post_vars['cluster%d' % i][0] for i in moves.xrange(self.clusters.shape[0]) ] examples = create_examples(self.clusters, labels) with tf_record.TFRecordWriter(self.output_path) as writer: for example in examples: writer.write(example.SerializeToString()) self.send_response(http_client.OK) self.end_headers() self.wfile.write('Success') # printed in the labeler alert
def testAsFunctionFromReader(self): with ops.device("CPU"): file_path = os.path.join( self.get_temp_dir(), "{}.tfrecord.gz".format("tf_record_asset")) with tf_record.TFRecordWriter(file_path, "GZIP") as f: for v in ["a", "aa", "aaa"]: f.write(str(v)) original_dataset = readers.TFRecordDataset([file_path], compression_type="GZIP") fn = original_dataset._trace_variant_creation() variant = fn() revived_dataset = dataset_ops._VariantDataset( variant, original_dataset.element_spec) self.assertDatasetProduces(revived_dataset, ["a", "aa", "aaa"])
def main(_): tf.logging.info('Building the pipeline...') records_dir = tempfile.mkdtemp(prefix='staffline_kmeans') try: patch_file_prefix = os.path.join(records_dir, 'patches') with pipeline_flags.create_pipeline() as pipeline: filenames = file_io.get_matching_files(FLAGS.music_pattern) assert filenames, 'Must have matched some filenames' if 0 < FLAGS.num_pages < len(filenames): filenames = random.sample(filenames, FLAGS.num_pages) filenames = pipeline | beam.transforms.Create(filenames) patches = filenames | beam.ParDo( staffline_patches_dofn.StafflinePatchesDoFn( patch_height=FLAGS.patch_height, patch_width=FLAGS.patch_width, num_stafflines=FLAGS.num_stafflines, timeout_ms=FLAGS.timeout_ms, max_patches_per_page=FLAGS.max_patches_per_page)) if FLAGS.num_outputs: patches |= combiners.Sample.FixedSizeGlobally( FLAGS.num_outputs) patches |= beam.io.WriteToTFRecord( patch_file_prefix, beam.coders.ProtoCoder(tf.train.Example)) tf.logging.info('Running the pipeline...') tf.logging.info('Running k-means...') patch_files = file_io.get_matching_files(patch_file_prefix + '*') clusters = train_kmeans(patch_files, FLAGS.kmeans_num_clusters, FLAGS.kmeans_batch_size, FLAGS.kmeans_num_steps) tf.logging.info('Writing the centroids...') with tf_record.TFRecordWriter(FLAGS.output_path) as writer: for cluster in clusters: example = tf.train.Example() example.features.feature['features'].float_list.value.extend( cluster) example.features.feature['height'].int64_list.value.append( FLAGS.patch_height) example.features.feature['width'].int64_list.value.append( FLAGS.patch_width) writer.write(example.SerializeToString()) tf.logging.info('Done!') finally: shutil.rmtree(records_dir)
def _WriteRecordsToFile(self, records, name="tfrecord", options=None): fn = os.path.join(self.get_temp_dir(), name) with tf_record.TFRecordWriter(fn, options=options) as writer: for r in records: writer.write(r) return fn
def write_records_to_file(filename, records): writer = tf_record.TFRecordWriter(filename) for record in records: writer.write(record) writer.close()