def testTFRecordDatasetFromDataset(self): filenames = [] all_contents = [] for i in range(_NUM_FILES): filename = os.path.join(self.get_temp_dir(), 'tf_record.%d' % i) filenames.append(filename) writer = python_io.TFRecordWriter(filename) for j in range(_NUM_ENTRIES): record = compat.as_bytes('Record %d of file %d' % (j, i)) writer.write(record) all_contents.append(record) writer.close() filenames = dataset_ops.Dataset.from_tensor_slices(filenames) dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord') iterator = dataset.make_initializable_iterator() self._sess.run(iterator.initializer) get_next = iterator.get_next() retrieved_values = [] for _ in range(4 * len(all_contents)): retrieved_values.append(compat.as_bytes(self._sess.run(get_next))) self.assertEqual(set(all_contents), set(retrieved_values))
def shuffle_records(fname): """Shuffle records in a single file.""" print("Shuffling records in file %s" % fname) # Rename file prior to shuffling tmp_fname = fname + ".unshuffled" gfile.Rename(fname, tmp_fname) reader = python_io.tf_record_iterator(tmp_fname) records = [] for record in reader: records.append(record) if len(records) % 100000 == 0: print("\tRead: %d", len(records)) random.shuffle(records) # Write shuffled records to original file name with python_io.TFRecordWriter(fname) as w: for count, record in enumerate(records): w.write(record) if count > 0 and count % 100000 == 0: print("\tWriting record: %d" % count) gfile.Remove(tmp_fname)
def testMakeBatchedFeaturesDataset(self): # Set up fn = os.path.join(self.get_temp_dir(), "tf_record.txt") writer = python_io.TFRecordWriter(fn) for i in range(1024): writer.write( example_pb2.Example( features=feature_pb2.Features( feature={ "value": feature_pb2.Feature( int64_list=feature_pb2.Int64List(value=[i])) })).SerializeToString()) writer.close() dataset = readers.make_batched_features_dataset( file_pattern=fn, batch_size=32, features={"value": parsing_ops.FixedLenFeature([], dtypes.int64)}, shuffle=False, num_epochs=1, drop_final_batch=False) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4) self.assertEqual([[None]], [ts.as_list() for ts in _flat_shapes(rebatched_dataset)]) expected_output = [{ "value": [k for k in range(i, i + 8)] } for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output)
def _createFile(self, options=None): filename = self._inputFilename() writer = python_io.TFRecordWriter(filename, options) for i in range(self._num_records): writer.write(self._record(i)) writer.close() return filename
def _writeFile(self, name, data): filename = os.path.join(self.get_temp_dir(), name) writer = python_io.TFRecordWriter(filename) for d in data: writer.write(compat.as_bytes(str(d))) writer.close() return filename
def __init__(self, output_path, header=None): super(TFRecordWriter, self).__init__() compressed = output_path.endswith('.gz') options = python_io.TFRecordOptions( python_io.TFRecordCompressionType. GZIP if compressed else python_io.TFRecordCompressionType.NONE) self._writer = python_io.TFRecordWriter(output_path, options=options) self.header = header
def testParseExampleInputFn(self): """Tests complete flow with input_fn constructed from parse_example.""" n_classes = 3 batch_size = 10 words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept'] _, examples_file = tempfile.mkstemp() writer = python_io.TFRecordWriter(examples_file) for _ in range(batch_size): sequence_length = random.randint(1, len(words)) sentence = random.sample(words, sequence_length) label = random.randint(0, n_classes - 1) example = example_pb2.Example(features=feature_pb2.Features( feature={ 'tokens': feature_pb2.Feature(bytes_list=feature_pb2.BytesList( value=sentence)), 'label': feature_pb2.Feature(int64_list=feature_pb2.Int64List( value=[label])), })) writer.write(example.SerializeToString()) writer.close() col = seq_fc.sequence_categorical_column_with_hash_bucket( 'tokens', hash_bucket_size=10) embed = fc.embedding_column(col, dimension=2) feature_columns = [embed] feature_spec = parsing_utils.classifier_parse_example_spec( feature_columns, label_key='label', label_dtype=dtypes.int64) def _train_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec) return dataset.map(lambda features: (features, features.pop('label'))) def _eval_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) return dataset.map(lambda features: (features, features.pop('label'))) def _predict_input_fn(): dataset = readers.make_batched_features_dataset( examples_file, batch_size, feature_spec, num_epochs=1) def features_fn(features): features.pop('label') return features return dataset.map(features_fn) self._test_complete_flow( feature_columns=feature_columns, train_input_fn=_train_input_fn, eval_input_fn=_eval_input_fn, predict_input_fn=_predict_input_fn, n_classes=n_classes, batch_size=batch_size)
def _createFiles(self): filenames = [] for i in range(self._num_files): fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i) filenames.append(fn) writer = python_io.TFRecordWriter(fn) for j in range(self._num_records): writer.write(self._record(i, j)) writer.close() return filenames
def encode_and_save_files(subtokenizer, data_dir, raw_files, tag, total_shards): """Save data from files as encoded Examples in TFrecord format. Args: subtokenizer: Subtokenizer object that will be used to encode the strings. data_dir: The directory in which to write the examples raw_files: A tuple of (input, target) data files. Each line in the input and the corresponding line in target file will be saved in a tf.Example. tag: String that will be added onto the file names. total_shards: Number of files to divide the data into. Returns: List of all files produced. """ # Create a file for each shard. filepaths = [ shard_filename(data_dir, tag, n + 1, total_shards) for n in range(total_shards) ] if all_exist(filepaths): print("Files with tag %s already exist." % tag) return filepaths print("Saving files with tag %s." % tag) input_file = raw_files[0] target_file = raw_files[1] # Write examples to each shard in round robin order. tmp_filepaths = [fname + ".incomplete" for fname in filepaths] writers = [python_io.TFRecordWriter(fname) for fname in tmp_filepaths] counter, shard = 0, 0 for counter, (input_line, target_line) in enumerate( zip(txt_line_iterator(input_file), txt_line_iterator(target_file))): if counter > 0 and counter % 100000 == 0: print("\tSaving case %d." % counter) example = dict_to_example({ "inputs": subtokenizer.encode(input_line, add_eos=True), "targets": subtokenizer.encode(target_line, add_eos=True) }) writers[shard].write(example.SerializeToString()) shard = (shard + 1) % total_shards for writer in writers: writer.close() for tmp_name, final_name in zip(tmp_filepaths, filepaths): gfile.Rename(tmp_name, final_name) print("Saved %d Examples", counter) return filepaths
def make_tfrecord_writer(outfile, options=None): """Creates a python_io.TFRecordWriter for the specified outfile. Args: outfile: str. A path where we'll write our TFRecords. options: python_io.TFRecordOptions or None. If None, one will be inferred from the filename. Returns: A python_io.TFRecordWriter object. """ if not options: options = make_tfrecord_options(outfile) return python_io.TFRecordWriter(outfile, options)
def __init__(self, output_path, header=None): """Initializer. Args: output_path: str. The output path to which the records are written. header: An optional header for the particular data type. This can be useful for file types that have logical headers where some operations depend on that header information (e.g. VCF using its headers to determine type information of annotation fields). """ super(TFRecordWriter, self).__init__() compressed = output_path.endswith('.gz') options = python_io.TFRecordOptions( python_io.TFRecordCompressionType. GZIP if compressed else python_io.TFRecordCompressionType.NONE) self._writer = python_io.TFRecordWriter(output_path, options=options) self.header = header
def main(unused_argv): example = tf.train.Example(features=tf.train.Features( feature={ "feature_0": tf.train.Feature(int64_list=tf.train.Int64List(value=[111])), 'feature_1': tf.train.Feature(bytes_list=tf.train.BytesList( value=["1111111111"])), })) options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB) writer = python_io.TFRecordWriter("/tmp/test1.tfrecord", options) writer.write(example.SerializeToString()) example = tf.train.Example(features=tf.train.Features( feature={ "feature_0": tf.train.Feature(int64_list=tf.train.Int64List(value=[222])), 'feature_1': tf.train.Feature(bytes_list=tf.train.BytesList( value=["2222222222"])), })) writer.write(example.SerializeToString()) example = tf.train.Example(features=tf.train.Features( feature={ "feature_0": tf.train.Feature(int64_list=tf.train.Int64List(value=[333])), 'feature_1': tf.train.Feature(bytes_list=tf.train.BytesList( value=["3333333333"])), })) writer.write(example.SerializeToString()) writer.close() tf.compat.v1.logging.info('File /tmp/test1.tfrecord generated!')
def testMakeBatchedFeaturesDataset(self): files = 2 records_per_file = 5 def make_record(file_index): example = example_pb2.Example( features=feature_pb2.Features( feature={ "file": feature_pb2.Feature( int64_list=feature_pb2.Int64List(value=[file_index])), })) return example.SerializeToString() filenames = [] for file_index in range(files): filename = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % file_index) filenames.append(filename) writer = python_io.TFRecordWriter(filename) for _ in range(records_per_file): writer.write(make_record(file_index)) writer.close() dataset = readers.make_batched_features_dataset( file_pattern=filenames, batch_size=records_per_file, features={ "file": parsing_ops.FixedLenFeature([], dtypes.int64), }, reader=core_readers.TFRecordDataset, num_epochs=1) # We should shard at the file level, so that all records come from file 0. dataset = distribute._AutoShardDataset(dataset, 2, 0) dataset = dataset.unbatch() output = self.getDatasetOutput(dataset) files = [elem["file"] for elem in output] self.assertEqual(files, [0] * records_per_file)
def testTFRecordDatasetIgnoreError(self): filenames = [] for i in range(5): fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i) filenames.append(fn) writer = python_io.TFRecordWriter(fn) for _ in range(10): writer.write(b"record") writer.close() # Append corrupted data with open(fn, "a") as f: f.write("corrupted data") dataset = readers.TFRecordDataset(filenames).apply( error_ops.ignore_errors()) get_next = self.getNext(dataset) # All of the files are present. for _ in filenames: for _ in range(10): self.assertEqual(b"record", self.evaluate(get_next())) with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next())