Python TFRecordWriter Examples, tensorflow.python.lib.io.python_io.TFRecordWriter Python Examples

Example #1

0

Show file

File: datasets_test.py Project: sgcm520/tensorflow2

    def testTFRecordDatasetFromDataset(self):
        filenames = []
        all_contents = []
        for i in range(_NUM_FILES):
            filename = os.path.join(self.get_temp_dir(), 'tf_record.%d' % i)
            filenames.append(filename)
            writer = python_io.TFRecordWriter(filename)
            for j in range(_NUM_ENTRIES):
                record = compat.as_bytes('Record %d of file %d' % (j, i))
                writer.write(record)
                all_contents.append(record)
            writer.close()

        filenames = dataset_ops.Dataset.from_tensor_slices(filenames)

        dataset = datasets.StreamingFilesDataset(filenames,
                                                 filetype='tfrecord')

        iterator = dataset.make_initializable_iterator()
        self._sess.run(iterator.initializer)
        get_next = iterator.get_next()

        retrieved_values = []
        for _ in range(4 * len(all_contents)):
            retrieved_values.append(compat.as_bytes(self._sess.run(get_next)))

        self.assertEqual(set(all_contents), set(retrieved_values))

Example #2

0

Show file

File: generate_dataset.py Project: templeblock/Transformer_Tensorlayer

def shuffle_records(fname):
    """Shuffle records in a single file."""
    print("Shuffling records in file %s" % fname)

    # Rename file prior to shuffling
    tmp_fname = fname + ".unshuffled"
    gfile.Rename(fname, tmp_fname)

    reader = python_io.tf_record_iterator(tmp_fname)
    records = []
    for record in reader:
        records.append(record)
        if len(records) % 100000 == 0:
            print("\tRead: %d", len(records))

    random.shuffle(records)

    # Write shuffled records to original file name
    with python_io.TFRecordWriter(fname) as w:
        for count, record in enumerate(records):
            w.write(record)
            if count > 0 and count % 100000 == 0:
                print("\tWriting record: %d" % count)

    gfile.Remove(tmp_fname)

Example #3

0

Show file

File: rebatch_dataset_test.py Project: yoonseok312/tensorflow-lite

  def testMakeBatchedFeaturesDataset(self):
    # Set up
    fn = os.path.join(self.get_temp_dir(), "tf_record.txt")
    writer = python_io.TFRecordWriter(fn)
    for i in range(1024):
      writer.write(
          example_pb2.Example(
              features=feature_pb2.Features(
                  feature={
                      "value":
                          feature_pb2.Feature(
                              int64_list=feature_pb2.Int64List(value=[i]))
                  })).SerializeToString())
    writer.close()

    dataset = readers.make_batched_features_dataset(
        file_pattern=fn,
        batch_size=32,
        features={"value": parsing_ops.FixedLenFeature([], dtypes.int64)},
        shuffle=False,
        num_epochs=1,
        drop_final_batch=False)

    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)

    self.assertEqual([[None]],
                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])

    expected_output = [{
        "value": [k for k in range(i, i + 8)]
    } for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
    self.assertDatasetProduces(rebatched_dataset, expected_output)

Example #4

0

Show file

File: tf_record_writer_test.py Project: d813s909q/tensortflow

 def _createFile(self, options=None):
   filename = self._inputFilename()
   writer = python_io.TFRecordWriter(filename, options)
   for i in range(self._num_records):
     writer.write(self._record(i))
   writer.close()
   return filename

Example #5

0

Show file

 def _writeFile(self, name, data):
     filename = os.path.join(self.get_temp_dir(), name)
     writer = python_io.TFRecordWriter(filename)
     for d in data:
         writer.write(compat.as_bytes(str(d)))
     writer.close()
     return filename

Example #6

0

Show file

File: genomics_writer.py Project: samanvp/nucleus

    def __init__(self, output_path, header=None):
        super(TFRecordWriter, self).__init__()

        compressed = output_path.endswith('.gz')
        options = python_io.TFRecordOptions(
            python_io.TFRecordCompressionType.
            GZIP if compressed else python_io.TFRecordCompressionType.NONE)
        self._writer = python_io.TFRecordWriter(output_path, options=options)
        self.header = header

Example #7

0

Show file

File: rnn_test.py Project: ShubhamSrivastava93/shubham-tensorflow

  def testParseExampleInputFn(self):
    """Tests complete flow with input_fn constructed from parse_example."""
    n_classes = 3
    batch_size = 10
    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']

    _, examples_file = tempfile.mkstemp()
    writer = python_io.TFRecordWriter(examples_file)
    for _ in range(batch_size):
      sequence_length = random.randint(1, len(words))
      sentence = random.sample(words, sequence_length)
      label = random.randint(0, n_classes - 1)
      example = example_pb2.Example(features=feature_pb2.Features(
          feature={
              'tokens':
                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                      value=sentence)),
              'label':
                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                      value=[label])),
          }))
      writer.write(example.SerializeToString())
    writer.close()

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]
    feature_spec = parsing_utils.classifier_parse_example_spec(
        feature_columns,
        label_key='label',
        label_dtype=dtypes.int64)

    def _train_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _eval_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _predict_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      def features_fn(features):
        features.pop('label')
        return features
      return dataset.map(features_fn)

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=_train_input_fn,
        eval_input_fn=_eval_input_fn,
        predict_input_fn=_predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)

Example #8

0

Show file

 def _createFiles(self):
     filenames = []
     for i in range(self._num_files):
         fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
         filenames.append(fn)
         writer = python_io.TFRecordWriter(fn)
         for j in range(self._num_records):
             writer.write(self._record(i, j))
         writer.close()
     return filenames

Example #9

0

Show file

File: generate_dataset.py Project: templeblock/Transformer_Tensorlayer

def encode_and_save_files(subtokenizer, data_dir, raw_files, tag,
                          total_shards):
    """Save data from files as encoded Examples in TFrecord format.
  Args:
    subtokenizer: Subtokenizer object that will be used to encode the strings.
    data_dir: The directory in which to write the examples
    raw_files: A tuple of (input, target) data files. Each line in the input and
      the corresponding line in target file will be saved in a tf.Example.
    tag: String that will be added onto the file names.
    total_shards: Number of files to divide the data into.
  Returns:
    List of all files produced.
  """
    # Create a file for each shard.
    filepaths = [
        shard_filename(data_dir, tag, n + 1, total_shards)
        for n in range(total_shards)
    ]

    if all_exist(filepaths):
        print("Files with tag %s already exist." % tag)
        return filepaths

    print("Saving files with tag %s." % tag)
    input_file = raw_files[0]
    target_file = raw_files[1]

    # Write examples to each shard in round robin order.
    tmp_filepaths = [fname + ".incomplete" for fname in filepaths]
    writers = [python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
    counter, shard = 0, 0
    for counter, (input_line, target_line) in enumerate(
            zip(txt_line_iterator(input_file),
                txt_line_iterator(target_file))):
        if counter > 0 and counter % 100000 == 0:
            print("\tSaving case %d." % counter)
        example = dict_to_example({
            "inputs":
            subtokenizer.encode(input_line, add_eos=True),
            "targets":
            subtokenizer.encode(target_line, add_eos=True)
        })
        writers[shard].write(example.SerializeToString())
        shard = (shard + 1) % total_shards
    for writer in writers:
        writer.close()

    for tmp_name, final_name in zip(tmp_filepaths, filepaths):
        gfile.Rename(tmp_name, final_name)

    print("Saved %d Examples", counter)
    return filepaths

Example #10

0

Show file

File: io_utils.py Project: pythseq/nucleus

def make_tfrecord_writer(outfile, options=None):
    """Creates a python_io.TFRecordWriter for the specified outfile.

  Args:
    outfile: str. A path where we'll write our TFRecords.
    options: python_io.TFRecordOptions or None. If None, one
      will be inferred from the filename.
  Returns:
    A python_io.TFRecordWriter object.
  """
    if not options:
        options = make_tfrecord_options(outfile)
    return python_io.TFRecordWriter(outfile, options)

Example #11

0

Show file

    def __init__(self, output_path, header=None):
        """Initializer.

    Args:
      output_path: str. The output path to which the records are written.
      header: An optional header for the particular data type. This can be
        useful for file types that have logical headers where some operations
        depend on that header information (e.g. VCF using its headers to
        determine type information of annotation fields).
    """
        super(TFRecordWriter, self).__init__()

        compressed = output_path.endswith('.gz')
        options = python_io.TFRecordOptions(
            python_io.TFRecordCompressionType.
            GZIP if compressed else python_io.TFRecordCompressionType.NONE)
        self._writer = python_io.TFRecordWriter(output_path, options=options)
        self.header = header

Example #12

0

Show file

def main(unused_argv):
    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[111])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["1111111111"])),
        }))

    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
    writer = python_io.TFRecordWriter("/tmp/test1.tfrecord", options)

    writer.write(example.SerializeToString())

    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[222])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["2222222222"])),
        }))
    writer.write(example.SerializeToString())

    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[333])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["3333333333"])),
        }))
    writer.write(example.SerializeToString())

    writer.close()

    tf.compat.v1.logging.info('File /tmp/test1.tfrecord generated!')

Example #13

0

Show file

  def testMakeBatchedFeaturesDataset(self):
    files = 2
    records_per_file = 5

    def make_record(file_index):
      example = example_pb2.Example(
          features=feature_pb2.Features(
              feature={
                  "file":
                      feature_pb2.Feature(
                          int64_list=feature_pb2.Int64List(value=[file_index])),
              }))
      return example.SerializeToString()

    filenames = []
    for file_index in range(files):
      filename = os.path.join(self.get_temp_dir(),
                              "tf_record.%d.txt" % file_index)
      filenames.append(filename)
      writer = python_io.TFRecordWriter(filename)
      for _ in range(records_per_file):
        writer.write(make_record(file_index))
      writer.close()

    dataset = readers.make_batched_features_dataset(
        file_pattern=filenames,
        batch_size=records_per_file,
        features={
            "file": parsing_ops.FixedLenFeature([], dtypes.int64),
        },
        reader=core_readers.TFRecordDataset,
        num_epochs=1)
    # We should shard at the file level, so that all records come from file 0.
    dataset = distribute._AutoShardDataset(dataset, 2, 0)
    dataset = dataset.unbatch()
    output = self.getDatasetOutput(dataset)
    files = [elem["file"] for elem in output]
    self.assertEqual(files, [0] * records_per_file)

Example #14

0

Show file

File: ignore_errors_test.py Project: Halo9Pan/dive-tensorflow

    def testTFRecordDatasetIgnoreError(self):
        filenames = []
        for i in range(5):
            fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
            filenames.append(fn)
            writer = python_io.TFRecordWriter(fn)
            for _ in range(10):
                writer.write(b"record")
            writer.close()
            # Append corrupted data
            with open(fn, "a") as f:
                f.write("corrupted data")

        dataset = readers.TFRecordDataset(filenames).apply(
            error_ops.ignore_errors())
        get_next = self.getNext(dataset)

        # All of the files are present.
        for _ in filenames:
            for _ in range(10):
                self.assertEqual(b"record", self.evaluate(get_next()))
        with self.assertRaises(errors.OutOfRangeError):
            self.evaluate(get_next())