Example #1
0
    def testTFRecordDatasetFromDataset(self):
        filenames = []
        all_contents = []
        for i in range(_NUM_FILES):
            filename = os.path.join(self.get_temp_dir(), 'tf_record.%d' % i)
            filenames.append(filename)
            writer = python_io.TFRecordWriter(filename)
            for j in range(_NUM_ENTRIES):
                record = compat.as_bytes('Record %d of file %d' % (j, i))
                writer.write(record)
                all_contents.append(record)
            writer.close()

        filenames = dataset_ops.Dataset.from_tensor_slices(filenames)

        dataset = datasets.StreamingFilesDataset(filenames,
                                                 filetype='tfrecord')

        iterator = dataset.make_initializable_iterator()
        self._sess.run(iterator.initializer)
        get_next = iterator.get_next()

        retrieved_values = []
        for _ in range(4 * len(all_contents)):
            retrieved_values.append(compat.as_bytes(self._sess.run(get_next)))

        self.assertEqual(set(all_contents), set(retrieved_values))
def shuffle_records(fname):
    """Shuffle records in a single file."""
    print("Shuffling records in file %s" % fname)

    # Rename file prior to shuffling
    tmp_fname = fname + ".unshuffled"
    gfile.Rename(fname, tmp_fname)

    reader = python_io.tf_record_iterator(tmp_fname)
    records = []
    for record in reader:
        records.append(record)
        if len(records) % 100000 == 0:
            print("\tRead: %d", len(records))

    random.shuffle(records)

    # Write shuffled records to original file name
    with python_io.TFRecordWriter(fname) as w:
        for count, record in enumerate(records):
            w.write(record)
            if count > 0 and count % 100000 == 0:
                print("\tWriting record: %d" % count)

    gfile.Remove(tmp_fname)
  def testMakeBatchedFeaturesDataset(self):
    # Set up
    fn = os.path.join(self.get_temp_dir(), "tf_record.txt")
    writer = python_io.TFRecordWriter(fn)
    for i in range(1024):
      writer.write(
          example_pb2.Example(
              features=feature_pb2.Features(
                  feature={
                      "value":
                          feature_pb2.Feature(
                              int64_list=feature_pb2.Int64List(value=[i]))
                  })).SerializeToString())
    writer.close()

    dataset = readers.make_batched_features_dataset(
        file_pattern=fn,
        batch_size=32,
        features={"value": parsing_ops.FixedLenFeature([], dtypes.int64)},
        shuffle=False,
        num_epochs=1,
        drop_final_batch=False)

    rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4)

    self.assertEqual([[None]],
                     [ts.as_list() for ts in _flat_shapes(rebatched_dataset)])

    expected_output = [{
        "value": [k for k in range(i, i + 8)]
    } for i in range(0, 1024, 8)]  # pylint: disable=g-complex-comprehension
    self.assertDatasetProduces(rebatched_dataset, expected_output)
 def _createFile(self, options=None):
   filename = self._inputFilename()
   writer = python_io.TFRecordWriter(filename, options)
   for i in range(self._num_records):
     writer.write(self._record(i))
   writer.close()
   return filename
Example #5
0
 def _writeFile(self, name, data):
     filename = os.path.join(self.get_temp_dir(), name)
     writer = python_io.TFRecordWriter(filename)
     for d in data:
         writer.write(compat.as_bytes(str(d)))
     writer.close()
     return filename
Example #6
0
    def __init__(self, output_path, header=None):
        super(TFRecordWriter, self).__init__()

        compressed = output_path.endswith('.gz')
        options = python_io.TFRecordOptions(
            python_io.TFRecordCompressionType.
            GZIP if compressed else python_io.TFRecordCompressionType.NONE)
        self._writer = python_io.TFRecordWriter(output_path, options=options)
        self.header = header
  def testParseExampleInputFn(self):
    """Tests complete flow with input_fn constructed from parse_example."""
    n_classes = 3
    batch_size = 10
    words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']

    _, examples_file = tempfile.mkstemp()
    writer = python_io.TFRecordWriter(examples_file)
    for _ in range(batch_size):
      sequence_length = random.randint(1, len(words))
      sentence = random.sample(words, sequence_length)
      label = random.randint(0, n_classes - 1)
      example = example_pb2.Example(features=feature_pb2.Features(
          feature={
              'tokens':
                  feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
                      value=sentence)),
              'label':
                  feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                      value=[label])),
          }))
      writer.write(example.SerializeToString())
    writer.close()

    col = seq_fc.sequence_categorical_column_with_hash_bucket(
        'tokens', hash_bucket_size=10)
    embed = fc.embedding_column(col, dimension=2)
    feature_columns = [embed]
    feature_spec = parsing_utils.classifier_parse_example_spec(
        feature_columns,
        label_key='label',
        label_dtype=dtypes.int64)

    def _train_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _eval_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      return dataset.map(lambda features: (features, features.pop('label')))
    def _predict_input_fn():
      dataset = readers.make_batched_features_dataset(
          examples_file, batch_size, feature_spec, num_epochs=1)
      def features_fn(features):
        features.pop('label')
        return features
      return dataset.map(features_fn)

    self._test_complete_flow(
        feature_columns=feature_columns,
        train_input_fn=_train_input_fn,
        eval_input_fn=_eval_input_fn,
        predict_input_fn=_predict_input_fn,
        n_classes=n_classes,
        batch_size=batch_size)
Example #8
0
 def _createFiles(self):
     filenames = []
     for i in range(self._num_files):
         fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
         filenames.append(fn)
         writer = python_io.TFRecordWriter(fn)
         for j in range(self._num_records):
             writer.write(self._record(i, j))
         writer.close()
     return filenames
def encode_and_save_files(subtokenizer, data_dir, raw_files, tag,
                          total_shards):
    """Save data from files as encoded Examples in TFrecord format.
  Args:
    subtokenizer: Subtokenizer object that will be used to encode the strings.
    data_dir: The directory in which to write the examples
    raw_files: A tuple of (input, target) data files. Each line in the input and
      the corresponding line in target file will be saved in a tf.Example.
    tag: String that will be added onto the file names.
    total_shards: Number of files to divide the data into.
  Returns:
    List of all files produced.
  """
    # Create a file for each shard.
    filepaths = [
        shard_filename(data_dir, tag, n + 1, total_shards)
        for n in range(total_shards)
    ]

    if all_exist(filepaths):
        print("Files with tag %s already exist." % tag)
        return filepaths

    print("Saving files with tag %s." % tag)
    input_file = raw_files[0]
    target_file = raw_files[1]

    # Write examples to each shard in round robin order.
    tmp_filepaths = [fname + ".incomplete" for fname in filepaths]
    writers = [python_io.TFRecordWriter(fname) for fname in tmp_filepaths]
    counter, shard = 0, 0
    for counter, (input_line, target_line) in enumerate(
            zip(txt_line_iterator(input_file),
                txt_line_iterator(target_file))):
        if counter > 0 and counter % 100000 == 0:
            print("\tSaving case %d." % counter)
        example = dict_to_example({
            "inputs":
            subtokenizer.encode(input_line, add_eos=True),
            "targets":
            subtokenizer.encode(target_line, add_eos=True)
        })
        writers[shard].write(example.SerializeToString())
        shard = (shard + 1) % total_shards
    for writer in writers:
        writer.close()

    for tmp_name, final_name in zip(tmp_filepaths, filepaths):
        gfile.Rename(tmp_name, final_name)

    print("Saved %d Examples", counter)
    return filepaths
Example #10
0
def make_tfrecord_writer(outfile, options=None):
    """Creates a python_io.TFRecordWriter for the specified outfile.

  Args:
    outfile: str. A path where we'll write our TFRecords.
    options: python_io.TFRecordOptions or None. If None, one
      will be inferred from the filename.
  Returns:
    A python_io.TFRecordWriter object.
  """
    if not options:
        options = make_tfrecord_options(outfile)
    return python_io.TFRecordWriter(outfile, options)
Example #11
0
    def __init__(self, output_path, header=None):
        """Initializer.

    Args:
      output_path: str. The output path to which the records are written.
      header: An optional header for the particular data type. This can be
        useful for file types that have logical headers where some operations
        depend on that header information (e.g. VCF using its headers to
        determine type information of annotation fields).
    """
        super(TFRecordWriter, self).__init__()

        compressed = output_path.endswith('.gz')
        options = python_io.TFRecordOptions(
            python_io.TFRecordCompressionType.
            GZIP if compressed else python_io.TFRecordCompressionType.NONE)
        self._writer = python_io.TFRecordWriter(output_path, options=options)
        self.header = header
Example #12
0
def main(unused_argv):
    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[111])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["1111111111"])),
        }))

    options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB)
    writer = python_io.TFRecordWriter("/tmp/test1.tfrecord", options)

    writer.write(example.SerializeToString())

    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[222])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["2222222222"])),
        }))
    writer.write(example.SerializeToString())

    example = tf.train.Example(features=tf.train.Features(
        feature={
            "feature_0":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[333])),
            'feature_1':
            tf.train.Feature(bytes_list=tf.train.BytesList(
                value=["3333333333"])),
        }))
    writer.write(example.SerializeToString())

    writer.close()

    tf.compat.v1.logging.info('File /tmp/test1.tfrecord generated!')
Example #13
0
  def testMakeBatchedFeaturesDataset(self):
    files = 2
    records_per_file = 5

    def make_record(file_index):
      example = example_pb2.Example(
          features=feature_pb2.Features(
              feature={
                  "file":
                      feature_pb2.Feature(
                          int64_list=feature_pb2.Int64List(value=[file_index])),
              }))
      return example.SerializeToString()

    filenames = []
    for file_index in range(files):
      filename = os.path.join(self.get_temp_dir(),
                              "tf_record.%d.txt" % file_index)
      filenames.append(filename)
      writer = python_io.TFRecordWriter(filename)
      for _ in range(records_per_file):
        writer.write(make_record(file_index))
      writer.close()

    dataset = readers.make_batched_features_dataset(
        file_pattern=filenames,
        batch_size=records_per_file,
        features={
            "file": parsing_ops.FixedLenFeature([], dtypes.int64),
        },
        reader=core_readers.TFRecordDataset,
        num_epochs=1)
    # We should shard at the file level, so that all records come from file 0.
    dataset = distribute._AutoShardDataset(dataset, 2, 0)
    dataset = dataset.unbatch()
    output = self.getDatasetOutput(dataset)
    files = [elem["file"] for elem in output]
    self.assertEqual(files, [0] * records_per_file)
    def testTFRecordDatasetIgnoreError(self):
        filenames = []
        for i in range(5):
            fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
            filenames.append(fn)
            writer = python_io.TFRecordWriter(fn)
            for _ in range(10):
                writer.write(b"record")
            writer.close()
            # Append corrupted data
            with open(fn, "a") as f:
                f.write("corrupted data")

        dataset = readers.TFRecordDataset(filenames).apply(
            error_ops.ignore_errors())
        get_next = self.getNext(dataset)

        # All of the files are present.
        for _ in filenames:
            for _ in range(10):
                self.assertEqual(b"record", self.evaluate(get_next()))
        with self.assertRaises(errors.OutOfRangeError):
            self.evaluate(get_next())