def testYesShuffle(self): id_source = rs.ReaderSource(reader_cls=tf.IdentityReader, work_units=self.work_units, batch_size=1, shuffle=True, num_threads=10, seed=1234) index_column, value_column = id_source() cache = {} index_tensor = index_column.build(cache) value_tensor = value_column.build(cache) self.assertEqual([1], index_tensor.get_shape().as_list()) self.assertEqual([1], value_tensor.get_shape().as_list()) seen = set([]) with self.test_session() as sess: tf.global_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) for _ in range(500): index, value = sess.run([index_tensor, value_tensor]) self.assertEqual(index, value) self.assertNotIn(int(value[0]), seen) seen.add(int(value[0])) coord.request_stop() coord.join(threads)
def from_examples(cls, filepatterns, batch_size, features, file_format=FileFormat.TFRECORD, shuffle=True, num_threads=1, queue_capacity=None, min_after_dequeue=None, seed=None): """Create a `DataFrame` from `tensorflow.Example`s. Args: filepatterns: a list of file patterns containing `tensorflow.Example`s. batch_size: desired batch size. features: a dict mapping feature names to `VarLenFeature` or `FixedLenFeature`. file_format: a `FileFormat` indicating the format of the files in `filepatterns`. shuffle: whether records should be shuffled. Defaults to true. num_threads: the number of readers that will work in parallel. queue_capacity: capacity of the queue that will store parsed `Example`s min_after_dequeue: minimum number of elements that can be left by a dequeue operation. Only used if `shuffle` is true. seed: passed to random shuffle operations. Only used if `shuffle` is true. Returns: A `DataFrame` that has columns corresponding to `features` and is filled with `Example`s from `filepatterns`. Raises: ValueError: no files match `filepatterns`. ValueError: `features` contains the reserved name 'index'. """ filenames = _expand_file_names(filepatterns) if not filenames: raise ValueError("No matching file names.") if "index" in features: raise ValueError( "'index' is reserved and can not be used for a feature name.") index, record = reader_source.ReaderSource( FILE_FORMAT_TO_READER_CLS[file_format], filenames, batch_size=batch_size, queue_capacity=queue_capacity, shuffle=shuffle, min_after_dequeue=min_after_dequeue, num_threads=num_threads, seed=seed)() parser = example_parser.ExampleParser(features) parsed = parser(record) column_dict = parsed._asdict() column_dict["index"] = index dataframe = cls() dataframe.assign(**column_dict) return dataframe
def testNoShuffle(self): id_source = rs.ReaderSource(reader_cls=tf.IdentityReader, work_units=self.work_units, batch_size=1, shuffle=False, num_threads=1) index_column, value_column = id_source() index_tensor = index_column.build() value_tensor = value_column.build() self.assertEqual([1], index_tensor.get_shape().as_list()) self.assertEqual([1], value_tensor.get_shape().as_list()) with self.test_session() as sess: tf.global_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) for i in range(50): index, value = sess.run([index_tensor, value_tensor]) self.assertEqual(i, int(index[0])) self.assertEqual(i, int(value[0])) coord.request_stop() coord.join(threads)
def from_examples(cls, filepatterns, features, reader_cls=io_ops.TFRecordReader, num_epochs=None, num_threads=1, enqueue_size=None, batch_size=32, queue_capacity=None, min_after_dequeue=None, shuffle=True, seed=None): """Create a `DataFrame` from `tensorflow.Example`s. Args: filepatterns: a list of file patterns containing `tensorflow.Example`s. features: a dict mapping feature names to `VarLenFeature` or `FixedLenFeature`. reader_cls: a subclass of `tensorflow.ReaderBase` that will be used to read the `Example`s. num_epochs: the number of times that the reader should loop through all the file names. If set to `None`, then the reader will continue indefinitely. num_threads: the number of readers that will work in parallel. enqueue_size: block size for each read operation. batch_size: desired batch size. queue_capacity: capacity of the queue that will store parsed `Example`s min_after_dequeue: minimum number of elements that can be left by a dequeue operation. Only used if `shuffle` is true. shuffle: whether records should be shuffled. Defaults to true. seed: passed to random shuffle operations. Only used if `shuffle` is true. Returns: A `DataFrame` that has columns corresponding to `features` and is filled with `Example`s from `filepatterns`. Raises: ValueError: no files match `filepatterns`. ValueError: `features` contains the reserved name 'index'. """ filenames = _expand_file_names(filepatterns) if not filenames: raise ValueError("No matching file names.") if "index" in features: raise ValueError( "'index' is reserved and can not be used for a feature name.") index, record = reader_source.ReaderSource( reader_cls, filenames, enqueue_size=enqueue_size, batch_size=batch_size, num_epochs=num_epochs, queue_capacity=queue_capacity, shuffle=shuffle, min_after_dequeue=min_after_dequeue, num_threads=num_threads, seed=seed)() parser = example_parser.ExampleParser(features) parsed = parser(record) column_dict = parsed._asdict() column_dict["index"] = index dataframe = cls() dataframe.assign(**column_dict) return dataframe