def from_examples(cls, filepatterns, batch_size, features, file_format=FileFormat.TFRECORD, shuffle=True, num_threads=1, queue_capacity=None, min_after_dequeue=None, seed=None): """Create a `DataFrame` from `tensorflow.Example`s. Args: filepatterns: a list of file patterns containing `tensorflow.Example`s. batch_size: desired batch size. features: a dict mapping feature names to `VarLenFeature` or `FixedLenFeature`. file_format: a `FileFormat` indicating the format of the files in `filepatterns`. shuffle: whether records should be shuffled. Defaults to true. num_threads: the number of readers that will work in parallel. queue_capacity: capacity of the queue that will store parsed `Example`s min_after_dequeue: minimum number of elements that can be left by a dequeue operation. Only used if `shuffle` is true. seed: passed to random shuffle operations. Only used if `shuffle` is true. Returns: A `DataFrame` that has columns corresponding to `features` and is filled with `Example`s from `filepatterns`. Raises: ValueError: no files match `filepatterns`. ValueError: `features` contains the reserved name 'index'. """ filenames = _expand_file_names(filepatterns) if not filenames: raise ValueError("No matching file names.") if "index" in features: raise ValueError( "'index' is reserved and can not be used for a feature name.") index, record = reader_source.ReaderSource( FILE_FORMAT_TO_READER_CLS[file_format], filenames, batch_size=batch_size, queue_capacity=queue_capacity, shuffle=shuffle, min_after_dequeue=min_after_dequeue, num_threads=num_threads, seed=seed)() parser = example_parser.ExampleParser(features) parsed = parser(record) column_dict = parsed._asdict() column_dict["index"] = index dataframe = cls() dataframe.assign(**column_dict) return dataframe
def testParseWithTupleDefinition(self): parser = example_parser.ExampleParser(self.features) output_columns = parser(self.example_column) self.assertEqual(2, len(output_columns)) cache = {} output_tensors = [o.build(cache) for o in output_columns] self.assertEqual(2, len(output_tensors)) with self.test_session() as sess: string_feature, int_feature = sess.run(output_tensors) np.testing.assert_array_equal(string_feature.shape, np.array([2, 2])) np.testing.assert_array_equal(int_feature.shape, np.array([2, 3])) np.testing.assert_array_equal(self.expected_string_values, string_feature.values) np.testing.assert_array_equal(self.expected_string_indices, string_feature.indices) np.testing.assert_array_equal(self.expected_int_feature, int_feature)
def from_examples(cls, filepatterns, features, reader_cls=io_ops.TFRecordReader, num_epochs=None, num_threads=1, enqueue_size=None, batch_size=32, queue_capacity=None, min_after_dequeue=None, shuffle=True, seed=None): """Create a `DataFrame` from `tensorflow.Example`s. Args: filepatterns: a list of file patterns containing `tensorflow.Example`s. features: a dict mapping feature names to `VarLenFeature` or `FixedLenFeature`. reader_cls: a subclass of `tensorflow.ReaderBase` that will be used to read the `Example`s. num_epochs: the number of times that the reader should loop through all the file names. If set to `None`, then the reader will continue indefinitely. num_threads: the number of readers that will work in parallel. enqueue_size: block size for each read operation. batch_size: desired batch size. queue_capacity: capacity of the queue that will store parsed `Example`s min_after_dequeue: minimum number of elements that can be left by a dequeue operation. Only used if `shuffle` is true. shuffle: whether records should be shuffled. Defaults to true. seed: passed to random shuffle operations. Only used if `shuffle` is true. Returns: A `DataFrame` that has columns corresponding to `features` and is filled with `Example`s from `filepatterns`. Raises: ValueError: no files match `filepatterns`. ValueError: `features` contains the reserved name 'index'. """ filenames = _expand_file_names(filepatterns) if not filenames: raise ValueError("No matching file names.") if "index" in features: raise ValueError( "'index' is reserved and can not be used for a feature name.") index, record = reader_source.ReaderSource( reader_cls, filenames, enqueue_size=enqueue_size, batch_size=batch_size, num_epochs=num_epochs, queue_capacity=queue_capacity, shuffle=shuffle, min_after_dequeue=min_after_dequeue, num_threads=num_threads, seed=seed)() parser = example_parser.ExampleParser(features) parsed = parser(record) column_dict = parsed._asdict() column_dict["index"] = index dataframe = cls() dataframe.assign(**column_dict) return dataframe