def from_csv_with_feature_spec(cls, filepatterns, feature_spec, has_header=True, column_names=None, num_threads=1, enqueue_size=None, batch_size=32, queue_capacity=None, min_after_dequeue=None, shuffle=True, seed=None): """Create a `DataFrame` from CSV files, given a feature_spec. If `has_header` is false, then `column_names` must be specified. If `has_header` is true and `column_names` are specified, then `column_names` overrides the names in the header. Args: filepatterns: a list of file patterns that resolve to CSV files. feature_spec: a dict mapping column names to `FixedLenFeature` or `VarLenFeature`. has_header: whether or not the CSV files have headers. column_names: a list of names for the columns in the CSV files. num_threads: the number of readers that will work in parallel. enqueue_size: block size for each read operation. batch_size: desired batch size. queue_capacity: capacity of the queue that will store parsed lines. min_after_dequeue: minimum number of elements that can be left by a dequeue operation. Only used if `shuffle` is true. shuffle: whether records should be shuffled. Defaults to true. seed: passed to random shuffle operations. Only used if `shuffle` is true. Returns: A `DataFrame` that has columns corresponding to `features` and is filled with examples from `filepatterns`. Raises: ValueError: no files match `filepatterns`. ValueError: `features` contains the reserved name 'index'. """ def get_default_values(column_names): return [_get_default_value(feature_spec[name]) for name in column_names] dataframe = cls._from_csv_base(filepatterns, get_default_values, has_header, column_names, num_threads, enqueue_size, batch_size, queue_capacity, min_after_dequeue, shuffle, seed) # replace the dense columns with sparse ones in place in the dataframe for name in dataframe.columns(): if name != "index" and isinstance(feature_spec[name], parsing_ops.VarLenFeature): strip_value = _get_default_value(feature_spec[name]) (dataframe[name],) = sparsify.Sparsify(strip_value)(dataframe[name]) return dataframe
def _test_sparsify_densify(self, x, default_value): """Test roundtrip via Sparsify and Densify.""" numpy_source = in_memory_source.NumpySource(x, batch_size=len(x))() (sparse_series, ) = sparsify.Sparsify(default_value)(numpy_source[1]) (dense_series, ) = densify.Densify(default_value)(sparse_series) cache = {} sparse_tensor = sparse_series.build(cache) dense_tensor = dense_series.build(cache) with self.test_session() as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) sparse_val, dense_val = sess.run([sparse_tensor, dense_tensor]) coord.request_stop() coord.join(threads) if x.dtype.kind not in ["S", "U"] and np.isnan(default_value): x_values = x[~np.isnan(x)] x_indexes = np.arange(len(x))[~np.isnan(x)].T.reshape(-1, 1) else: x_values = x[x != default_value] x_indexes = np.arange(len(x))[x != default_value].T.reshape(-1, 1) if x.dtype.kind in ["S", "U"]: # Python 2/3 compatibility # TensorFlow always returns bytes, so we just convert the unicode # expectations to bytes also before comparing. expected_x = [item.encode("utf-8") for item in x] expected_x_values = [item.encode("utf-8") for item in x_values] else: expected_x = x expected_x_values = x_values np.testing.assert_array_equal(len(x), sparse_val.shape[0]) np.testing.assert_array_equal(expected_x_values, sparse_val.values) np.testing.assert_array_equal(x_indexes, sparse_val.indices) np.testing.assert_array_equal(expected_x, dense_val)