Ejemplo n.º 1
0
  def from_csv_with_feature_spec(cls,
                                 filepatterns,
                                 feature_spec,
                                 has_header=True,
                                 column_names=None,
                                 num_threads=1,
                                 enqueue_size=None,
                                 batch_size=32,
                                 queue_capacity=None,
                                 min_after_dequeue=None,
                                 shuffle=True,
                                 seed=None):
    """Create a `DataFrame` from CSV files, given a feature_spec.

    If `has_header` is false, then `column_names` must be specified. If
    `has_header` is true and `column_names` are specified, then `column_names`
    overrides the names in the header.

    Args:
      filepatterns: a list of file patterns that resolve to CSV files.
      feature_spec: a dict mapping column names to `FixedLenFeature` or
          `VarLenFeature`.
      has_header: whether or not the CSV files have headers.
      column_names: a list of names for the columns in the CSV files.
      num_threads: the number of readers that will work in parallel.
      enqueue_size: block size for each read operation.
      batch_size: desired batch size.
      queue_capacity: capacity of the queue that will store parsed lines.
      min_after_dequeue: minimum number of elements that can be left by a
        dequeue operation. Only used if `shuffle` is true.
      shuffle: whether records should be shuffled. Defaults to true.
      seed: passed to random shuffle operations. Only used if `shuffle` is true.

    Returns:
      A `DataFrame` that has columns corresponding to `features` and is filled
      with examples from `filepatterns`.

    Raises:
      ValueError: no files match `filepatterns`.
      ValueError: `features` contains the reserved name 'index'.
    """

    def get_default_values(column_names):
      return [_get_default_value(feature_spec[name]) for name in column_names]

    dataframe = cls._from_csv_base(filepatterns, get_default_values, has_header,
                                   column_names, num_threads,
                                   enqueue_size, batch_size, queue_capacity,
                                   min_after_dequeue, shuffle, seed)

    # replace the dense columns with sparse ones in place in the dataframe
    for name in dataframe.columns():
      if name != "index" and isinstance(feature_spec[name],
                                        parsing_ops.VarLenFeature):
        strip_value = _get_default_value(feature_spec[name])
        (dataframe[name],) = sparsify.Sparsify(strip_value)(dataframe[name])

    return dataframe
Ejemplo n.º 2
0
def _test_sparsify_densify(self, x, default_value):
    """Test roundtrip via Sparsify and Densify."""

    numpy_source = in_memory_source.NumpySource(x, batch_size=len(x))()

    (sparse_series, ) = sparsify.Sparsify(default_value)(numpy_source[1])
    (dense_series, ) = densify.Densify(default_value)(sparse_series)

    cache = {}
    sparse_tensor = sparse_series.build(cache)
    dense_tensor = dense_series.build(cache)

    with self.test_session() as sess:
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        sparse_val, dense_val = sess.run([sparse_tensor, dense_tensor])

        coord.request_stop()
        coord.join(threads)

    if x.dtype.kind not in ["S", "U"] and np.isnan(default_value):
        x_values = x[~np.isnan(x)]
        x_indexes = np.arange(len(x))[~np.isnan(x)].T.reshape(-1, 1)
    else:
        x_values = x[x != default_value]
        x_indexes = np.arange(len(x))[x != default_value].T.reshape(-1, 1)

    if x.dtype.kind in ["S", "U"]:
        # Python 2/3 compatibility
        # TensorFlow always returns bytes, so we just convert the unicode
        # expectations to bytes also before comparing.
        expected_x = [item.encode("utf-8") for item in x]
        expected_x_values = [item.encode("utf-8") for item in x_values]
    else:
        expected_x = x
        expected_x_values = x_values

    np.testing.assert_array_equal(len(x), sparse_val.shape[0])
    np.testing.assert_array_equal(expected_x_values, sparse_val.values)
    np.testing.assert_array_equal(x_indexes, sparse_val.indices)
    np.testing.assert_array_equal(expected_x, dense_val)