Ejemplo n.º 1
0
 def testPandasFeeding(self):
     if not HAS_PANDAS:
         return
     batch_size = 3
     iterations = 1000
     index = np.arange(100, 132)
     a = np.arange(32)
     b = np.arange(32, 64)
     dataframe = pd.DataFrame({"a": a, "b": b}, index=index)
     pandas_source = in_memory_source.PandasSource(dataframe, batch_size)
     pandas_columns = pandas_source()
     cache = {}
     with tf.Graph().as_default():
         pandas_tensors = [col.build(cache) for col in pandas_columns]
         with tf.Session() as sess:
             coord = tf.train.Coordinator()
             threads = tf.train.start_queue_runners(sess=sess, coord=coord)
             for i in range(iterations):
                 indices = [
                     j % dataframe.shape[0]
                     for j in range(batch_size * i, batch_size * (i + 1))
                 ]
                 expected_df_indices = dataframe.index[indices]
                 expected_rows = dataframe.iloc[indices]
                 actual_value = sess.run(pandas_tensors)
                 np.testing.assert_array_equal(expected_df_indices,
                                               actual_value[0])
                 for col_num, col in enumerate(dataframe.columns):
                     np.testing.assert_array_equal(
                         expected_rows[col].values,
                         actual_value[col_num + 1])
             coord.request_stop()
             coord.join(threads)
Ejemplo n.º 2
0
  def from_pandas(cls,
                  pandas_dataframe,
                  batch_size=None,
                  shuffle=True,
                  queue_capacity=None,
                  min_after_dequeue=None,
                  seed=None):
    """Create a `tf.learn.DataFrame` from a `pandas.DataFrame`.

    Args:
      pandas_dataframe: `pandas.DataFrame` that serves as a data source.
      batch_size: desired batch size.
      shuffle: whether records should be shuffled. Defaults to true.
      queue_capacity: capacity of the queue that will store parsed `Example`s
      min_after_dequeue: minimum number of elements that can be left by a
        dequeue operation. Only used if `shuffle` is true.
      seed: passed to random shuffle operations. Only used if `shuffle` is true.

    Returns:
      A `tf.learn.DataFrame` that contains batches drawn from the given
      `pandas_dataframe`.
    """
    pandas_source = in_memory_source.PandasSource(pandas_dataframe, batch_size,
                                                  queue_capacity, shuffle,
                                                  min_after_dequeue, seed)
    dataframe = cls()
    dataframe.assign(**(pandas_source()._asdict()))
    return dataframe
Ejemplo n.º 3
0
  def from_pandas(cls,
                  pandas_dataframe,
                  num_threads=None,
                  enqueue_size=None,
                  batch_size=None,
                  queue_capacity=None,
                  min_after_dequeue=None,
                  shuffle=True,
                  seed=None,
                  data_name="pandas_data"):
    """Create a `tf.learn.DataFrame` from a `pandas.DataFrame`.

    Args:
      pandas_dataframe: `pandas.DataFrame` that serves as a data source.
      num_threads: the number of threads to use for enqueueing.
      enqueue_size: the number of rows to enqueue per step.
      batch_size: desired batch size.
      queue_capacity: capacity of the queue that will store parsed `Example`s
      min_after_dequeue: minimum number of elements that can be left by a
        dequeue operation. Only used if `shuffle` is true.
      shuffle: whether records should be shuffled. Defaults to true.
      seed: passed to random shuffle operations. Only used if `shuffle` is true.
      data_name: a scope name identifying the data.

    Returns:
      A `tf.learn.DataFrame` that contains batches drawn from the given
      `pandas_dataframe`.
    """
    pandas_source = in_memory_source.PandasSource(
        pandas_dataframe,
        num_threads=num_threads,
        enqueue_size=enqueue_size,
        batch_size=batch_size,
        queue_capacity=queue_capacity,
        shuffle=shuffle,
        min_after_dequeue=min_after_dequeue,
        seed=seed,
        data_name=data_name)
    dataframe = cls()
    dataframe.assign(**(pandas_source()._asdict()))
    return dataframe