Beispiel #1
0
  def setUp(self):
    ops.reset_default_graph()

    self.scalar_int_feed = array_ops.placeholder(dtypes_lib.int32, ())
    self.unk_int64_feed = array_ops.placeholder(dtypes_lib.int64, (None,))
    self.vec3_str_feed = array_ops.placeholder(dtypes_lib.string, (3,))
    self.sparse_c = sparse_tensor.SparseTensor(
        indices=[[0]],
        values=[1.0],
        dense_shape=[1])

    self._coord = coordinator.Coordinator()
    # Make capacity very large so we can feed all the inputs in the
    # main thread without blocking
    input_queue = data_flow_ops.PaddingFIFOQueue(
        5000,
        dtypes=[dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.string],
        shapes=[(), (None,), (3,)])

    self._input_enqueue_op = input_queue.enqueue(
        (self.scalar_int_feed, self.unk_int64_feed, self.vec3_str_feed))
    self.scalar_int, self.unk_int64, self.vec3_str = input_queue.dequeue()
    self._threads = None
    self._close_op = input_queue.close()
    self._sess = None
Beispiel #2
0
    def testDynamicPad(self):
        with self.cached_session() as sess:
            # Create 3 tensors of variable but compatible shapes.
            var_shape = [None, 2]
            p1 = constant_op.constant([[1, 2], [3, 4]])
            p1.set_shape(var_shape)
            p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]])
            p2.set_shape(var_shape)
            p3 = constant_op.constant([[11, 12]])
            p3.set_shape(var_shape)
            batch = [p1, p2, p3]
            batch_size = len(batch)

            zero64 = constant_op.constant(0, dtype=dtypes.int64)
            examples = variables.Variable(zero64)
            counter = examples.count_up_to(batch_size)

            # Create a PaddingFIFOQueue to enqueue these tensors.
            q = data_flow_ops.PaddingFIFOQueue(capacity=10,
                                               dtypes=[dtypes.int32],
                                               shapes=[var_shape])
            for tensor in [p1, p2, p3]:
                q.enqueue([tensor]).run()

            # Dequeue from the queue and batch them using batch().
            batches = input_lib.batch([q.dequeue(), counter],
                                      batch_size=batch_size,
                                      num_threads=1,
                                      dynamic_pad=True)
            self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())

            # Finally, assemble them into prefetch_queue with dynamic_pad.
            batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True)
            batches = batcher.dequeue()
            self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())

            variables.global_variables_initializer().run()
            threads = queue_runner_impl.start_queue_runners()

            values, _ = sess.run(batches)
            # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad
            # they should be padded to the fixed size [3, 3, 2], where 3
            # is the maximum length of the batch.
            self.assertTrue(
                np.array_equal(
                    np.array([[[1, 2], [3, 4], [0, 0]],
                              [[5, 6], [7, 8], [9, 10]],
                              [[11, 12], [0, 0], [0, 0]]]), values))

            with self.assertRaises(errors_impl.OutOfRangeError):
                sess.run(batches)
            for thread in threads:
                thread.join()
def bucket(tensors,
           which_bucket,
           batch_size,
           num_buckets,
           num_threads=1,
           capacity=32,
           shapes=None,
           dynamic_pad=False,
           allow_smaller_final_batch=False,
           keep_input=None,
           shared_name=None,
           name=None):
    """Lazy bucketing of input tensors according to `which_bucket`.

  The argument `tensors` can be a list or a dictionary of tensors.
  The value returned by the function will be of the same type
  as `tensors`.

  The tensors entering this function are put into the bucket given by
  `which_bucket`.  Each bucket has its own queue.  When a bucket contains
  `batch_size` elements, this minibatch is pushed onto a top queue.  The
  tensors returned from this function are a the result of dequeueing the
  next minibatch from this top queue.

  This function is implemented using several queues. A `QueueRunner` for the
  queues is added to the current `Graph`'s `QUEUE_RUNNER` collection.

  As the returned tensors are the result of of a dequeue operation, evaluating
  them will throw a `tf.errors.OutOfRangeError` when the input queue is
  exhausted.  If these tensors are feeding another input queue, its queue runner
  will catch this exception, however, if they are used in your main thread
  you are responsible for catching this yourself.

  *N.B.:* If `dynamic_pad` is `False`, you must ensure that either
  (i) the `shapes` argument is passed, or (ii) all of the tensors in
  `tensors` must have fully-defined shapes. `ValueError` will be
  raised if neither of these conditions holds.

  If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
  tensors is known, but individual dimensions may have shape `None`.
  In this case, for each enqueue the dimensions with value `None`
  may have a variable length; upon dequeue, the output tensors will be padded
  on the right to the maximum shape of the tensors in the current minibatch.
  For numbers, this padding takes value 0.  For strings, this padding is
  the empty string.  See `PaddingFIFOQueue` for more info.

  If `allow_smaller_final_batch` is `True`, a smaller batch value than
  `batch_size` is returned when the queues are closed and there are not enough
  elements to fill the batch, otherwise the pending elements are discarded.
  In addition, all output tensors' static shapes, as accessed via the
  `get_shape()` method will have a 0th `Dimension` value of `None`, and
  operations that depend on fixed batch_size would fail.

  Args:
    tensors: The list or dictionary of tensors, representing a single element,
      to bucket.  Nested lists are not supported.
    which_bucket: An `int32` scalar Tensor taking a value in `[0, num_buckets)`.
    batch_size: The new batch size pulled from the queue
      (python int or int32 scalar).
    num_buckets: A python integer, the number of buckets.
    num_threads: An integer.  The number of threads enqueuing `tensors`.
    capacity: An integer. The maximum number of minibatches in the top queue,
      and also the maximum number of elements within each bucket.
    shapes: (Optional) The shapes for each example.  Defaults to the
      inferred shapes for `tensors`.
    dynamic_pad: Boolean.  Allow variable dimensions in input shapes.
      The given dimensions are padded upon dequeue so that tensors within a
      batch have the same shapes.
    allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final
      batches to be smaller if there are insufficient items left in the queues.
    keep_input: (Optional).  A `bool` scalar Tensor.  If provided, this tensor
      controls whether the input is added to the queue or not.  If it evaluates
      `True`, then `tensors` are added to the bucket; otherwise they are
      dropped.  This tensor essentially acts as a filtering mechanism.
      The default behavior is to assume `keep_input=True`.
    shared_name: (Optional). If set, the queues will be shared under the given
      name across multiple sessions.
    name: (Optional) A name for the operations.

  Returns:
    A tuple `(bucket, outputs)` where `bucket` is
    a `int32` scalar tensor and `outputs` is a list or
    dictionary of batched outputs corresponding to elements of `tensors`.
    Every step will receive a new bucket of outputs.

  Raises:
    ValueError: If the `shapes` are not specified, and cannot be
      inferred from the elements of `tensors`.
  """
    tensor_list = _as_tensor_list(tensors)
    with ops.name_scope(name, "bucket", tensor_list) as name:
        tensor_list = _validate_bucket(tensor_list)
        (tensor_list, sparse_info) = _store_sparse_tensors(tensor_list,
                                                           enqueue_many=False)

        # Round-trip batch_size to a tensor, and possibly back
        batch_size = ops.convert_to_tensor(batch_size,
                                           dtype=dtypes.int32,
                                           name="batch_size")
        static_batch_size = tensor_util.constant_value(batch_size)
        batch_size = (static_batch_size
                      if static_batch_size is not None else batch_size)

        types = _dtypes([tensor_list])
        shapes = _shapes([tensor_list], shapes, enqueue_many=False)

        which_bucket = ops.convert_to_tensor(which_bucket,
                                             dtype=dtypes.int32,
                                             name="which_bucket")

        queue_creator = _which_queue(dynamic_pad)
        bucket_queues = []
        for i in range(num_buckets):
            shared_name_i = ("%s_%d" % (shared_name, i)
                             if shared_name is not None else None)
            bucket_queues.append(
                queue_creator(capacity=capacity,
                              dtypes=types,
                              shapes=shapes,
                              shared_name=shared_name_i,
                              name="bucket_queue_%d" % i))

        maybe_static_batch_size = (None if allow_smaller_final_batch else
                                   static_batch_size)

        bucket_shapes = [
            tensor_shape.vector(maybe_static_batch_size).concatenate(s)
            for s in bucket_queues[0].shapes
        ]
        # top_queue is a PaddingFIFOQueue even if the bucket queues are regular FIFO
        # queues because if we use allow_smaller_final_batch, shapes will
        # contain Nones in their first entry; as a result, a regular
        # FIFOQueue would die when being passed shapes that are not fully defined.
        top_queue = data_flow_ops.PaddingFIFOQueue(
            capacity=capacity,
            dtypes=[dtypes.int32] + types,
            shapes=[tensor_shape.scalar()] + bucket_shapes,
            shared_name=shared_name,
            name="top_queue")

        def enqueue_which():
            def enqueue_single(i):
                return bucket_queues[i].enqueue(tensor_list)

            enqueues = [
                control_flow_ops.cond(math_ops.equal(which_bucket, i),
                                      functools.partial(enqueue_single, i),
                                      control_flow_ops.no_op)
                for i in range(num_buckets)
            ]
            return control_flow_ops.group(*enqueues, name="group_enqueues")

        if keep_input is not None:
            # TODO(ebrevdo): Expand keep_input param to core training
            # methods, and pipe through to _store_sparse_tensors; so
            # that expensive serialization is guarded by keep_input.
            maybe_enqueue = control_flow_ops.cond(keep_input, enqueue_which,
                                                  control_flow_ops.no_op)
        else:
            maybe_enqueue = enqueue_which()

        bucket_enqueue_ops = [maybe_enqueue] * num_threads

        if allow_smaller_final_batch:
            which_dequeue = lambda q: q.dequeue_up_to
        else:
            which_dequeue = lambda q: q.dequeue_many

        enqueues_to_top = [
            top_queue.enqueue(
                [constant_op.constant(i)] +
                which_dequeue(q)(batch_size, name="read_bucket_%d" % i),
                name="enqueue_from_bucket_%d" % i)
            for i, q in enumerate(bucket_queues)
        ]

        for i, q in enumerate(bucket_queues):
            queue_runner.add_queue_runner(
                queue_runner.QueueRunner(
                    q, [enqueues_to_top[i]],
                    queue_closed_exception_types=(errors.OutOfRangeError,
                                                  errors.CancelledError)))
        queue_runner.add_queue_runner(
            queue_runner.QueueRunner(
                top_queue,
                bucket_enqueue_ops,
                queue_closed_exception_types=(errors.OutOfRangeError,
                                              errors.CancelledError)))

        for q in bucket_queues:
            summary.scalar("bucket/%s/size" % q.name,
                           math_ops.cast(top_queue.size(), dtypes.float32))
        summary.scalar(
            "bucket/%s/fraction_of_%d_full" % (top_queue.name, capacity),
            math_ops.cast(top_queue.size(), dtypes.float32) * (1. / capacity))

        dequeued = top_queue.dequeue(name="dequeue_top")
        which_bucket_dequeued = dequeued[0]
        dequeued = dequeued[1:]
        dequeued = _restore_sparse_tensors(dequeued, sparse_info)
        return (which_bucket_dequeued, _as_original_type(tensors, dequeued))
def _enqueue_data(data,
                  capacity,
                  shuffle=False,
                  min_after_dequeue=None,
                  num_threads=1,
                  seed=None,
                  name="enqueue_input",
                  enqueue_size=1,
                  num_epochs=None,
                  pad_value=None):
    """Creates a queue filled from a numpy array or pandas `DataFrame`.

    Returns a queue filled with the rows of the given (`OrderedDict` of) array
    or `DataFrame`. In the case of a pandas `DataFrame`, the first enqueued
    `Tensor` corresponds to the index of the `DataFrame`. For (`OrderedDict` of)
    numpy arrays, the first enqueued `Tensor` contains the row number.

  Args:
    data: a numpy `ndarray`, `OrderedDict` of numpy arrays, or a generator
       yielding `dict`s of numpy arrays or pandas `DataFrame` that will be read
       into the queue.
    capacity: the capacity of the queue.
    shuffle: whether or not to shuffle the rows of the array.
    min_after_dequeue: minimum number of elements that can remain in the queue
    after a dequeue operation. Only used when `shuffle` is true. If not set,
    defaults to `capacity` / 4.
    num_threads: number of threads used for reading and enqueueing.
    seed: used to seed shuffling and reader starting points.
    name: a scope name identifying the data.
    enqueue_size: the number of rows to enqueue per step.
    num_epochs: limit enqueuing to a specified number of epochs, if provided.
    pad_value: default value for dynamic padding of data samples, if provided.

  Returns:
    A queue filled with the rows of the given (`OrderedDict` of) array or
      `DataFrame`.

  Raises:
    TypeError: `data` is not a Pandas `DataFrame`, an `OrderedDict` of numpy
      arrays, a numpy `ndarray`, or a generator producing these.
    NotImplementedError: padding and shuffling data at the same time.
    NotImplementedError: padding usage with non generator data type.
  """
    with ops.name_scope(name):
        if isinstance(data, np.ndarray):
            types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
            queue_shapes = [(), data.shape[1:]]
            get_feed_fn = _ArrayFeedFn
        elif isinstance(data, collections.OrderedDict):
            types = [dtypes.int64
                     ] + [dtypes.as_dtype(col.dtype) for col in data.values()]
            queue_shapes = [()] + [col.shape[1:] for col in data.values()]
            get_feed_fn = _OrderedDictNumpyFeedFn
        elif isinstance(data, tp.FunctionType):
            x_first_el = six.next(data())
            x_first_keys = sorted(x_first_el.keys())
            x_first_values = [x_first_el[key] for key in x_first_keys]
            types = [dtypes.as_dtype(col.dtype) for col in x_first_values]
            queue_shapes = [col.shape for col in x_first_values]
            get_feed_fn = _GeneratorFeedFn
        elif HAS_PANDAS and isinstance(data, pd.DataFrame):
            types = [
                dtypes.as_dtype(dt)
                for dt in [data.index.dtype] + list(data.dtypes)
            ]
            queue_shapes = [() for _ in types]
            get_feed_fn = _PandasFeedFn
        else:
            raise TypeError(
                "data must be either a numpy array or pandas DataFrame if pandas is "
                "installed; got {}".format(type(data).__name__))

        pad_data = pad_value is not None
        if pad_data and get_feed_fn is not _GeneratorFeedFn:
            raise NotImplementedError(
                "padding is only available with generator usage")
        if shuffle and pad_data:
            raise NotImplementedError(
                "padding and shuffling data at the same time is not implemented"
            )

        # TODO(jamieas): TensorBoard warnings for all warnings below once available.

        if num_threads > 1 and num_epochs is not None:
            logging.warning(
                "enqueue_data was called with num_epochs and num_threads > 1. "
                "num_epochs is applied per thread, so this will produce more "
                "epochs than you probably intend. "
                "If you want to limit epochs, use one thread.")

        if shuffle and num_threads > 1 and num_epochs is not None:
            logging.warning(
                "enqueue_data was called with shuffle=True, num_threads > 1, and "
                "num_epochs. This will create multiple threads, all reading the "
                "array/dataframe in order adding to the same shuffling queue; the "
                "results will likely not be sufficiently shuffled.")

        if not shuffle and num_threads > 1:
            logging.warning(
                "enqueue_data was called with shuffle=False and num_threads > 1. "
                "This will create multiple threads, all reading the "
                "array/dataframe in order. If you want examples read in order, use"
                " one thread; if you want multiple threads, enable shuffling.")

        if shuffle:
            min_after_dequeue = int(
                capacity /
                4 if min_after_dequeue is None else min_after_dequeue)
            queue = data_flow_ops.RandomShuffleQueue(capacity,
                                                     min_after_dequeue,
                                                     dtypes=types,
                                                     shapes=queue_shapes,
                                                     seed=seed)
        elif pad_data:
            min_after_dequeue = 0  # just for the summary text
            queue_shapes = list(
                map(
                    lambda x: tuple(list(x[:-1]) + [None])
                    if len(x) > 0 else x, queue_shapes))
            queue = data_flow_ops.PaddingFIFOQueue(capacity,
                                                   dtypes=types,
                                                   shapes=queue_shapes)
        else:
            min_after_dequeue = 0  # just for the summary text
            queue = data_flow_ops.FIFOQueue(capacity,
                                            dtypes=types,
                                            shapes=queue_shapes)

        enqueue_ops = []
        feed_fns = []

        for i in range(num_threads):
            # Note the placeholders have no shapes, so they will accept any
            # enqueue_size.  enqueue_many below will break them up.
            placeholders = [array_ops.placeholder(t) for t in types]

            enqueue_ops.append(queue.enqueue_many(placeholders))
            seed_i = None if seed is None else (i + 1) * seed

            if not pad_data:
                feed_fns.append(
                    get_feed_fn(placeholders,
                                data,
                                enqueue_size,
                                random_start=shuffle,
                                seed=seed_i,
                                num_epochs=num_epochs))
            else:
                feed_fns.append(
                    get_feed_fn(placeholders,
                                data,
                                enqueue_size,
                                random_start=shuffle,
                                seed=seed_i,
                                num_epochs=num_epochs,
                                pad_value=pad_value))

        runner = fqr._FeedingQueueRunner(  # pylint: disable=protected-access
            queue=queue,
            enqueue_ops=enqueue_ops,
            feed_fns=feed_fns)
        queue_runner.add_queue_runner(runner)

        full = (math_ops.cast(
            math_ops.maximum(0,
                             queue.size() - min_after_dequeue), dtypes.float32)
                * (1. / (capacity - min_after_dequeue)))
        # Note that name contains a '/' at the end so we intentionally do not place
        # a '/' after %s below.
        summary_name = (
            "queue/%sfraction_over_%d_of_%d_full" %
            (queue.name, min_after_dequeue, capacity - min_after_dequeue))
        summary.scalar(summary_name, full)
        return queue