Esempio n. 1
0
def prefetch_queue(tensors,
                   capacity=8,
                   num_threads=1,
                   dynamic_pad=False,
                   shared_name=None,
                   name=None):
  """Creates a queue to prefetch tensors from `tensors`.

  A queue runner for enqueuing tensors into the prefetch_queue is automatically
  added to the TF QueueRunners collection.

  Example:
  This is for example useful to pre-assemble input batches read with
  `tf.train.batch()` and enqueue the pre-assembled batches.  Ops that dequeue
  from the pre-assembled queue will not pay the cost of assembling the batch.

  images, labels = tf.train.batch([image, label], batch_size=32, num_threads=4)
  batch_queue = prefetch_queue([images, labels])
  images, labels = batch_queue.dequeue()
  logits = Net(images)
  loss = Loss(logits, labels)

  Args:
    tensors: A list or dictionary of `Tensors` to enqueue in the buffer.
    capacity: An integer. The maximum number of elements in the queue.
    num_threads: An integer.  Number of threads running the enqueue op.
    dynamic_pad: Boolean.  Whether to allow variable dimensions in input shapes.
    shared_name: (optional). If set, this queue will be shared under the given
      name across multiple sessions.
    name: (Optional) A name for the operations.

  Returns:
    A queue from which you can dequeue tensors with the same type and shape
    as `tensors`.
  """
  if isinstance(tensors, dict):
    # Need to wrap the keys and values in list() since Python3 returns views.
    # We sort the keys so the order is consistent across runs.
    names = list(sorted(tensors.keys()))
    tensor_list = list([tensors[n] for n in names])
  else:
    names = None
    tensor_list = tensors

  with ops.name_scope(name, "prefetch_queue", tensor_list) as name:
    dtypes = [t.dtype for t in tensor_list]
    shapes = [t.get_shape() for t in tensor_list]
    queue = _which_queue(dynamic_pad)(
        capacity=capacity,
        dtypes=dtypes,
        shapes=shapes,
        names=names,
        shared_name=shared_name)
    enqueue_op = queue.enqueue(tensors)
    queue_runner.add_queue_runner(
        queue_runner.QueueRunner(queue, [enqueue_op] * num_threads))
    summary.scalar(
        "fraction_of_%d_full" % capacity,
        math_ops.cast(queue.size(), _dtypes.float32) * (1. / capacity))
    return queue
Esempio n. 2
0
    def read(self, queue, name=None):
        """Returns the next record (key, value pair) produced by the reader.

    The multiple reader instances are all configured to `read()` from the
    filenames listed in `queue` and enqueue their output into the `common_queue`
    passed to the constructor, and this method returns the next record dequeued
    from that `common_queue`.


    Readers dequeue a work unit from `queue` if necessary (e.g. when a
    reader needs to start reading from a new file since it has finished with
    the previous file).

    A queue runner for enqueing in the `common_queue` is automatically added to
    the TF QueueRunners collection.

    Args:
      queue: A Queue or a mutable string Tensor representing a handle
        to a Queue, with string work items.
      name: A name for the operation (optional).

    Returns:
      The next record (i.e. (key, value pair)) from the common_queue.
    """

        enqueue_ops = []
        for reader in self._readers:
            enqueue_ops.append(self._common_queue.enqueue(reader.read(queue)))

        queue_runner.add_queue_runner(queue_runner.QueueRunner(self._common_queue, enqueue_ops))

        return self._common_queue.dequeue(name=name)
Esempio n. 3
0
  def _apply_transform(self, transform_input):
    filename_queue = input_ops.string_input_producer(self._work_units,
                                                     shuffle=self.shuffle,
                                                     seed=self._seed)

    if self.shuffle:
      queue = data_flow_ops.RandomShuffleQueue(
          capacity=self.queue_capacity,
          min_after_dequeue=self.min_after_dequeue,
          dtypes=[dtypes.string, dtypes.string],
          shapes=[[], []],
          seed=self.seed)
    else:
      queue = data_flow_ops.FIFOQueue(capacity=self.queue_capacity,
                                      dtypes=[dtypes.string, dtypes.string],
                                      shapes=[[], []])

    enqueue_ops = []
    for _ in range(self.num_threads):
      reader = self._reader_cls(**self._reader_kwargs)
      enqueue_ops.append(queue.enqueue(reader.read(filename_queue)))

    runner = queue_runner.QueueRunner(queue, enqueue_ops)
    queue_runner.add_queue_runner(runner)
    dequeued = queue.dequeue_many(self.batch_size)

    # pylint: disable=not-callable
    return self.return_type(*dequeued)
Esempio n. 4
0
def _get_stratified_batch_from_tensors(val_list, label, accept_probs,
                                       batch_size, queue_threads=3):
  """Accepts examples one-at-a-time based on class."""
  # Make queue that will have proper class proportions. Contains exactly one
  # batch at a time.
  vals_shapes = [val.get_shape() for val in val_list]
  vals_dtypes = [val.dtype for val in val_list]
  label_shape = label.get_shape()
  final_q = data_flow_ops.FIFOQueue(capacity=batch_size,
                                    shapes=vals_shapes + [label_shape],
                                    dtypes=vals_dtypes + [label.dtype],
                                    name='batched_queue')

  # Conditionally enqueue.
  tensors_to_enqueue = val_list + [label]
  eq_tf = array_ops.reshape(math_ops.less(
      random_ops.random_uniform([1]),
      array_ops.slice(accept_probs, [label], [1])),
                            [])
  conditional_enqueue = control_flow_ops.cond(
      eq_tf,
      lambda: final_q.enqueue(tensors_to_enqueue),
      control_flow_ops.no_op)
  queue_runner.add_queue_runner(queue_runner.QueueRunner(
      final_q, [conditional_enqueue] * queue_threads))

  return final_q.dequeue_many(batch_size)
Esempio n. 5
0
  def _configure_readers_by(self, queue):
    enqueue_ops = []
    for reader in self._readers:
      enqueue_ops.append(self._common_queue.enqueue(reader.read(queue)))

    queue_runner.add_queue_runner(
        queue_runner.QueueRunner(self._common_queue, enqueue_ops))
Esempio n. 6
0
def _make_per_class_queues(data, labels, num_classes, queue_capacity,
                           threads_per_queue):
  """Creates per-class-queues based on data and labels."""
  # Create one queue per class.
  queues = []
  per_data_shape = data.get_shape().with_rank_at_least(1)[1:]
  per_data_shape.assert_is_fully_defined()

  for i in range(num_classes):
    q = data_flow_ops.FIFOQueue(capacity=queue_capacity,
                                shapes=per_data_shape, dtypes=[data.dtype],
                                name='stratified_sample_class%d_queue' % i)
    logging_ops.scalar_summary('queue/stratified_sample_class%d' % i, q.size())
    queues.append(q)

  # Partition tensors according to labels.
  partitions = data_flow_ops.dynamic_partition(data, labels, num_classes)

  # Enqueue each tensor on the per-class-queue.
  for i in range(num_classes):
    enqueue_op = queues[i].enqueue_many(partitions[i]),
    queue_runner.add_queue_runner(queue_runner.QueueRunner(
        queues[i], [enqueue_op] * threads_per_queue))

  return queues
def create_input_queues(image, label, capacity=100):
    """Creates Queues a FIFO Queue out of Input tensor objects.
     
     This function is no longer used in the input pipeline.
     However it took me a while to understand queuing and it might be useful
     fot someone at some point.

    Args:
       image: an image tensor object, generated by queues.
       label: an label tensor object, generated by queues.
      
    Returns: Two FiFO Queues
    """
    
    #create input queues

    im_queue = tf.FIFOQueue(capacity, dtypes.uint8)
    enqueue_op = im_queue.enqueue(image)
    
    queue_runner.add_queue_runner(queue_runner.QueueRunner(im_queue,
                                                           [enqueue_op]))

    label_queue = tf.FIFOQueue(capacity, dtypes.uint8)
    enqueue_op = label_queue.enqueue(label)
    
    queue_runner.add_queue_runner(queue_runner.QueueRunner(label_queue,
                                                           [enqueue_op]))
                                                           
    return im_queue, label_queue
Esempio n. 8
0
 def _fn():
   queue = data_flow_ops.FIFOQueue(
       capacity=10, dtypes=dtypes.float32, shapes=[10, 3])
   enqueue_op = queue.enqueue(array_ops.zeros([10, 3], dtype=dtypes.float32))
   queue_runner.add_queue_runner(
       queue_runner.QueueRunner(queue, [enqueue_op]))
   return queue.dequeue(), None
Esempio n. 9
0
def _make_per_class_queues(tensor_list, labels, num_classes, queue_capacity, threads_per_queue):
    """Creates per-class-queues based on data and labels."""
    # Create one queue per class.
    queues = []
    data_shapes = []
    data_dtypes = []
    for data_tensor in tensor_list:
        per_data_shape = data_tensor.get_shape().with_rank_at_least(1)[1:]
        per_data_shape.assert_is_fully_defined()
        data_shapes.append(per_data_shape)
        data_dtypes.append(data_tensor.dtype)

    for i in range(num_classes):
        q = data_flow_ops.FIFOQueue(
            capacity=queue_capacity, shapes=data_shapes, dtypes=data_dtypes, name="stratified_sample_class%d_queue" % i
        )
        logging_ops.scalar_summary("queue/%s/stratified_sample_class%d" % (q.name, i), q.size())
        queues.append(q)

    # Partition tensors according to labels. `partitions` is a list of lists, of
    # size num_classes X len(tensor_list). The number of tensors in partition `i`
    # should be the same for all tensors.
    all_partitions = [data_flow_ops.dynamic_partition(data, labels, num_classes) for data in tensor_list]
    partitions = [[cur_partition[i] for cur_partition in all_partitions] for i in range(num_classes)]

    # Enqueue each tensor on the per-class-queue.
    for i in range(num_classes):
        enqueue_op = (queues[i].enqueue_many(partitions[i]),)
        queue_runner.add_queue_runner(queue_runner.QueueRunner(queues[i], [enqueue_op] * threads_per_queue))

    return queues
Esempio n. 10
0
def _get_stratified_batch_from_tensors(val, label, reject_probs, batch_size,
                                       queue_threads=3):
  """Reject examples one-at-a-time based on class."""
  # Make rejection probabilities into a tensor so they can be dynamically
  # accessed by tensors.
  reject_probs = constant_op.constant(
      reject_probs, dtype=dtypes.float32, name='rejection_probabilities')

  # Make queue that will have proper class proportions. Contains exactly one
  # batch at a time.
  val_shape = val.get_shape()
  label_shape = label.get_shape()
  final_q = data_flow_ops.FIFOQueue(capacity=batch_size,
                                    shapes=[val_shape, label_shape],
                                    dtypes=[val.dtype, label.dtype],
                                    name='batched_queue')

  # Conditionally enqueue.
  eq_tf = array_ops.reshape(math_ops.greater(
      random_ops.random_uniform([1]),
      array_ops.slice(reject_probs, [label], [1])),
                            [])
  conditional_enqueue = control_flow_ops.cond(
      eq_tf,
      lambda: final_q.enqueue([val, label]),
      control_flow_ops.no_op)
  queue_runner.add_queue_runner(queue_runner.QueueRunner(
      final_q, [conditional_enqueue] * queue_threads))

  return final_q.dequeue_many(batch_size)
Esempio n. 11
0
def input_producer(input_tensor, element_shape=None, num_epochs=None,
                   shuffle=True, seed=None, capacity=32, shared_name=None,
                   summary_name=None, name=None):
  """Output the rows of `input_tensor` to a queue for an input pipeline.

  Args:
    input_tensor: A tensor with the rows to produce. Must be at least
      one-dimensional. Must either have a fully-defined shape, or
      `element_shape` must be defined.
    element_shape: (Optional.) A `TensorShape` representing the shape of a
      row of `input_tensor`, if it cannot be inferred.
    num_epochs: (Optional.) An integer. If specified `input_producer` produces
      each row of `input_tensor` `num_epochs` times before generating an
      `OutOfRange` error. If not specified, `input_producer` can cycle through
      the rows of `input_tensor` an unlimited number of times.
    shuffle: (Optional.) A boolean. If true, the rows are randomly shuffled
      within each epoch.
    seed: (Optional.) An integer. The seed to use if `shuffle` is true.
    capacity: (Optional.) The capacity of the queue to be used for buffering
      the input.
    shared_name: (Optional.) If set, this queue will be shared under the given
      name across multiple sessions.
    summary_name: (Optional.) If set, a scalar summary for the current queue
      size will be generated, using this name as part of the tag.
    name: (Optional.) A name for queue.

  Returns:
    A queue with the output rows.  A `QueueRunner` for the queue is
    added to the current `QUEUE_RUNNER` collection of the current
    graph.

  Raises:
    ValueError: If the shape of the input cannot be inferred from the arguments.
  """
  with ops.name_scope(name, "input_producer", [input_tensor]):
    input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
    element_shape = input_tensor.get_shape()[1:].merge_with(element_shape)
    if not element_shape.is_fully_defined():
      raise ValueError("Either `input_tensor` must have a fully defined shape "
                       "or `element_shape` must be specified")

    if shuffle:
      input_tensor = random_ops.random_shuffle(input_tensor, seed=seed)

    input_tensor = limit_epochs(input_tensor, num_epochs)

    q = data_flow_ops.FIFOQueue(capacity=capacity,
                                dtypes=[input_tensor.dtype.base_dtype],
                                shapes=[element_shape],
                                shared_name=shared_name, name=name)
    enq = q.enqueue_many([input_tensor])
    queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq]))
    if summary_name is not None:
      logging_ops.scalar_summary("queue/%s/%s" % (q.name, summary_name),
                                 math_ops.cast(q.size(), dtypes.float32) *
                                 (1. / capacity))
    return q
Esempio n. 12
0
def prefetch_queue(tensors,
                   capacity=8,
                   shared_name=None,
                   name=None):
  """Creates a queue to prefetech tensors from `tensors`.

  A queue runner for enqueing tensors into the prefetch_queue is automatically
  added to the TF QueueRunners collection.

  Example:
  This is for example useful to pre-assemble input batches read with
  `tf.train.batch()` and enqueue the pre-assembled batches.  Ops that dequeue
  from the pre-assembled queue will not pay the cost of assembling the batch.

  images, labels = tf.train.batch([image, label], batch_size=32, num_threads=4)
  batch_queue = prefetch_queue([images, labels])
  images, labels = batch_queue.dequeue()
  logits = Net(images)
  loss = Loss(logits, labels)

  Args:
    tensors: A list or dictionary of `Tensors` to enqueue in the buffer.
    capacity: An integer. The maximum number of elements in the queue.
    shared_name: (optional). If set, this queue will be shared under the given
      name across multiple sessions.
    name: (Optional) A name for the operations.

  Returns:
    A queue from which you can dequeue tensors with the same type and shape
    as `tensors`.
  """
  if isinstance(tensors, dict):
    # Need to wrap the keys and values in list() since Python3 returns views.
    names = list(tensors.keys())
    tensor_list = list(tensors.values())
  else:
    names = None
    tensor_list = tensors

  with ops.name_scope(name, "prefetch_queue", tensor_list) as name:
    dtypes = [t.dtype for t in tensor_list]
    shapes = [t.get_shape() for t in tensor_list]
    queue = data_flow_ops.FIFOQueue(capacity=capacity,
                                    dtypes=dtypes,
                                    shapes=shapes,
                                    names=names,
                                    shared_name=shared_name)
    enqueue_op = queue.enqueue(tensors, name=name)
    queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, [enqueue_op]))
    logging_ops.scalar_summary(
        "queue/%s/fraction_of_%d_full" % (queue.name, capacity),
        math_ops.to_float(queue.size()) * (1. / capacity))
    return queue
Esempio n. 13
0
def _input_producer(input_tensor, dtype, num_epochs, shuffle, seed, capacity, name, summary_name):
    if shuffle:
        input_tensor = random_ops.random_shuffle(input_tensor, seed=seed)
    input_tensor = limit_epochs(input_tensor, num_epochs)

    q = data_flow_ops.FIFOQueue(capacity=capacity, dtypes=[dtype], shapes=[[]], name=name)
    enq = q.enqueue_many([input_tensor])
    queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq]))
    summary_ops.scalar_summary(
        "queue/%s/%s" % (q.name, summary_name), math_ops.cast(q.size(), dtypes.float32) * (1.0 / capacity)
    )
    return q
def _queue_parsed_features(feature_map):
  tensors_to_enqueue = []
  keys = []
  for key, tensor in six.iteritems(feature_map):
    keys.append(key)
    tensors_to_enqueue.append(tensor)
  queue_dtypes = [x.dtype for x in tensors_to_enqueue]
  input_queue = data_flow_ops.FIFOQueue(capacity=100, dtypes=queue_dtypes)
  queue_runner.add_queue_runner(
      queue_runner.QueueRunner(input_queue,
                               [input_queue.enqueue(tensors_to_enqueue)]))
  dequeued_tensors = input_queue.dequeue()
  return {keys[i]: dequeued_tensors[i] for i in range(len(dequeued_tensors))}
Esempio n. 15
0
def _conditional_batch(tensors, accept_prob, batch_size, queue_threads=10):
  """Conditionally enqueue tensors based on accept_prob.

  Specifically, enqueue the element if accept_prob > rand_unif([0, 1]).

  Args:
      tensors: List of tensors to enqueue.
      accept_prob: Acceptance probability per example.
      batch_size: Size of batch.
      queue_threads: Number of threads enqueuing in the final queue.

  Returns:
      List of batched tensors.

  Raises:
      ValueError: `accept_prob` isn't 0D.
  """
  accept_prob.get_shape().assert_has_rank(0)
  # Determine shapes and types of to-be-enqueued-tensors.
  shapes_list = []
  dtypes_list = []
  for tensor in tensors:
    cur_shape = tensor.get_shape()
    cur_shape.assert_is_fully_defined()
    shapes_list.append(cur_shape)
    dtypes_list.append(tensor.dtype)

  final_q = data_flow_ops.FIFOQueue(capacity=batch_size,
                                    shapes=shapes_list,
                                    dtypes=dtypes_list,
                                    name='batched_queue')
  logging_ops.scalar_summary('queue/%s/size' % final_q.name, final_q.size())

  # Conditionally enqueue.
  # Reshape enqueue op to match no_op's shape.
  eq_tf = math_ops.less(random_ops.random_uniform([]), accept_prob)
  conditional_enqueue = control_flow_ops.cond(
      eq_tf,
      lambda: final_q.enqueue(tensors),
      control_flow_ops.no_op)
  queue_runner.add_queue_runner(queue_runner.QueueRunner(
      final_q, [conditional_enqueue] * queue_threads))

  out_tensor = final_q.dequeue_many(batch_size)
  # Queues return a single tensor if the list of enqued tensors is one. Since we
  # want the type to be the same in all cases, always return a list.
  if isinstance(out_tensor, ops.Tensor):
    out_tensor = [out_tensor]

  return out_tensor
Esempio n. 16
0
  def _add_remote_queue_runner(self, queue, enq_ops):
    """Adds a remote queue runner to the graph.

    These queue runners differ from the standard in two ways: First,
    they never close their queue. Second, they are added to the
    `Feeder.REMOTE_QUEUE_RUNNERS` collection, rather than
    `ops.GraphKeys.QUEUE_RUNNERS`, so they can be started/stopped
    separately.

    Args:
      queue: The queue.
      enq_ops: A list of ops which perform enqueues (each on its own thread).
    """

    runner = queue_runner.QueueRunner(
        queue,
        enq_ops,
        cancel_op=self._fake_op,
        close_op=self._fake_op)
    queue_runner.add_queue_runner(
        runner, collection=Feeder.REMOTE_QUEUE_RUNNERS)
Esempio n. 17
0
def queue_parsed_features(parsed_features, keys=None, feature_queue_capacity=100, num_queue_runners=2, name=None):
    """Speeds up parsing by using queues to do it asynchronously.

  This function adds the tensors in `parsed_features` to a queue, which allows
  the parsing (or any other expensive op before this) to be asynchronous wrt the
  rest of the training graph. This greatly improves read latency and speeds up
  training since the data will already be parsed and ready when each step of
  training needs it.

  All queue runners are added to the queue runners collection, and may be
  started via `start_queue_runners`.

  All ops are added to the default graph.

  Args:
    parsed_features: A dict of string key to `Tensor` or `SparseTensor` objects.
    keys: `Tensor` of string keys.
    feature_queue_capacity: Capacity of the parsed features queue.
    num_queue_runners: Number of queue runners to start for the feature queue,
      Adding multiple queue runners for the parsed example queue helps maintain
      a full queue when the subsequent computations overall are cheaper than
      parsing.
    name: Name of resulting op.

  Returns:
    Returns tuple of:
    - `Tensor` corresponding to `keys` if provided, otherwise `None`.
    -  A dict of string key to `Tensor` or `SparseTensor` objects corresponding
       to `parsed_features`.
  """
    args = list(parsed_features.values())
    if keys is not None:
        args += [keys]

    with ops.name_scope(name, "queue_parsed_features", args):
        # Lets also add preprocessed tensors into the queue types for each item of
        # the queue.
        tensors_to_enqueue = []
        # Each entry contains the key, and a boolean which indicates whether the
        # tensor was a sparse tensor.
        tensors_mapping = []
        # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse
        # tensors into a queue. This could be taken care in somewhere else so others
        # can reuse it. Also, QueueBase maybe extended to handle sparse tensors
        # directly.
        for key in sorted(parsed_features.keys()):
            tensor = parsed_features[key]
            if isinstance(tensor, ops.SparseTensor):
                tensors_mapping.append((key, True))
                tensors_to_enqueue.extend([tensor.indices, tensor.values, tensor.shape])
            else:
                tensors_mapping.append((key, False))
                tensors_to_enqueue.append(tensor)

        if keys is not None:
            tensors_to_enqueue.append(keys)

        queue_dtypes = [x.dtype for x in tensors_to_enqueue]
        input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity, queue_dtypes)

        # Add a summary op to debug if our feature queue is full or not.
        logging_ops.scalar_summary(
            "queue/parsed_features/%s/fraction_of_%d_full" % (input_queue.name, feature_queue_capacity),
            math_ops.cast(input_queue.size(), dtypes.float32) * (1.0 / feature_queue_capacity),
        )

        # Add multiple queue runners so that the queue is always full. Adding more
        # than two queue-runners may hog the cpu on the worker to fill up the queue.
        for _ in range(num_queue_runners):
            queue_runner.add_queue_runner(
                queue_runner.QueueRunner(input_queue, [input_queue.enqueue(tensors_to_enqueue)])
            )

        dequeued_tensors = input_queue.dequeue()

        # Reset shapes on dequeued tensors.
        for i in range(len(tensors_to_enqueue)):
            dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape())

        # Recreate feature mapping according to the original dictionary.
        dequeued_parsed_features = {}
        index = 0
        for key, is_sparse_tensor in tensors_mapping:
            if is_sparse_tensor:
                # Three tensors are (indices, values, shape).
                dequeued_parsed_features[key] = ops.SparseTensor(
                    dequeued_tensors[index], dequeued_tensors[index + 1], dequeued_tensors[index + 2]
                )
                index += 3
            else:
                dequeued_parsed_features[key] = dequeued_tensors[index]
                index += 1

        dequeued_keys = None
        if keys is not None:
            dequeued_keys = dequeued_tensors[-1]

        return dequeued_keys, dequeued_parsed_features
Esempio n. 18
0
def _enqueue_join(queue, tensor_list_list):
    enqueue_ops = [queue.enqueue(tl) for tl in tensor_list_list]
    queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
Esempio n. 19
0
 def set_many_fed_tensors(self, tensors):
   """Sets batches fed tensors."""
   enq_op = self._local_q.enqueue_many(tensors)
   queue_runner.add_queue_runner(queue_runner.QueueRunner(
       self._local_q, [enq_op]))
Esempio n. 20
0
def bucket(tensors,
           which_bucket,
           batch_size,
           num_buckets,
           num_threads=1,
           capacity=32,
           bucket_capacities=None,
           shapes=None,
           dynamic_pad=False,
           allow_smaller_final_batch=False,
           keep_input=True,
           shared_name=None,
           name=None):
    """Lazy bucketing of input tensors according to `which_bucket`.

  The argument `tensors` can be a list or a dictionary of tensors.
  The value returned by the function will be of the same type
  as `tensors`.

  The tensors entering this function are put into the bucket given by
  `which_bucket`.  Each bucket has its own queue.  When a bucket contains
  `batch_size` elements, this minibatch is pushed onto a top queue.  The
  tensors returned from this function are a the result of dequeueing the
  next minibatch from this top queue.

  This function is implemented using several queues. A `QueueRunner` for the
  queues is added to the current `Graph`'s `QUEUE_RUNNER` collection.

  As the returned tensors are the result of a dequeue operation, evaluating
  them will throw a `tf.errors.OutOfRangeError` when the input queue is
  exhausted.  If these tensors are feeding another input queue, its queue runner
  will catch this exception, however, if they are used in your main thread
  you are responsible for catching this yourself.

  *N.B.:* If `dynamic_pad` is `False`, you must ensure that either
  (i) the `shapes` argument is passed, or (ii) all of the tensors in
  `tensors` must have fully-defined shapes. `ValueError` will be
  raised if neither of these conditions holds.

  If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
  tensors is known, but individual dimensions may have shape `None`.
  In this case, for each enqueue the dimensions with value `None`
  may have a variable length; upon dequeue, the output tensors will be padded
  on the right to the maximum shape of the tensors in the current minibatch.
  For numbers, this padding takes value 0.  For strings, this padding is
  the empty string.  See `PaddingFIFOQueue` for more info.

  If `allow_smaller_final_batch` is `True`, a smaller batch value than
  `batch_size` is returned when the queues are closed and there are not enough
  elements to fill the batch, otherwise the pending elements are discarded.
  In addition, all output tensors' static shapes, as accessed via the
  `get_shape()` method will have a 0th `Dimension` value of `None`, and
  operations that depend on fixed batch_size would fail.

  Args:
    tensors: The list or dictionary of tensors, representing a single element,
      to bucket.  Nested lists are not supported.
    which_bucket: An `int32` scalar Tensor taking a value in `[0, num_buckets)`.
    batch_size: The new batch size pulled from the queue (all queues will have
      the same size).  If a list is passed in then each bucket will have a
      different batch_size.
      (python int, int32 scalar or iterable of integers of length num_buckets).
    num_buckets: A python integer, the number of buckets.
    num_threads: An integer.  The number of threads enqueuing `tensors`.
    capacity: An integer. The maximum number of minibatches in the top queue,
      and also (by default) the maximum number of elements within each bucket.
    bucket_capacities: (Optional) None or a list of integers, the capacities of
      each bucket. If None, capacity is used (default). If specified, it must
      be a list of integers of length num_buckets: the i-th element is used
      as capacity for the i-th bucket queue.
    shapes: (Optional) The shapes for each example.  Defaults to the
      inferred shapes for `tensors`.
    dynamic_pad: Boolean.  Allow variable dimensions in input shapes.
      The given dimensions are padded upon dequeue so that tensors within a
      batch have the same shapes.
    allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final
      batches to be smaller if there are insufficient items left in the queues.
    keep_input: A `bool` scalar Tensor.  If provided, this tensor controls
      whether the input is added to the queue or not.  If it evaluates `True`,
      then `tensors` are added to the bucket; otherwise they are dropped.  This
      tensor essentially acts as a filtering mechanism.
    shared_name: (Optional). If set, the queues will be shared under the given
      name across multiple sessions.
    name: (Optional) A name for the operations.

  Returns:
    A tuple `(bucket, outputs)` where `bucket` is
    a `int32` scalar tensor and `outputs` is a list or
    dictionary of batched outputs corresponding to elements of `tensors`.
    Every step will receive a new bucket of outputs.

  Raises:
    ValueError: If the `shapes` are not specified, and cannot be
      inferred from the elements of `tensors` or if batch_size is a sequence
      but its length != num_buckets. Also if bucket_capacities is not None but
      its length != num_buckets.
  """
    batch_size_per_bucket = False
    if isinstance(batch_size, (list, tuple)):
        batch_size_per_bucket = True
        if len(batch_size) != num_buckets:
            raise ValueError(
                "If batch_size is a list it must have num_buckets elements")
    else:
        batch_size = [batch_size] * num_buckets

    if bucket_capacities is None:
        bucket_capacities = [capacity] * num_buckets
    if len(bucket_capacities) != num_buckets:
        raise ValueError(
            "The list bucket_capacities (%s) must have exactly num_buckets (%d) "
            "elements." % (str(bucket_capacities), num_buckets))

    tensor_list = _as_tensor_list(tensors)
    with ops.name_scope(name, "bucket", tensor_list) as name:
        tensor_list = _validate_bucket(tensor_list)
        keep_input = _validate_keep_input(keep_input, enqueue_many=False)
        (tensor_list,
         sparse_info) = _store_sparse_tensors(tensor_list,
                                              enqueue_many=False,
                                              keep_input=keep_input)

        # Round-trip batch_size to a tensor, and possibly back
        for i, bucket_batch_size in enumerate(batch_size):
            bucket_batch_size = ops.convert_to_tensor(bucket_batch_size,
                                                      dtype=dtypes.int32,
                                                      name="batch_size")
            static_batch_size = tensor_util.constant_value(bucket_batch_size)
            batch_size[i] = (static_batch_size if static_batch_size is not None
                             else bucket_batch_size)

        types = _dtypes([tensor_list])
        shapes = _shapes([tensor_list], shapes, enqueue_many=False)

        which_bucket = ops.convert_to_tensor(which_bucket,
                                             dtype=dtypes.int32,
                                             name="which_bucket")

        queue_creator = _which_queue(dynamic_pad)
        bucket_queues = []
        for i in range(num_buckets):
            shared_name_i = ("%s_%d" % (shared_name, i)
                             if shared_name is not None else None)
            bucket_queues.append(
                queue_creator(capacity=bucket_capacities[i],
                              dtypes=types,
                              shapes=shapes,
                              shared_name=shared_name_i,
                              name="bucket_queue_%d" % i))

        maybe_static_batch_size = (None if (allow_smaller_final_batch
                                            or batch_size_per_bucket) else
                                   static_batch_size)

        bucket_shapes = [
            tensor_shape.vector(maybe_static_batch_size).concatenate(s)
            for s in bucket_queues[0].shapes
        ]
        # top_queue is a PaddingFIFOQueue even if the bucket queues are regular FIFO
        # queues because if we use allow_smaller_final_batch, shapes will
        # contain Nones in their first entry; as a result, a regular
        # FIFOQueue would die when being passed shapes that are not fully defined.
        top_queue = data_flow_ops.PaddingFIFOQueue(
            capacity=capacity,
            dtypes=[dtypes.int32] + types,
            shapes=[tensor_shape.scalar()] + bucket_shapes,
            shared_name=shared_name,
            name="top_queue")

        def enqueue_which():
            """Return an op that enqueues conditionally in one of the queues."""
            def enqueue_single(i):
                return bucket_queues[i].enqueue(tensor_list)

            enqueues = [
                control_flow_ops.cond(math_ops.equal(which_bucket, i),
                                      functools.partial(enqueue_single, i),
                                      control_flow_ops.no_op)
                for i in range(num_buckets)
            ]
            return control_flow_ops.group(*enqueues, name="group_enqueues")

        maybe_enqueue = utils.smart_cond(keep_input, enqueue_which,
                                         control_flow_ops.no_op)

        bucket_enqueue_ops = [maybe_enqueue] * num_threads

        if allow_smaller_final_batch:
            which_dequeue = lambda q: q.dequeue_up_to
        else:
            which_dequeue = lambda q: q.dequeue_many

        def make_list(t):
            if isinstance(t, (list, tuple)):
                return t
            else:
                return [t]

        enqueues_to_top = [
            top_queue.enqueue(
                [constant_op.constant(i)] +
                make_list(which_dequeue(q)(bs, name="read_bucket_%d" % i)),
                name="enqueue_from_bucket_%d" % i)
            for i, (q, bs) in enumerate(zip(bucket_queues, batch_size))
        ]

        queue_runner.add_queue_runner(
            queue_runner.QueueRunner(
                bucket_queues[0],
                enqueues_to_top,
                close_op=top_queue.close(),
                cancel_op=top_queue.close(cancel_pending_enqueues=True),
                queue_closed_exception_types=(errors.OutOfRangeError,
                                              errors.CancelledError)))
        queue_runner.add_queue_runner(
            queue_runner.QueueRunner(
                top_queue,
                bucket_enqueue_ops,
                close_op=control_flow_ops.group(
                    *[q.close() for q in bucket_queues]),
                cancel_op=control_flow_ops.group(*[
                    q.close(cancel_pending_enqueues=True)
                    for q in bucket_queues
                ]),
                queue_closed_exception_types=(errors.OutOfRangeError,
                                              errors.CancelledError)))

        for q in bucket_queues:
            summary.scalar("bucket/%s/size" % q.name,
                           math_ops.cast(top_queue.size(), dtypes.float32))
        summary.scalar(
            "bucket/%s/fraction_of_%d_full" % (top_queue.name, capacity),
            math_ops.cast(top_queue.size(), dtypes.float32) * (1. / capacity))

        dequeued = top_queue.dequeue(name="dequeue_top")
        which_bucket_dequeued = dequeued[0]
        dequeued = dequeued[1:]
        if len(dequeued) == 1:
            dequeued = dequeued[0]
        dequeued = _restore_sparse_tensors(dequeued, sparse_info)
        return (which_bucket_dequeued, _as_original_type(tensors, dequeued))
Esempio n. 21
0
def queue_parsed_features(parsed_features,
                          keys=None,
                          feature_queue_capacity=100,
                          num_queue_runners=None,
                          num_enqueue_threads=None,
                          name=None):
  """Speeds up parsing by using queues to do it asynchronously.

  This function adds the tensors in `parsed_features` to a queue, which allows
  the parsing (or any other expensive op before this) to be asynchronous wrt the
  rest of the training graph. This greatly improves read latency and speeds up
  training since the data will already be parsed and ready when each step of
  training needs it.

  All queue runners are added to the queue runners collection, and may be
  started via `start_queue_runners`.

  All ops are added to the default graph.

  Args:
    parsed_features: A dict of string key to `Tensor` or `SparseTensor` objects.
    keys: `Tensor` of string keys.
    feature_queue_capacity: Capacity of the parsed features queue.
    num_queue_runners: Deprecated. Defaults to 2 if this and
      `num_enqueue_threads` are both `None`. This is the number of queue
      runners to start for the feature queue. Adding multiple queue runners for
      the parsed example queue helps maintain a full queue when the subsequent
      computations overall are cheaper than parsing. This argument will be
      deprecated and replaced with `num_enqueue_threads`.
    num_enqueue_threads: Number of threads to enqueue the parsed example queue.
      Using multiple threads to enqueue the parsed example queue helps maintain
      a full queue when the subsequent computations overall are cheaper than
      parsing. This argument will replace `num_queue_runners`. This and
      `num_queue_runners` can not both be set.
    name: Name of resulting op.

  Returns:
    Returns tuple of:
    - `Tensor` corresponding to `keys` if provided, otherwise `None`.
    -  A dict of string key to `Tensor` or `SparseTensor` objects corresponding
       to `parsed_features`.
  Raises:
    ValueError: for invalid inputs.
  """
  num_queue_runners, num_enqueue_threads = _check_enqueue_params(
      num_queue_runners, num_enqueue_threads)

  args = list(parsed_features.values())
  if keys is not None:
    args += [keys]

  with ops.name_scope(name, 'queue_parsed_features', args):
    # Lets also add preprocessed tensors into the queue types for each item of
    # the queue.
    tensors_to_enqueue = []
    # Each entry contains the key, and a boolean which indicates whether the
    # tensor was a sparse tensor.
    tensors_mapping = []
    # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse
    # tensors into a queue. This could be taken care in somewhere else so others
    # can reuse it. Also, QueueBase maybe extended to handle sparse tensors
    # directly.
    for key in sorted(parsed_features.keys()):
      tensor = parsed_features[key]
      if isinstance(tensor, sparse_tensor.SparseTensor):
        tensors_mapping.append((key, True))
        tensors_to_enqueue.extend([tensor.indices, tensor.values, tensor.shape])
      else:
        tensors_mapping.append((key, False))
        tensors_to_enqueue.append(tensor)

    if keys is not None:
      tensors_to_enqueue.append(keys)

    queue_dtypes = [x.dtype for x in tensors_to_enqueue]
    input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity, queue_dtypes)

    # Add a summary op to debug if our feature queue is full or not.
    summary.scalar('queue/parsed_features/%s/fraction_of_%d_full' %
                   (input_queue.name, feature_queue_capacity),
                   math_ops.cast(input_queue.size(), dtypes.float32) *
                   (1. / feature_queue_capacity))

    # Add multiple queue runners so that the queue is always full. Adding more
    # than two queue-runners may hog the cpu on the worker to fill up the queue.
    #
    # Note: this can result in large last batch being lost as the multiple queue
    # runner threads do not coordinate with each other. Please use
    # `num_enqueue_threads` instead.
    if num_queue_runners is not None:
      for _ in range(num_queue_runners):
        queue_runner.add_queue_runner(
            queue_runner.QueueRunner(
                input_queue, [input_queue.enqueue(tensors_to_enqueue)],
                queue_closed_exception_types=(errors.OutOfRangeError,
                                              errors.CancelledError)))
    # Use a single QueueRunner with multiple threads to enqueue so the queue is
    # always full. The threads are coordinated so the last batch will not be
    # lost.
    elif num_enqueue_threads is not None:
      enqueue_ops = [input_queue.enqueue(tensors_to_enqueue)
                     for _ in range(num_enqueue_threads)]
      queue_runner.add_queue_runner(queue_runner.QueueRunner(
          input_queue, enqueue_ops,
          queue_closed_exception_types=(errors.OutOfRangeError,
                                        errors.CancelledError)))
    else:
      raise AssertionError(
          'Either `num_queue_runners` or `num_enqueue_threads` should have '
          'been set.')

    dequeued_tensors = input_queue.dequeue()

    # Reset shapes on dequeued tensors.
    for i in range(len(tensors_to_enqueue)):
      dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape())

    # Recreate feature mapping according to the original dictionary.
    dequeued_parsed_features = {}
    index = 0
    for key, is_sparse_tensor in tensors_mapping:
      if is_sparse_tensor:
        # Three tensors are (indices, values, shape).
        dequeued_parsed_features[key] = sparse_tensor.SparseTensor(
            dequeued_tensors[index], dequeued_tensors[index + 1],
            dequeued_tensors[index + 2])
        index += 3
      else:
        dequeued_parsed_features[key] = dequeued_tensors[index]
        index += 1

    dequeued_keys = None
    if keys is not None:
      dequeued_keys = dequeued_tensors[-1]

    return dequeued_keys, dequeued_parsed_features
Esempio n. 22
0
def enqueue_data(data,
                 capacity,
                 shuffle=False,
                 min_after_dequeue=None,
                 seed=None,
                 name="enqueue_input"):
    """Creates a queue filled from a numpy array or pandas `DataFrame`.

    Returns a queue filled with the rows of the given array or `DataFrame`. In
    the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to
    the index of the `DataFrame`. For numpy arrays, the first enqueued `Tensor`
    contains the row number.

  Args:
    data: a numpy `ndarray or` pandas `DataFrame` that will be read into the
      queue.
    capacity: the capacity of the queue.
    shuffle: whether or not to shuffle the rows of the array.
    min_after_dequeue: minimum number of elements that can remain in the queue
    after a dequeue operation. Only used when `shuffle` is true. If not set,
    defaults to `capacity` / 4.
    seed: used to seed RandomShuffleQueue. Only used when `shuffle` is True.
    name: a scope name identifying the data.

  Returns:
    A queue filled with the rows of the given array or `DataFrame`.

  Raises:
    TypeError: `data` is not a Pandas `DataFrame` or a numpy `ndarray`.
  """
    with ops.op_scope([], name, None) as name:
        # TODO(jamieas): create multithreaded version of enqueue_data.
        if isinstance(data, np.ndarray):
            types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
            shapes = [(), data.shape[1:]]
            get_feed_fn = _ArrayFeedFn
        elif HAS_PANDAS and isinstance(data, pd.DataFrame):
            types = [
                dtypes.as_dtype(dt)
                for dt in [data.index.dtype] + list(data.dtypes)
            ]
            shapes = [() for _ in types]
            get_feed_fn = _PandasFeedFn
        else:
            raise TypeError(
                "data must be either a numpy array or pandas DataFrame if pandas is "
                "installed; got {}".format(type(data).__name__))

        placeholders = [
            array_ops.placeholder(*type_and_shape)
            for type_and_shape in zip(types, shapes)
        ]

        if shuffle:
            min_after_dequeue = int(
                capacity /
                4 if min_after_dequeue is None else min_after_dequeue)
            queue = data_flow_ops.RandomShuffleQueue(capacity,
                                                     min_after_dequeue,
                                                     dtypes=types,
                                                     shapes=shapes,
                                                     seed=seed)
        else:
            min_after_dequeue = 0  # just for the summary text
            queue = data_flow_ops.FIFOQueue(capacity,
                                            dtypes=types,
                                            shapes=shapes)
        enqueue_op = queue.enqueue(placeholders)
        feed_fn = get_feed_fn(placeholders, data)
        runner = fqr.FeedingQueueRunner(queue=queue,
                                        enqueue_ops=[enqueue_op],
                                        feed_fn=feed_fn)
        queue_runner.add_queue_runner(runner)

        full = (math_ops.cast(
            math_ops.maximum(0,
                             queue.size() - min_after_dequeue), dtypes.float32)
                * (1. / (capacity - min_after_dequeue)))
        # Note that name contains a '/' at the end so we intentionally do not place
        # a '/' after %s below.
        summary_name = (
            "queue/%sfraction_over_%d_of_%d_full" %
            (queue.name, min_after_dequeue, capacity - min_after_dequeue))
        logging_ops.scalar_summary(summary_name, full)
        return queue
Esempio n. 23
0
def enqueue_data(data, capacity, shuffle=False, min_after_dequeue=None, seed=None, name="enqueue_input"):
    """Creates a queue filled from a numpy array or pandas `DataFrame`.

    Returns a queue filled with the rows of the given array or `DataFrame`. In
    the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to
    the index of the `DataFrame`. For numpy arrays, the first enqueued `Tensor`
    contains the row number.

  Args:
    data: a numpy `ndarray or` pandas `DataFrame` that will be read into the
      queue.
    capacity: the capacity of the queue.
    shuffle: whether or not to shuffle the rows of the array.
    min_after_dequeue: minimum number of elements that can remain in the queue
    after a dequeue operation. Only used when `shuffle` is true. If not set,
    defaults to `capacity` / 4.
    seed: used to seed RandomShuffleQueue. Only used when `shuffle` is True.
    name: a scope name identifying the data.

  Returns:
    A queue filled with the rows of the given array or `DataFrame`.

  Raises:
    TypeError: `data` is not a Pandas `DataFrame` or a numpy `ndarray`.
  """
    with ops.op_scope([], name, None) as name:
        # TODO(jamieas): create multithreaded version of enqueue_data.
        if isinstance(data, np.ndarray):
            types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
            shapes = [(), data.shape[1:]]
            get_feed_fn = _ArrayFeedFn
        elif HAS_PANDAS and isinstance(data, pd.DataFrame):
            types = [dtypes.as_dtype(dt) for dt in [data.index.dtype] + list(data.dtypes)]
            shapes = [() for _ in types]
            get_feed_fn = _PandasFeedFn
        else:
            raise TypeError(
                "data must be either a numpy array or pandas DataFrame if pandas is "
                "installed; got {}".format(type(data).__name__)
            )

        placeholders = [array_ops.placeholder(*type_and_shape) for type_and_shape in zip(types, shapes)]

        if shuffle:
            min_after_dequeue = int(capacity / 4 if min_after_dequeue is None else min_after_dequeue)
            queue = data_flow_ops.RandomShuffleQueue(
                capacity, min_after_dequeue, dtypes=types, shapes=shapes, seed=seed
            )
        else:
            min_after_dequeue = 0  # just for the summary text
            queue = data_flow_ops.FIFOQueue(capacity, dtypes=types, shapes=shapes)
        enqueue_op = queue.enqueue(placeholders)
        feed_fn = get_feed_fn(placeholders, data)
        runner = fqr.FeedingQueueRunner(queue=queue, enqueue_ops=[enqueue_op], feed_fn=feed_fn)
        queue_runner.add_queue_runner(runner)

        full = math_ops.cast(math_ops.maximum(0, queue.size() - min_after_dequeue), dtypes.float32) * (
            1.0 / (capacity - min_after_dequeue)
        )
        # Note that name contains a '/' at the end so we intentionally do not place
        # a '/' after %s below.
        summary_name = "queue/%sfraction_over_%d_of_%d_full" % (
            queue.name,
            min_after_dequeue,
            capacity - min_after_dequeue,
        )
        logging_ops.scalar_summary(summary_name, full)
        return queue
Esempio n. 24
0
def _read_keyed_batch_examples_helper(file_pattern,
                                      batch_size,
                                      reader,
                                      randomize_input=True,
                                      num_epochs=None,
                                      queue_capacity=10000,
                                      num_threads=1,
                                      read_batch_size=1,
                                      filter_fn=None,
                                      parse_fn=None,
                                      setup_shared_queue=False,
                                      name=None,
                                      seed=None):
    """Adds operations to read, queue, batch `Example` protos.

  Args:
    file_pattern: List of files or patterns of file paths containing
        `Example` records. See `tf.io.gfile.glob` for pattern rules.
    batch_size: An int or scalar `Tensor` specifying the batch size to use.
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If `None`, cycles through the dataset forever.
      NOTE - If specified, creates a variable that must be initialized, so call
      `tf.compat.v1.local_variables_initializer()` and run the op in a session.
    queue_capacity: Capacity for input queue.
    num_threads: The number of threads enqueuing examples.
    read_batch_size: An int or scalar `Tensor` specifying the number of
      records to read at once.
    filter_fn: Filtering function, takes both keys as well `Example` Tensors
      and returns a boolean mask of the same shape as the input Tensors to
      be applied for filtering. If `None`, no filtering is done.
    parse_fn: Parsing function, takes `Example` Tensor returns parsed
      representation. If `None`, no parsing is done.
    setup_shared_queue: Whether to set up a shared queue for file names.
    name: Name of resulting op.
    seed: An integer (optional). Seed used if randomize_input == True.

  Returns:
    Returns tuple of:
    - `Tensor` of string keys.
    - String `Tensor` of batched `Example` proto.

  Raises:
    ValueError: for invalid inputs.
  """
    # Retrieve files to read.
    file_names = _get_file_names(file_pattern, randomize_input)

    # Check input parameters are given and reasonable.
    if (not queue_capacity) or (queue_capacity <= 0):
        raise ValueError('Invalid queue_capacity %s.' % queue_capacity)
    if (batch_size is None) or (
        (not isinstance(batch_size, ops.Tensor)) and
        (batch_size <= 0 or batch_size >= queue_capacity)):
        raise ValueError('Invalid batch_size %s, with queue_capacity %s.' %
                         (batch_size, queue_capacity))
    if (read_batch_size is None) or (
        (not isinstance(read_batch_size, ops.Tensor)) and
        (read_batch_size <= 0)):
        raise ValueError('Invalid read_batch_size %s.' % read_batch_size)
    if (not num_threads) or (num_threads <= 0):
        raise ValueError('Invalid num_threads %s.' % num_threads)
    if (num_epochs is not None) and (num_epochs <= 0):
        raise ValueError('Invalid num_epochs %s.' % num_epochs)

    with ops.name_scope(name, 'read_batch_examples', [file_pattern]) as scope:
        with ops.name_scope('file_name_queue') as file_name_queue_scope:
            if setup_shared_queue:
                file_name_queue = data_flow_ops.FIFOQueue(
                    capacity=1, dtypes=[dtypes.string], shapes=[[]])
                enqueue_op = file_name_queue.enqueue(
                    input_pipeline_ops.seek_next(file_names,
                                                 shuffle=randomize_input,
                                                 num_epochs=num_epochs,
                                                 seed=seed))
                queue_runner.add_queue_runner(
                    queue_runner.QueueRunner(file_name_queue, [enqueue_op]))
            else:
                file_name_queue = input_ops.string_input_producer(
                    constant_op.constant(file_names, name='input'),
                    shuffle=randomize_input,
                    num_epochs=num_epochs,
                    name=file_name_queue_scope,
                    seed=seed)

        example_list = _get_examples(file_name_queue, reader, num_threads,
                                     read_batch_size, filter_fn, parse_fn)

        enqueue_many = read_batch_size > 1

        if num_epochs is None:
            allow_smaller_final_batch = False
        else:
            allow_smaller_final_batch = True

        # Setup batching queue given list of read example tensors.
        if randomize_input:
            if isinstance(batch_size, ops.Tensor):
                min_after_dequeue = int(queue_capacity * 0.4)
            else:
                min_after_dequeue = max(queue_capacity - (3 * batch_size),
                                        batch_size)
            queued_examples_with_keys = input_ops.shuffle_batch_join(
                example_list,
                batch_size,
                capacity=queue_capacity,
                min_after_dequeue=min_after_dequeue,
                enqueue_many=enqueue_many,
                name=scope,
                allow_smaller_final_batch=allow_smaller_final_batch,
                seed=seed)
        else:
            queued_examples_with_keys = input_ops.batch_join(
                example_list,
                batch_size,
                capacity=queue_capacity,
                enqueue_many=enqueue_many,
                name=scope,
                allow_smaller_final_batch=allow_smaller_final_batch)
        if parse_fn and isinstance(queued_examples_with_keys, dict):
            queued_keys = queued_examples_with_keys.pop(KEY_FEATURE_NAME)
            return queued_keys, queued_examples_with_keys
        return queued_examples_with_keys
Esempio n. 25
0
def _read_keyed_batch_examples_helper(file_pattern,
                                      batch_size,
                                      reader,
                                      randomize_input=True,
                                      num_epochs=None,
                                      queue_capacity=10000,
                                      num_threads=1,
                                      read_batch_size=1,
                                      parse_fn=None,
                                      setup_shared_queue=False,
                                      name=None):
  # Retrieve files to read.
  file_names = _get_file_names(file_pattern, randomize_input)

  # Check input parameters are given and reasonable.
  if (not queue_capacity) or (queue_capacity <= 0):
    raise ValueError('Invalid queue_capacity %s.' % queue_capacity)
  if (batch_size is None) or (
      (not isinstance(batch_size, ops.Tensor)) and
      (batch_size <= 0 or batch_size > queue_capacity)):
    raise ValueError(
        'Invalid batch_size %s, with queue_capacity %s.' %
        (batch_size, queue_capacity))
  if (read_batch_size is None) or (
      (not isinstance(read_batch_size, ops.Tensor)) and
      (read_batch_size <= 0)):
    raise ValueError('Invalid read_batch_size %s.' % read_batch_size)
  if (not num_threads) or (num_threads <= 0):
    raise ValueError('Invalid num_threads %s.' % num_threads)
  if (num_epochs is not None) and (num_epochs <= 0):
    raise ValueError('Invalid num_epochs %s.' % num_epochs)

  with ops.name_scope(name, 'read_batch_examples', [file_pattern]) as scope:
    with ops.name_scope('file_name_queue') as file_name_queue_scope:
      if setup_shared_queue:
        shared_file_name_queue = _get_shared_file_name_queue(
            file_names, randomize_input, num_epochs, file_name_queue_scope)
        file_name_queue = data_flow_ops.FIFOQueue(
            capacity=1, dtypes=[dtypes.string], shapes=[[]])
        enqueue_op = file_name_queue.enqueue(shared_file_name_queue.dequeue())
        queue_runner.add_queue_runner(
            queue_runner.QueueRunner(file_name_queue, [enqueue_op]))
      else:
        file_name_queue = input_ops.string_input_producer(
            constant_op.constant(
                file_names, name='input'),
            shuffle=randomize_input,
            num_epochs=num_epochs,
            name=file_name_queue_scope)

    example_list = _get_examples(file_name_queue, reader, num_threads,
                                 read_batch_size, parse_fn)

    enqueue_many = read_batch_size > 1

    if num_epochs is not None:
      allow_smaller_final_batch = True
    else:
      allow_smaller_final_batch = False

    # Setup batching queue given list of read example tensors.
    if randomize_input:
      if isinstance(batch_size, ops.Tensor):
        min_after_dequeue = int(queue_capacity * 0.4)
      else:
        min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size)
      queued_examples_with_keys = input_ops.shuffle_batch_join(
          example_list, batch_size, capacity=queue_capacity,
          min_after_dequeue=min_after_dequeue,
          enqueue_many=enqueue_many, name=scope,
          allow_smaller_final_batch=allow_smaller_final_batch)
    else:
      queued_examples_with_keys = input_ops.batch_join(
          example_list, batch_size, capacity=queue_capacity,
          enqueue_many=enqueue_many, name=scope,
          allow_smaller_final_batch=allow_smaller_final_batch)
    if parse_fn and isinstance(queued_examples_with_keys, dict):
      queued_keys = queued_examples_with_keys.pop(KEY_FEATURE_NAME)
      return queued_keys, queued_examples_with_keys
    return queued_examples_with_keys
Esempio n. 26
0
def queue_parsed_features(parsed_features,
                          keys=None,
                          feature_queue_capacity=100,
                          num_enqueue_threads=2,
                          name=None):
    """Speeds up parsing by using queues to do it asynchronously.

  This function adds the tensors in `parsed_features` to a queue, which allows
  the parsing (or any other expensive op before this) to be asynchronous wrt the
  rest of the training graph. This greatly improves read latency and speeds up
  training since the data will already be parsed and ready when each step of
  training needs it.

  All queue runners are added to the queue runners collection, and may be
  started via `start_queue_runners`.

  All ops are added to the default graph.

  Args:
    parsed_features: A dict of string key to `Tensor` or `SparseTensor` objects.
    keys: `Tensor` of string keys.
    feature_queue_capacity: Capacity of the parsed features queue.
    num_enqueue_threads: Number of threads to enqueue the parsed example queue.
      Using multiple threads to enqueue the parsed example queue helps maintain
      a full queue when the subsequent computations overall are cheaper than
      parsing. In order to have predictable and repeatable order of reading and
      enqueueing, such as in prediction and evaluation mode,
      `num_enqueue_threads` should be 1.
    name: Name of resulting op.

  Returns:
    Returns tuple of:
    - `Tensor` corresponding to `keys` if provided, otherwise `None`.
    -  A dict of string key to `Tensor` or `SparseTensor` objects corresponding
       to `parsed_features`.
  Raises:
    ValueError: for invalid inputs.
  """

    args = list(parsed_features.values())
    if keys is not None:
        args += [keys]

    with ops.name_scope(name, 'queue_parsed_features', args):
        # Lets also add preprocessed tensors into the queue types for each item of
        # the queue.
        tensors_to_enqueue = []
        # Each entry contains the key, and a boolean which indicates whether the
        # tensor was a sparse tensor.
        tensors_mapping = []
        # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse
        # tensors into a queue. This could be taken care in somewhere else so others
        # can reuse it. Also, QueueBase maybe extended to handle sparse tensors
        # directly.
        for key in sorted(parsed_features.keys()):
            tensor = parsed_features[key]
            if isinstance(tensor, sparse_tensor.SparseTensor):
                tensors_mapping.append((key, True))
                tensors_to_enqueue.extend(
                    [tensor.indices, tensor.values, tensor.dense_shape])
            else:
                tensors_mapping.append((key, False))
                tensors_to_enqueue.append(tensor)

        if keys is not None:
            tensors_to_enqueue.append(keys)

        queue_dtypes = [x.dtype for x in tensors_to_enqueue]
        input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity,
                                              queue_dtypes)

        # Add a summary op to debug if our feature queue is full or not.
        summary.scalar(
            'queue/parsed_features/%s/fraction_of_%d_full' %
            (input_queue.name, feature_queue_capacity),
            math_ops.cast(input_queue.size(), dtypes.float32) *
            (1. / feature_queue_capacity))

        # Use a single QueueRunner with multiple threads to enqueue so the queue is
        # always full. The threads are coordinated so the last batch will not be
        # lost.
        enqueue_ops = [
            input_queue.enqueue(tensors_to_enqueue)
            for _ in range(num_enqueue_threads)
        ]
        queue_runner.add_queue_runner(
            queue_runner.QueueRunner(
                input_queue,
                enqueue_ops,
                queue_closed_exception_types=(errors.OutOfRangeError,
                                              errors.CancelledError)))

        dequeued_tensors = input_queue.dequeue()
        if not isinstance(dequeued_tensors, list):
            # input_queue.dequeue() returns a single tensor instead of a list of
            # tensors if there is only one tensor to dequeue, which breaks the
            # assumption of a list below.
            dequeued_tensors = [dequeued_tensors]

        # Reset shapes on dequeued tensors.
        for i in range(len(tensors_to_enqueue)):
            dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape())

        # Recreate feature mapping according to the original dictionary.
        dequeued_parsed_features = {}
        index = 0
        for key, is_sparse_tensor in tensors_mapping:
            if is_sparse_tensor:
                # Three tensors are (indices, values, shape).
                dequeued_parsed_features[key] = sparse_tensor.SparseTensor(
                    dequeued_tensors[index], dequeued_tensors[index + 1],
                    dequeued_tensors[index + 2])
                index += 3
            else:
                dequeued_parsed_features[key] = dequeued_tensors[index]
                index += 1

        dequeued_keys = None
        if keys is not None:
            dequeued_keys = dequeued_tensors[-1]

        return dequeued_keys, dequeued_parsed_features
Esempio n. 27
0
def load_dataset(model,
                 num_gpus,
                 batch_size,
                 output_dims,
                 input_dims,
                 seq_length,
                 size,
                 base_data_path,
                 dataset,
                 istraining,
                 clip_length,
                 video_offset,
                 clip_offset,
                 num_clips,
                 clip_stride,
                 video_step,
                 preproc_debugging=0,
                 shuffle_seed=0,
                 verbose=True):
    """
    Function load dataset, setup queue and read data into queue
    Args:
        :model:              tf-activity-recognition framework model object
        :num_gpus:           Number of gpus to use when training
        :batch_size:         Number of clips to load into the model each step.
        :input_dims:         Number of frames used in input
        :output_dims:        Integer number of classes in current dataset
        :seq_length:         Length of output sequence expected from LSTM
        :size:               List detailing height and width of frame
        :dataset:            Name of dataset being processed
        :base_data_path:     Full path to root directory containing datasets
        :istraining:         Boolean variable indicating training/testing phase
        :clip_length:        Length of clips to cut video into, -1 indicates using the entire video as one clip')
        :clip_offset:        "none" or "random" indicating where to begin selecting video clips
        :num_clips:          Number of clips to break video into
        :clip_stride:        Number of frames that overlap between clips, 0 indicates no overlap and negative values indicate a gap of frames between clips

    Return:
        Input data tensor, label tensor and name of loaded data (video/image)
    """
    # Get a list of tfrecords file names from which to pull videos
    filenames = []
    number_of_tfrecords = 0

    for f in os.listdir(base_data_path):
        filenames.append(os.path.join(base_data_path, f))
        number_of_tfrecords += 1

    # END FOR

    if verbose:
        print "Number of records available: ", number_of_tfrecords

    # END IF

    # Create Queue which will read in videos num_gpus at a time (Queue seeded for repeatability of experiments)
    tfrecord_file_queue = tf.train.string_input_producer(filenames,
                                                         shuffle=istraining,
                                                         name='file_q',
                                                         seed=shuffle_seed)

    # Errors occurring in a model's preprocessing function are not properly traced back when using 'clip_q'.
    # If an error occurs stating that "fifo_queue has insufficient elements", then set '--preprocDebugging 1'
    # For debugging, a batch_size other than 1 will cause instability
    if preproc_debugging:
        input_data_tensor, labels_tensor, names_tensor, video_step_tensor, alpha_tensor = _load_video(
            model, output_dims, input_dims, seq_length, size, base_data_path,
            dataset, istraining, clip_length, video_offset, clip_offset,
            num_clips, clip_stride, tfrecord_file_queue, video_step)

    else:
        tf.set_random_seed(
            0
        )  # To ensure the numbers are generated for temporal offset consistently

        # Number of threads to be used
        thread_count = 1

        # Initialize queue that will contain multiple clips of the format [[clip_frame_count, height, width, channels], [labels_copied_seqLength], [name_of_video]]
        clip_q = tf.FIFOQueue(
            num_gpus * batch_size * thread_count,
            dtypes=[tf.float32, tf.int32, tf.string, tf.float32, tf.float32],
            shapes=[[input_dims, size[0], size[1], 3], [seq_length], [], [],
                    []])

        # Attempts to load num_gpus*batch_size number of clips into queue, if there exist too many clips in a video then this function blocks until the clips are dequeued
        enqueue_op = clip_q.enqueue_many(
            _load_video(model, output_dims, input_dims, seq_length, size,
                        base_data_path, dataset, istraining, clip_length,
                        video_offset, clip_offset, num_clips, clip_stride,
                        tfrecord_file_queue, video_step))

        # Initialize the queuerunner and add it to the collection, this becomes initialized in train_test_TFRecords_multigpu_model.py after the Session is begun
        qr = tf.train.QueueRunner(clip_q,
                                  [enqueue_op] * num_gpus * thread_count)
        queue_runner.add_queue_runner(qr)

        # Dequeue the required number of clips so that each gpu contains batch_size clips
        input_data_tensor, labels_tensor, names_tensor, video_step_tensor, alpha_tensor = clip_q.dequeue_many(
            num_gpus * batch_size)

    # END IF

    # Track scalar value defined in a models preprocessing function in a class variable called 'store_alpha'
    if hasattr(model, 'store_alpha'):
        model.store_alpha = alpha_tensor
        model.add_track_variables('Parameterization_Variables',
                                  model.store_alpha)

    # END IF

    return input_data_tensor, labels_tensor, names_tensor
Esempio n. 28
0
def _enqueue_join(queue, tensor_list_list, enqueue_many):
  if enqueue_many:
    enqueue_ops = [queue.enqueue_many(tl) for tl in tensor_list_list]
  else:
    enqueue_ops = [queue.enqueue(tl) for tl in tensor_list_list]
  queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
Esempio n. 29
0
def _enqueue_join(queue, tensor_list_list, enqueue_many):
    if enqueue_many:
        enqueue_ops = [queue.enqueue_many(tl) for tl in tensor_list_list]
    else:
        enqueue_ops = [queue.enqueue(tl) for tl in tensor_list_list]
    queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
Esempio n. 30
0
def read_keyed_batch_features(file_pattern,
                              batch_size,
                              features,
                              reader,
                              randomize_input=True,
                              num_epochs=None,
                              queue_capacity=10000,
                              reader_num_threads=1,
                              feature_queue_capacity=100,
                              num_queue_runners=2,
                              parser_num_threads=None,
                              name=None):
  """Adds operations to read, queue, batch and parse `Example` protos.

  Given file pattern (or list of files), will setup a queue for file names,
  read `Example` proto using provided `reader`, use batch queue to create
  batches of examples of size `batch_size` and parse example given `features`
  specification.

  All queue runners are added to the queue runners collection, and may be
  started via `start_queue_runners`.

  All ops are added to the default graph.

  Args:
    file_pattern: List of files or pattern of file paths containing
        `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int or scalar `Tensor` specifying the batch size to use.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values.
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. NOTE - If specified,
      creates a variable that must be initialized, so call
      tf.initialize_local_variables() as shown in the tests.
    queue_capacity: Capacity for input queue.
    reader_num_threads: The number of threads to read examples.
    feature_queue_capacity: Capacity of the parsed features queue.
    num_queue_runners: Number of queue runners to start for the feature queue,
      Adding multiple queue runners for the parsed example queue helps maintain
      a full queue when the subsequent computations overall are cheaper than
      parsing.
    parser_num_threads: (Deprecated) The number of threads to parse examples.
    name: Name of resulting op.

  Returns:
    A dict of `Tensor` or `SparseTensor` objects for each in `features`.
    If `keep_keys` is `True`, returns tuple of string `Tensor` and above dict.

  Raises:
    ValueError: for invalid inputs.
  """

  if parser_num_threads:
    # TODO(sibyl-Aix6ihai): Remove on Sept 3 2016.
    logging.warning('parser_num_threads is deprecated, it will be removed on'
                    'Sept 3 2016')
  with ops.name_scope(name, 'read_batch_features', [file_pattern]) as scope:
    keys, examples = read_keyed_batch_examples(
        file_pattern, batch_size, reader, randomize_input=randomize_input,
        num_epochs=num_epochs, queue_capacity=queue_capacity,
        num_threads=reader_num_threads, read_batch_size=batch_size,
        name=scope)

    # Parse the example.
    feature_map = parsing_ops.parse_example(examples, features)

    # Lets also add preprocessed tensors into the queue types for each item of
    # the queue.
    tensors_to_enqueue = []
    # Each entry contains the key, and a boolean which indicates whether the
    # tensor was a sparse tensor.
    tensors_mapping = []
    # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse
    # tensors into a queue. This could be taken care in somewhere else so others
    # can reuse it. Also, QueueBase maybe extended to handle sparse tensors
    # directly.
    for key in sorted(feature_map.keys()):
      tensor = feature_map[key]
      if isinstance(tensor, ops.SparseTensor):
        tensors_mapping.append((key, True))
        tensors_to_enqueue.extend([tensor.indices, tensor.values, tensor.shape])
      else:
        tensors_mapping.append((key, False))
        tensors_to_enqueue.append(tensor)
    tensors_to_enqueue.append(keys)

    queue_dtypes = [x.dtype for x in tensors_to_enqueue]
    input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity, queue_dtypes)

    # Add a summary op to debug if our feature queue is full or not.
    logging_ops.scalar_summary('queue/parsed_features/%s/fraction_of_%d_full' %
                               (input_queue.name, feature_queue_capacity),
                               math_ops.cast(input_queue.size(), dtypes.float32)
                               * (1. / feature_queue_capacity))

    # Add multiple queue runners so that the queue is always full. Adding more
    # than two queue-runners may hog the cpu on the worker to fill up the queue.
    for _ in range(num_queue_runners):
      queue_runner.add_queue_runner(
          queue_runner.QueueRunner(input_queue, [input_queue.enqueue(
              tensors_to_enqueue)]))

    dequeued_tensors = input_queue.dequeue()

    # Reset shapes on dequeued tensors.
    for i in range(len(tensors_to_enqueue)):
      dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape())

    # Recreate feature mapping according to the original dictionary.
    dequeued_feature_map = {}
    index = 0
    for key, is_sparse_tensor in tensors_mapping:
      if is_sparse_tensor:
        # Three tensors are (indices, values, shape).
        dequeued_feature_map[key] = ops.SparseTensor(
            dequeued_tensors[index], dequeued_tensors[index + 1],
            dequeued_tensors[index + 2])
        index += 3
      else:
        dequeued_feature_map[key] = dequeued_tensors[index]
        index += 1
    dequeued_keys = dequeued_tensors[-1]

    return dequeued_keys, dequeued_feature_map
Esempio n. 31
0
def read_keyed_batch_features(file_pattern,
                              batch_size,
                              features,
                              reader,
                              randomize_input=True,
                              num_epochs=None,
                              queue_capacity=10000,
                              reader_num_threads=1,
                              feature_queue_capacity=100,
                              num_queue_runners=2,
                              parser_num_threads=None,
                              name=None):
    """Adds operations to read, queue, batch and parse `Example` protos.

  Given file pattern (or list of files), will setup a queue for file names,
  read `Example` proto using provided `reader`, use batch queue to create
  batches of examples of size `batch_size` and parse example given `features`
  specification.

  All queue runners are added to the queue runners collection, and may be
  started via `start_queue_runners`.

  All ops are added to the default graph.

  Args:
    file_pattern: List of files or pattern of file paths containing
        `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int or scalar `Tensor` specifying the batch size to use.
    features: A `dict` mapping feature keys to `FixedLenFeature` or
      `VarLenFeature` values.
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If None, cycles through the dataset forever. NOTE - If specified,
      creates a variable that must be initialized, so call
      tf.initialize_local_variables() as shown in the tests.
    queue_capacity: Capacity for input queue.
    reader_num_threads: The number of threads to read examples.
    feature_queue_capacity: Capacity of the parsed features queue.
    num_queue_runners: Number of queue runners to start for the feature queue,
      Adding multiple queue runners for the parsed example queue helps maintain
      a full queue when the subsequent computations overall are cheaper than
      parsing.
    parser_num_threads: (Deprecated) The number of threads to parse examples.
    name: Name of resulting op.

  Returns:
    A dict of `Tensor` or `SparseTensor` objects for each in `features`.
    If `keep_keys` is `True`, returns tuple of string `Tensor` and above dict.

  Raises:
    ValueError: for invalid inputs.
  """

    if parser_num_threads:
        # TODO(sibyl-Aix6ihai): Remove on Sept 3 2016.
        logging.warning(
            'parser_num_threads is deprecated, it will be removed on'
            'Sept 3 2016')
    with ops.op_scope([file_pattern], name, 'read_batch_features') as scope:
        keys, examples = read_keyed_batch_examples(
            file_pattern,
            batch_size,
            reader,
            randomize_input=randomize_input,
            num_epochs=num_epochs,
            queue_capacity=queue_capacity,
            num_threads=reader_num_threads,
            read_batch_size=batch_size,
            name=scope)

        # Parse the example.
        feature_map = parsing_ops.parse_example(examples, features)

        # Lets also add preprocessed tensors into the queue types for each item of
        # the queue.
        tensors_to_enqueue = []
        # Each entry contains the key, and a boolean which indicates whether the
        # tensor was a sparse tensor.
        tensors_mapping = []
        # TODO(sibyl-Aix6ihai): Most of the functionality here is about pushing sparse
        # tensors into a queue. This could be taken care in somewhere else so others
        # can reuse it. Also, QueueBase maybe extended to handle sparse tensors
        # directly.
        for key, tensor in feature_map.iteritems():
            if isinstance(tensor, ops.SparseTensor):
                tensors_mapping.append((key, True))
                tensors_to_enqueue.extend(
                    [tensor.indices, tensor.values, tensor.shape])
            else:
                tensors_mapping.append((key, False))
                tensors_to_enqueue.append(tensor)
        tensors_to_enqueue.append(keys)

        queue_dtypes = [x.dtype for x in tensors_to_enqueue]
        input_queue = data_flow_ops.FIFOQueue(feature_queue_capacity,
                                              queue_dtypes)

        # Add a summary op to debug if our feature queue is full or not.
        logging_ops.scalar_summary(
            'queue/parsed_features/%s/fraction_of_%d_full' %
            (input_queue.name, feature_queue_capacity),
            math_ops.cast(input_queue.size(), dtypes.float32) *
            (1. / feature_queue_capacity))

        # Add multiple queue runners so that the queue is always full. Adding more
        # than two queue-runners may hog the cpu on the worker to fill up the queue.
        for _ in range(num_queue_runners):
            queue_runner.add_queue_runner(
                queue_runner.QueueRunner(
                    input_queue, [input_queue.enqueue(tensors_to_enqueue)]))

        dequeued_tensors = input_queue.dequeue()

        # Reset shapes on dequeued tensors.
        for i in range(len(tensors_to_enqueue)):
            dequeued_tensors[i].set_shape(tensors_to_enqueue[i].get_shape())

        # Recreate feature mapping according to the original dictionary.
        dequeued_feature_map = {}
        index = 0
        for key, is_sparse_tensor in tensors_mapping:
            if is_sparse_tensor:
                # Three tensors are (indices, values, shape).
                dequeued_feature_map[key] = ops.SparseTensor(
                    dequeued_tensors[index], dequeued_tensors[index + 1],
                    dequeued_tensors[index + 2])
                index += 3
            else:
                dequeued_feature_map[key] = dequeued_tensors[index]
                index += 1
        dequeued_keys = dequeued_tensors[-1]

        return dequeued_keys, dequeued_feature_map
Esempio n. 32
0
def bucket(tensors,
           which_bucket,
           batch_size,
           num_buckets,
           num_threads=1,
           capacity=32,
           shapes=None,
           dynamic_pad=False,
           allow_smaller_final_batch=False,
           keep_input=None,
           shared_name=None,
           name=None):
  """Lazy bucketing of input tensors according to `which_bucket`.

  The argument `tensors` can be a list or a dictionary of tensors.
  The value returned by the function will be of the same type
  as `tensors`.

  The tensors entering this function are put into the bucket given by
  `which_bucket`.  Each bucket has its own queue.  When a bucket contains
  `batch_size` elements, this minibatch is pushed onto a top queue.  The
  tensors returned from this function are a the result of dequeueing the
  next minibatch from this top queue.

  This function is implemented using several queues. A `QueueRunner` for the
  queues is added to the current `Graph`'s `QUEUE_RUNNER` collection.

  As the returned tensors are the result of of a dequeue operation, evaluating
  them will throw a `tf.errors.OutOfRangeError` when the input queue is
  exhausted.  If these tensors are feeding another input queue, its queue runner
  will catch this exception, however, if they are used in your main thread
  you are responsible for catching this yourself.

  *N.B.:* If `dynamic_pad` is `False`, you must ensure that either
  (i) the `shapes` argument is passed, or (ii) all of the tensors in
  `tensors` must have fully-defined shapes. `ValueError` will be
  raised if neither of these conditions holds.

  If `dynamic_pad` is `True`, it is sufficient that the *rank* of the
  tensors is known, but individual dimensions may have shape `None`.
  In this case, for each enqueue the dimensions with value `None`
  may have a variable length; upon dequeue, the output tensors will be padded
  on the right to the maximum shape of the tensors in the current minibatch.
  For numbers, this padding takes value 0.  For strings, this padding is
  the empty string.  See `PaddingFIFOQueue` for more info.

  If `allow_smaller_final_batch` is `True`, a smaller batch value than
  `batch_size` is returned when the queues are closed and there are not enough
  elements to fill the batch, otherwise the pending elements are discarded.
  In addition, all output tensors' static shapes, as accessed via the
  `get_shape()` method will have a 0th `Dimension` value of `None`, and
  operations that depend on fixed batch_size would fail.

  Args:
    tensors: The list or dictionary of tensors, representing a single element,
      to bucket.  Nested lists are not supported.
    which_bucket: An `int32` scalar Tensor taking a value in `[0, num_buckets)`.
    batch_size: The new batch size pulled from the queue
      (python int or int32 scalar).
    num_buckets: A python integer, the number of buckets.
    num_threads: An integer.  The number of threads enqueuing `tensors`.
    capacity: An integer. The maximum number of minibatches in the top queue,
      and also the maximum number of elements within each bucket.
    shapes: (Optional) The shapes for each example.  Defaults to the
      inferred shapes for `tensors`.
    dynamic_pad: Boolean.  Allow variable dimensions in input shapes.
      The given dimensions are padded upon dequeue so that tensors within a
      batch have the same shapes.
    allow_smaller_final_batch: (Optional) Boolean. If `True`, allow the final
      batches to be smaller if there are insufficient items left in the queues.
    keep_input: (Optional).  A `bool` scalar Tensor.  If provided, this tensor
      controls whether the input is added to the queue or not.  If it evaluates
      `True`, then `tensors` are added to the bucket; otherwise they are
      dropped.  This tensor essentially acts as a filtering mechanism.
      The default behavior is to assume `keep_input=True`.
    shared_name: (Optional). If set, the queues will be shared under the given
      name across multiple sessions.
    name: (Optional) A name for the operations.

  Returns:
    A tuple `(bucket, outputs)` where `bucket` is
    a `int32` scalar tensor and `outputs` is a list or
    dictionary of batched outputs corresponding to elements of `tensors`.
    Every step will receive a new bucket of outputs.

  Raises:
    ValueError: If the `shapes` are not specified, and cannot be
      inferred from the elements of `tensors`.
  """
  tensor_list = _as_tensor_list(tensors)
  with ops.name_scope(name, "bucket", tensor_list) as name:
    tensor_list = _validate_bucket(tensor_list)
    (tensor_list, sparse_info) = _store_sparse_tensors(
        tensor_list, enqueue_many=False)

    # Round-trip batch_size to a tensor, and possibly back
    batch_size = ops.convert_to_tensor(
        batch_size, dtype=dtypes.int32, name="batch_size")
    static_batch_size = tensor_util.constant_value(batch_size)
    batch_size = (
        static_batch_size if static_batch_size is not None else batch_size)

    types = _dtypes([tensor_list])
    shapes = _shapes([tensor_list], shapes, enqueue_many=False)

    which_bucket = ops.convert_to_tensor(
        which_bucket, dtype=dtypes.int32, name="which_bucket")

    queue_creator = _which_queue(dynamic_pad)
    bucket_queues = []
    for i in range(num_buckets):
      shared_name_i = (
          "%s_%d" % (shared_name, i) if shared_name is not None else None)
      bucket_queues.append(
          queue_creator(capacity=capacity,
                        dtypes=types,
                        shapes=shapes,
                        shared_name=shared_name_i, name="bucket_queue_%d" % i))

    maybe_static_batch_size = (
        None if allow_smaller_final_batch else static_batch_size)

    bucket_shapes = [tensor_shape.vector(maybe_static_batch_size).concatenate(s)
                     for s in bucket_queues[0].shapes]
    # top_queue is a PaddingFIFOQueue even if the bucket queues are regular FIFO
    # queues because if we use allow_smaller_final_batch, shapes will
    # contain Nones in their first entry; as a result, a regular
    # FIFOQueue would die when being passed shapes that are not fully defined.
    top_queue = data_flow_ops.PaddingFIFOQueue(
        capacity=capacity,
        dtypes=[dtypes.int32] + types,
        shapes=[tensor_shape.scalar()] + bucket_shapes,
        shared_name=shared_name, name="top_queue")

    def enqueue_which():
      def enqueue_single(i):
        return bucket_queues[i].enqueue(tensor_list)
      enqueues = [
          control_flow_ops.cond(
              math_ops.equal(which_bucket, i),
              functools.partial(enqueue_single, i),
              control_flow_ops.no_op)
          for i in range(num_buckets)]
      return control_flow_ops.group(*enqueues, name="group_enqueues")

    if keep_input is not None:
      # TODO(ebrevdo): Expand keep_input param to core training
      # methods, and pipe through to _store_sparse_tensors; so
      # that expensive serialization is guarded by keep_input.
      maybe_enqueue = control_flow_ops.cond(
          keep_input,
          enqueue_which,
          control_flow_ops.no_op)
    else:
      maybe_enqueue = enqueue_which()

    bucket_enqueue_ops = [maybe_enqueue] * num_threads

    if allow_smaller_final_batch:
      which_dequeue = lambda q: q.dequeue_up_to
    else:
      which_dequeue = lambda q: q.dequeue_many

    enqueues_to_top = [
        top_queue.enqueue(
            [constant_op.constant(i)] +
            which_dequeue(q)(batch_size, name="read_bucket_%d" % i),
            name="enqueue_from_bucket_%d" % i)
        for i, q in enumerate(bucket_queues)]

    for i, q in enumerate(bucket_queues):
      queue_runner.add_queue_runner(queue_runner.QueueRunner(
          q, [enqueues_to_top[i]],
          queue_closed_exception_types=(
              errors.OutOfRangeError, errors.CancelledError)))
    queue_runner.add_queue_runner(queue_runner.QueueRunner(
        top_queue, bucket_enqueue_ops,
        queue_closed_exception_types=(
            errors.OutOfRangeError, errors.CancelledError)))

    for q in bucket_queues:
      summary.scalar("bucket/%s/size" % q.name,
                     math_ops.cast(top_queue.size(), dtypes.float32))
    summary.scalar("bucket/%s/fraction_of_%d_full" % (top_queue.name, capacity),
                   math_ops.cast(top_queue.size(), dtypes.float32) *
                   (1. / capacity))

    dequeued = top_queue.dequeue(name="dequeue_top")
    which_bucket_dequeued = dequeued[0]
    dequeued = dequeued[1:]
    dequeued = _restore_sparse_tensors(dequeued, sparse_info)
    return (which_bucket_dequeued, _as_original_type(tensors, dequeued))
def enqueue_data(data,
                 capacity,
                 shuffle=False,
                 min_after_dequeue=None,
                 num_threads=1,
                 seed=None,
                 name="enqueue_input",
                 enqueue_size=1,
                 num_epochs=None):
    """Creates a queue filled from a numpy array or pandas `DataFrame`.

    Returns a queue filled with the rows of the given array or `DataFrame`. In
    the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to
    the index of the `DataFrame`. For numpy arrays, the first enqueued `Tensor`
    contains the row number.

  Args:
    data: a numpy `ndarray or` pandas `DataFrame` that will be read into the
      queue.
    capacity: the capacity of the queue.
    shuffle: whether or not to shuffle the rows of the array.
    min_after_dequeue: minimum number of elements that can remain in the queue
    after a dequeue operation. Only used when `shuffle` is true. If not set,
    defaults to `capacity` / 4.
    num_threads: number of threads used for reading and enqueueing.
    seed: used to seed shuffling and reader starting points.
    name: a scope name identifying the data.
    enqueue_size: the number of rows to enqueue per step.
    num_epochs: limit enqueuing to a specified number of epochs, if provided.

  Returns:
    A queue filled with the rows of the given array or `DataFrame`.

  Raises:
    TypeError: `data` is not a Pandas `DataFrame` or a numpy `ndarray`.
  """
    with ops.name_scope(name):
        if isinstance(data, np.ndarray):
            types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
            queue_shapes = [(), data.shape[1:]]
            get_feed_fn = _ArrayFeedFn
        elif isinstance(data, collections.OrderedDict):
            types = [dtypes.int64
                     ] + [dtypes.as_dtype(col.dtype) for col in data.values()]
            queue_shapes = [()] + [col.shape[1:] for col in data.values()]
            get_feed_fn = _OrderedDictNumpyFeedFn
        elif HAS_PANDAS and isinstance(data, pd.DataFrame):
            types = [
                dtypes.as_dtype(dt)
                for dt in [data.index.dtype] + list(data.dtypes)
            ]
            queue_shapes = [() for _ in types]
            get_feed_fn = _PandasFeedFn
        else:
            raise TypeError(
                "data must be either a numpy array or pandas DataFrame if pandas is "
                "installed; got {}".format(type(data).__name__))

        # TODO(jamieas): TensorBoard warnings for all warnings below once available.

        if num_threads > 1 and num_epochs is not None:
            logging.warning(
                "enqueue_data was called with num_epochs and num_threads > 1. "
                "num_epochs is applied per thread, so this will produce more "
                "epochs than you probably intend. "
                "If you want to limit epochs, use one thread.")

        if shuffle and num_threads > 1 and num_epochs is not None:
            logging.warning(
                "enqueue_data was called with shuffle=True, num_threads > 1, and "
                "num_epochs. This will create multiple threads, all reading the "
                "array/dataframe in order adding to the same shuffling queue; the "
                "results will likely not be sufficiently shuffled.")

        if not shuffle and num_threads > 1:
            logging.warning(
                "enqueue_data was called with shuffle=False and num_threads > 1. "
                "This will create multiple threads, all reading the "
                "array/dataframe in order. If you want examples read in order, use"
                " one thread; if you want multiple threads, enable shuffling.")

        if shuffle:
            min_after_dequeue = int(
                capacity /
                4 if min_after_dequeue is None else min_after_dequeue)
            queue = data_flow_ops.RandomShuffleQueue(capacity,
                                                     min_after_dequeue,
                                                     dtypes=types,
                                                     shapes=queue_shapes,
                                                     seed=seed)
        else:
            min_after_dequeue = 0  # just for the summary text
            queue = data_flow_ops.FIFOQueue(capacity,
                                            dtypes=types,
                                            shapes=queue_shapes)

        enqueue_ops = []
        feed_fns = []

        for i in range(num_threads):
            # Note the placeholders have no shapes, so they will accept any
            # enqueue_size.  enqueue_many below will break them up.
            placeholders = [array_ops.placeholder(t) for t in types]

            enqueue_ops.append(queue.enqueue_many(placeholders))
            seed_i = None if seed is None else (i + 1) * seed
            feed_fns.append(
                get_feed_fn(placeholders,
                            data,
                            enqueue_size,
                            random_start=shuffle,
                            seed=seed_i,
                            num_epochs=num_epochs))

        runner = fqr.FeedingQueueRunner(queue=queue,
                                        enqueue_ops=enqueue_ops,
                                        feed_fns=feed_fns)
        queue_runner.add_queue_runner(runner)

        full = (math_ops.cast(
            math_ops.maximum(0,
                             queue.size() - min_after_dequeue), dtypes.float32)
                * (1. / (capacity - min_after_dequeue)))
        # Note that name contains a '/' at the end so we intentionally do not place
        # a '/' after %s below.
        summary_name = (
            "queue/%sfraction_over_%d_of_%d_full" %
            (queue.name, min_after_dequeue, capacity - min_after_dequeue))
        summary.scalar(summary_name, full)
        return queue
Esempio n. 34
0
def enqueue_data(data,
                 capacity,
                 shuffle=False,
                 min_after_dequeue=None,
                 num_threads=1,
                 seed=None,
                 name="enqueue_input",
                 enqueue_size=1,
                 num_epochs=None):
  """Creates a queue filled from a numpy array or pandas `DataFrame`.

    Returns a queue filled with the rows of the given array or `DataFrame`. In
    the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to
    the index of the `DataFrame`. For numpy arrays, the first enqueued `Tensor`
    contains the row number.

  Args:
    data: a numpy `ndarray or` pandas `DataFrame` that will be read into the
      queue.
    capacity: the capacity of the queue.
    shuffle: whether or not to shuffle the rows of the array.
    min_after_dequeue: minimum number of elements that can remain in the queue
    after a dequeue operation. Only used when `shuffle` is true. If not set,
    defaults to `capacity` / 4.
    num_threads: number of threads used for reading and enqueueing.
    seed: used to seed shuffling and reader starting points.
    name: a scope name identifying the data.
    enqueue_size: the number of rows to enqueue per step.
    num_epochs: limit enqueuing to a specified number of epochs, if provided.

  Returns:
    A queue filled with the rows of the given array or `DataFrame`.

  Raises:
    TypeError: `data` is not a Pandas `DataFrame` or a numpy `ndarray`.
  """
  with ops.name_scope(name):
    if isinstance(data, np.ndarray):
      types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
      queue_shapes = [(), data.shape[1:]]
      get_feed_fn = _ArrayFeedFn
    elif isinstance(data, collections.OrderedDict):
      types = [dtypes.int64] + [dtypes.as_dtype(col.dtype)
                                for col in data.values()]
      queue_shapes = [()] + [col.shape[1:] for col in data.values()]
      get_feed_fn = _OrderedDictNumpyFeedFn
    elif HAS_PANDAS and isinstance(data, pd.DataFrame):
      types = [dtypes.as_dtype(dt)
               for dt in [data.index.dtype] + list(data.dtypes)]
      queue_shapes = [() for _ in types]
      get_feed_fn = _PandasFeedFn
    else:
      raise TypeError(
          "data must be either a numpy array or pandas DataFrame if pandas is "
          "installed; got {}".format(type(data).__name__))

    # TODO(jamieas): TensorBoard warnings for all warnings below once available.

    if num_threads > 1 and num_epochs is not None:
      logging.warning(
          "enqueue_data was called with num_epochs and num_threads > 1. "
          "num_epochs is applied per thread, so this will produce more "
          "epochs than you probably intend. "
          "If you want to limit epochs, use one thread.")

    if shuffle and num_threads > 1 and num_epochs is not None:
      logging.warning(
          "enqueue_data was called with shuffle=True, num_threads > 1, and "
          "num_epochs. This will create multiple threads, all reading the "
          "array/dataframe in order adding to the same shuffling queue; the "
          "results will likely not be sufficiently shuffled.")

    if not shuffle and num_threads > 1:
      logging.warning(
          "enqueue_data was called with shuffle=False and num_threads > 1. "
          "This will create multiple threads, all reading the "
          "array/dataframe in order. If you want examples read in order, use"
          " one thread; if you want multiple threads, enable shuffling.")

    if shuffle:
      min_after_dequeue = int(capacity / 4 if min_after_dequeue is None else
                              min_after_dequeue)
      queue = data_flow_ops.RandomShuffleQueue(capacity,
                                               min_after_dequeue,
                                               dtypes=types,
                                               shapes=queue_shapes,
                                               seed=seed)
    else:
      min_after_dequeue = 0  # just for the summary text
      queue = data_flow_ops.FIFOQueue(capacity,
                                      dtypes=types,
                                      shapes=queue_shapes)

    enqueue_ops = []
    feed_fns = []

    for i in range(num_threads):
      # Note the placeholders have no shapes, so they will accept any
      # enqueue_size.  enqueue_many below will break them up.
      placeholders = [array_ops.placeholder(t) for t in types]

      enqueue_ops.append(queue.enqueue_many(placeholders))
      seed_i = None if seed is None else (i + 1) * seed
      feed_fns.append(get_feed_fn(placeholders,
                                  data,
                                  enqueue_size,
                                  random_start=shuffle,
                                  seed=seed_i,
                                  num_epochs=num_epochs))

    runner = fqr.FeedingQueueRunner(queue=queue,
                                    enqueue_ops=enqueue_ops,
                                    feed_fns=feed_fns)
    queue_runner.add_queue_runner(runner)

    full = (math_ops.cast(
        math_ops.maximum(0, queue.size() - min_after_dequeue),
        dtypes.float32) * (1. / (capacity - min_after_dequeue)))
    # Note that name contains a '/' at the end so we intentionally do not place
    # a '/' after %s below.
    summary_name = ("queue/%sfraction_over_%d_of_%d_full" %
                    (queue.name, min_after_dequeue,
                     capacity - min_after_dequeue))
    summary.scalar(summary_name, full)
    return queue
Esempio n. 35
0
def _enqueue_join(queue, tensor_list_list):
    enqueue_ops = [queue.enqueue(tl) for tl in tensor_list_list]
    queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
Esempio n. 36
0
def _read_keyed_batch_examples_helper(file_pattern,
                                      batch_size,
                                      reader,
                                      randomize_input=True,
                                      num_epochs=None,
                                      queue_capacity=10000,
                                      num_threads=1,
                                      read_batch_size=1,
                                      parse_fn=None,
                                      setup_shared_queue=False,
                                      name=None):
  """Adds operations to read, queue, batch `Example` protos.

  Args:
    file_pattern: List of files or pattern of file paths containing
        `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int or scalar `Tensor` specifying the batch size to use.
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
    num_epochs: Integer specifying the number of times to read through the
      dataset. If `None`, cycles through the dataset forever.
      NOTE - If specified, creates a variable that must be initialized, so call
      `tf.initialize_all_variables()` as shown in the tests.
    queue_capacity: Capacity for input queue.
    num_threads: The number of threads enqueuing examples.
    read_batch_size: An int or scalar `Tensor` specifying the number of
      records to read at once
    parse_fn: Parsing function, takes `Example` Tensor returns parsed
      representation. If `None`, no parsing is done.
    setup_shared_queue: Whether to set up a shared queue for file names.
    name: Name of resulting op.

  Returns:
    Returns tuple of:
    - `Tensor` of string keys.
    - String `Tensor` of batched `Example` proto.

  Raises:
    ValueError: for invalid inputs.
  """
  # Retrieve files to read.
  file_names = _get_file_names(file_pattern, randomize_input)

  # Check input parameters are given and reasonable.
  if (not queue_capacity) or (queue_capacity <= 0):
    raise ValueError('Invalid queue_capacity %s.' % queue_capacity)
  if (batch_size is None) or (
      (not isinstance(batch_size, ops.Tensor)) and
      (batch_size <= 0 or batch_size > queue_capacity)):
    raise ValueError(
        'Invalid batch_size %s, with queue_capacity %s.' %
        (batch_size, queue_capacity))
  if (read_batch_size is None) or (
      (not isinstance(read_batch_size, ops.Tensor)) and
      (read_batch_size <= 0)):
    raise ValueError('Invalid read_batch_size %s.' % read_batch_size)
  if (not num_threads) or (num_threads <= 0):
    raise ValueError('Invalid num_threads %s.' % num_threads)
  if (num_epochs is not None) and (num_epochs <= 0):
    raise ValueError('Invalid num_epochs %s.' % num_epochs)

  with ops.name_scope(name, 'read_batch_examples', [file_pattern]) as scope:
    with ops.name_scope('file_name_queue') as file_name_queue_scope:
      if setup_shared_queue:
        shared_file_name_queue = _get_shared_file_name_queue(
            file_names, randomize_input, num_epochs, file_name_queue_scope)
        file_name_queue = data_flow_ops.FIFOQueue(
            capacity=1, dtypes=[dtypes.string], shapes=[[]])
        enqueue_op = file_name_queue.enqueue(shared_file_name_queue.dequeue())
        queue_runner.add_queue_runner(
            queue_runner.QueueRunner(file_name_queue, [enqueue_op]))
      else:
        file_name_queue = input_ops.string_input_producer(
            constant_op.constant(
                file_names, name='input'),
            shuffle=randomize_input,
            num_epochs=num_epochs,
            name=file_name_queue_scope)

    example_list = _get_examples(file_name_queue, reader, num_threads,
                                 read_batch_size, parse_fn)

    enqueue_many = read_batch_size > 1

    if num_epochs is None:
      allow_smaller_final_batch = False
    else:
      allow_smaller_final_batch = True

    # Setup batching queue given list of read example tensors.
    if randomize_input:
      if isinstance(batch_size, ops.Tensor):
        min_after_dequeue = int(queue_capacity * 0.4)
      else:
        min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size)
      queued_examples_with_keys = input_ops.shuffle_batch_join(
          example_list, batch_size, capacity=queue_capacity,
          min_after_dequeue=min_after_dequeue,
          enqueue_many=enqueue_many, name=scope,
          allow_smaller_final_batch=allow_smaller_final_batch)
    else:
      queued_examples_with_keys = input_ops.batch_join(
          example_list, batch_size, capacity=queue_capacity,
          enqueue_many=enqueue_many, name=scope,
          allow_smaller_final_batch=allow_smaller_final_batch)
    if parse_fn and isinstance(queued_examples_with_keys, dict):
      queued_keys = queued_examples_with_keys.pop(KEY_FEATURE_NAME)
      return queued_keys, queued_examples_with_keys
    return queued_examples_with_keys
Esempio n. 37
0
def enqueue_data(data,
                 capacity,
                 shuffle=False,
                 min_after_dequeue=None,
                 seed=None):
  """Creates a queue filled from a numpy array or pandas `DataFrame`.

    Returns a queue filled with the rows of the given array or `DataFrame`. In
    the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to
    the index of the `DataFrame`. For numpy arrays, the first enqueued `Tensor`
    contains the row number.

  Args:
    data: a numpy `ndarray or` pandas `DataFrame` that will be read into the
      queue.
    capacity: the capacity of the queue.
    shuffle: whether or not to shuffle the rows of the array.
    min_after_dequeue: minimum number of elements that can remain in the queue
    after a dequeue operation. Only used when `shuffle` is true. If not set,
    defaults to `capacity` / 4.
    seed: used to seed RandomShuffleQueue. Only used when `shuffle` is True.

  Returns:
    A queue filled with the rows of the given array or `DataFrame`.

  Raises:
    TypeError: `data` is not a Pandas `DataFrame` or a numpy `ndarray`.
  """
  # TODO(jamieas): create multithreaded version of enqueue_data.
  if isinstance(data, np.ndarray):
    types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
    shapes = [(), data.shape[1:]]
    get_feed_fn = _ArrayFeedFn
  elif HAS_PANDAS and isinstance(data, pd.DataFrame):
    types = [dtypes.as_dtype(dt)
             for dt in [data.index.dtype] + list(data.dtypes)]
    shapes = [() for _ in types]
    get_feed_fn = _PandasFeedFn
  else:
    raise TypeError(
        "data must be either a numpy array or pandas DataFrame if pandas is "
        "installed; got {}".format(
            type(data).__name__))

  placeholders = [array_ops.placeholder(*type_and_shape)
                  for type_and_shape in zip(types, shapes)]
  if shuffle:
    min_after_dequeue = (capacity / 4 if min_after_dequeue is None else
                         min_after_dequeue)
    queue = data_flow_ops.RandomShuffleQueue(capacity,
                                             min_after_dequeue,
                                             dtypes=types,
                                             shapes=shapes,
                                             seed=seed)
  else:
    queue = data_flow_ops.FIFOQueue(capacity, dtypes=types, shapes=shapes)
  enqueue_op = queue.enqueue(placeholders)
  feed_fn = get_feed_fn(placeholders, data)
  runner = fqr.FeedingQueueRunner(queue=queue,
                                  enqueue_ops=[enqueue_op],
                                  feed_fn=feed_fn)
  queue_runner.add_queue_runner(runner)
  return queue
Esempio n. 38
0
def input_producer(input_tensor,
                   element_shape=None,
                   num_epochs=None,
                   shuffle=True,
                   seed=None,
                   capacity=32,
                   shared_name=None,
                   summary_name=None,
                   name=None):
    """Output the rows of `input_tensor` to a queue for an input pipeline.

  Args:
    input_tensor: A tensor with the rows to produce. Must be at least
      one-dimensional. Must either have a fully-defined shape, or
      `element_shape` must be defined.
    element_shape: (Optional.) A `TensorShape` representing the shape of a
      row of `input_tensor`, if it cannot be inferred.
    num_epochs: (Optional.) An integer. If specified `input_producer` produces
      each row of `input_tensor` `num_epochs` times before generating an
      `OutOfRange` error. If not specified, `input_producer` can cycle through
      the rows of `input_tensor` an unlimited number of times.
    shuffle: (Optional.) A boolean. If true, the rows are randomly shuffled
      within each epoch.
    seed: (Optional.) An integer. The seed to use if `shuffle` is true.
    capacity: (Optional.) The capacity of the queue to be used for buffering
      the input.
    shared_name: (Optional.) If set, this queue will be shared under the given
      name across multiple sessions.
    summary_name: (Optional.) If set, a scalar summary for the current queue
      size will be generated, using this name as part of the tag.
    name: (Optional.) A name for queue.

  Returns:
    A queue with the output rows.  A `QueueRunner` for the queue is
    added to the current `QUEUE_RUNNER` collection of the current
    graph.

  Raises:
    ValueError: If the shape of the input cannot be inferred from the arguments.
  """
    with ops.name_scope(name, "input_producer", [input_tensor]):
        input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
        element_shape = input_tensor.get_shape()[1:].merge_with(element_shape)
        if not element_shape.is_fully_defined():
            raise ValueError(
                "Either `input_tensor` must have a fully defined shape "
                "or `element_shape` must be specified")

        if shuffle:
            input_tensor = random_ops.random_shuffle(input_tensor, seed=seed)

        input_tensor = limit_epochs(input_tensor, num_epochs)

        q = data_flow_ops.FIFOQueue(capacity=capacity,
                                    dtypes=[input_tensor.dtype.base_dtype],
                                    shapes=[element_shape],
                                    shared_name=shared_name,
                                    name=name)
        enq = q.enqueue_many([input_tensor])
        queue_runner.add_queue_runner(queue_runner.QueueRunner(q, [enq]))
        if summary_name is not None:
            summary.scalar(
                "queue/%s/%s" % (q.name, summary_name),
                math_ops.cast(q.size(), dtypes.float32) * (1. / capacity))
        return q
Esempio n. 39
0
def _enqueue(queue, tensor_list, threads, enqueue_many):
    if enqueue_many:
        enqueue_ops = [queue.enqueue_many(tensor_list)] * threads
    else:
        enqueue_ops = [queue.enqueue(tensor_list)] * threads
    queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
Esempio n. 40
0
def _enqueue(queue, tensor_list, threads, enqueue_many):
  if enqueue_many:
    enqueue_ops = [queue.enqueue_many(tensor_list)] * threads
  else:
    enqueue_ops = [queue.enqueue(tensor_list)] * threads
  queue_runner.add_queue_runner(queue_runner.QueueRunner(queue, enqueue_ops))
Esempio n. 41
0
def enqueue_data(data,
                 capacity,
                 shuffle=False,
                 min_after_dequeue=None,
                 seed=None):
  """Creates a queue filled from a numpy array or pandas `DataFrame`.

    Returns a queue filled with the rows of the given array or `DataFrame`. In
    the case of a pandas `DataFrame`, the first enqueued `Tensor` corresponds to
    the index of the `DataFrame`. For numpy arrays, the first enqueued `Tensor`
    contains the row number.

  Args:
    data: a numpy `ndarray or` pandas `DataFrame` that will be read into the
      queue.
    capacity: the capacity of the queue.
    shuffle: whether or not to shuffle the rows of the array.
    min_after_dequeue: minimum number of elements that can remain in the queue
    after a dequeue operation. Only used when `shuffle` is true. If not set,
    defaults to `capacity` / 4.
    seed: used to seed RandomShuffleQueue. Only used when `shuffle` is True.

  Returns:
    A queue filled with the rows of the given array or `DataFrame`.

  Raises:
    TypeError: `data` is not a Pandas `DataFrame` or a numpy `ndarray`.
  """
  # TODO(jamieas): create multithreaded version of enqueue_data.
  if isinstance(data, np.ndarray):
    types = [dtypes.int64, dtypes.as_dtype(data.dtype)]
    shapes = [(), data.shape[1:]]
    get_feed_fn = _ArrayFeedFn
  elif HAS_PANDAS and isinstance(data, pd.DataFrame):
    types = [dtypes.as_dtype(dt)
             for dt in [data.index.dtype] + list(data.dtypes)]
    shapes = [() for _ in types]
    get_feed_fn = _PandasFeedFn
  else:
    raise TypeError(
        "data must be either a numpy array or pandas DataFrame if pandas is "
        "installed; got {}".format(
            type(data).__name__))

  placeholders = [array_ops.placeholder(*type_and_shape)
                  for type_and_shape in zip(types, shapes)]
  if shuffle:
    min_after_dequeue = int(capacity / 4 if min_after_dequeue is None else
                            min_after_dequeue)
    queue = data_flow_ops.RandomShuffleQueue(capacity,
                                             min_after_dequeue,
                                             dtypes=types,
                                             shapes=shapes,
                                             seed=seed)
  else:
    queue = data_flow_ops.FIFOQueue(capacity, dtypes=types, shapes=shapes)
  enqueue_op = queue.enqueue(placeholders)
  feed_fn = get_feed_fn(placeholders, data)
  runner = fqr.FeedingQueueRunner(queue=queue,
                                  enqueue_ops=[enqueue_op],
                                  feed_fn=feed_fn)
  queue_runner.add_queue_runner(runner)
  return queue