Exemple #1
0
    def _fn():
      num_rows = np.shape(np_matrix)[0]
      num_cols = np.shape(np_matrix)[1]
      row_ids = math_ops.range(num_rows, dtype=dtypes.int64)
      col_ids = math_ops.range(num_cols, dtype=dtypes.int64)
      sp_mat = self.np_array_to_sparse(np_matrix)
      sp_mat_t = sparse_ops.sparse_transpose(sp_mat)
      row_batch = input_lib.batch(
          [row_ids, sp_mat],
          batch_size=min(batch_size, num_rows),
          capacity=10,
          enqueue_many=True)
      col_batch = input_lib.batch(
          [col_ids, sp_mat_t],
          batch_size=min(batch_size, num_cols),
          capacity=10,
          enqueue_many=True)

      features = extract_features(row_batch, col_batch, sp_mat.dense_shape)
      if projection_weights is not None:
        weights_batch = input_lib.batch(
            projection_weights,
            batch_size=batch_size,
            capacity=10,
            enqueue_many=True)
        features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = (
            weights_batch)
      if project_row is not None:
        features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
            constant_op.constant(project_row))

      labels = None
      return features, labels
Exemple #2
0
        def _fn():
            num_rows = np.shape(np_matrix)[0]
            num_cols = np.shape(np_matrix)[1]
            row_ids = math_ops.range(num_rows, dtype=dtypes.int64)
            col_ids = math_ops.range(num_cols, dtype=dtypes.int64)
            sp_mat = self.np_array_to_sparse(np_matrix)
            sp_mat_t = sparse_ops.sparse_transpose(sp_mat)
            row_batch = input_lib.batch([row_ids, sp_mat],
                                        batch_size=min(batch_size, num_rows),
                                        capacity=10,
                                        enqueue_many=True)
            col_batch = input_lib.batch([col_ids, sp_mat_t],
                                        batch_size=min(batch_size, num_cols),
                                        capacity=10,
                                        enqueue_many=True)

            features = extract_features(row_batch, col_batch,
                                        sp_mat.dense_shape)
            if projection_weights is not None:
                weights_batch = input_lib.batch(projection_weights,
                                                batch_size=batch_size,
                                                capacity=10,
                                                enqueue_many=True)
                features[wals_lib.WALSMatrixFactorization.
                         PROJECTION_WEIGHTS] = (weights_batch)
            if project_row is not None:
                features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
                    constant_op.constant(project_row))

            labels = None
            return features, labels
Exemple #3
0
    def testGeneratorWorksWithBatching(self):
        def simple_generator():
            for i in range(5):
                yield {"value": i, "ignored": 3}

        simple_features = {
            "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
        }
        tensors = python_input.python_input(simple_generator, simple_features)

        # Request batches of size 4 at a time, the final batch may be smaller.
        batched_tensors = core_input.batch(tensors,
                                           batch_size=4,
                                           allow_smaller_final_batch=True)

        self.assertEqual(["value"], batched_tensors.keys())
        self.assertEqual(dtypes.int32, batched_tensors["value"].dtype)
        self.assertEqual([None], batched_tensors["value"].shape.as_list())

        with self.test_session() as sess:
            # The generator emits 5 items total.  The first 4 are returned in
            # the first session run; the final one is returned in the
            # second.  This works because allow_smaller_final_batch=True.
            coord = coordinator.Coordinator()
            threads = queue_runner_impl.start_queue_runners(sess=sess,
                                                            coord=coord)
            r1 = sess.run(batched_tensors)
            r2 = sess.run(batched_tensors)
            self.assertAllEqual([0, 1, 2, 3], r1["value"])
            self.assertEqual([4], r2["value"])
            with self.assertRaisesOpError("Iteration finished"):
                sess.run(tensors)
            coord.request_stop()
            for thread in threads:
                thread.join()
  def testGeneratorWorksWithBatching(self):
    def simple_generator():
      for i in range(5):
        yield {"value": i, "ignored": 3}

    simple_features = {
        "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32)
    }
    tensors = python_input.python_input(simple_generator, simple_features)

    # Request batches of size 4 at a time, the final batch may be smaller.
    batched_tensors = core_input.batch(tensors, batch_size=4,
                                       allow_smaller_final_batch=True)

    self.assertEqual(["value"], batched_tensors.keys())
    self.assertEqual(dtypes.int32, batched_tensors["value"].dtype)
    self.assertEqual([None], batched_tensors["value"].shape.as_list())

    with self.test_session() as sess:
      # The generator emits 5 items total.  The first 4 are returned in
      # the first session run; the final one is returned in the
      # second.  This works because allow_smaller_final_batch=True.
      coord = coordinator.Coordinator()
      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
      r1 = sess.run(batched_tensors)
      r2 = sess.run(batched_tensors)
      self.assertAllEqual([0, 1, 2, 3], r1["value"])
      self.assertEqual([4], r2["value"])
      with self.assertRaisesOpError("Iteration finished"):
        sess.run(tensors)
      coord.request_stop()
      for thread in threads:
        thread.join()
Exemple #5
0
 def fn(tensors, scope):
     return input.batch(tensors,
                        batch_size=batch_size,
                        num_threads=num_threads,
                        capacity=capacity,
                        enqueue_many=enqueue_many,
                        allow_smaller_final_batch=allow_smaller_final_batch,
                        name=scope)
Exemple #6
0
 def fn(tensors, scope):
   return input.batch(tensors,
                      batch_size=batch_size,
                      num_threads=num_threads,
                      capacity=capacity,
                      enqueue_many=enqueue_many,
                      allow_smaller_final_batch=allow_smaller_final_batch,
                      name=scope)
Exemple #7
0
 def _apply_transform(self, transform_input):
   batched = input_ops.batch(transform_input,
                             batch_size=self.batch_size,
                             num_threads=self.num_threads,
                             capacity=self.queue_capacity,
                             enqueue_many=True)
   # TODO(jamieas): batch will soon return a list regardless of the number of
   # enqueued tensors. Remove the following once that change is in place.
   if not isinstance(batched, (tuple, list)):
     batched = (batched,)
   # pylint: disable=not-callable
   return self.return_type(*batched)
Exemple #8
0
    def testDynamicPad(self):
        with self.cached_session() as sess:
            # Create 3 tensors of variable but compatible shapes.
            var_shape = [None, 2]
            p1 = constant_op.constant([[1, 2], [3, 4]])
            p1.set_shape(var_shape)
            p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]])
            p2.set_shape(var_shape)
            p3 = constant_op.constant([[11, 12]])
            p3.set_shape(var_shape)
            batch = [p1, p2, p3]
            batch_size = len(batch)

            zero64 = constant_op.constant(0, dtype=dtypes.int64)
            examples = variables.Variable(zero64)
            counter = examples.count_up_to(batch_size)

            # Create a PaddingFIFOQueue to enqueue these tensors.
            q = data_flow_ops.PaddingFIFOQueue(capacity=10,
                                               dtypes=[dtypes.int32],
                                               shapes=[var_shape])
            for tensor in [p1, p2, p3]:
                q.enqueue([tensor]).run()

            # Dequeue from the queue and batch them using batch().
            batches = input_lib.batch([q.dequeue(), counter],
                                      batch_size=batch_size,
                                      num_threads=1,
                                      dynamic_pad=True)
            self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())

            # Finally, assemble them into prefetch_queue with dynamic_pad.
            batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True)
            batches = batcher.dequeue()
            self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())

            variables.global_variables_initializer().run()
            threads = queue_runner_impl.start_queue_runners()

            values, _ = sess.run(batches)
            # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad
            # they should be padded to the fixed size [3, 3, 2], where 3
            # is the maximum length of the batch.
            self.assertTrue(
                np.array_equal(
                    np.array([[[1, 2], [3, 4], [0, 0]],
                              [[5, 6], [7, 8], [9, 10]],
                              [[11, 12], [0, 0], [0, 0]]]), values))

            with self.assertRaises(errors_impl.OutOfRangeError):
                sess.run(batches)
            for thread in threads:
                thread.join()
Exemple #9
0
 def _apply_transform(self, transform_input, **kwargs):
   batched = input_ops.batch(transform_input,
                             batch_size=self.batch_size,
                             num_threads=self.num_threads,
                             capacity=self.queue_capacity,
                             enqueue_many=True)
   # TODO(jamieas): batch will soon return a list regardless of the number of
   # enqueued tensors. Remove the following once that change is in place.
   if not isinstance(batched, (tuple, list)):
     batched = (batched,)
   # pylint: disable=not-callable
   return self.return_type(*batched)
Exemple #10
0
    def _fn():
      num_rows = np.shape(np_matrix)[0]
      num_cols = np.shape(np_matrix)[1]
      row_ids = math_ops.range(num_rows, dtype=dtypes.int64)
      col_ids = math_ops.range(num_cols, dtype=dtypes.int64)
      sp_mat = self.np_array_to_sparse(np_matrix)
      sp_mat_t = sparse_ops.sparse_transpose(sp_mat)
      row_batch = input_lib.batch(
          [row_ids, sp_mat],
          batch_size=min(batch_size, num_rows),
          capacity=10,
          enqueue_many=True)
      col_batch = input_lib.batch(
          [col_ids, sp_mat_t],
          batch_size=min(batch_size, num_cols),
          capacity=10,
          enqueue_many=True)

      features = extract_features(row_batch, col_batch, num_rows, num_cols)

      if mode == model_fn.ModeKeys.INFER or mode == model_fn.ModeKeys.EVAL:
        self.assertTrue(
            project_row is not None,
            msg='project_row must be specified in INFER or EVAL mode.')
        features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
            constant_op.constant(project_row))

      if mode == model_fn.ModeKeys.INFER and projection_weights is not None:
        weights_batch = input_lib.batch(
            projection_weights,
            batch_size=batch_size,
            capacity=10,
            enqueue_many=True)
        features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = (
            weights_batch)

      labels = None
      return features, labels
Exemple #11
0
    def _fn():
      num_rows = np.shape(np_matrix)[0]
      num_cols = np.shape(np_matrix)[1]
      row_ids = math_ops.range(num_rows, dtype=dtypes.int64)
      col_ids = math_ops.range(num_cols, dtype=dtypes.int64)
      sp_mat = self.np_array_to_sparse(np_matrix)
      sp_mat_t = sparse_ops.sparse_transpose(sp_mat)
      row_batch = input_lib.batch(
          [row_ids, sp_mat],
          batch_size=min(batch_size, num_rows),
          capacity=10,
          enqueue_many=True)
      col_batch = input_lib.batch(
          [col_ids, sp_mat_t],
          batch_size=min(batch_size, num_cols),
          capacity=10,
          enqueue_many=True)

      features = extract_features(row_batch, col_batch, num_rows, num_cols)

      if mode == model_fn.ModeKeys.INFER or mode == model_fn.ModeKeys.EVAL:
        self.assertTrue(
            project_row is not None,
            msg='project_row must be specified in INFER or EVAL mode.')
        features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = (
            constant_op.constant(project_row))

      if mode == model_fn.ModeKeys.INFER and projection_weights is not None:
        weights_batch = input_lib.batch(
            projection_weights,
            batch_size=batch_size,
            capacity=10,
            enqueue_many=True)
        features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = (
            weights_batch)

      labels = None
      return features, labels
  def testDynamicPad(self):
    with self.test_session() as sess:
      # Create 3 tensors of variable but compatible shapes.
      var_shape = [None, 2]
      p1 = constant_op.constant([[1, 2], [3, 4]])
      p1.set_shape(var_shape)
      p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]])
      p2.set_shape(var_shape)
      p3 = constant_op.constant([[11, 12]])
      p3.set_shape(var_shape)
      batch = [p1, p2, p3]
      batch_size = len(batch)

      zero64 = constant_op.constant(0, dtype=dtypes.int64)
      examples = variables.Variable(zero64)
      counter = examples.count_up_to(batch_size)

      # Create a PaddingFIFOQueue to enqueue these tensors.
      q = data_flow_ops.PaddingFIFOQueue(
          capacity=10, dtypes=[dtypes.int32], shapes=[var_shape])
      for tensor in [p1, p2, p3]:
        q.enqueue([tensor]).run()

      # Dequeue from the queue and batch them using batch().
      batches = input_lib.batch([q.dequeue(), counter], batch_size=batch_size,
                                num_threads=1, dynamic_pad=True)
      self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())

      # Finally, assemble them into prefetch_queue with dynamic_pad.
      batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True)
      batches = batcher.dequeue()
      self.assertEqual([batch_size, None, 2], batches[0].shape.as_list())

      variables.global_variables_initializer().run()
      threads = queue_runner_impl.start_queue_runners()

      values, _ = sess.run(batches)
      # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad
      # they should be padded to the fixed size [3, 3, 2], where 3
      # is the maximum length of the batch.
      self.assertTrue(np.array_equal(
          np.array([[[1, 2], [3, 4], [0, 0]],
                    [[5, 6], [7, 8], [9, 10]],
                    [[11, 12], [0, 0], [0, 0]]]),
          values))

      with self.assertRaises(errors_impl.OutOfRangeError):
        sess.run(batches)
      for thread in threads:
        thread.join()
Exemple #13
0
    def testMultipleDequeue(self):
        with self.cached_session() as sess:
            batch_size = 10
            image_size = 32
            num_batches = 4

            zero64 = constant_op.constant(0, dtype=dtypes.int64)

            examples = variables.Variable(zero64)
            counter = examples.count_up_to(num_batches * batch_size)
            image = random_ops.random_normal([image_size, image_size, 3],
                                             dtype=dtypes.float32,
                                             name='images')
            label = random_ops.random_uniform([1],
                                              0,
                                              10,
                                              dtype=dtypes.int32,
                                              name='labels')

            batches = input_lib.batch([counter, image, label],
                                      batch_size=batch_size,
                                      num_threads=4)

            batcher = prefetch_queue.prefetch_queue(batches)
            batches_list = [batcher.dequeue() for _ in range(2)]

            variables.global_variables_initializer().run()
            threads = queue_runner_impl.start_queue_runners()

            value_counter = []
            for _ in range(int(num_batches / 2)):
                for batches in batches_list:
                    results = sess.run(batches)
                    value_counter.append(results[0])
                    self.assertEquals(results[1].shape,
                                      (batch_size, image_size, image_size, 3))
                    self.assertEquals(results[2].shape, (batch_size, 1))

            self.assertAllEqual(np.sort(np.concatenate(value_counter)),
                                np.arange(0, num_batches * batch_size))
            # Reached the limit.
            with self.assertRaises(errors_impl.OutOfRangeError):
                sess.run(batches)
            for thread in threads:
                thread.join()
Exemple #14
0
    def testOneThread(self):
        with self.cached_session() as sess:
            batch_size = 10
            image_size = 32
            num_batches = 5

            zero64 = constant_op.constant(0, dtype=dtypes.int64)

            examples = variables.Variable(zero64)
            counter = examples.count_up_to(num_batches * batch_size)
            image = random_ops.random_normal([image_size, image_size, 3],
                                             dtype=dtypes.float32,
                                             name='images')
            label = random_ops.random_uniform([1],
                                              0,
                                              10,
                                              dtype=dtypes.int32,
                                              name='labels')

            batches = input_lib.batch([counter, image, label],
                                      batch_size=batch_size,
                                      num_threads=1)

            batches = prefetch_queue.prefetch_queue(batches).dequeue()

            variables.global_variables_initializer().run()
            threads = queue_runner_impl.start_queue_runners()

            for i in range(num_batches):
                results = sess.run(batches)
                self.assertAllEqual(
                    results[0], np.arange(i * batch_size,
                                          (i + 1) * batch_size))
                self.assertEquals(results[1].shape,
                                  (batch_size, image_size, image_size, 3))
                self.assertEquals(results[2].shape, (batch_size, 1))

            # Reached the limit.
            with self.assertRaises(errors_impl.OutOfRangeError):
                sess.run(batches)
            for thread in threads:
                thread.join()
  def testMultipleDequeue(self):
    with self.test_session() as sess:
      batch_size = 10
      image_size = 32
      num_batches = 4

      zero64 = constant_op.constant(0, dtype=dtypes.int64)

      examples = variables.Variable(zero64)
      counter = examples.count_up_to(num_batches * batch_size)
      image = random_ops.random_normal(
          [image_size, image_size, 3], dtype=dtypes.float32, name='images')
      label = random_ops.random_uniform(
          [1], 0, 10, dtype=dtypes.int32, name='labels')

      batches = input_lib.batch(
          [counter, image, label], batch_size=batch_size, num_threads=4)

      batcher = prefetch_queue.prefetch_queue(batches)
      batches_list = [batcher.dequeue() for _ in range(2)]

      variables.global_variables_initializer().run()
      threads = queue_runner_impl.start_queue_runners()

      value_counter = []
      for _ in range(int(num_batches / 2)):
        for batches in batches_list:
          results = sess.run(batches)
          value_counter.append(results[0])
          self.assertEquals(results[1].shape,
                            (batch_size, image_size, image_size, 3))
          self.assertEquals(results[2].shape, (batch_size, 1))

      self.assertAllEqual(
          np.sort(np.concatenate(value_counter)),
          np.arange(0, num_batches * batch_size))
      # Reached the limit.
      with self.assertRaises(errors_impl.OutOfRangeError):
        sess.run(batches)
      for thread in threads:
        thread.join()
  def testOneThread(self):
    with self.test_session() as sess:
      batch_size = 10
      image_size = 32
      num_batches = 5

      zero64 = constant_op.constant(0, dtype=dtypes.int64)

      examples = variables.Variable(zero64)
      counter = examples.count_up_to(num_batches * batch_size)
      image = random_ops.random_normal(
          [image_size, image_size, 3], dtype=dtypes.float32, name='images')
      label = random_ops.random_uniform(
          [1], 0, 10, dtype=dtypes.int32, name='labels')

      batches = input_lib.batch(
          [counter, image, label], batch_size=batch_size, num_threads=1)

      batches = prefetch_queue.prefetch_queue(batches).dequeue()

      variables.global_variables_initializer().run()
      threads = queue_runner_impl.start_queue_runners()

      for i in range(num_batches):
        results = sess.run(batches)
        self.assertAllEqual(results[0],
                            np.arange(i * batch_size, (i + 1) * batch_size))
        self.assertEquals(results[1].shape,
                          (batch_size, image_size, image_size, 3))
        self.assertEquals(results[2].shape, (batch_size, 1))

      # Reached the limit.
      with self.assertRaises(errors_impl.OutOfRangeError):
        sess.run(batches)
      for thread in threads:
        thread.join()
    def create_batch(self):
        """Create queues to window and batch time series data.

    Returns:
      A dictionary of Tensors corresponding to the output of `self._reader`
      (from the `time_series_reader` constructor argument), each with shapes
      prefixed by [`batch_size`, `window_size`].
    """
        features = self._reader.read()
        if self._jitter:
            # TODO(agarwal, allenl): Figure out if more jitter is needed here.
            jitter = random_ops.random_uniform(shape=[],
                                               maxval=2,
                                               dtype=dtypes.int32)
        else:
            jitter = 0
        # To keep things efficient, we pass from the windowing batcher to the
        # batch-of-windows batcher in batches. This avoids the need for huge numbers
        # of threads, but does mean that jitter is only applied occasionally.
        # TODO(allenl): Experiment with different internal passing sizes.
        internal_passing_size = self._batch_size
        features_windowed = input_lib.batch(
            features,
            batch_size=self._window_size * internal_passing_size + jitter,
            enqueue_many=True,
            capacity=(self._queue_capacity_multiplier * internal_passing_size *
                      self._window_size),
            num_threads=self._num_threads)
        raw_features_windowed = features_windowed
        if self._jitter:
            features_windowed = {
                key: value[jitter:]
                for key, value in features_windowed.items()
            }
        features_windowed = {
            key: array_ops.reshape(
                value,
                array_ops.concat([[internal_passing_size, self._window_size],
                                  array_ops.shape(value)[1:]],
                                 axis=0))
            for key, value in features_windowed.items()
        }
        batch_and_window_shape = tensor_shape.TensorShape(
            [internal_passing_size, self._window_size])
        for key in features_windowed.keys():
            features_windowed[key].set_shape(
                batch_and_window_shape.concatenate(
                    raw_features_windowed[key].get_shape()[1:]))
        # When switching files, we may end up with windows where the time is not
        # decreasing, even if times within each file are sorted (and even if those
        # files are visited in order, when looping back around to the beginning of
        # the first file). This is hard for models to deal with, so we either
        # discard such examples, creating a bias where the beginning and end of the
        # series is under-sampled, or we sort the window, creating large gaps.
        times = features_windowed[feature_keys.TrainEvalFeatures.TIMES]
        if self._discard_out_of_order:
            non_decreasing = math_ops.reduce_all(times[:, 1:] >= times[:, :-1],
                                                 axis=1)
            # Ensure that no more than self._discard_limit complete batches are
            # discarded contiguously (resetting the count when we find a single clean
            # window). This prevents infinite looping when the dataset is smaller than
            # the window size.
            # TODO(allenl): Figure out a way to return informative errors from
            # count_up_to.
            discarded_windows_limiter = variable_scope.variable(
                initial_value=constant_op.constant(0, dtype=dtypes.int64),
                name="discarded_windows_limiter",
                trainable=False,
                collections=[ops.GraphKeys.LOCAL_VARIABLES])

            def _initialized_limit_check():
                return control_flow_ops.cond(
                    math_ops.reduce_any(non_decreasing),
                    lambda: state_ops.assign(discarded_windows_limiter, 0),
                    lambda: discarded_windows_limiter.count_up_to(
                        self._discard_limit))

            discard_limit_op = control_flow_ops.cond(
                state_ops.is_variable_initialized(discarded_windows_limiter),
                _initialized_limit_check,
                lambda: constant_op.constant(0, dtype=dtypes.int64))
            with ops.control_dependencies([discard_limit_op]):
                non_decreasing = array_ops.identity(non_decreasing)
        else:
            _, indices_descending = nn.top_k(times,
                                             k=array_ops.shape(times)[-1],
                                             sorted=True)
            indices = array_ops.reverse(indices_descending, axis=[0])
            features_windowed = {
                key: array_ops.gather(params=value, indices=indices)
                for key, value in features_windowed.items()
            }
            non_decreasing = True
        features_batched = input_lib.maybe_shuffle_batch(
            features_windowed,
            num_threads=self._num_threads,
            seed=self._shuffle_seed,
            batch_size=self._batch_size,
            capacity=self._queue_capacity_multiplier * self._batch_size,
            min_after_dequeue=(self._shuffle_min_after_dequeue_multiplier *
                               self._batch_size),
            keep_input=non_decreasing,
            enqueue_many=True)
        return (features_batched, None)
def skip_gram_sample(input_tensor,
                     min_skips=1,
                     max_skips=5,
                     start=0,
                     limit=-1,
                     emit_self_as_target=False,
                     vocab_freq_table=None,
                     vocab_min_count=None,
                     vocab_subsampling=None,
                     corpus_size=None,
                     batch_size=None,
                     batch_capacity=None,
                     seed=None,
                     name=None):
  """Generates skip-gram token and label paired Tensors from the input tensor.

  Generates skip-gram `("token", "label")` pairs using each element in the
  rank-1 `input_tensor` as a token. The window size used for each token will be
  randomly selected from the range specified by `[min_skips, max_skips]`,
  inclusive. See https://arxiv.org/abs/1301.3781 for more details about
  skip-gram.

  For example, given `input_tensor = ["the", "quick", "brown", "fox", "jumps"]`,
  `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`, the output
  `(tokens, labels)` pairs for the token "quick" will be randomly selected from
  either `(tokens=["quick", "quick"], labels=["the", "brown"])` for 1 skip, or
  `(tokens=["quick", "quick", "quick"], labels=["the", "brown", "fox"])` for 2
  skips.

  If `emit_self_as_target = True`, each token will also be emitted as a label
  for itself. From the previous example, the output will be either
  `(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])` for 1
  skip, or `(tokens=["quick", "quick", "quick", "quick"], labels=["the",
  "quick", "brown", "fox"])` for 2 skips.

  The same process is repeated for each element of `input_tensor` and
  concatenated together into the two output rank-1 `Tensors` (one for all the
  tokens, another for all the labels).

  If `vocab_freq_table` is specified, tokens in `input_tensor` that are not
  present in the vocabulary are discarded. Tokens whose frequency counts are
  below `vocab_min_count` are also discarded. Tokens whose frequency proportions
  in the corpus exceed `vocab_subsampling` may be randomly down-sampled. See
  Eq. 5 in http://arxiv.org/abs/1310.4546 for more details about subsampling.

  Due to the random window sizes used for each token, the lengths of the outputs
  are non-deterministic, unless `batch_size` is specified to batch the outputs
  to always return `Tensors` of length `batch_size`.

  Args:
    input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates.
    min_skips: `int` or scalar `Tensor` specifying the minimum window size to
      randomly use for each token. Must be >= 0 and <= `max_skips`. If
      `min_skips` and `max_skips` are both 0, the only label outputted will be
      the token itself when `emit_self_as_target = True` - or no output
      otherwise.
    max_skips: `int` or scalar `Tensor` specifying the maximum window size to
      randomly use for each token. Must be >= 0.
    start: `int` or scalar `Tensor` specifying the position in
      `input_tensor` from which to start generating skip-gram candidates.
    limit: `int` or scalar `Tensor` specifying the maximum number of
      elements in `input_tensor` to use in generating skip-gram candidates. -1
      means to use the rest of the `Tensor` after `start`.
    emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
      each token as a label for itself.
    vocab_freq_table: (Optional) A lookup table (subclass of
      `lookup.InitializableLookupTableBase`) that maps tokens to their raw
      frequency counts. If specified, any token in `input_tensor` that is not
      found in `vocab_freq_table` will be filtered out before generating
      skip-gram candidates. While this will typically map to integer raw
      frequency counts, it could also map to float frequency proportions.
      `vocab_min_count` and `corpus_size` should be in the same units as this.
    vocab_min_count: (Optional) `int`, `float`, or scalar `Tensor` specifying
      minimum frequency threshold (from `vocab_freq_table`) for a token to be
      kept in `input_tensor`. If this is specified, `vocab_freq_table` must also
      be specified - and they should both be in the same units.
    vocab_subsampling: (Optional) `float` specifying frequency proportion
      threshold for tokens from `input_tensor`. Tokens that occur more
      frequently (based on the ratio of the token's `vocab_freq_table` value to
      the `corpus_size`) will be randomly down-sampled. Reasonable starting
      values may be around 1e-3 or 1e-5. If this is specified, both
      `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5
      in http://arxiv.org/abs/1310.4546 for more details.
    corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
      total number of tokens in the corpus (e.g., sum of all the frequency
      counts of `vocab_freq_table`). Used with `vocab_subsampling` for
      down-sampling frequently occurring tokens. If this is specified,
      `vocab_freq_table` and `vocab_subsampling` must also be specified.
    batch_size: (Optional) `int` specifying batch size of returned `Tensors`.
    batch_capacity: (Optional) `int` specifying batch capacity for the queue
      used for batching returned `Tensors`. Only has an effect if
      `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
    seed: (Optional) `int` used to create a random seed for window size and
      subsampling. See `set_random_seed` docs for behavior.
    name: (Optional) A `string` name or a name scope for the operations.

  Returns:
    A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
    rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
    length `batch_size`; if `batch_size` is not specified, they will be of
    random length, though they will be in sync with each other as long as they
    are evaluated together.

  Raises:
    ValueError: If `vocab_freq_table` is not provided, but `vocab_min_count`,
      `vocab_subsampling`, or `corpus_size` is specified. If `vocab_subsampling`
      and `corpus_size` are not both present or both absent.
  """

  if vocab_freq_table is None and (vocab_min_count is not None or
                                   vocab_subsampling is not None or
                                   corpus_size is not None):
    raise ValueError(
        "vocab_freq_table is not provided, but vocab_min_count={}, "
        "vocab_subsampling={}, or corpus_size={} is not None. These settings "
        "are useless without a vocab_freq_table.".format(
            vocab_min_count, vocab_subsampling, corpus_size))

  if (vocab_subsampling is None) != (corpus_size is None):
    raise ValueError(
        "vocab_subsampling is {} while corpus_size is {} - both must be "
        "provided in order for subsampling to work.".format(
            vocab_subsampling, corpus_size))

  with ops.name_scope(
      name,
      "skip_gram_sample",
      values=[input_tensor, min_skips, max_skips, start, limit]):

    input_tensor = _filter_input(
        input_tensor=input_tensor,
        vocab_freq_table=vocab_freq_table,
        vocab_min_count=vocab_min_count,
        vocab_subsampling=vocab_subsampling,
        corpus_size=corpus_size,
        seed=seed)

    seed1, seed2 = random_seed.get_seed(seed)
    tokens, labels = gen_skip_gram_ops.skip_gram_generate_candidates(
        input_tensor=input_tensor,
        min_skips=min_skips,
        max_skips=max_skips,
        start=start,
        limit=limit,
        emit_self_as_target=emit_self_as_target,
        # Note that seed here should be seed1! This is due to
        # GuardedPhiloxRandom's hard-coded attributes of "seed" and "seed2".
        seed=seed1,
        seed2=seed2)

    # TODO(weiho): If the need arises, add support for sparse input_tensor that
    # figures out sentence boundaries, then calls
    # skip_gram_generate_candidates() on each sentence.

    # Batches the (tokens, labels) outputs so that they will be of deterministic
    # batch_size, to facilitate feeding them into the rest of the network.
    if batch_size is not None and batch_size > 0:
      batch_capacity = (batch_capacity
                        if (batch_capacity is not None and batch_capacity > 0)
                        else 100 * batch_size)
      return input_ops.batch(
          [tokens, labels],
          batch_size,
          capacity=batch_capacity,
          enqueue_many=True)

    return tokens, labels
Exemple #19
0
def stratified_sample(data, labels, init_probs, target_probs, batch_size,
                      enqueue_many=False, queue_capacity=16,
                      threads_per_queue=1, name=None):
  """Stochastically creates batches based on per-class probabilities.

  This method discards examples. Internally, it creates one queue to amortize
  the cost of disk reads, and one queue to hold the properly-proportioned
  batch. See `stratified_sample_unknown_dist` for a function that performs
  stratified sampling with one queue per class and doesn't require knowing the
  class data-distribution ahead of time.

  Args:
    data: Tensor for data. Either one item or a batch, according to
        enqueue_many.
    labels: Tensor for label of data. Label is a single integer or a batch,
        depending on enqueue_many. It is not a one-hot vector.
    init_probs: 1D numpy or python array of class proportions in the data.
    target_probs: 1D numpy or python array of target class proportions in batch.
    batch_size: Size of batch to be returned.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    queue_capacity: Capacity of the large queue that holds input examples.
    threads_per_queue: Number of threads for the large queue that holds input
        examples and for the final queue with the proper class proportions.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: enqueue_many is True and labels doesn't have a batch
        dimension, or if enqueue_many is False and labels isn't a scalar.
    ValueError: enqueue_many is True, and batch dimension on data and labels
        don't match.
    ValueError: if probs don't sum to one.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
    TFAssertion: if labels aren't integers in [0, num classes).
  Returns:
    (data_batch, label_batch)

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to per-class probabilities.
    init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)]
    target_probs = [...distribution you want...]
    data_batch, labels = tf.contrib.framework.sampling_ops.stratified_sample(
        data, label, init_probs, target_probs)

    # Run batch through network.
    ...
  """
  with ops.op_scope([data, labels], name, 'stratified_sample'):
    data = ops.convert_to_tensor(data)
    labels = ops.convert_to_tensor(labels)
    # Reduce the case of a single example to that of a batch of size 1.
    if not enqueue_many:
      data = array_ops.expand_dims(data, 0)
      labels = array_ops.expand_dims(labels, 0)

    # Validate that input is consistent.
    data, labels, [init_probs, target_probs] = _verify_input(
        data, labels, [init_probs, target_probs])

    # Check that all zero initial probabilities also have zero target
    # probabilities.
    if np.any(np.logical_and(np.array(init_probs) == 0,
                             np.array(target_probs) != 0)):
      raise ValueError('Some initial probability class has nonzero target '
                       'probability.')

    # Calculate rejection sampling probabilities.
    reject_probs = _calculate_rejection_probabilities(init_probs, target_probs)
    proportion_rejected = np.sum(np.array(reject_probs) * np.array(init_probs))
    if proportion_rejected > .5:
      logging.warning('Proportion of examples rejected by sampler is high: ',
                      proportion_rejected)

    # Make a single queue to hold input examples.
    val, label = input_ops.batch([data, labels],
                                 batch_size=1,
                                 num_threads=threads_per_queue,
                                 capacity=queue_capacity,
                                 enqueue_many=True)
    val = array_ops.reshape(val, data.get_shape().with_rank_at_least(1)[1:])
    label = array_ops.reshape(
        label, labels.get_shape().with_rank_at_least(1)[1:])

    # Set up second queue containing batches that have the desired class
    # proportions.
    return _get_stratified_batch_from_tensors(
        val, label, reject_probs, batch_size, threads_per_queue)
Exemple #20
0
def stratified_sample(data,
                      labels,
                      init_probs,
                      target_probs,
                      batch_size,
                      enqueue_many=False,
                      queue_capacity=16,
                      threads_per_queue=1,
                      name=None):
    """Stochastically creates batches based on per-class probabilities.

  This method discards examples. Internally, it creates one queue to amortize
  the cost of disk reads, and one queue to hold the properly-proportioned
  batch. See `stratified_sample_unknown_dist` for a function that performs
  stratified sampling with one queue per class and doesn't require knowing the
  class data-distribution ahead of time.

  Args:
    data: Tensor for data. Either one item or a batch, according to
        enqueue_many.
    labels: Tensor for label of data. Label is a single integer or a batch,
        depending on enqueue_many. It is not a one-hot vector.
    init_probs: 1D numpy or python array of class proportions in the data.
    target_probs: 1D numpy or python array of target class proportions in batch.
    batch_size: Size of batch to be returned.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    queue_capacity: Capacity of the large queue that holds input examples.
    threads_per_queue: Number of threads for the large queue that holds input
        examples and for the final queue with the proper class proportions.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: enqueue_many is True and labels doesn't have a batch
        dimension, or if enqueue_many is False and labels isn't a scalar.
    ValueError: enqueue_many is True, and batch dimension on data and labels
        don't match.
    ValueError: if probs don't sum to one.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
    TFAssertion: if labels aren't integers in [0, num classes).
  Returns:
    (data_batch, label_batch)

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to per-class probabilities.
    init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)]
    target_probs = [...distribution you want...]
    data_batch, labels = tf.contrib.framework.sampling_ops.stratified_sample(
        data, label, init_probs, target_probs)

    # Run batch through network.
    ...
  """
    with ops.op_scope([data, labels], name, 'stratified_sample'):
        data = ops.convert_to_tensor(data)
        labels = ops.convert_to_tensor(labels)
        # Reduce the case of a single example to that of a batch of size 1.
        if not enqueue_many:
            data = array_ops.expand_dims(data, 0)
            labels = array_ops.expand_dims(labels, 0)

        # Validate that input is consistent.
        data, labels, [init_probs, target_probs
                       ] = _verify_input(data, labels,
                                         [init_probs, target_probs])

        # Check that all zero initial probabilities also have zero target
        # probabilities.
        if np.any(
                np.logical_and(
                    np.array(init_probs) == 0,
                    np.array(target_probs) != 0)):
            raise ValueError(
                'Some initial probability class has nonzero target '
                'probability.')

        # Calculate rejection sampling probabilities.
        reject_probs = _calculate_rejection_probabilities(
            init_probs, target_probs)
        proportion_rejected = np.sum(
            np.array(reject_probs) * np.array(init_probs))
        if proportion_rejected > .5:
            logging.warning(
                'Proportion of examples rejected by sampler is high: ',
                proportion_rejected)

        # Make a single queue to hold input examples.
        val, label = input_ops.batch([data, labels],
                                     batch_size=1,
                                     num_threads=threads_per_queue,
                                     capacity=queue_capacity,
                                     enqueue_many=True)
        val = array_ops.reshape(val,
                                data.get_shape().with_rank_at_least(1)[1:])
        label = array_ops.reshape(label,
                                  labels.get_shape().with_rank_at_least(1)[1:])

        # Set up second queue containing batches that have the desired class
        # proportions.
        return _get_stratified_batch_from_tensors(val, label, reject_probs,
                                                  batch_size,
                                                  threads_per_queue)
Exemple #21
0
def rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1,
                     enqueue_many=False, prebatch_capacity=16,
                     prebatch_threads=1, runtime_checks=False, name=None):
  """Stochastically creates batches by rejection sampling.

  Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce
  a scalar tensor between 0 and 1. This tensor corresponds to the probability of
  being accepted. When `batch_size` tensor groups have been accepted, the batch
  queue will return a mini-batch.

  Args:
    tensors: List of tensors for data. All tensors are either one item or a
        batch, according to enqueue_many.
    accept_prob_fn: A python lambda that takes a non-batch tensor from each
        item in `tensors`, and produces a scalar tensor.
    batch_size: Size of batch to be returned.
    queue_threads: The number of threads for the queue that will hold the final
      batch.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    prebatch_capacity: Capacity for the large queue that is used to convert
      batched tensors to single examples.
    prebatch_threads: Number of threads for the large queue that is used to
      convert batched tensors to single examples.
    runtime_checks: Bool. If true, insert runtime checks on the output of
        `accept_prob_fn`. Using `True` might have a performance impact.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: enqueue_many is True and labels doesn't have a batch
        dimension, or if enqueue_many is False and labels isn't a scalar.
    ValueError: enqueue_many is True, and batch dimension on data and labels
        don't match.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
  Returns:
    A list of tensors of the same length as `tensors`, with batch dimension
    `batch_size`.

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to data tensor.
    accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2
    data_batch = tf.contrib.training.rejection_sample(
        [data, label], accept_prob_fn, 16)

    # Run batch through network.
    ...
  """
  with variable_scope.variable_scope(name, 'rejection_sample', tensors):
    tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
    # Reduce the case of a batched example to that of a batch of a single
    # example by taking a batch of size one.
    if enqueue_many:
      # Validate that batch dimension of the input is consistent.
      tensor_list = _verify_data_inputs(tensor_list)

      # Make a single queue to hold input examples. Reshape output so examples
      # don't have singleton batch dimension.
      batched = input_ops.batch(tensor_list,
                                batch_size=1,
                                num_threads=prebatch_threads,
                                capacity=prebatch_capacity,
                                enqueue_many=True)
      tensor_list = [array_ops.squeeze(x, [0]) for x in batched]

    # Set up a queue containing batches that have the distribution.
    cur_prob = accept_prob_fn(tensor_list)
    if runtime_checks:
      cur_prob = array_ops.identity(control_flow_ops.with_dependencies(
          [check_ops.assert_less_equal(0.0, cur_prob),
           check_ops.assert_less_equal(cur_prob, 1.0)],
          cur_prob), name='prob_with_checks')
    keep_input = random_ops.random_uniform([]) < cur_prob
    return _conditional_batch(
        tensor_list, keep_input, batch_size, num_threads=queue_threads)
Exemple #22
0
def stratified_sample(tensors, labels, target_probs, batch_size,
                      init_probs=None, enqueue_many=False, queue_capacity=16,
                      threads_per_queue=1, name=None):
  """Stochastically creates batches based on per-class probabilities.

  This method discards examples. Internally, it creates one queue to amortize
  the cost of disk reads, and one queue to hold the properly-proportioned
  batch. See `stratified_sample_unknown_dist` for a function that performs
  stratified sampling with one queue per class and doesn't require knowing the
  class data-distribution ahead of time.

  Args:
    tensors: List of tensors for data. All tensors are either one item or a
        batch, according to enqueue_many.
    labels: Tensor for label of data. Label is a single integer or a batch,
        depending on enqueue_many. It is not a one-hot vector.
    target_probs: Target class proportions in batch. An object whose type has a
        registered Tensor conversion function.
    batch_size: Size of batch to be returned.
    init_probs: Class proportions in the data. An object whose type has a
        registered Tensor conversion function, or `None` for estimating the
        initial distribution.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    queue_capacity: Capacity of the large queue that holds input examples.
    threads_per_queue: Number of threads for the large queue that holds input
        examples and for the final queue with the proper class proportions.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: enqueue_many is True and labels doesn't have a batch
        dimension, or if enqueue_many is False and labels isn't a scalar.
    ValueError: enqueue_many is True, and batch dimension on data and labels
        don't match.
    ValueError: if probs don't sum to one.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
    TFAssertion: if labels aren't integers in [0, num classes).
  Returns:
    (data_batch, label_batch), where data_batch is a list of tensors of the same
        length as `tensors`

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to per-class probabilities.
    target_probs = [...distribution you want...]
    [data_batch], labels = tf.contrib.training.stratified_sample(
        [data], label, target_probs)

    # Run batch through network.
    ...
  """
  with ops.name_scope(name, 'stratified_sample', tensors + [labels]):
    tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
    labels = ops.convert_to_tensor(labels)
    target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32)
    # Reduce the case of a single example to that of a batch of size 1.
    if not enqueue_many:
      tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list]
      labels = array_ops.expand_dims(labels, 0)

    # If `init_probs` is `None`, set up online estimation of data distribution.
    if init_probs is None:
      # We use `target_probs` to get the number of classes, so its shape must be
      # fully defined at graph construction time.
      target_probs.get_shape().assert_is_fully_defined()
      init_probs = _estimate_data_distribution(
          labels, target_probs.get_shape().num_elements())
    else:
      init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)

    # Validate that input is consistent.
    tensor_list, labels, [init_probs, target_probs] = _verify_input(
        tensor_list, labels, [init_probs, target_probs])

    # Check that all zero initial probabilities also have zero target
    # probabilities.
    assert_op = control_flow_ops.Assert(
        math_ops.reduce_all(math_ops.logical_or(
            math_ops.not_equal(init_probs, 0),
            math_ops.equal(target_probs, 0))),
        ['All classes with zero initial probability must also have zero target '
         'probability: ', init_probs, target_probs])
    init_probs = control_flow_ops.with_dependencies([assert_op], init_probs)

    # Calculate acceptance sampling probabilities.
    accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs)
    proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs)
    accept_probs = control_flow_ops.cond(
        math_ops.less(proportion_rejected, .5),
        lambda: accept_probs,
        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
            accept_probs, [accept_probs],
            message='Proportion of examples rejected by sampler is high.',
            first_n=10))

    # Make a single queue to hold input examples. Reshape output so examples
    # don't have singleton batch dimension.
    batched = input_ops.batch(tensor_list + [labels],
                              batch_size=1,
                              num_threads=threads_per_queue,
                              capacity=queue_capacity,
                              enqueue_many=True)
    val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]]
    label = array_ops.squeeze(batched[-1], [0])

    # Set up second queue containing batches that have the desired class
    # proportions.
    cur_prob = array_ops.gather(accept_probs, label)
    keep_input = random_ops.random_uniform([]) < cur_prob
    batched = _conditional_batch(
        val_list + [label],
        keep_input,
        batch_size,
        num_threads=threads_per_queue)
    return batched[:-1], batched[-1]
Exemple #23
0
def read_batch_examples(file_pattern, batch_size, reader,
                        randomize_input=True, queue_capacity=10000,
                        num_threads=1, name='dequeue_examples'):
  """Adds operations to read, queue, batch `Example` protos.

  Given file pattern (or list of files), will setup a queue for file names,
  read `Example` proto using provided `reader`, use batch queue to create
  batches of examples of size `batch_size`.

  All queue runners are added to the queue runners collection, and may be
  started via `start_queue_runners`.

  All ops are added to the default graph.

  Args:
    file_pattern: List of files or pattern of file paths containing
        `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int or scalar `Tensor` specifying the batch size to use.
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
    queue_capacity: Capacity for input queue.
    num_threads: The number of threads enqueuing examples.
    name: Name of resulting op.

  Returns:
    String `Tensor` of batched `Example` proto.

  Raises:
    ValueError: for invalid inputs.
  """
  # Retrive files to read.
  if isinstance(file_pattern, list):
    file_names = file_pattern
    if not file_names:
      raise ValueError('No files given to dequeue_examples.')
  else:
    file_names = list(gfile.Glob(file_pattern))
    if not file_names:
      raise ValueError('No files match %s.' % file_pattern)

  # Sort files so it will be deterministic for unit tests. They'll be shuffled
  # in `string_input_producer` if `randomize_input` is enabled.
  if not randomize_input:
    file_names = sorted(file_names)

  # Check input parameters are given and reasonable.
  if (not queue_capacity) or (queue_capacity <= 0):
    raise ValueError('Invalid queue_capacity %s.' % queue_capacity)
  if (batch_size is None) or (
      (not isinstance(batch_size, ops.Tensor)) and
      (batch_size <= 0 or batch_size > queue_capacity)):
    raise ValueError(
        'Invalid batch_size %s, with queue_capacity %s.' %
        (batch_size, queue_capacity))
  if (not num_threads) or (num_threads <= 0):
    raise ValueError('Invalid num_threads %s.' % num_threads)

  with ops.name_scope(name) as scope:
    # Setup filename queue with shuffling.
    with ops.name_scope('file_name_queue') as file_name_queue_scope:
      file_name_queue = input_ops.string_input_producer(
          constant_op.constant(file_names, name='input'),
          shuffle=randomize_input, name=file_name_queue_scope)

    # Create reader and set it to read from filename queue.
    with ops.name_scope('read'):
      _, example_proto = reader().read(file_name_queue)

    # Setup batching queue.
    if randomize_input:
      if isinstance(batch_size, ops.Tensor):
        min_after_dequeue = int(queue_capacity * 0.4)
      else:
        min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size)
      examples = input_ops.shuffle_batch(
          [example_proto], batch_size, capacity=queue_capacity,
          num_threads=num_threads, min_after_dequeue=min_after_dequeue,
          name=scope)
    else:
      examples = input_ops.batch(
          [example_proto], batch_size, capacity=queue_capacity,
          num_threads=num_threads, name=scope)

    return examples
Exemple #24
0
  def create_batch(self):
    """Create queues to window and batch time series data.

    Returns:
      A dictionary of Tensors corresponding to the output of `self._reader`
      (from the `time_series_reader` constructor argument), each with shapes
      prefixed by [`batch_size`, `window_size`].
    """
    features = self._reader.read()
    if self._jitter:
      # TODO(agarwal, allenl): Figure out if more jitter is needed here.
      jitter = random_ops.random_uniform(shape=[], maxval=2, dtype=dtypes.int32)
    else:
      jitter = 0
    # To keep things efficient, we pass from the windowing batcher to the
    # batch-of-windows batcher in batches. This avoids the need for huge numbers
    # of threads, but does mean that jitter is only applied occasionally.
    # TODO(allenl): Experiment with different internal passing sizes.
    internal_passing_size = self._batch_size
    features_windowed = input_lib.batch(
        features,
        batch_size=self._window_size * internal_passing_size + jitter,
        enqueue_many=True,
        capacity=(self._queue_capacity_multiplier
                  * internal_passing_size * self._window_size),
        num_threads=self._num_threads)
    raw_features_windowed = features_windowed
    if self._jitter:
      features_windowed = {
          key: value[jitter:]
          for key, value in features_windowed.items()}
    features_windowed = {
        key: array_ops.reshape(
            value,
            array_ops.concat(
                [[internal_passing_size, self._window_size],
                 array_ops.shape(value)[1:]],
                axis=0))
        for key, value in features_windowed.items()}
    batch_and_window_shape = tensor_shape.TensorShape(
        [internal_passing_size, self._window_size])
    for key in features_windowed.keys():
      features_windowed[key].set_shape(
          batch_and_window_shape.concatenate(
              raw_features_windowed[key].get_shape()[1:]))
    # When switching files, we may end up with windows where the time is not
    # decreasing, even if times within each file are sorted (and even if those
    # files are visited in order, when looping back around to the beginning of
    # the first file). This is hard for models to deal with, so we either
    # discard such examples, creating a bias where the beginning and end of the
    # series is under-sampled, or we sort the window, creating large gaps.
    times = features_windowed[feature_keys.TrainEvalFeatures.TIMES]
    if self._discard_out_of_order:
      non_decreasing = math_ops.reduce_all(
          times[:, 1:] >= times[:, :-1], axis=1)
      # Ensure that no more than self._discard_limit complete batches are
      # discarded contiguously (resetting the count when we find a single clean
      # window). This prevents infinite looping when the dataset is smaller than
      # the window size.
      # TODO(allenl): Figure out a way to return informative errors from
      # count_up_to.
      discarded_windows_limiter = variable_scope.variable(
          initial_value=constant_op.constant(0, dtype=dtypes.int64),
          name="discarded_windows_limiter",
          trainable=False,
          collections=[ops.GraphKeys.LOCAL_VARIABLES])
      def _initialized_limit_check():
        return control_flow_ops.cond(
            math_ops.reduce_any(non_decreasing),
            lambda: state_ops.assign(discarded_windows_limiter, 0),
            lambda: discarded_windows_limiter.count_up_to(self._discard_limit))
      discard_limit_op = control_flow_ops.cond(
          state_ops.is_variable_initialized(discarded_windows_limiter),
          _initialized_limit_check,
          lambda: constant_op.constant(0, dtype=dtypes.int64))
      with ops.control_dependencies([discard_limit_op]):
        non_decreasing = array_ops.identity(non_decreasing)
    else:
      _, indices_descending = nn.top_k(
          times, k=array_ops.shape(times)[-1], sorted=True)
      indices = array_ops.reverse(indices_descending, axis=[0])
      features_windowed = {
          key: array_ops.gather(params=value, indices=indices)
          for key, value in features_windowed.items()
      }
      non_decreasing = True
    features_batched = input_lib.maybe_shuffle_batch(
        features_windowed,
        num_threads=self._num_threads,
        seed=self._shuffle_seed,
        batch_size=self._batch_size,
        capacity=self._queue_capacity_multiplier * self._batch_size,
        min_after_dequeue=(self._shuffle_min_after_dequeue_multiplier *
                           self._batch_size),
        keep_input=non_decreasing,
        enqueue_many=True)
    return (features_batched, None)
def skip_gram_sample(input_tensor,
                     min_skips=1,
                     max_skips=5,
                     start=0,
                     limit=-1,
                     emit_self_as_target=False,
                     vocab_freq_table=None,
                     vocab_min_count=None,
                     vocab_subsampling=None,
                     corpus_size=None,
                     batch_size=None,
                     batch_capacity=None,
                     seed=None,
                     name=None):
    """Generates skip-gram token and label paired Tensors from the input tensor.

    Generates skip-gram `("token", "label")` pairs using each element in the
    rank-1 `input_tensor` as a token. The window size used for each token will be
    randomly selected from the range specified by `[min_skips, max_skips]`,
    inclusive. See https://arxiv.org/abs/1301.3781 for more details about
    skip-gram.

    For example, given `input_tensor = ["the", "quick", "brown", "fox", "jumps"]`,
    `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`, the output
    `(tokens, labels)` pairs for the token "quick" will be randomly selected from
    either `(tokens=["quick", "quick"], labels=["the", "brown"])` for 1 skip, or
    `(tokens=["quick", "quick", "quick"], labels=["the", "brown", "fox"])` for 2
    skips.

    If `emit_self_as_target = True`, each token will also be emitted as a label
    for itself. From the previous example, the output will be either
    `(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])` for 1
    skip, or `(tokens=["quick", "quick", "quick", "quick"], labels=["the",
    "quick", "brown", "fox"])` for 2 skips.

    The same process is repeated for each element of `input_tensor` and
    concatenated together into the two output rank-1 `Tensors` (one for all the
    tokens, another for all the labels).

    If `vocab_freq_table` is specified, tokens in `input_tensor` that are not
    present in the vocabulary are discarded. Tokens whose frequency counts are
    below `vocab_min_count` are also discarded. Tokens whose frequency proportions
    in the corpus exceed `vocab_subsampling` may be randomly down-sampled. See
    Eq. 5 in http://arxiv.org/abs/1310.4546 for more details about subsampling.

    Due to the random window sizes used for each token, the lengths of the outputs
    are non-deterministic, unless `batch_size` is specified to batch the outputs
    to always return `Tensors` of length `batch_size`.

    Args:
      input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates.
      min_skips: `int` or scalar `Tensor` specifying the minimum window size to
        randomly use for each token. Must be >= 0 and <= `max_skips`. If
        `min_skips` and `max_skips` are both 0, the only label outputted will be
        the token itself when `emit_self_as_target = True` - or no output
        otherwise.
      max_skips: `int` or scalar `Tensor` specifying the maximum window size to
        randomly use for each token. Must be >= 0.
      start: `int` or scalar `Tensor` specifying the position in
        `input_tensor` from which to start generating skip-gram candidates.
      limit: `int` or scalar `Tensor` specifying the maximum number of
        elements in `input_tensor` to use in generating skip-gram candidates. -1
        means to use the rest of the `Tensor` after `start`.
      emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit
        each token as a label for itself.
      vocab_freq_table: (Optional) A lookup table (subclass of
        `lookup.InitializableLookupTableBase`) that maps tokens to their raw
        frequency counts. If specified, any token in `input_tensor` that is not
        found in `vocab_freq_table` will be filtered out before generating
        skip-gram candidates. While this will typically map to integer raw
        frequency counts, it could also map to float frequency proportions.
        `vocab_min_count` and `corpus_size` should be in the same units as this.
      vocab_min_count: (Optional) `int`, `float`, or scalar `Tensor` specifying
        minimum frequency threshold (from `vocab_freq_table`) for a token to be
        kept in `input_tensor`. If this is specified, `vocab_freq_table` must also
        be specified - and they should both be in the same units.
      vocab_subsampling: (Optional) `float` specifying frequency proportion
        threshold for tokens from `input_tensor`. Tokens that occur more
        frequently (based on the ratio of the token's `vocab_freq_table` value to
        the `corpus_size`) will be randomly down-sampled. Reasonable starting
        values may be around 1e-3 or 1e-5. If this is specified, both
        `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5
        in http://arxiv.org/abs/1310.4546 for more details.
      corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the
        total number of tokens in the corpus (e.g., sum of all the frequency
        counts of `vocab_freq_table`). Used with `vocab_subsampling` for
        down-sampling frequently occurring tokens. If this is specified,
        `vocab_freq_table` and `vocab_subsampling` must also be specified.
      batch_size: (Optional) `int` specifying batch size of returned `Tensors`.
      batch_capacity: (Optional) `int` specifying batch capacity for the queue
        used for batching returned `Tensors`. Only has an effect if
        `batch_size` > 0. Defaults to 100 * `batch_size` if not specified.
      seed: (Optional) `int` used to create a random seed for window size and
        subsampling. See `set_random_seed` docs for behavior.
      name: (Optional) A `string` name or a name scope for the operations.

    Returns:
      A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of
      rank-1 and has the same type as `input_tensor`. The `Tensors` will be of
      length `batch_size`; if `batch_size` is not specified, they will be of
      random length, though they will be in sync with each other as long as they
      are evaluated together.

    Raises:
      ValueError: If `vocab_freq_table` is not provided, but `vocab_min_count`,
        `vocab_subsampling`, or `corpus_size` is specified. If `vocab_subsampling`
        and `corpus_size` are not both present or both absent.
    """

    if vocab_freq_table is None and (vocab_min_count is not None
                                     or vocab_subsampling is not None
                                     or corpus_size is not None):
        raise ValueError(
            "vocab_freq_table is not provided, but vocab_min_count={}, "
            "vocab_subsampling={}, or corpus_size={} is not None. These settings "
            "are useless without a vocab_freq_table.".format(
                vocab_min_count, vocab_subsampling, corpus_size))

    if (vocab_subsampling is None) != (corpus_size is None):
        raise ValueError(
            "vocab_subsampling is {} while corpus_size is {} - both must be "
            "provided in order for subsampling to work.".format(
                vocab_subsampling, corpus_size))

    with ops.name_scope(
            name,
            "skip_gram_sample",
            values=[input_tensor, min_skips, max_skips, start, limit]):

        input_tensor = _filter_input(input_tensor=input_tensor,
                                     vocab_freq_table=vocab_freq_table,
                                     vocab_min_count=vocab_min_count,
                                     vocab_subsampling=vocab_subsampling,
                                     corpus_size=corpus_size,
                                     seed=seed)

        seed1, seed2 = random_seed.get_seed(seed)
        tokens, labels = skip_gram_ops.skip_gram_generate_candidates(
            input_tensor=input_tensor,
            min_skips=min_skips,
            max_skips=max_skips,
            start=start,
            limit=limit,
            emit_self_as_target=emit_self_as_target,
            # Note that seed here should be seed1! This is due to
            # GuardedPhiloxRandom's hard-coded attributes of "seed" and "seed2".
            seed=seed1,
            seed2=seed2)

        # TODO(weiho): If the need arises, add support for sparse input_tensor that
        # figures out sentence boundaries, then calls
        # skip_gram_generate_candidates() on each sentence.

        # Batches the (tokens, labels) outputs so that they will be of deterministic
        # batch_size, to facilitate feeding them into the rest of the network.
        if batch_size is not None and batch_size > 0:
            batch_capacity = (batch_capacity if
                              (batch_capacity is not None
                               and batch_capacity > 0) else 100 * batch_size)
            return input_ops.batch([tokens, labels],
                                   batch_size,
                                   capacity=batch_capacity,
                                   enqueue_many=True)

        return tokens, labels
Exemple #26
0
def read_batch_examples(file_pattern,
                        batch_size,
                        reader,
                        randomize_input=True,
                        queue_capacity=10000,
                        num_threads=1,
                        name='dequeue_examples'):
    """Adds operations to read, queue, batch `Example` protos.

  Given file pattern (or list of files), will setup a queue for file names,
  read `Example` proto using provided `reader`, use batch queue to create
  batches of examples of size `batch_size`.

  All queue runners are added to the queue runners collection, and may be
  started via `start_queue_runners`.

  All ops are added to the default graph.

  Args:
    file_pattern: List of files or pattern of file paths containing
        `Example` records. See `tf.gfile.Glob` for pattern rules.
    batch_size: An int or scalar `Tensor` specifying the batch size to use.
    reader: A function or class that returns an object with
      `read` method, (filename tensor) -> (example tensor).
    randomize_input: Whether the input should be randomized.
    queue_capacity: Capacity for input queue.
    num_threads: The number of threads enqueuing examples.
    name: Name of resulting op.

  Returns:
    String `Tensor` of batched `Example` proto.

  Raises:
    ValueError: for invalid inputs.
  """
    # Retrive files to read.
    if isinstance(file_pattern, list):
        file_names = file_pattern
        if not file_names:
            raise ValueError('No files given to dequeue_examples.')
    else:
        file_names = list(gfile.Glob(file_pattern))
        if not file_names:
            raise ValueError('No files match %s.' % file_pattern)

    # Sort files so it will be deterministic for unit tests. They'll be shuffled
    # in `string_input_producer` if `randomize_input` is enabled.
    if not randomize_input:
        file_names = sorted(file_names)

    # Check input parameters are given and reasonable.
    if (not queue_capacity) or (queue_capacity <= 0):
        raise ValueError('Invalid queue_capacity %s.' % queue_capacity)
    if (batch_size is None) or (
        (not isinstance(batch_size, ops.Tensor)) and
        (batch_size <= 0 or batch_size > queue_capacity)):
        raise ValueError('Invalid batch_size %s, with queue_capacity %s.' %
                         (batch_size, queue_capacity))
    if (not num_threads) or (num_threads <= 0):
        raise ValueError('Invalid num_threads %s.' % num_threads)

    with ops.name_scope(name) as scope:
        # Setup filename queue with shuffling.
        with ops.name_scope('file_name_queue') as file_name_queue_scope:
            file_name_queue = input_ops.string_input_producer(
                constant_op.constant(file_names, name='input'),
                shuffle=randomize_input,
                name=file_name_queue_scope)

        # Create reader and set it to read from filename queue.
        with ops.name_scope('read'):
            _, example_proto = reader().read(file_name_queue)

        # Setup batching queue.
        if randomize_input:
            if isinstance(batch_size, ops.Tensor):
                min_after_dequeue = int(queue_capacity * 0.4)
            else:
                min_after_dequeue = max(queue_capacity - (3 * batch_size),
                                        batch_size)
            examples = input_ops.shuffle_batch(
                [example_proto],
                batch_size,
                capacity=queue_capacity,
                num_threads=num_threads,
                min_after_dequeue=min_after_dequeue,
                name=scope)
        else:
            examples = input_ops.batch([example_proto],
                                       batch_size,
                                       capacity=queue_capacity,
                                       num_threads=num_threads,
                                       name=scope)

        return examples
Exemple #27
0
def stratified_sample(data, labels, init_probs, target_probs, batch_size,
                      enqueue_many=False, queue_capacity=16,
                      threads_per_queue=1, name=None):
  """Stochastically creates batches based on per-class probabilities.

  This method discards examples. Internally, it creates one queue to amortize
  the cost of disk reads, and one queue to hold the properly-proportioned
  batch. See `stratified_sample_unknown_dist` for a function that performs
  stratified sampling with one queue per class and doesn't require knowing the
  class data-distribution ahead of time.

  Args:
    data: Tensor for data. Either one item or a batch, according to
        enqueue_many.
    labels: Tensor for label of data. Label is a single integer or a batch,
        depending on enqueue_many. It is not a one-hot vector.
    init_probs: Class proportions in the data. An object whose type has a
        registered Tensor conversion function.
    target_probs: Target class proportions in batch. An object whose type has a
        registered Tensor conversion function.
    batch_size: Size of batch to be returned.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    queue_capacity: Capacity of the large queue that holds input examples.
    threads_per_queue: Number of threads for the large queue that holds input
        examples and for the final queue with the proper class proportions.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: enqueue_many is True and labels doesn't have a batch
        dimension, or if enqueue_many is False and labels isn't a scalar.
    ValueError: enqueue_many is True, and batch dimension on data and labels
        don't match.
    ValueError: if probs don't sum to one.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
    TFAssertion: if labels aren't integers in [0, num classes).
  Returns:
    (data_batch, label_batch)

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to per-class probabilities.
    init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)]
    target_probs = [...distribution you want...]
    data_batch, labels = tf.contrib.framework.sampling_ops.stratified_sample(
        data, label, init_probs, target_probs)

    # Run batch through network.
    ...
  """
  with ops.op_scope([data, labels], name, 'stratified_sample'):
    data = ops.convert_to_tensor(data)
    labels = ops.convert_to_tensor(labels)
    init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)
    target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32)
    # Reduce the case of a single example to that of a batch of size 1.
    if not enqueue_many:
      data = array_ops.expand_dims(data, 0)
      labels = array_ops.expand_dims(labels, 0)

    # Validate that input is consistent.
    data, labels, [init_probs, target_probs] = _verify_input(
        data, labels, [init_probs, target_probs])

    # Check that all zero initial probabilities also have zero target
    # probabilities.
    assert_op = logging_ops.Assert(math_ops.reduce_all(math_ops.logical_or(
        math_ops.not_equal(init_probs, 0),
        math_ops.equal(target_probs, 0))), [init_probs, target_probs])
    init_probs = control_flow_ops.with_dependencies([assert_op], init_probs)

    # Calculate acceptance sampling probabilities.
    accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs)
    proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs)
    accept_probs = control_flow_ops.cond(
        math_ops.less(proportion_rejected, .5),
        lambda: accept_probs,
        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
            accept_probs, [accept_probs],
            message='Proportion of examples rejected by sampler is high.',
            first_n=10))

    # Make a single queue to hold input examples.
    val, label = input_ops.batch([data, labels],
                                 batch_size=1,
                                 num_threads=threads_per_queue,
                                 capacity=queue_capacity,
                                 enqueue_many=True)
    val = array_ops.reshape(val, data.get_shape().with_rank_at_least(1)[1:])
    label = array_ops.reshape(
        label, labels.get_shape().with_rank_at_least(1)[1:])

    # Set up second queue containing batches that have the desired class
    # proportions.
    return _get_stratified_batch_from_tensors(
        val, label, accept_probs, batch_size, threads_per_queue)
Exemple #28
0
def stratified_sample(tensors,
                      labels,
                      target_probs,
                      batch_size,
                      init_probs=None,
                      enqueue_many=False,
                      queue_capacity=16,
                      threads_per_queue=1,
                      name=None):
    """Stochastically creates batches based on per-class probabilities.

  This method discards examples. Internally, it creates one queue to amortize
  the cost of disk reads, and one queue to hold the properly-proportioned
  batch.

  Args:
    tensors: List of tensors for data. All tensors are either one item or a
        batch, according to enqueue_many.
    labels: Tensor for label of data. Label is a single integer or a batch,
        depending on `enqueue_many`. It is not a one-hot vector.
    target_probs: Target class proportions in batch. An object whose type has a
        registered Tensor conversion function.
    batch_size: Size of batch to be returned.
    init_probs: Class proportions in the data. An object whose type has a
        registered Tensor conversion function, or `None` for estimating the
        initial distribution.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    queue_capacity: Capacity of the large queue that holds input examples.
    threads_per_queue: Number of threads for the large queue that holds input
        examples and for the final queue with the proper class proportions.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: If `tensors` isn't iterable.
    ValueError: `enqueue_many` is True and labels doesn't have a batch
        dimension, or if `enqueue_many` is False and labels isn't a scalar.
    ValueError: `enqueue_many` is True, and batch dimension on data and labels
        don't match.
    ValueError: if probs don't sum to one.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
    TFAssertion: if labels aren't integers in [0, num classes).
  Returns:
    (data_batch, label_batch), where data_batch is a list of tensors of the same
        length as `tensors`

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to per-class probabilities.
    target_probs = [...distribution you want...]
    [data_batch], labels = tf.contrib.training.stratified_sample(
        [data], label, target_probs)

    # Run batch through network.
    ...
  """
    with ops.name_scope(name, 'stratified_sample', list(tensors) + [labels]):
        tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
        labels = ops.convert_to_tensor(labels)
        target_probs = ops.convert_to_tensor(target_probs,
                                             dtype=dtypes.float32)
        # Reduce the case of a single example to that of a batch of size 1.
        if not enqueue_many:
            tensor_list = [
                array_ops.expand_dims(tensor, 0) for tensor in tensor_list
            ]
            labels = array_ops.expand_dims(labels, 0)

        # If `init_probs` is `None`, set up online estimation of data distribution.
        if init_probs is None:
            # We use `target_probs` to get the number of classes, so its shape must be
            # fully defined at graph construction time.
            target_probs.get_shape().assert_is_fully_defined()
            init_probs = _estimate_data_distribution(
                labels,
                target_probs.get_shape().num_elements())
        else:
            init_probs = ops.convert_to_tensor(init_probs,
                                               dtype=dtypes.float32)

        # Validate that input is consistent.
        tensor_list, labels, [init_probs, target_probs
                              ] = _verify_input(tensor_list, labels,
                                                [init_probs, target_probs])

        # Check that all zero initial probabilities also have zero target
        # probabilities.
        assert_op = control_flow_ops.Assert(
            math_ops.reduce_all(
                math_ops.logical_or(math_ops.not_equal(init_probs, 0),
                                    math_ops.equal(target_probs, 0))),
            [
                'All classes with zero initial probability must also have zero target '
                'probability: ', init_probs, target_probs
            ])
        init_probs = control_flow_ops.with_dependencies([assert_op],
                                                        init_probs)

        # Calculate acceptance sampling probabilities.
        accept_probs = _calculate_acceptance_probabilities(
            init_probs, target_probs)
        proportion_rejected = math_ops.reduce_sum(
            (1 - accept_probs) * init_probs)
        accept_probs = control_flow_ops.cond(
            math_ops.less(proportion_rejected, .5),
            lambda: accept_probs,
            lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
                accept_probs, [accept_probs],
                message='Proportion of examples rejected by sampler is high.',
                first_n=10))

        # Make a single queue to hold input examples. Reshape output so examples
        # don't have singleton batch dimension.
        batched = input_ops.batch(tensor_list + [labels],
                                  batch_size=1,
                                  num_threads=threads_per_queue,
                                  capacity=queue_capacity,
                                  enqueue_many=True)
        val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]]
        label = array_ops.squeeze(batched[-1], [0])

        # Set up second queue containing batches that have the desired class
        # proportions.
        cur_prob = array_ops.gather(accept_probs, label)
        batched = input_ops.maybe_batch(
            val_list + [label],
            keep_input=random_ops.random_uniform([]) < cur_prob,
            batch_size=batch_size,
            num_threads=threads_per_queue)
        return batched[:-1], batched[-1]
Exemple #29
0
def rejection_sample(tensors,
                     accept_prob_fn,
                     batch_size,
                     queue_threads=1,
                     enqueue_many=False,
                     prebatch_capacity=16,
                     prebatch_threads=1,
                     runtime_checks=False,
                     name=None):
    """Stochastically creates batches by rejection sampling.

  Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce
  a scalar tensor between 0 and 1. This tensor corresponds to the probability of
  being accepted. When `batch_size` tensor groups have been accepted, the batch
  queue will return a mini-batch.

  Args:
    tensors: List of tensors for data. All tensors are either one item or a
        batch, according to enqueue_many.
    accept_prob_fn: A python lambda that takes a non-batch tensor from each
        item in `tensors`, and produces a scalar tensor.
    batch_size: Size of batch to be returned.
    queue_threads: The number of threads for the queue that will hold the final
      batch.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    prebatch_capacity: Capacity for the large queue that is used to convert
      batched tensors to single examples.
    prebatch_threads: Number of threads for the large queue that is used to
      convert batched tensors to single examples.
    runtime_checks: Bool. If true, insert runtime checks on the output of
        `accept_prob_fn`. Using `True` might have a performance impact.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: enqueue_many is True and labels doesn't have a batch
        dimension, or if enqueue_many is False and labels isn't a scalar.
    ValueError: enqueue_many is True, and batch dimension on data and labels
        don't match.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
  Returns:
    A list of tensors of the same length as `tensors`, with batch dimension
    `batch_size`.

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to data tensor.
    accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2
    data_batch = tf.contrib.training.rejection_sample(
        [data, label], accept_prob_fn, 16)

    # Run batch through network.
    ...
  """
    with variable_scope.variable_scope(name, 'rejection_sample', tensors):
        tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
        # Reduce the case of a batched example to that of a batch of a single
        # example by taking a batch of size one.
        if enqueue_many:
            # Validate that batch dimension of the input is consistent.
            tensor_list = _verify_data_inputs(tensor_list)

            # Make a single queue to hold input examples. Reshape output so examples
            # don't have singleton batch dimension.
            batched = input_ops.batch(tensor_list,
                                      batch_size=1,
                                      num_threads=prebatch_threads,
                                      capacity=prebatch_capacity,
                                      enqueue_many=True)
            tensor_list = [array_ops.squeeze(x, [0]) for x in batched]

        # Set up a queue containing batches that have the distribution.
        cur_prob = accept_prob_fn(tensor_list)
        if runtime_checks:
            cur_prob = array_ops.identity(control_flow_ops.with_dependencies([
                check_ops.assert_less_equal(0.0, cur_prob),
                check_ops.assert_less_equal(cur_prob, 1.0)
            ], cur_prob),
                                          name='prob_with_checks')
        minibatch = input_ops.maybe_batch(
            tensor_list,
            keep_input=random_ops.random_uniform([]) < cur_prob,
            batch_size=batch_size,
            num_threads=queue_threads)

        # Queues return a single tensor if the list of enqueued tensors is one. Since
        # we want the type to always be the same, always return a list.
        if isinstance(minibatch, ops.Tensor):
            minibatch = [minibatch]

        return minibatch
Exemple #30
0
def stratified_sample(tensors, labels, init_probs, target_probs, batch_size,
                      enqueue_many=False, queue_capacity=16,
                      threads_per_queue=1, name=None):
  """Stochastically creates batches based on per-class probabilities.

  This method discards examples. Internally, it creates one queue to amortize
  the cost of disk reads, and one queue to hold the properly-proportioned
  batch. See `stratified_sample_unknown_dist` for a function that performs
  stratified sampling with one queue per class and doesn't require knowing the
  class data-distribution ahead of time.

  Args:
    tensors: List of tensors for data. All tensors are either one item or a
        batch, according to enqueue_many.
    labels: Tensor for label of data. Label is a single integer or a batch,
        depending on enqueue_many. It is not a one-hot vector.
    init_probs: Class proportions in the data. An object whose type has a
        registered Tensor conversion function.
    target_probs: Target class proportions in batch. An object whose type has a
        registered Tensor conversion function.
    batch_size: Size of batch to be returned.
    enqueue_many: Bool. If true, interpret input tensors as having a batch
        dimension.
    queue_capacity: Capacity of the large queue that holds input examples.
    threads_per_queue: Number of threads for the large queue that holds input
        examples and for the final queue with the proper class proportions.
    name: Optional prefix for ops created by this function.
  Raises:
    ValueError: enqueue_many is True and labels doesn't have a batch
        dimension, or if enqueue_many is False and labels isn't a scalar.
    ValueError: enqueue_many is True, and batch dimension on data and labels
        don't match.
    ValueError: if probs don't sum to one.
    ValueError: if a zero initial probability class has a nonzero target
        probability.
    TFAssertion: if labels aren't integers in [0, num classes).
  Returns:
    (data_batch, label_batch), where data_batch is a list of tensors of the same
        length as `tensors`

  Example:
    # Get tensor for a single data and label example.
    data, label = data_provider.Get(['data', 'label'])

    # Get stratified batch according to per-class probabilities.
    init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)]
    target_probs = [...distribution you want...]
    [data_batch], labels = tf.contrib.framework.sampling_ops.stratified_sample(
        [data], label, init_probs, target_probs)

    # Run batch through network.
    ...
  """
  with ops.op_scope(tensors + [labels], name, 'stratified_sample'):
    tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors)
    labels = ops.convert_to_tensor(labels)
    init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32)
    target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32)
    # Reduce the case of a single example to that of a batch of size 1.
    if not enqueue_many:
      tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list]
      labels = array_ops.expand_dims(labels, 0)

    # Validate that input is consistent.
    tensor_list, labels, [init_probs, target_probs] = _verify_input(
        tensor_list, labels, [init_probs, target_probs])

    # Check that all zero initial probabilities also have zero target
    # probabilities.
    assert_op = logging_ops.Assert(math_ops.reduce_all(math_ops.logical_or(
        math_ops.not_equal(init_probs, 0),
        math_ops.equal(target_probs, 0))), [init_probs, target_probs])
    init_probs = control_flow_ops.with_dependencies([assert_op], init_probs)

    # Calculate acceptance sampling probabilities.
    accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs)
    proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs)
    accept_probs = control_flow_ops.cond(
        math_ops.less(proportion_rejected, .5),
        lambda: accept_probs,
        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
            accept_probs, [accept_probs],
            message='Proportion of examples rejected by sampler is high.',
            first_n=10))

    # Make a single queue to hold input examples. Reshape output so examples
    # don't have singleton batch dimension.
    batched = input_ops.batch(tensor_list + [labels],
                              batch_size=1,
                              num_threads=threads_per_queue,
                              capacity=queue_capacity,
                              enqueue_many=True)
    val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]]
    label = array_ops.squeeze(batched[-1], [0])

    # Set up second queue containing batches that have the desired class
    # proportions.
    batched = _get_stratified_batch_from_tensors(
        val_list, label, accept_probs, batch_size, threads_per_queue)
    return batched[:-1], batched[-1]