def _fn(): num_rows = np.shape(np_matrix)[0] num_cols = np.shape(np_matrix)[1] row_ids = math_ops.range(num_rows, dtype=dtypes.int64) col_ids = math_ops.range(num_cols, dtype=dtypes.int64) sp_mat = self.np_array_to_sparse(np_matrix) sp_mat_t = sparse_ops.sparse_transpose(sp_mat) row_batch = input_lib.batch( [row_ids, sp_mat], batch_size=min(batch_size, num_rows), capacity=10, enqueue_many=True) col_batch = input_lib.batch( [col_ids, sp_mat_t], batch_size=min(batch_size, num_cols), capacity=10, enqueue_many=True) features = extract_features(row_batch, col_batch, sp_mat.dense_shape) if projection_weights is not None: weights_batch = input_lib.batch( projection_weights, batch_size=batch_size, capacity=10, enqueue_many=True) features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = ( weights_batch) if project_row is not None: features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = ( constant_op.constant(project_row)) labels = None return features, labels
def _fn(): num_rows = np.shape(np_matrix)[0] num_cols = np.shape(np_matrix)[1] row_ids = math_ops.range(num_rows, dtype=dtypes.int64) col_ids = math_ops.range(num_cols, dtype=dtypes.int64) sp_mat = self.np_array_to_sparse(np_matrix) sp_mat_t = sparse_ops.sparse_transpose(sp_mat) row_batch = input_lib.batch([row_ids, sp_mat], batch_size=min(batch_size, num_rows), capacity=10, enqueue_many=True) col_batch = input_lib.batch([col_ids, sp_mat_t], batch_size=min(batch_size, num_cols), capacity=10, enqueue_many=True) features = extract_features(row_batch, col_batch, sp_mat.dense_shape) if projection_weights is not None: weights_batch = input_lib.batch(projection_weights, batch_size=batch_size, capacity=10, enqueue_many=True) features[wals_lib.WALSMatrixFactorization. PROJECTION_WEIGHTS] = (weights_batch) if project_row is not None: features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = ( constant_op.constant(project_row)) labels = None return features, labels
def testGeneratorWorksWithBatching(self): def simple_generator(): for i in range(5): yield {"value": i, "ignored": 3} simple_features = { "value": parsing_ops.FixedLenFeature(shape=[], dtype=dtypes.int32) } tensors = python_input.python_input(simple_generator, simple_features) # Request batches of size 4 at a time, the final batch may be smaller. batched_tensors = core_input.batch(tensors, batch_size=4, allow_smaller_final_batch=True) self.assertEqual(["value"], batched_tensors.keys()) self.assertEqual(dtypes.int32, batched_tensors["value"].dtype) self.assertEqual([None], batched_tensors["value"].shape.as_list()) with self.test_session() as sess: # The generator emits 5 items total. The first 4 are returned in # the first session run; the final one is returned in the # second. This works because allow_smaller_final_batch=True. coord = coordinator.Coordinator() threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord) r1 = sess.run(batched_tensors) r2 = sess.run(batched_tensors) self.assertAllEqual([0, 1, 2, 3], r1["value"]) self.assertEqual([4], r2["value"]) with self.assertRaisesOpError("Iteration finished"): sess.run(tensors) coord.request_stop() for thread in threads: thread.join()
def fn(tensors, scope): return input.batch(tensors, batch_size=batch_size, num_threads=num_threads, capacity=capacity, enqueue_many=enqueue_many, allow_smaller_final_batch=allow_smaller_final_batch, name=scope)
def _apply_transform(self, transform_input): batched = input_ops.batch(transform_input, batch_size=self.batch_size, num_threads=self.num_threads, capacity=self.queue_capacity, enqueue_many=True) # TODO(jamieas): batch will soon return a list regardless of the number of # enqueued tensors. Remove the following once that change is in place. if not isinstance(batched, (tuple, list)): batched = (batched,) # pylint: disable=not-callable return self.return_type(*batched)
def testDynamicPad(self): with self.cached_session() as sess: # Create 3 tensors of variable but compatible shapes. var_shape = [None, 2] p1 = constant_op.constant([[1, 2], [3, 4]]) p1.set_shape(var_shape) p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]]) p2.set_shape(var_shape) p3 = constant_op.constant([[11, 12]]) p3.set_shape(var_shape) batch = [p1, p2, p3] batch_size = len(batch) zero64 = constant_op.constant(0, dtype=dtypes.int64) examples = variables.Variable(zero64) counter = examples.count_up_to(batch_size) # Create a PaddingFIFOQueue to enqueue these tensors. q = data_flow_ops.PaddingFIFOQueue(capacity=10, dtypes=[dtypes.int32], shapes=[var_shape]) for tensor in [p1, p2, p3]: q.enqueue([tensor]).run() # Dequeue from the queue and batch them using batch(). batches = input_lib.batch([q.dequeue(), counter], batch_size=batch_size, num_threads=1, dynamic_pad=True) self.assertEqual([batch_size, None, 2], batches[0].shape.as_list()) # Finally, assemble them into prefetch_queue with dynamic_pad. batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True) batches = batcher.dequeue() self.assertEqual([batch_size, None, 2], batches[0].shape.as_list()) variables.global_variables_initializer().run() threads = queue_runner_impl.start_queue_runners() values, _ = sess.run(batches) # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad # they should be padded to the fixed size [3, 3, 2], where 3 # is the maximum length of the batch. self.assertTrue( np.array_equal( np.array([[[1, 2], [3, 4], [0, 0]], [[5, 6], [7, 8], [9, 10]], [[11, 12], [0, 0], [0, 0]]]), values)) with self.assertRaises(errors_impl.OutOfRangeError): sess.run(batches) for thread in threads: thread.join()
def _apply_transform(self, transform_input, **kwargs): batched = input_ops.batch(transform_input, batch_size=self.batch_size, num_threads=self.num_threads, capacity=self.queue_capacity, enqueue_many=True) # TODO(jamieas): batch will soon return a list regardless of the number of # enqueued tensors. Remove the following once that change is in place. if not isinstance(batched, (tuple, list)): batched = (batched,) # pylint: disable=not-callable return self.return_type(*batched)
def _fn(): num_rows = np.shape(np_matrix)[0] num_cols = np.shape(np_matrix)[1] row_ids = math_ops.range(num_rows, dtype=dtypes.int64) col_ids = math_ops.range(num_cols, dtype=dtypes.int64) sp_mat = self.np_array_to_sparse(np_matrix) sp_mat_t = sparse_ops.sparse_transpose(sp_mat) row_batch = input_lib.batch( [row_ids, sp_mat], batch_size=min(batch_size, num_rows), capacity=10, enqueue_many=True) col_batch = input_lib.batch( [col_ids, sp_mat_t], batch_size=min(batch_size, num_cols), capacity=10, enqueue_many=True) features = extract_features(row_batch, col_batch, num_rows, num_cols) if mode == model_fn.ModeKeys.INFER or mode == model_fn.ModeKeys.EVAL: self.assertTrue( project_row is not None, msg='project_row must be specified in INFER or EVAL mode.') features[wals_lib.WALSMatrixFactorization.PROJECT_ROW] = ( constant_op.constant(project_row)) if mode == model_fn.ModeKeys.INFER and projection_weights is not None: weights_batch = input_lib.batch( projection_weights, batch_size=batch_size, capacity=10, enqueue_many=True) features[wals_lib.WALSMatrixFactorization.PROJECTION_WEIGHTS] = ( weights_batch) labels = None return features, labels
def testDynamicPad(self): with self.test_session() as sess: # Create 3 tensors of variable but compatible shapes. var_shape = [None, 2] p1 = constant_op.constant([[1, 2], [3, 4]]) p1.set_shape(var_shape) p2 = constant_op.constant([[5, 6], [7, 8], [9, 10]]) p2.set_shape(var_shape) p3 = constant_op.constant([[11, 12]]) p3.set_shape(var_shape) batch = [p1, p2, p3] batch_size = len(batch) zero64 = constant_op.constant(0, dtype=dtypes.int64) examples = variables.Variable(zero64) counter = examples.count_up_to(batch_size) # Create a PaddingFIFOQueue to enqueue these tensors. q = data_flow_ops.PaddingFIFOQueue( capacity=10, dtypes=[dtypes.int32], shapes=[var_shape]) for tensor in [p1, p2, p3]: q.enqueue([tensor]).run() # Dequeue from the queue and batch them using batch(). batches = input_lib.batch([q.dequeue(), counter], batch_size=batch_size, num_threads=1, dynamic_pad=True) self.assertEqual([batch_size, None, 2], batches[0].shape.as_list()) # Finally, assemble them into prefetch_queue with dynamic_pad. batcher = prefetch_queue.prefetch_queue(batches, dynamic_pad=True) batches = batcher.dequeue() self.assertEqual([batch_size, None, 2], batches[0].shape.as_list()) variables.global_variables_initializer().run() threads = queue_runner_impl.start_queue_runners() values, _ = sess.run(batches) # We enqueued 3 tensors of [None, 2] shapes, so using dynamic_pad # they should be padded to the fixed size [3, 3, 2], where 3 # is the maximum length of the batch. self.assertTrue(np.array_equal( np.array([[[1, 2], [3, 4], [0, 0]], [[5, 6], [7, 8], [9, 10]], [[11, 12], [0, 0], [0, 0]]]), values)) with self.assertRaises(errors_impl.OutOfRangeError): sess.run(batches) for thread in threads: thread.join()
def testMultipleDequeue(self): with self.cached_session() as sess: batch_size = 10 image_size = 32 num_batches = 4 zero64 = constant_op.constant(0, dtype=dtypes.int64) examples = variables.Variable(zero64) counter = examples.count_up_to(num_batches * batch_size) image = random_ops.random_normal([image_size, image_size, 3], dtype=dtypes.float32, name='images') label = random_ops.random_uniform([1], 0, 10, dtype=dtypes.int32, name='labels') batches = input_lib.batch([counter, image, label], batch_size=batch_size, num_threads=4) batcher = prefetch_queue.prefetch_queue(batches) batches_list = [batcher.dequeue() for _ in range(2)] variables.global_variables_initializer().run() threads = queue_runner_impl.start_queue_runners() value_counter = [] for _ in range(int(num_batches / 2)): for batches in batches_list: results = sess.run(batches) value_counter.append(results[0]) self.assertEquals(results[1].shape, (batch_size, image_size, image_size, 3)) self.assertEquals(results[2].shape, (batch_size, 1)) self.assertAllEqual(np.sort(np.concatenate(value_counter)), np.arange(0, num_batches * batch_size)) # Reached the limit. with self.assertRaises(errors_impl.OutOfRangeError): sess.run(batches) for thread in threads: thread.join()
def testOneThread(self): with self.cached_session() as sess: batch_size = 10 image_size = 32 num_batches = 5 zero64 = constant_op.constant(0, dtype=dtypes.int64) examples = variables.Variable(zero64) counter = examples.count_up_to(num_batches * batch_size) image = random_ops.random_normal([image_size, image_size, 3], dtype=dtypes.float32, name='images') label = random_ops.random_uniform([1], 0, 10, dtype=dtypes.int32, name='labels') batches = input_lib.batch([counter, image, label], batch_size=batch_size, num_threads=1) batches = prefetch_queue.prefetch_queue(batches).dequeue() variables.global_variables_initializer().run() threads = queue_runner_impl.start_queue_runners() for i in range(num_batches): results = sess.run(batches) self.assertAllEqual( results[0], np.arange(i * batch_size, (i + 1) * batch_size)) self.assertEquals(results[1].shape, (batch_size, image_size, image_size, 3)) self.assertEquals(results[2].shape, (batch_size, 1)) # Reached the limit. with self.assertRaises(errors_impl.OutOfRangeError): sess.run(batches) for thread in threads: thread.join()
def testMultipleDequeue(self): with self.test_session() as sess: batch_size = 10 image_size = 32 num_batches = 4 zero64 = constant_op.constant(0, dtype=dtypes.int64) examples = variables.Variable(zero64) counter = examples.count_up_to(num_batches * batch_size) image = random_ops.random_normal( [image_size, image_size, 3], dtype=dtypes.float32, name='images') label = random_ops.random_uniform( [1], 0, 10, dtype=dtypes.int32, name='labels') batches = input_lib.batch( [counter, image, label], batch_size=batch_size, num_threads=4) batcher = prefetch_queue.prefetch_queue(batches) batches_list = [batcher.dequeue() for _ in range(2)] variables.global_variables_initializer().run() threads = queue_runner_impl.start_queue_runners() value_counter = [] for _ in range(int(num_batches / 2)): for batches in batches_list: results = sess.run(batches) value_counter.append(results[0]) self.assertEquals(results[1].shape, (batch_size, image_size, image_size, 3)) self.assertEquals(results[2].shape, (batch_size, 1)) self.assertAllEqual( np.sort(np.concatenate(value_counter)), np.arange(0, num_batches * batch_size)) # Reached the limit. with self.assertRaises(errors_impl.OutOfRangeError): sess.run(batches) for thread in threads: thread.join()
def testOneThread(self): with self.test_session() as sess: batch_size = 10 image_size = 32 num_batches = 5 zero64 = constant_op.constant(0, dtype=dtypes.int64) examples = variables.Variable(zero64) counter = examples.count_up_to(num_batches * batch_size) image = random_ops.random_normal( [image_size, image_size, 3], dtype=dtypes.float32, name='images') label = random_ops.random_uniform( [1], 0, 10, dtype=dtypes.int32, name='labels') batches = input_lib.batch( [counter, image, label], batch_size=batch_size, num_threads=1) batches = prefetch_queue.prefetch_queue(batches).dequeue() variables.global_variables_initializer().run() threads = queue_runner_impl.start_queue_runners() for i in range(num_batches): results = sess.run(batches) self.assertAllEqual(results[0], np.arange(i * batch_size, (i + 1) * batch_size)) self.assertEquals(results[1].shape, (batch_size, image_size, image_size, 3)) self.assertEquals(results[2].shape, (batch_size, 1)) # Reached the limit. with self.assertRaises(errors_impl.OutOfRangeError): sess.run(batches) for thread in threads: thread.join()
def create_batch(self): """Create queues to window and batch time series data. Returns: A dictionary of Tensors corresponding to the output of `self._reader` (from the `time_series_reader` constructor argument), each with shapes prefixed by [`batch_size`, `window_size`]. """ features = self._reader.read() if self._jitter: # TODO(agarwal, allenl): Figure out if more jitter is needed here. jitter = random_ops.random_uniform(shape=[], maxval=2, dtype=dtypes.int32) else: jitter = 0 # To keep things efficient, we pass from the windowing batcher to the # batch-of-windows batcher in batches. This avoids the need for huge numbers # of threads, but does mean that jitter is only applied occasionally. # TODO(allenl): Experiment with different internal passing sizes. internal_passing_size = self._batch_size features_windowed = input_lib.batch( features, batch_size=self._window_size * internal_passing_size + jitter, enqueue_many=True, capacity=(self._queue_capacity_multiplier * internal_passing_size * self._window_size), num_threads=self._num_threads) raw_features_windowed = features_windowed if self._jitter: features_windowed = { key: value[jitter:] for key, value in features_windowed.items() } features_windowed = { key: array_ops.reshape( value, array_ops.concat([[internal_passing_size, self._window_size], array_ops.shape(value)[1:]], axis=0)) for key, value in features_windowed.items() } batch_and_window_shape = tensor_shape.TensorShape( [internal_passing_size, self._window_size]) for key in features_windowed.keys(): features_windowed[key].set_shape( batch_and_window_shape.concatenate( raw_features_windowed[key].get_shape()[1:])) # When switching files, we may end up with windows where the time is not # decreasing, even if times within each file are sorted (and even if those # files are visited in order, when looping back around to the beginning of # the first file). This is hard for models to deal with, so we either # discard such examples, creating a bias where the beginning and end of the # series is under-sampled, or we sort the window, creating large gaps. times = features_windowed[feature_keys.TrainEvalFeatures.TIMES] if self._discard_out_of_order: non_decreasing = math_ops.reduce_all(times[:, 1:] >= times[:, :-1], axis=1) # Ensure that no more than self._discard_limit complete batches are # discarded contiguously (resetting the count when we find a single clean # window). This prevents infinite looping when the dataset is smaller than # the window size. # TODO(allenl): Figure out a way to return informative errors from # count_up_to. discarded_windows_limiter = variable_scope.variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), name="discarded_windows_limiter", trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES]) def _initialized_limit_check(): return control_flow_ops.cond( math_ops.reduce_any(non_decreasing), lambda: state_ops.assign(discarded_windows_limiter, 0), lambda: discarded_windows_limiter.count_up_to( self._discard_limit)) discard_limit_op = control_flow_ops.cond( state_ops.is_variable_initialized(discarded_windows_limiter), _initialized_limit_check, lambda: constant_op.constant(0, dtype=dtypes.int64)) with ops.control_dependencies([discard_limit_op]): non_decreasing = array_ops.identity(non_decreasing) else: _, indices_descending = nn.top_k(times, k=array_ops.shape(times)[-1], sorted=True) indices = array_ops.reverse(indices_descending, axis=[0]) features_windowed = { key: array_ops.gather(params=value, indices=indices) for key, value in features_windowed.items() } non_decreasing = True features_batched = input_lib.maybe_shuffle_batch( features_windowed, num_threads=self._num_threads, seed=self._shuffle_seed, batch_size=self._batch_size, capacity=self._queue_capacity_multiplier * self._batch_size, min_after_dequeue=(self._shuffle_min_after_dequeue_multiplier * self._batch_size), keep_input=non_decreasing, enqueue_many=True) return (features_batched, None)
def skip_gram_sample(input_tensor, min_skips=1, max_skips=5, start=0, limit=-1, emit_self_as_target=False, vocab_freq_table=None, vocab_min_count=None, vocab_subsampling=None, corpus_size=None, batch_size=None, batch_capacity=None, seed=None, name=None): """Generates skip-gram token and label paired Tensors from the input tensor. Generates skip-gram `("token", "label")` pairs using each element in the rank-1 `input_tensor` as a token. The window size used for each token will be randomly selected from the range specified by `[min_skips, max_skips]`, inclusive. See https://arxiv.org/abs/1301.3781 for more details about skip-gram. For example, given `input_tensor = ["the", "quick", "brown", "fox", "jumps"]`, `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`, the output `(tokens, labels)` pairs for the token "quick" will be randomly selected from either `(tokens=["quick", "quick"], labels=["the", "brown"])` for 1 skip, or `(tokens=["quick", "quick", "quick"], labels=["the", "brown", "fox"])` for 2 skips. If `emit_self_as_target = True`, each token will also be emitted as a label for itself. From the previous example, the output will be either `(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])` for 1 skip, or `(tokens=["quick", "quick", "quick", "quick"], labels=["the", "quick", "brown", "fox"])` for 2 skips. The same process is repeated for each element of `input_tensor` and concatenated together into the two output rank-1 `Tensors` (one for all the tokens, another for all the labels). If `vocab_freq_table` is specified, tokens in `input_tensor` that are not present in the vocabulary are discarded. Tokens whose frequency counts are below `vocab_min_count` are also discarded. Tokens whose frequency proportions in the corpus exceed `vocab_subsampling` may be randomly down-sampled. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details about subsampling. Due to the random window sizes used for each token, the lengths of the outputs are non-deterministic, unless `batch_size` is specified to batch the outputs to always return `Tensors` of length `batch_size`. Args: input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates. min_skips: `int` or scalar `Tensor` specifying the minimum window size to randomly use for each token. Must be >= 0 and <= `max_skips`. If `min_skips` and `max_skips` are both 0, the only label outputted will be the token itself when `emit_self_as_target = True` - or no output otherwise. max_skips: `int` or scalar `Tensor` specifying the maximum window size to randomly use for each token. Must be >= 0. start: `int` or scalar `Tensor` specifying the position in `input_tensor` from which to start generating skip-gram candidates. limit: `int` or scalar `Tensor` specifying the maximum number of elements in `input_tensor` to use in generating skip-gram candidates. -1 means to use the rest of the `Tensor` after `start`. emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit each token as a label for itself. vocab_freq_table: (Optional) A lookup table (subclass of `lookup.InitializableLookupTableBase`) that maps tokens to their raw frequency counts. If specified, any token in `input_tensor` that is not found in `vocab_freq_table` will be filtered out before generating skip-gram candidates. While this will typically map to integer raw frequency counts, it could also map to float frequency proportions. `vocab_min_count` and `corpus_size` should be in the same units as this. vocab_min_count: (Optional) `int`, `float`, or scalar `Tensor` specifying minimum frequency threshold (from `vocab_freq_table`) for a token to be kept in `input_tensor`. If this is specified, `vocab_freq_table` must also be specified - and they should both be in the same units. vocab_subsampling: (Optional) `float` specifying frequency proportion threshold for tokens from `input_tensor`. Tokens that occur more frequently (based on the ratio of the token's `vocab_freq_table` value to the `corpus_size`) will be randomly down-sampled. Reasonable starting values may be around 1e-3 or 1e-5. If this is specified, both `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details. corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the total number of tokens in the corpus (e.g., sum of all the frequency counts of `vocab_freq_table`). Used with `vocab_subsampling` for down-sampling frequently occurring tokens. If this is specified, `vocab_freq_table` and `vocab_subsampling` must also be specified. batch_size: (Optional) `int` specifying batch size of returned `Tensors`. batch_capacity: (Optional) `int` specifying batch capacity for the queue used for batching returned `Tensors`. Only has an effect if `batch_size` > 0. Defaults to 100 * `batch_size` if not specified. seed: (Optional) `int` used to create a random seed for window size and subsampling. See `set_random_seed` docs for behavior. name: (Optional) A `string` name or a name scope for the operations. Returns: A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of rank-1 and has the same type as `input_tensor`. The `Tensors` will be of length `batch_size`; if `batch_size` is not specified, they will be of random length, though they will be in sync with each other as long as they are evaluated together. Raises: ValueError: If `vocab_freq_table` is not provided, but `vocab_min_count`, `vocab_subsampling`, or `corpus_size` is specified. If `vocab_subsampling` and `corpus_size` are not both present or both absent. """ if vocab_freq_table is None and (vocab_min_count is not None or vocab_subsampling is not None or corpus_size is not None): raise ValueError( "vocab_freq_table is not provided, but vocab_min_count={}, " "vocab_subsampling={}, or corpus_size={} is not None. These settings " "are useless without a vocab_freq_table.".format( vocab_min_count, vocab_subsampling, corpus_size)) if (vocab_subsampling is None) != (corpus_size is None): raise ValueError( "vocab_subsampling is {} while corpus_size is {} - both must be " "provided in order for subsampling to work.".format( vocab_subsampling, corpus_size)) with ops.name_scope( name, "skip_gram_sample", values=[input_tensor, min_skips, max_skips, start, limit]): input_tensor = _filter_input( input_tensor=input_tensor, vocab_freq_table=vocab_freq_table, vocab_min_count=vocab_min_count, vocab_subsampling=vocab_subsampling, corpus_size=corpus_size, seed=seed) seed1, seed2 = random_seed.get_seed(seed) tokens, labels = gen_skip_gram_ops.skip_gram_generate_candidates( input_tensor=input_tensor, min_skips=min_skips, max_skips=max_skips, start=start, limit=limit, emit_self_as_target=emit_self_as_target, # Note that seed here should be seed1! This is due to # GuardedPhiloxRandom's hard-coded attributes of "seed" and "seed2". seed=seed1, seed2=seed2) # TODO(weiho): If the need arises, add support for sparse input_tensor that # figures out sentence boundaries, then calls # skip_gram_generate_candidates() on each sentence. # Batches the (tokens, labels) outputs so that they will be of deterministic # batch_size, to facilitate feeding them into the rest of the network. if batch_size is not None and batch_size > 0: batch_capacity = (batch_capacity if (batch_capacity is not None and batch_capacity > 0) else 100 * batch_size) return input_ops.batch( [tokens, labels], batch_size, capacity=batch_capacity, enqueue_many=True) return tokens, labels
def stratified_sample(data, labels, init_probs, target_probs, batch_size, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. See `stratified_sample_unknown_dist` for a function that performs stratified sampling with one queue per class and doesn't require knowing the class data-distribution ahead of time. Args: data: Tensor for data. Either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on enqueue_many. It is not a one-hot vector. init_probs: 1D numpy or python array of class proportions in the data. target_probs: 1D numpy or python array of target class proportions in batch. batch_size: Size of batch to be returned. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch) Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)] target_probs = [...distribution you want...] data_batch, labels = tf.contrib.framework.sampling_ops.stratified_sample( data, label, init_probs, target_probs) # Run batch through network. ... """ with ops.op_scope([data, labels], name, 'stratified_sample'): data = ops.convert_to_tensor(data) labels = ops.convert_to_tensor(labels) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: data = array_ops.expand_dims(data, 0) labels = array_ops.expand_dims(labels, 0) # Validate that input is consistent. data, labels, [init_probs, target_probs] = _verify_input( data, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. if np.any(np.logical_and(np.array(init_probs) == 0, np.array(target_probs) != 0)): raise ValueError('Some initial probability class has nonzero target ' 'probability.') # Calculate rejection sampling probabilities. reject_probs = _calculate_rejection_probabilities(init_probs, target_probs) proportion_rejected = np.sum(np.array(reject_probs) * np.array(init_probs)) if proportion_rejected > .5: logging.warning('Proportion of examples rejected by sampler is high: ', proportion_rejected) # Make a single queue to hold input examples. val, label = input_ops.batch([data, labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val = array_ops.reshape(val, data.get_shape().with_rank_at_least(1)[1:]) label = array_ops.reshape( label, labels.get_shape().with_rank_at_least(1)[1:]) # Set up second queue containing batches that have the desired class # proportions. return _get_stratified_batch_from_tensors( val, label, reject_probs, batch_size, threads_per_queue)
def stratified_sample(data, labels, init_probs, target_probs, batch_size, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. See `stratified_sample_unknown_dist` for a function that performs stratified sampling with one queue per class and doesn't require knowing the class data-distribution ahead of time. Args: data: Tensor for data. Either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on enqueue_many. It is not a one-hot vector. init_probs: 1D numpy or python array of class proportions in the data. target_probs: 1D numpy or python array of target class proportions in batch. batch_size: Size of batch to be returned. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch) Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)] target_probs = [...distribution you want...] data_batch, labels = tf.contrib.framework.sampling_ops.stratified_sample( data, label, init_probs, target_probs) # Run batch through network. ... """ with ops.op_scope([data, labels], name, 'stratified_sample'): data = ops.convert_to_tensor(data) labels = ops.convert_to_tensor(labels) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: data = array_ops.expand_dims(data, 0) labels = array_ops.expand_dims(labels, 0) # Validate that input is consistent. data, labels, [init_probs, target_probs ] = _verify_input(data, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. if np.any( np.logical_and( np.array(init_probs) == 0, np.array(target_probs) != 0)): raise ValueError( 'Some initial probability class has nonzero target ' 'probability.') # Calculate rejection sampling probabilities. reject_probs = _calculate_rejection_probabilities( init_probs, target_probs) proportion_rejected = np.sum( np.array(reject_probs) * np.array(init_probs)) if proportion_rejected > .5: logging.warning( 'Proportion of examples rejected by sampler is high: ', proportion_rejected) # Make a single queue to hold input examples. val, label = input_ops.batch([data, labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val = array_ops.reshape(val, data.get_shape().with_rank_at_least(1)[1:]) label = array_ops.reshape(label, labels.get_shape().with_rank_at_least(1)[1:]) # Set up second queue containing batches that have the desired class # proportions. return _get_stratified_batch_from_tensors(val, label, reject_probs, batch_size, threads_per_queue)
def rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1, enqueue_many=False, prebatch_capacity=16, prebatch_threads=1, runtime_checks=False, name=None): """Stochastically creates batches by rejection sampling. Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce a scalar tensor between 0 and 1. This tensor corresponds to the probability of being accepted. When `batch_size` tensor groups have been accepted, the batch queue will return a mini-batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. accept_prob_fn: A python lambda that takes a non-batch tensor from each item in `tensors`, and produces a scalar tensor. batch_size: Size of batch to be returned. queue_threads: The number of threads for the queue that will hold the final batch. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. prebatch_capacity: Capacity for the large queue that is used to convert batched tensors to single examples. prebatch_threads: Number of threads for the large queue that is used to convert batched tensors to single examples. runtime_checks: Bool. If true, insert runtime checks on the output of `accept_prob_fn`. Using `True` might have a performance impact. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if a zero initial probability class has a nonzero target probability. Returns: A list of tensors of the same length as `tensors`, with batch dimension `batch_size`. Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to data tensor. accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2 data_batch = tf.contrib.training.rejection_sample( [data, label], accept_prob_fn, 16) # Run batch through network. ... """ with variable_scope.variable_scope(name, 'rejection_sample', tensors): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) # Reduce the case of a batched example to that of a batch of a single # example by taking a batch of size one. if enqueue_many: # Validate that batch dimension of the input is consistent. tensor_list = _verify_data_inputs(tensor_list) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list, batch_size=1, num_threads=prebatch_threads, capacity=prebatch_capacity, enqueue_many=True) tensor_list = [array_ops.squeeze(x, [0]) for x in batched] # Set up a queue containing batches that have the distribution. cur_prob = accept_prob_fn(tensor_list) if runtime_checks: cur_prob = array_ops.identity(control_flow_ops.with_dependencies( [check_ops.assert_less_equal(0.0, cur_prob), check_ops.assert_less_equal(cur_prob, 1.0)], cur_prob), name='prob_with_checks') keep_input = random_ops.random_uniform([]) < cur_prob return _conditional_batch( tensor_list, keep_input, batch_size, num_threads=queue_threads)
def stratified_sample(tensors, labels, target_probs, batch_size, init_probs=None, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. See `stratified_sample_unknown_dist` for a function that performs stratified sampling with one queue per class and doesn't require knowing the class data-distribution ahead of time. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on enqueue_many. It is not a one-hot vector. target_probs: Target class proportions in batch. An object whose type has a registered Tensor conversion function. batch_size: Size of batch to be returned. init_probs: Class proportions in the data. An object whose type has a registered Tensor conversion function, or `None` for estimating the initial distribution. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch), where data_batch is a list of tensors of the same length as `tensors` Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. target_probs = [...distribution you want...] [data_batch], labels = tf.contrib.training.stratified_sample( [data], label, target_probs) # Run batch through network. ... """ with ops.name_scope(name, 'stratified_sample', tensors + [labels]): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) labels = ops.convert_to_tensor(labels) target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list] labels = array_ops.expand_dims(labels, 0) # If `init_probs` is `None`, set up online estimation of data distribution. if init_probs is None: # We use `target_probs` to get the number of classes, so its shape must be # fully defined at graph construction time. target_probs.get_shape().assert_is_fully_defined() init_probs = _estimate_data_distribution( labels, target_probs.get_shape().num_elements()) else: init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32) # Validate that input is consistent. tensor_list, labels, [init_probs, target_probs] = _verify_input( tensor_list, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. assert_op = control_flow_ops.Assert( math_ops.reduce_all(math_ops.logical_or( math_ops.not_equal(init_probs, 0), math_ops.equal(target_probs, 0))), ['All classes with zero initial probability must also have zero target ' 'probability: ', init_probs, target_probs]) init_probs = control_flow_ops.with_dependencies([assert_op], init_probs) # Calculate acceptance sampling probabilities. accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs) proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs) accept_probs = control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_probs, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_probs, [accept_probs], message='Proportion of examples rejected by sampler is high.', first_n=10)) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list + [labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]] label = array_ops.squeeze(batched[-1], [0]) # Set up second queue containing batches that have the desired class # proportions. cur_prob = array_ops.gather(accept_probs, label) keep_input = random_ops.random_uniform([]) < cur_prob batched = _conditional_batch( val_list + [label], keep_input, batch_size, num_threads=threads_per_queue) return batched[:-1], batched[-1]
def read_batch_examples(file_pattern, batch_size, reader, randomize_input=True, queue_capacity=10000, num_threads=1, name='dequeue_examples'): """Adds operations to read, queue, batch `Example` protos. Given file pattern (or list of files), will setup a queue for file names, read `Example` proto using provided `reader`, use batch queue to create batches of examples of size `batch_size`. All queue runners are added to the queue runners collection, and may be started via `start_queue_runners`. All ops are added to the default graph. Args: file_pattern: List of files or pattern of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int or scalar `Tensor` specifying the batch size to use. reader: A function or class that returns an object with `read` method, (filename tensor) -> (example tensor). randomize_input: Whether the input should be randomized. queue_capacity: Capacity for input queue. num_threads: The number of threads enqueuing examples. name: Name of resulting op. Returns: String `Tensor` of batched `Example` proto. Raises: ValueError: for invalid inputs. """ # Retrive files to read. if isinstance(file_pattern, list): file_names = file_pattern if not file_names: raise ValueError('No files given to dequeue_examples.') else: file_names = list(gfile.Glob(file_pattern)) if not file_names: raise ValueError('No files match %s.' % file_pattern) # Sort files so it will be deterministic for unit tests. They'll be shuffled # in `string_input_producer` if `randomize_input` is enabled. if not randomize_input: file_names = sorted(file_names) # Check input parameters are given and reasonable. if (not queue_capacity) or (queue_capacity <= 0): raise ValueError('Invalid queue_capacity %s.' % queue_capacity) if (batch_size is None) or ( (not isinstance(batch_size, ops.Tensor)) and (batch_size <= 0 or batch_size > queue_capacity)): raise ValueError( 'Invalid batch_size %s, with queue_capacity %s.' % (batch_size, queue_capacity)) if (not num_threads) or (num_threads <= 0): raise ValueError('Invalid num_threads %s.' % num_threads) with ops.name_scope(name) as scope: # Setup filename queue with shuffling. with ops.name_scope('file_name_queue') as file_name_queue_scope: file_name_queue = input_ops.string_input_producer( constant_op.constant(file_names, name='input'), shuffle=randomize_input, name=file_name_queue_scope) # Create reader and set it to read from filename queue. with ops.name_scope('read'): _, example_proto = reader().read(file_name_queue) # Setup batching queue. if randomize_input: if isinstance(batch_size, ops.Tensor): min_after_dequeue = int(queue_capacity * 0.4) else: min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size) examples = input_ops.shuffle_batch( [example_proto], batch_size, capacity=queue_capacity, num_threads=num_threads, min_after_dequeue=min_after_dequeue, name=scope) else: examples = input_ops.batch( [example_proto], batch_size, capacity=queue_capacity, num_threads=num_threads, name=scope) return examples
def create_batch(self): """Create queues to window and batch time series data. Returns: A dictionary of Tensors corresponding to the output of `self._reader` (from the `time_series_reader` constructor argument), each with shapes prefixed by [`batch_size`, `window_size`]. """ features = self._reader.read() if self._jitter: # TODO(agarwal, allenl): Figure out if more jitter is needed here. jitter = random_ops.random_uniform(shape=[], maxval=2, dtype=dtypes.int32) else: jitter = 0 # To keep things efficient, we pass from the windowing batcher to the # batch-of-windows batcher in batches. This avoids the need for huge numbers # of threads, but does mean that jitter is only applied occasionally. # TODO(allenl): Experiment with different internal passing sizes. internal_passing_size = self._batch_size features_windowed = input_lib.batch( features, batch_size=self._window_size * internal_passing_size + jitter, enqueue_many=True, capacity=(self._queue_capacity_multiplier * internal_passing_size * self._window_size), num_threads=self._num_threads) raw_features_windowed = features_windowed if self._jitter: features_windowed = { key: value[jitter:] for key, value in features_windowed.items()} features_windowed = { key: array_ops.reshape( value, array_ops.concat( [[internal_passing_size, self._window_size], array_ops.shape(value)[1:]], axis=0)) for key, value in features_windowed.items()} batch_and_window_shape = tensor_shape.TensorShape( [internal_passing_size, self._window_size]) for key in features_windowed.keys(): features_windowed[key].set_shape( batch_and_window_shape.concatenate( raw_features_windowed[key].get_shape()[1:])) # When switching files, we may end up with windows where the time is not # decreasing, even if times within each file are sorted (and even if those # files are visited in order, when looping back around to the beginning of # the first file). This is hard for models to deal with, so we either # discard such examples, creating a bias where the beginning and end of the # series is under-sampled, or we sort the window, creating large gaps. times = features_windowed[feature_keys.TrainEvalFeatures.TIMES] if self._discard_out_of_order: non_decreasing = math_ops.reduce_all( times[:, 1:] >= times[:, :-1], axis=1) # Ensure that no more than self._discard_limit complete batches are # discarded contiguously (resetting the count when we find a single clean # window). This prevents infinite looping when the dataset is smaller than # the window size. # TODO(allenl): Figure out a way to return informative errors from # count_up_to. discarded_windows_limiter = variable_scope.variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), name="discarded_windows_limiter", trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES]) def _initialized_limit_check(): return control_flow_ops.cond( math_ops.reduce_any(non_decreasing), lambda: state_ops.assign(discarded_windows_limiter, 0), lambda: discarded_windows_limiter.count_up_to(self._discard_limit)) discard_limit_op = control_flow_ops.cond( state_ops.is_variable_initialized(discarded_windows_limiter), _initialized_limit_check, lambda: constant_op.constant(0, dtype=dtypes.int64)) with ops.control_dependencies([discard_limit_op]): non_decreasing = array_ops.identity(non_decreasing) else: _, indices_descending = nn.top_k( times, k=array_ops.shape(times)[-1], sorted=True) indices = array_ops.reverse(indices_descending, axis=[0]) features_windowed = { key: array_ops.gather(params=value, indices=indices) for key, value in features_windowed.items() } non_decreasing = True features_batched = input_lib.maybe_shuffle_batch( features_windowed, num_threads=self._num_threads, seed=self._shuffle_seed, batch_size=self._batch_size, capacity=self._queue_capacity_multiplier * self._batch_size, min_after_dequeue=(self._shuffle_min_after_dequeue_multiplier * self._batch_size), keep_input=non_decreasing, enqueue_many=True) return (features_batched, None)
def skip_gram_sample(input_tensor, min_skips=1, max_skips=5, start=0, limit=-1, emit_self_as_target=False, vocab_freq_table=None, vocab_min_count=None, vocab_subsampling=None, corpus_size=None, batch_size=None, batch_capacity=None, seed=None, name=None): """Generates skip-gram token and label paired Tensors from the input tensor. Generates skip-gram `("token", "label")` pairs using each element in the rank-1 `input_tensor` as a token. The window size used for each token will be randomly selected from the range specified by `[min_skips, max_skips]`, inclusive. See https://arxiv.org/abs/1301.3781 for more details about skip-gram. For example, given `input_tensor = ["the", "quick", "brown", "fox", "jumps"]`, `min_skips = 1`, `max_skips = 2`, `emit_self_as_target = False`, the output `(tokens, labels)` pairs for the token "quick" will be randomly selected from either `(tokens=["quick", "quick"], labels=["the", "brown"])` for 1 skip, or `(tokens=["quick", "quick", "quick"], labels=["the", "brown", "fox"])` for 2 skips. If `emit_self_as_target = True`, each token will also be emitted as a label for itself. From the previous example, the output will be either `(tokens=["quick", "quick", "quick"], labels=["the", "quick", "brown"])` for 1 skip, or `(tokens=["quick", "quick", "quick", "quick"], labels=["the", "quick", "brown", "fox"])` for 2 skips. The same process is repeated for each element of `input_tensor` and concatenated together into the two output rank-1 `Tensors` (one for all the tokens, another for all the labels). If `vocab_freq_table` is specified, tokens in `input_tensor` that are not present in the vocabulary are discarded. Tokens whose frequency counts are below `vocab_min_count` are also discarded. Tokens whose frequency proportions in the corpus exceed `vocab_subsampling` may be randomly down-sampled. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details about subsampling. Due to the random window sizes used for each token, the lengths of the outputs are non-deterministic, unless `batch_size` is specified to batch the outputs to always return `Tensors` of length `batch_size`. Args: input_tensor: A rank-1 `Tensor` from which to generate skip-gram candidates. min_skips: `int` or scalar `Tensor` specifying the minimum window size to randomly use for each token. Must be >= 0 and <= `max_skips`. If `min_skips` and `max_skips` are both 0, the only label outputted will be the token itself when `emit_self_as_target = True` - or no output otherwise. max_skips: `int` or scalar `Tensor` specifying the maximum window size to randomly use for each token. Must be >= 0. start: `int` or scalar `Tensor` specifying the position in `input_tensor` from which to start generating skip-gram candidates. limit: `int` or scalar `Tensor` specifying the maximum number of elements in `input_tensor` to use in generating skip-gram candidates. -1 means to use the rest of the `Tensor` after `start`. emit_self_as_target: `bool` or scalar `Tensor` specifying whether to emit each token as a label for itself. vocab_freq_table: (Optional) A lookup table (subclass of `lookup.InitializableLookupTableBase`) that maps tokens to their raw frequency counts. If specified, any token in `input_tensor` that is not found in `vocab_freq_table` will be filtered out before generating skip-gram candidates. While this will typically map to integer raw frequency counts, it could also map to float frequency proportions. `vocab_min_count` and `corpus_size` should be in the same units as this. vocab_min_count: (Optional) `int`, `float`, or scalar `Tensor` specifying minimum frequency threshold (from `vocab_freq_table`) for a token to be kept in `input_tensor`. If this is specified, `vocab_freq_table` must also be specified - and they should both be in the same units. vocab_subsampling: (Optional) `float` specifying frequency proportion threshold for tokens from `input_tensor`. Tokens that occur more frequently (based on the ratio of the token's `vocab_freq_table` value to the `corpus_size`) will be randomly down-sampled. Reasonable starting values may be around 1e-3 or 1e-5. If this is specified, both `vocab_freq_table` and `corpus_size` must also be specified. See Eq. 5 in http://arxiv.org/abs/1310.4546 for more details. corpus_size: (Optional) `int`, `float`, or scalar `Tensor` specifying the total number of tokens in the corpus (e.g., sum of all the frequency counts of `vocab_freq_table`). Used with `vocab_subsampling` for down-sampling frequently occurring tokens. If this is specified, `vocab_freq_table` and `vocab_subsampling` must also be specified. batch_size: (Optional) `int` specifying batch size of returned `Tensors`. batch_capacity: (Optional) `int` specifying batch capacity for the queue used for batching returned `Tensors`. Only has an effect if `batch_size` > 0. Defaults to 100 * `batch_size` if not specified. seed: (Optional) `int` used to create a random seed for window size and subsampling. See `set_random_seed` docs for behavior. name: (Optional) A `string` name or a name scope for the operations. Returns: A `tuple` containing (token, label) `Tensors`. Each output `Tensor` is of rank-1 and has the same type as `input_tensor`. The `Tensors` will be of length `batch_size`; if `batch_size` is not specified, they will be of random length, though they will be in sync with each other as long as they are evaluated together. Raises: ValueError: If `vocab_freq_table` is not provided, but `vocab_min_count`, `vocab_subsampling`, or `corpus_size` is specified. If `vocab_subsampling` and `corpus_size` are not both present or both absent. """ if vocab_freq_table is None and (vocab_min_count is not None or vocab_subsampling is not None or corpus_size is not None): raise ValueError( "vocab_freq_table is not provided, but vocab_min_count={}, " "vocab_subsampling={}, or corpus_size={} is not None. These settings " "are useless without a vocab_freq_table.".format( vocab_min_count, vocab_subsampling, corpus_size)) if (vocab_subsampling is None) != (corpus_size is None): raise ValueError( "vocab_subsampling is {} while corpus_size is {} - both must be " "provided in order for subsampling to work.".format( vocab_subsampling, corpus_size)) with ops.name_scope( name, "skip_gram_sample", values=[input_tensor, min_skips, max_skips, start, limit]): input_tensor = _filter_input(input_tensor=input_tensor, vocab_freq_table=vocab_freq_table, vocab_min_count=vocab_min_count, vocab_subsampling=vocab_subsampling, corpus_size=corpus_size, seed=seed) seed1, seed2 = random_seed.get_seed(seed) tokens, labels = skip_gram_ops.skip_gram_generate_candidates( input_tensor=input_tensor, min_skips=min_skips, max_skips=max_skips, start=start, limit=limit, emit_self_as_target=emit_self_as_target, # Note that seed here should be seed1! This is due to # GuardedPhiloxRandom's hard-coded attributes of "seed" and "seed2". seed=seed1, seed2=seed2) # TODO(weiho): If the need arises, add support for sparse input_tensor that # figures out sentence boundaries, then calls # skip_gram_generate_candidates() on each sentence. # Batches the (tokens, labels) outputs so that they will be of deterministic # batch_size, to facilitate feeding them into the rest of the network. if batch_size is not None and batch_size > 0: batch_capacity = (batch_capacity if (batch_capacity is not None and batch_capacity > 0) else 100 * batch_size) return input_ops.batch([tokens, labels], batch_size, capacity=batch_capacity, enqueue_many=True) return tokens, labels
def read_batch_examples(file_pattern, batch_size, reader, randomize_input=True, queue_capacity=10000, num_threads=1, name='dequeue_examples'): """Adds operations to read, queue, batch `Example` protos. Given file pattern (or list of files), will setup a queue for file names, read `Example` proto using provided `reader`, use batch queue to create batches of examples of size `batch_size`. All queue runners are added to the queue runners collection, and may be started via `start_queue_runners`. All ops are added to the default graph. Args: file_pattern: List of files or pattern of file paths containing `Example` records. See `tf.gfile.Glob` for pattern rules. batch_size: An int or scalar `Tensor` specifying the batch size to use. reader: A function or class that returns an object with `read` method, (filename tensor) -> (example tensor). randomize_input: Whether the input should be randomized. queue_capacity: Capacity for input queue. num_threads: The number of threads enqueuing examples. name: Name of resulting op. Returns: String `Tensor` of batched `Example` proto. Raises: ValueError: for invalid inputs. """ # Retrive files to read. if isinstance(file_pattern, list): file_names = file_pattern if not file_names: raise ValueError('No files given to dequeue_examples.') else: file_names = list(gfile.Glob(file_pattern)) if not file_names: raise ValueError('No files match %s.' % file_pattern) # Sort files so it will be deterministic for unit tests. They'll be shuffled # in `string_input_producer` if `randomize_input` is enabled. if not randomize_input: file_names = sorted(file_names) # Check input parameters are given and reasonable. if (not queue_capacity) or (queue_capacity <= 0): raise ValueError('Invalid queue_capacity %s.' % queue_capacity) if (batch_size is None) or ( (not isinstance(batch_size, ops.Tensor)) and (batch_size <= 0 or batch_size > queue_capacity)): raise ValueError('Invalid batch_size %s, with queue_capacity %s.' % (batch_size, queue_capacity)) if (not num_threads) or (num_threads <= 0): raise ValueError('Invalid num_threads %s.' % num_threads) with ops.name_scope(name) as scope: # Setup filename queue with shuffling. with ops.name_scope('file_name_queue') as file_name_queue_scope: file_name_queue = input_ops.string_input_producer( constant_op.constant(file_names, name='input'), shuffle=randomize_input, name=file_name_queue_scope) # Create reader and set it to read from filename queue. with ops.name_scope('read'): _, example_proto = reader().read(file_name_queue) # Setup batching queue. if randomize_input: if isinstance(batch_size, ops.Tensor): min_after_dequeue = int(queue_capacity * 0.4) else: min_after_dequeue = max(queue_capacity - (3 * batch_size), batch_size) examples = input_ops.shuffle_batch( [example_proto], batch_size, capacity=queue_capacity, num_threads=num_threads, min_after_dequeue=min_after_dequeue, name=scope) else: examples = input_ops.batch([example_proto], batch_size, capacity=queue_capacity, num_threads=num_threads, name=scope) return examples
def stratified_sample(data, labels, init_probs, target_probs, batch_size, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. See `stratified_sample_unknown_dist` for a function that performs stratified sampling with one queue per class and doesn't require knowing the class data-distribution ahead of time. Args: data: Tensor for data. Either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on enqueue_many. It is not a one-hot vector. init_probs: Class proportions in the data. An object whose type has a registered Tensor conversion function. target_probs: Target class proportions in batch. An object whose type has a registered Tensor conversion function. batch_size: Size of batch to be returned. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch) Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)] target_probs = [...distribution you want...] data_batch, labels = tf.contrib.framework.sampling_ops.stratified_sample( data, label, init_probs, target_probs) # Run batch through network. ... """ with ops.op_scope([data, labels], name, 'stratified_sample'): data = ops.convert_to_tensor(data) labels = ops.convert_to_tensor(labels) init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32) target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: data = array_ops.expand_dims(data, 0) labels = array_ops.expand_dims(labels, 0) # Validate that input is consistent. data, labels, [init_probs, target_probs] = _verify_input( data, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. assert_op = logging_ops.Assert(math_ops.reduce_all(math_ops.logical_or( math_ops.not_equal(init_probs, 0), math_ops.equal(target_probs, 0))), [init_probs, target_probs]) init_probs = control_flow_ops.with_dependencies([assert_op], init_probs) # Calculate acceptance sampling probabilities. accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs) proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs) accept_probs = control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_probs, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_probs, [accept_probs], message='Proportion of examples rejected by sampler is high.', first_n=10)) # Make a single queue to hold input examples. val, label = input_ops.batch([data, labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val = array_ops.reshape(val, data.get_shape().with_rank_at_least(1)[1:]) label = array_ops.reshape( label, labels.get_shape().with_rank_at_least(1)[1:]) # Set up second queue containing batches that have the desired class # proportions. return _get_stratified_batch_from_tensors( val, label, accept_probs, batch_size, threads_per_queue)
def stratified_sample(tensors, labels, target_probs, batch_size, init_probs=None, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on `enqueue_many`. It is not a one-hot vector. target_probs: Target class proportions in batch. An object whose type has a registered Tensor conversion function. batch_size: Size of batch to be returned. init_probs: Class proportions in the data. An object whose type has a registered Tensor conversion function, or `None` for estimating the initial distribution. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: If `tensors` isn't iterable. ValueError: `enqueue_many` is True and labels doesn't have a batch dimension, or if `enqueue_many` is False and labels isn't a scalar. ValueError: `enqueue_many` is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch), where data_batch is a list of tensors of the same length as `tensors` Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. target_probs = [...distribution you want...] [data_batch], labels = tf.contrib.training.stratified_sample( [data], label, target_probs) # Run batch through network. ... """ with ops.name_scope(name, 'stratified_sample', list(tensors) + [labels]): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) labels = ops.convert_to_tensor(labels) target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: tensor_list = [ array_ops.expand_dims(tensor, 0) for tensor in tensor_list ] labels = array_ops.expand_dims(labels, 0) # If `init_probs` is `None`, set up online estimation of data distribution. if init_probs is None: # We use `target_probs` to get the number of classes, so its shape must be # fully defined at graph construction time. target_probs.get_shape().assert_is_fully_defined() init_probs = _estimate_data_distribution( labels, target_probs.get_shape().num_elements()) else: init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32) # Validate that input is consistent. tensor_list, labels, [init_probs, target_probs ] = _verify_input(tensor_list, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. assert_op = control_flow_ops.Assert( math_ops.reduce_all( math_ops.logical_or(math_ops.not_equal(init_probs, 0), math_ops.equal(target_probs, 0))), [ 'All classes with zero initial probability must also have zero target ' 'probability: ', init_probs, target_probs ]) init_probs = control_flow_ops.with_dependencies([assert_op], init_probs) # Calculate acceptance sampling probabilities. accept_probs = _calculate_acceptance_probabilities( init_probs, target_probs) proportion_rejected = math_ops.reduce_sum( (1 - accept_probs) * init_probs) accept_probs = control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_probs, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_probs, [accept_probs], message='Proportion of examples rejected by sampler is high.', first_n=10)) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list + [labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]] label = array_ops.squeeze(batched[-1], [0]) # Set up second queue containing batches that have the desired class # proportions. cur_prob = array_ops.gather(accept_probs, label) batched = input_ops.maybe_batch( val_list + [label], keep_input=random_ops.random_uniform([]) < cur_prob, batch_size=batch_size, num_threads=threads_per_queue) return batched[:-1], batched[-1]
def rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1, enqueue_many=False, prebatch_capacity=16, prebatch_threads=1, runtime_checks=False, name=None): """Stochastically creates batches by rejection sampling. Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce a scalar tensor between 0 and 1. This tensor corresponds to the probability of being accepted. When `batch_size` tensor groups have been accepted, the batch queue will return a mini-batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. accept_prob_fn: A python lambda that takes a non-batch tensor from each item in `tensors`, and produces a scalar tensor. batch_size: Size of batch to be returned. queue_threads: The number of threads for the queue that will hold the final batch. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. prebatch_capacity: Capacity for the large queue that is used to convert batched tensors to single examples. prebatch_threads: Number of threads for the large queue that is used to convert batched tensors to single examples. runtime_checks: Bool. If true, insert runtime checks on the output of `accept_prob_fn`. Using `True` might have a performance impact. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if a zero initial probability class has a nonzero target probability. Returns: A list of tensors of the same length as `tensors`, with batch dimension `batch_size`. Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to data tensor. accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2 data_batch = tf.contrib.training.rejection_sample( [data, label], accept_prob_fn, 16) # Run batch through network. ... """ with variable_scope.variable_scope(name, 'rejection_sample', tensors): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) # Reduce the case of a batched example to that of a batch of a single # example by taking a batch of size one. if enqueue_many: # Validate that batch dimension of the input is consistent. tensor_list = _verify_data_inputs(tensor_list) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list, batch_size=1, num_threads=prebatch_threads, capacity=prebatch_capacity, enqueue_many=True) tensor_list = [array_ops.squeeze(x, [0]) for x in batched] # Set up a queue containing batches that have the distribution. cur_prob = accept_prob_fn(tensor_list) if runtime_checks: cur_prob = array_ops.identity(control_flow_ops.with_dependencies([ check_ops.assert_less_equal(0.0, cur_prob), check_ops.assert_less_equal(cur_prob, 1.0) ], cur_prob), name='prob_with_checks') minibatch = input_ops.maybe_batch( tensor_list, keep_input=random_ops.random_uniform([]) < cur_prob, batch_size=batch_size, num_threads=queue_threads) # Queues return a single tensor if the list of enqueued tensors is one. Since # we want the type to always be the same, always return a list. if isinstance(minibatch, ops.Tensor): minibatch = [minibatch] return minibatch
def stratified_sample(tensors, labels, init_probs, target_probs, batch_size, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. See `stratified_sample_unknown_dist` for a function that performs stratified sampling with one queue per class and doesn't require knowing the class data-distribution ahead of time. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on enqueue_many. It is not a one-hot vector. init_probs: Class proportions in the data. An object whose type has a registered Tensor conversion function. target_probs: Target class proportions in batch. An object whose type has a registered Tensor conversion function. batch_size: Size of batch to be returned. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch), where data_batch is a list of tensors of the same length as `tensors` Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. init_probs = [1.0/NUM_CLASSES for _ in range(NUM_CLASSES)] target_probs = [...distribution you want...] [data_batch], labels = tf.contrib.framework.sampling_ops.stratified_sample( [data], label, init_probs, target_probs) # Run batch through network. ... """ with ops.op_scope(tensors + [labels], name, 'stratified_sample'): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) labels = ops.convert_to_tensor(labels) init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32) target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list] labels = array_ops.expand_dims(labels, 0) # Validate that input is consistent. tensor_list, labels, [init_probs, target_probs] = _verify_input( tensor_list, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. assert_op = logging_ops.Assert(math_ops.reduce_all(math_ops.logical_or( math_ops.not_equal(init_probs, 0), math_ops.equal(target_probs, 0))), [init_probs, target_probs]) init_probs = control_flow_ops.with_dependencies([assert_op], init_probs) # Calculate acceptance sampling probabilities. accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs) proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs) accept_probs = control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_probs, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_probs, [accept_probs], message='Proportion of examples rejected by sampler is high.', first_n=10)) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list + [labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]] label = array_ops.squeeze(batched[-1], [0]) # Set up second queue containing batches that have the desired class # proportions. batched = _get_stratified_batch_from_tensors( val_list, label, accept_probs, batch_size, threads_per_queue) return batched[:-1], batched[-1]