def rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1, enqueue_many=False, prebatch_capacity=16, prebatch_threads=1, runtime_checks=False, name=None): """Stochastically creates batches by rejection sampling. Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce a scalar tensor between 0 and 1. This tensor corresponds to the probability of being accepted. When `batch_size` tensor groups have been accepted, the batch queue will return a mini-batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. accept_prob_fn: A python lambda that takes a non-batch tensor from each item in `tensors`, and produces a scalar tensor. batch_size: Size of batch to be returned. queue_threads: The number of threads for the queue that will hold the final batch. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. prebatch_capacity: Capacity for the large queue that is used to convert batched tensors to single examples. prebatch_threads: Number of threads for the large queue that is used to convert batched tensors to single examples. runtime_checks: Bool. If true, insert runtime checks on the output of `accept_prob_fn`. Using `True` might have a performance impact. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if a zero initial probability class has a nonzero target probability. Returns: A list of tensors of the same length as `tensors`, with batch dimension `batch_size`. Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to data tensor. accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2 data_batch = tf.contrib.training.rejection_sample( [data, label], accept_prob_fn, 16) # Run batch through network. ... """ with variable_scope.variable_scope(name, 'rejection_sample', tensors): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) # Reduce the case of a batched example to that of a batch of a single # example by taking a batch of size one. if enqueue_many: # Validate that batch dimension of the input is consistent. tensor_list = _verify_data_inputs(tensor_list) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list, batch_size=1, num_threads=prebatch_threads, capacity=prebatch_capacity, enqueue_many=True) tensor_list = [array_ops.squeeze(x, [0]) for x in batched] # Set up a queue containing batches that have the distribution. cur_prob = accept_prob_fn(tensor_list) if runtime_checks: cur_prob = array_ops.identity(control_flow_ops.with_dependencies([ check_ops.assert_less_equal(0.0, cur_prob), check_ops.assert_less_equal(cur_prob, 1.0) ], cur_prob), name='prob_with_checks') minibatch = input_ops.maybe_batch( tensor_list, keep_input=random_ops.random_uniform([]) < cur_prob, batch_size=batch_size, num_threads=queue_threads) # Queues return a single tensor if the list of enqueued tensors is one. Since # we want the type to always be the same, always return a list. if isinstance(minibatch, ops.Tensor): minibatch = [minibatch] return minibatch
def rejection_sample(tensors, accept_prob_fn, batch_size, queue_threads=1, enqueue_many=False, prebatch_capacity=16, prebatch_threads=1, runtime_checks=False, name=None): """Stochastically creates batches by rejection sampling. Each list of non-batched tensors is evaluated by `accept_prob_fn`, to produce a scalar tensor between 0 and 1. This tensor corresponds to the probability of being accepted. When `batch_size` tensor groups have been accepted, the batch queue will return a mini-batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. accept_prob_fn: A python lambda that takes a non-batch tensor from each item in `tensors`, and produces a scalar tensor. batch_size: Size of batch to be returned. queue_threads: The number of threads for the queue that will hold the final batch. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. prebatch_capacity: Capacity for the large queue that is used to convert batched tensors to single examples. prebatch_threads: Number of threads for the large queue that is used to convert batched tensors to single examples. runtime_checks: Bool. If true, insert runtime checks on the output of `accept_prob_fn`. Using `True` might have a performance impact. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if a zero initial probability class has a nonzero target probability. Returns: A list of tensors of the same length as `tensors`, with batch dimension `batch_size`. Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to data tensor. accept_prob_fn = lambda x: (tf.tanh(x[0]) + 1) / 2 data_batch = tf.contrib.training.rejection_sample( [data, label], accept_prob_fn, 16) # Run batch through network. ... """ with variable_scope.variable_scope(name, 'rejection_sample', tensors): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) # Reduce the case of a batched example to that of a batch of a single # example by taking a batch of size one. if enqueue_many: # Validate that batch dimension of the input is consistent. tensor_list = _verify_data_inputs(tensor_list) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch( tensor_list, batch_size=1, num_threads=prebatch_threads, capacity=prebatch_capacity, enqueue_many=True) tensor_list = [array_ops.squeeze(x, [0]) for x in batched] # Set up a queue containing batches that have the distribution. cur_prob = accept_prob_fn(tensor_list) if runtime_checks: cur_prob = array_ops.identity( control_flow_ops.with_dependencies([ check_ops.assert_less_equal(0.0, cur_prob), check_ops.assert_less_equal(cur_prob, 1.0) ], cur_prob), name='prob_with_checks') minibatch = input_ops.maybe_batch( tensor_list, keep_input=random_ops.random_uniform([]) < cur_prob, batch_size=batch_size, num_threads=queue_threads) # Queues return a single tensor if the list of enqued tensors is one. Since # we want the type to always be the same, always return a list. if isinstance(minibatch, ops.Tensor): minibatch = [minibatch] return minibatch
def stratified_sample(tensors, labels, target_probs, batch_size, init_probs=None, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on `enqueue_many`. It is not a one-hot vector. target_probs: Target class proportions in batch. An object whose type has a registered Tensor conversion function. batch_size: Size of batch to be returned. init_probs: Class proportions in the data. An object whose type has a registered Tensor conversion function, or `None` for estimating the initial distribution. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: If `tensors` isn't iterable. ValueError: `enqueue_many` is True and labels doesn't have a batch dimension, or if `enqueue_many` is False and labels isn't a scalar. ValueError: `enqueue_many` is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch), where data_batch is a list of tensors of the same length as `tensors` Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. target_probs = [...distribution you want...] [data_batch], labels = tf.contrib.training.stratified_sample( [data], label, target_probs) # Run batch through network. ... """ with ops.name_scope(name, 'stratified_sample', list(tensors) + [labels]): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) labels = ops.convert_to_tensor(labels) target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: tensor_list = [ array_ops.expand_dims(tensor, 0) for tensor in tensor_list ] labels = array_ops.expand_dims(labels, 0) # If `init_probs` is `None`, set up online estimation of data distribution. if init_probs is None: # We use `target_probs` to get the number of classes, so its shape must be # fully defined at graph construction time. target_probs.get_shape().assert_is_fully_defined() init_probs = _estimate_data_distribution( labels, target_probs.get_shape().num_elements()) else: init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32) # Validate that input is consistent. tensor_list, labels, [init_probs, target_probs ] = _verify_input(tensor_list, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. assert_op = control_flow_ops.Assert( math_ops.reduce_all( math_ops.logical_or(math_ops.not_equal(init_probs, 0), math_ops.equal(target_probs, 0))), [ 'All classes with zero initial probability must also have zero target ' 'probability: ', init_probs, target_probs ]) init_probs = control_flow_ops.with_dependencies([assert_op], init_probs) # Calculate acceptance sampling probabilities. accept_probs = _calculate_acceptance_probabilities( init_probs, target_probs) proportion_rejected = math_ops.reduce_sum( (1 - accept_probs) * init_probs) accept_probs = control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_probs, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_probs, [accept_probs], message='Proportion of examples rejected by sampler is high.', first_n=10)) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch(tensor_list + [labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]] label = array_ops.squeeze(batched[-1], [0]) # Set up second queue containing batches that have the desired class # proportions. cur_prob = array_ops.gather(accept_probs, label) batched = input_ops.maybe_batch( val_list + [label], keep_input=random_ops.random_uniform([]) < cur_prob, batch_size=batch_size, num_threads=threads_per_queue) return batched[:-1], batched[-1]
def stratified_sample(tensors, labels, target_probs, batch_size, init_probs=None, enqueue_many=False, queue_capacity=16, threads_per_queue=1, name=None): """Stochastically creates batches based on per-class probabilities. This method discards examples. Internally, it creates one queue to amortize the cost of disk reads, and one queue to hold the properly-proportioned batch. Args: tensors: List of tensors for data. All tensors are either one item or a batch, according to enqueue_many. labels: Tensor for label of data. Label is a single integer or a batch, depending on enqueue_many. It is not a one-hot vector. target_probs: Target class proportions in batch. An object whose type has a registered Tensor conversion function. batch_size: Size of batch to be returned. init_probs: Class proportions in the data. An object whose type has a registered Tensor conversion function, or `None` for estimating the initial distribution. enqueue_many: Bool. If true, interpret input tensors as having a batch dimension. queue_capacity: Capacity of the large queue that holds input examples. threads_per_queue: Number of threads for the large queue that holds input examples and for the final queue with the proper class proportions. name: Optional prefix for ops created by this function. Raises: ValueError: enqueue_many is True and labels doesn't have a batch dimension, or if enqueue_many is False and labels isn't a scalar. ValueError: enqueue_many is True, and batch dimension on data and labels don't match. ValueError: if probs don't sum to one. ValueError: if a zero initial probability class has a nonzero target probability. TFAssertion: if labels aren't integers in [0, num classes). Returns: (data_batch, label_batch), where data_batch is a list of tensors of the same length as `tensors` Example: # Get tensor for a single data and label example. data, label = data_provider.Get(['data', 'label']) # Get stratified batch according to per-class probabilities. target_probs = [...distribution you want...] [data_batch], labels = tf.contrib.training.stratified_sample( [data], label, target_probs) # Run batch through network. ... """ with ops.name_scope(name, 'stratified_sample', tensors + [labels]): tensor_list = ops.convert_n_to_tensor_or_indexed_slices(tensors) labels = ops.convert_to_tensor(labels) target_probs = ops.convert_to_tensor(target_probs, dtype=dtypes.float32) # Reduce the case of a single example to that of a batch of size 1. if not enqueue_many: tensor_list = [array_ops.expand_dims(tensor, 0) for tensor in tensor_list] labels = array_ops.expand_dims(labels, 0) # If `init_probs` is `None`, set up online estimation of data distribution. if init_probs is None: # We use `target_probs` to get the number of classes, so its shape must be # fully defined at graph construction time. target_probs.get_shape().assert_is_fully_defined() init_probs = _estimate_data_distribution( labels, target_probs.get_shape().num_elements()) else: init_probs = ops.convert_to_tensor(init_probs, dtype=dtypes.float32) # Validate that input is consistent. tensor_list, labels, [init_probs, target_probs] = _verify_input( tensor_list, labels, [init_probs, target_probs]) # Check that all zero initial probabilities also have zero target # probabilities. assert_op = control_flow_ops.Assert( math_ops.reduce_all( math_ops.logical_or( math_ops.not_equal(init_probs, 0), math_ops.equal(target_probs, 0))), ['All classes with zero initial probability must also have zero target ' 'probability: ', init_probs, target_probs ]) init_probs = control_flow_ops.with_dependencies([assert_op], init_probs) # Calculate acceptance sampling probabilities. accept_probs = _calculate_acceptance_probabilities(init_probs, target_probs) proportion_rejected = math_ops.reduce_sum((1 - accept_probs) * init_probs) accept_probs = control_flow_ops.cond( math_ops.less(proportion_rejected, .5), lambda: accept_probs, lambda: logging_ops.Print( # pylint: disable=g-long-lambda accept_probs, [accept_probs], message='Proportion of examples rejected by sampler is high.', first_n=10)) # Make a single queue to hold input examples. Reshape output so examples # don't have singleton batch dimension. batched = input_ops.batch( tensor_list + [labels], batch_size=1, num_threads=threads_per_queue, capacity=queue_capacity, enqueue_many=True) val_list = [array_ops.squeeze(x, [0]) for x in batched[:-1]] label = array_ops.squeeze(batched[-1], [0]) # Set up second queue containing batches that have the desired class # proportions. cur_prob = array_ops.gather(accept_probs, label) batched = input_ops.maybe_batch( val_list + [label], keep_input=random_ops.random_uniform([]) < cur_prob, batch_size=batch_size, num_threads=threads_per_queue) return batched[:-1], batched[-1]