def recall_at_k(labels, predictions, k): ''' Compute recall at position k. :param labels: shape=(num_examples,), dtype=tf.int64 :param predictions: logits of shape=(num_examples, num_classes) :param k: recall position :return: recall at position k Example: labels = tf.constant([0, 1, 1], dtype=tf.int64) predictions = tf.constant([[0.1, 0.2, 0.3], [3, 5, 2], [0.3, 0.4, 0.7]]) recall_at_k(labels, predictions, 2) # recall_at_k(labels, predictions, 2) = 0.6667 ''' labels = expand_dims(labels, axis=1) _, predictions_idx = nn.top_k(predictions, k) predictions_idx = math_ops.to_int64(predictions_idx) tp = sets.set_size(sets.set_intersection(predictions_idx, labels)) tp = math_ops.to_double(tp) tp = math_ops.reduce_sum(tp) fn = sets.set_size( sets.set_difference(predictions_idx, labels, aminusb=False)) fn = math_ops.to_double(fn) fn = math_ops.reduce_sum(fn) recall = math_ops.div(tp, math_ops.add(tp, fn), name='recall_at_k') return recall
def _compute_accuracy(logits, targets, weights=None): if self._n_classes > 2: _, predictions = nn.top_k(logits, 1) else: predictions = array_ops.reshape(logits, [-1]) predictions = math_ops.greater(predictions, array_ops.zeros_like(predictions)) targets = array_ops.reshape(targets, [-1]) return metrics_lib.streaming_accuracy( math_ops.to_int32(predictions), math_ops.to_int32(targets), weights)
def sparsemax(logits, axis=1, number_dim=2, name=None): """Computes sparsemax activations [1]. For each batch `i` and class `j` we have $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$ [1]: https://arxiv.org/abs/1602.02068 Args: logits: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax", [logits]) as name: logits = ops.convert_to_tensor(logits, name="Matrix") print(logits) obs = array_ops.shape(logits)[0] obs2 = array_ops.shape(logits)[1] dims = array_ops.shape(logits)[2] print(obs, dims) z = logits - math_ops.reduce_mean(logits, axis=-1)[:, array_ops.newaxis] # sort z z_sorted, _ = nn.top_k(z, k=dims) # calculate k(z) z_cumsum = math_ops.cumsum(z_sorted, axis=-1) k = math_ops.range(1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype) z_check = 1 + k * z_sorted > z_cumsum # because the z_check vector is always [1,1,...1,0,0,...0] finding the # (index + 1) of the last `1` is the same as just summing the number of 1. k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=-1) # calculate tau(z) print(k_z) mesh = meshgrid(math_ops.range(0, obs)) print(mesh) indices = array_ops.stack([mesh, k_z - 1], axis=-1) tau_sum = array_ops.gather_nd(z_cumsum, indices) tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype) # calculate p sparsemax = math_ops.maximum(math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis]) # sparsemax = transpose(sparsemax,perm=permut) return (sparsemax)
def tf_spmax(logits, name=None): """Computes sparsemax activations [1]. For each batch `i` and class `j` we have $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$ [1]: https://arxiv.org/abs/1602.02068 Args: logits: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax", [logits]) as name: logits = ops.convert_to_tensor(logits, name="logits") obs = array_ops.shape(logits)[0] dims = array_ops.shape(logits)[1] z = logits #- math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis] # sort z z_sorted, _ = nn.top_k(z, k=dims) # calculate k(z) z_cumsum = math_ops.cumsum(z_sorted, axis=1) k = math_ops.range(1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype) z_check = 1 + k * z_sorted > z_cumsum # because the z_check vector is always [1,1,...1,0,0,...0] finding the # (index + 1) of the last `1` is the same as just summing the number of 1. k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1) # calculate tau(z) indices = array_ops.stack([math_ops.range(0, obs), k_z - 1], axis=1) tau_sum = array_ops.gather_nd(z_cumsum, indices) tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype) spmax_policy = math_ops.maximum(math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis]) z_square = math_ops.square(z_sorted) tau_square = math_ops.square(tau_z) spmax = 0.5 * (math_ops.reduce_sum( math_ops.cast(z_check, dtypes.float32) * z_square, axis=1) - math_ops.cast(k_z, dtypes.float32) * tau_square) + 0.5 return spmax_policy, spmax
def sparsemax(logits, name=None): """Computes sparsemax activations [1]. For each batch `i` and class `j` we have sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0) [1]: https://arxiv.org/abs/1602.02068 Args: logits: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax", [logits]) as name: logits = ops.convert_to_tensor(logits, name="logits") obs = array_ops.shape(logits)[0] dims = array_ops.shape(logits)[1] z = logits - math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis] # sort z z_sorted, _ = nn.top_k(z, k=dims) # calculate k(z) z_cumsum = math_ops.cumsum(z_sorted, axis=1) k = math_ops.range( 1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype ) z_check = 1 + k * z_sorted > z_cumsum # because the z_check vector is always [1,1,...1,0,0,...0] finding the # (index + 1) of the last `1` is the same as just summing the number of 1. k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1) # calculate tau(z) indices = array_ops.stack([math_ops.range(0, obs), k_z - 1], axis=1) tau_sum = array_ops.gather_nd(z_cumsum, indices) tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype) # calculate p return math_ops.maximum( math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis] )
def _sparsemax(logits, name=None): """Computes sparsemax activations [1]. For each batch `i` and class `j` we have sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0) [1]: https://arxiv.org/abs/1602.02068 :param logits, tensor Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax", [logits]) as name: logits = ops.convert_to_tensor(logits, name="logits") obs = logits.shape[0] dims = logits.shape[1] z = logits - math_ops.reduce_mean(logits, axis=1)[:, array_ops.newaxis] # sort z z_sorted, _ = nn.top_k(z, k=dims) # calculate k(z) z_cumsum = math_ops.cumsum(z_sorted, axis=1) k = math_ops.range( 1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype) z_check = 1 + k * z_sorted > z_cumsum # because the z_check vector is always [1,1,...1,0,0,...0] finding the # (index + 1) of the last `1` is the same as just summing the number of 1. k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1) # calculate tau(z) indices = array_ops.stack([math_ops.range(0, obs), k_z - 1], axis=1) tau_sum = array_ops.gather_nd(z_cumsum, indices) tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype) # calculate p return math_ops.maximum( math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis])
def prop_raw(x): obs = array_ops.shape(x)[0] dim = array_ops.shape(x)[1] z = x - math_ops.reduce_mean(x, axis=1)[:, array_ops.newaxis] z_sorted, _ = nn.top_k(z, k=dim) z_cumsum = math_ops.cumsum(z_sorted, axis=1) k = math_ops.range(1, math_ops.cast(dim, x.dtype) + 1, dtype=x.dtype) z_check = 1 + k * z_sorted > z_cumsum k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1) indices = array_ops.stack([math_ops.range(0, obs), k_z - 1], axis=1) tau_sum = array_ops.gather_nd(z_cumsum, indices) tau_z = (tau_sum - 1) / math_ops.cast(k_z, x.dtype) return math_ops.maximum(math_ops.cast(0, x.dtype), z - tau_z[:, array_ops.newaxis])
def loop_fn(i): x_i = array_ops.gather(x, i) return nn.top_k(x_i)
def dnn_sampled_softmax_classifier_model_fn(features, target_indices, mode, params): """model_fn that uses candidate sampling. Args: features: Single Tensor or dict of Tensor (depends on data passed to `fit`) target_indices: A single Tensor of shape [batch_size, n_labels] containing the target indices. mode: Represents if this training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters that are listed below. hidden_units- List of hidden units per layer. All layers are fully connected. Ex. `[64, 32]` means first layer has 64 nodes and second one has 32. feature_columns- An iterable containing all the feature columns used by the model. All items in the set should be instances of classes derived from `FeatureColumn`. n_classes- number of target classes. It must be greater than 2. n_samples- number of sample target classes. Needs to be tuned - A good starting point could be 2% of n_classes. n_labels- number of labels in each example. top_k- The number of classes to predict. optimizer- An instance of `tf.Optimizer` used to train the model. If `None`, will use an Adagrad optimizer. dropout- When not `None`, the probability we will drop out a given coordinate. gradient_clip_norm- A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. See tf.clip_by_global_norm for more details. num_ps_replicas- The number of parameter server replicas. Returns: predictions: A single Tensor or a dict of Tensors. loss: A scalar containing the loss of the step. train_op: The op for training. """ hidden_units = params["hidden_units"] feature_columns = params["feature_columns"] n_classes = params["n_classes"] n_samples = params["n_samples"] n_labels = params["n_labels"] top_k = params["top_k"] optimizer = params["optimizer"] dropout = params["dropout"] gradient_clip_norm = params["gradient_clip_norm"] num_ps_replicas = params["num_ps_replicas"] parent_scope = "dnn_ss" # Setup the input layer partitioner. input_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20)) # Create the input layer. with variable_scope.variable_scope( parent_scope + "/input_from_feature_columns", features.values(), partitioner=input_layer_partitioner) as scope: net = layers.input_from_feature_columns( features, feature_columns, weight_collections=[parent_scope], scope=scope) # Setup the hidden layer partitioner. hidden_layer_partitioner = ( partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas)) final_hidden_layer_dim = None # Create hidden layers using fully_connected. for layer_id, num_hidden_units in enumerate(hidden_units): with variable_scope.variable_scope( parent_scope + "/hiddenlayer_%d" % layer_id, [net], partitioner=hidden_layer_partitioner) as scope: net = layers.fully_connected(net, num_hidden_units, variables_collections=[parent_scope], scope=scope) final_hidden_layer_dim = num_hidden_units # Add dropout if it is enabled. if dropout is not None and mode == estimator.ModeKeys.TRAIN: net = layers.dropout(net, keep_prob=(1.0 - dropout)) # Create the weights and biases for the logit layer. with variable_scope.variable_scope( parent_scope + "/logits", [net], partitioner=hidden_layer_partitioner) as scope: dtype = net.dtype.base_dtype weights_shape = [n_classes, final_hidden_layer_dim] weights = variables.model_variable( "weights", shape=weights_shape, dtype=dtype, initializer=initializers.xavier_initializer(), trainable=True, collections=[parent_scope]) biases = variables.model_variable( "biases", shape=[n_classes,], dtype=dtype, initializer=init_ops.zeros_initializer, trainable=True, collections=[parent_scope]) if mode == estimator.ModeKeys.TRAIN: # Call the candidate sampling APIs and calculate the loss. sampled_values = nn.learned_unigram_candidate_sampler( true_classes=math_ops.to_int64(target_indices), num_true=n_labels, num_sampled=n_samples, unique=True, range_max=n_classes) sampled_softmax_loss = nn.sampled_softmax_loss( weights=weights, biases=biases, inputs=net, labels=math_ops.to_int64(target_indices), num_sampled=n_samples, num_classes=n_classes, num_true=n_labels, sampled_values=sampled_values) loss = math_ops.reduce_mean(sampled_softmax_loss, name="loss") train_op = optimizers.optimize_loss( loss=loss, global_step=contrib_framework.get_global_step(), learning_rate=_DEFAULT_LEARNING_RATE, optimizer=_get_optimizer(optimizer), clip_gradients=gradient_clip_norm, name=parent_scope) return None, loss, train_op elif mode == estimator.ModeKeys.EVAL: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) # Since the targets have multiple labels, setup the target probabilities # as 1.0/n_labels for each of the labels. target_one_hot = array_ops.one_hot( indices=target_indices, depth=n_classes, on_value=1.0 / n_labels) target_one_hot = math_ops.reduce_sum( input_tensor=target_one_hot, reduction_indices=[1]) loss = math_ops.reduce_mean( nn.softmax_cross_entropy_with_logits(logits, target_one_hot)) return predictions, loss, None elif mode == estimator.ModeKeys.INFER: logits = nn.bias_add(standard_ops.matmul(net, array_ops.transpose(weights)), biases) predictions = {} predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) _, predictions[_TOP_K] = nn.top_k(logits, top_k) return predictions, None, None
import tensorflow as tf import tensorflow.contrib as tfc from tensorflow.python.ops import nn #tf.enable_eager_execution() with tf.Session() as sess: labels = tf.constant(value=[1,0,2], dtype=tf.int64) probs = tf.constant(value=[[0.8, 0.93, .2,.1],[.82, 0, .1,.83],[.92,.1, .90, .3]]) _, ix = nn.top_k(probs, k=1) c = tf.metrics.recall_at_k(predictions= probs, labels= labels, k=1) d = tfc.metrics.streaming_sparse_recall_at_k(predictions=probs, labels=labels, k=1) sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) print(sess.run(ix)) print(sess.run(c)) print(sess.run(d)) print('done')
def create_batch(self): """Create queues to window and batch time series data. Returns: A dictionary of Tensors corresponding to the output of `self._reader` (from the `time_series_reader` constructor argument), each with shapes prefixed by [`batch_size`, `window_size`]. """ features = self._reader.read() if self._jitter: # TODO(agarwal, allenl): Figure out if more jitter is needed here. jitter = random_ops.random_uniform(shape=[], maxval=2, dtype=dtypes.int32) else: jitter = 0 # To keep things efficient, we pass from the windowing batcher to the # batch-of-windows batcher in batches. This avoids the need for huge numbers # of threads, but does mean that jitter is only applied occasionally. # TODO(allenl): Experiment with different internal passing sizes. internal_passing_size = self._batch_size features_windowed = input_lib.batch( features, batch_size=self._window_size * internal_passing_size + jitter, enqueue_many=True, capacity=(self._queue_capacity_multiplier * internal_passing_size * self._window_size), num_threads=self._num_threads) raw_features_windowed = features_windowed if self._jitter: features_windowed = { key: value[jitter:] for key, value in features_windowed.items()} features_windowed = { key: array_ops.reshape( value, array_ops.concat( [[internal_passing_size, self._window_size], array_ops.shape(value)[1:]], axis=0)) for key, value in features_windowed.items()} batch_and_window_shape = tensor_shape.TensorShape( [internal_passing_size, self._window_size]) for key in features_windowed.keys(): features_windowed[key].set_shape( batch_and_window_shape.concatenate( raw_features_windowed[key].get_shape()[1:])) # When switching files, we may end up with windows where the time is not # decreasing, even if times within each file are sorted (and even if those # files are visited in order, when looping back around to the beginning of # the first file). This is hard for models to deal with, so we either # discard such examples, creating a bias where the beginning and end of the # series is under-sampled, or we sort the window, creating large gaps. times = features_windowed[feature_keys.TrainEvalFeatures.TIMES] if self._discard_out_of_order: non_decreasing = math_ops.reduce_all( times[:, 1:] >= times[:, :-1], axis=1) # Ensure that no more than self._discard_limit complete batches are # discarded contiguously (resetting the count when we find a single clean # window). This prevents infinite looping when the dataset is smaller than # the window size. # TODO(allenl): Figure out a way to return informative errors from # count_up_to. discarded_windows_limiter = variable_scope.variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), name="discarded_windows_limiter", trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES]) def _initialized_limit_check(): return control_flow_ops.cond( math_ops.reduce_any(non_decreasing), lambda: state_ops.assign(discarded_windows_limiter, 0), lambda: discarded_windows_limiter.count_up_to(self._discard_limit)) discard_limit_op = control_flow_ops.cond( state_ops.is_variable_initialized(discarded_windows_limiter), _initialized_limit_check, lambda: constant_op.constant(0, dtype=dtypes.int64)) with ops.control_dependencies([discard_limit_op]): non_decreasing = array_ops.identity(non_decreasing) else: _, indices_descending = nn.top_k( times, k=array_ops.shape(times)[-1], sorted=True) indices = array_ops.reverse(indices_descending, axis=[0]) features_windowed = { key: array_ops.gather(params=value, indices=indices) for key, value in features_windowed.items() } non_decreasing = True features_batched = input_lib.maybe_shuffle_batch( features_windowed, num_threads=self._num_threads, seed=self._shuffle_seed, batch_size=self._batch_size, capacity=self._queue_capacity_multiplier * self._batch_size, min_after_dequeue=(self._shuffle_min_after_dequeue_multiplier * self._batch_size), keep_input=non_decreasing, enqueue_many=True) return (features_batched, None)
def _rank_resample(weights, biases, inputs, sampled_values, num_resampled, resampling_temperature, partition_strategy): """A helper function for rank_sampled_softmax_loss. This computes, for each i in `sampled_values`, log(sum_j exp((w_i * x_j + b_i) / resampling_temperature)) where w_i, b_i are the weight and bias of the i-th class, respectively, and j ranges over the rows of `inputs`. For efficiency, we rearrange the computation to log(sum_j exp(w_i * (x_j / resampling_temperature))) + b_i / resampling_temperature. This translates to the following batched computation using tensorflow ops: reduce_logsumexp(matmul(embeddings, transpose(inputs / resampling_temperature))) + biases / resampling_temperature The computation of the first term is colocated with the embeddings using `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second term, not the bottleneck, is computed at the worker. Args: weights: From `rank_sampled_softmax_loss`. biases: From `rank_sampled_softmax_loss`. inputs: From `rank_sampled_softmax_loss`. sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. num_resampled: An `int`. This many values are selected from `sampled_values` using the adaptive resampling algorithm. The caller must ensure that `num_resampled` is less than the size of `sampled_values`. resampling_temperature: A scalar `Tensor` with the temperature parameter for the adaptive resampling algorithm. partition_strategy: From `rank_sampled_softmax_loss`. Returns: A tuple of (`resampled_candidates`, `true_expected_count`, `resampled_expected_count`), similar to `sampled_values` but sampled down to `num_resampled` values. """ # This code supports passing a Tensor for num_resampled, but since it is only # called with an int, that's what we specify in the arg list. If this # function is ever externalized, we should change the doc to support Tensor. sampled, true_expected_count, sampled_expected_count = sampled_values sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64) true_expected_count = array_ops.stop_gradient(true_expected_count) sampled_expected_count = array_ops.stop_gradient(sampled_expected_count) reweighted_inputs = inputs / resampling_temperature def logsumexp_logit(embeddings): return math_ops.reduce_logsumexp(math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True), axis=1, keepdims=False) # Calling this protected form of embedding_lookup allows co-locating # the logsumexp computation with the partitioned weights, which yields # a large speedup in practice. sampled_logits = embedding_ops._embedding_lookup_and_transform( # pylint: disable=protected-access weights, sampled, partition_strategy, transform_fn=logsumexp_logit) sampled_b = array_ops.reshape( embedding_ops.embedding_lookup(biases, sampled, partition_strategy), [-1]) sampled_logits += sampled_b / resampling_temperature _, resampled_indices = nn.top_k(sampled_logits, k=num_resampled, sorted=False) resampled = array_ops.gather(sampled, indices=resampled_indices) resampled_expected_count = array_ops.gather(sampled_expected_count, indices=resampled_indices) return resampled, true_expected_count, resampled_expected_count
def model(a): values, indices = nn.top_k(a, topn) return indices
def create_batch(self): """Create queues to window and batch time series data. Returns: A dictionary of Tensors corresponding to the output of `self._reader` (from the `time_series_reader` constructor argument), each with shapes prefixed by [`batch_size`, `window_size`]. """ features = self._reader.read() if self._jitter: # TODO(agarwal, allenl): Figure out if more jitter is needed here. jitter = random_ops.random_uniform(shape=[], maxval=2, dtype=dtypes.int32) else: jitter = 0 # To keep things efficient, we pass from the windowing batcher to the # batch-of-windows batcher in batches. This avoids the need for huge numbers # of threads, but does mean that jitter is only applied occasionally. # TODO(allenl): Experiment with different internal passing sizes. internal_passing_size = self._batch_size features_windowed = input_lib.batch( features, batch_size=self._window_size * internal_passing_size + jitter, enqueue_many=True, capacity=(self._queue_capacity_multiplier * internal_passing_size * self._window_size), num_threads=self._num_threads) raw_features_windowed = features_windowed if self._jitter: features_windowed = { key: value[jitter:] for key, value in features_windowed.items() } features_windowed = { key: array_ops.reshape( value, array_ops.concat([[internal_passing_size, self._window_size], array_ops.shape(value)[1:]], axis=0)) for key, value in features_windowed.items() } batch_and_window_shape = tensor_shape.TensorShape( [internal_passing_size, self._window_size]) for key in features_windowed.keys(): features_windowed[key].set_shape( batch_and_window_shape.concatenate( raw_features_windowed[key].get_shape()[1:])) # When switching files, we may end up with windows where the time is not # decreasing, even if times within each file are sorted (and even if those # files are visited in order, when looping back around to the beginning of # the first file). This is hard for models to deal with, so we either # discard such examples, creating a bias where the beginning and end of the # series is under-sampled, or we sort the window, creating large gaps. times = features_windowed[feature_keys.TrainEvalFeatures.TIMES] if self._discard_out_of_order: non_decreasing = math_ops.reduce_all(times[:, 1:] >= times[:, :-1], axis=1) # Ensure that no more than self._discard_limit complete batches are # discarded contiguously (resetting the count when we find a single clean # window). This prevents infinite looping when the dataset is smaller than # the window size. # TODO(allenl): Figure out a way to return informative errors from # count_up_to. discarded_windows_limiter = variable_scope.variable( initial_value=constant_op.constant(0, dtype=dtypes.int64), name="discarded_windows_limiter", trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES]) def _initialized_limit_check(): return control_flow_ops.cond( math_ops.reduce_any(non_decreasing), lambda: state_ops.assign(discarded_windows_limiter, 0), lambda: discarded_windows_limiter.count_up_to( self._discard_limit)) discard_limit_op = control_flow_ops.cond( state_ops.is_variable_initialized(discarded_windows_limiter), _initialized_limit_check, lambda: constant_op.constant(0, dtype=dtypes.int64)) with ops.control_dependencies([discard_limit_op]): non_decreasing = array_ops.identity(non_decreasing) else: _, indices_descending = nn.top_k(times, k=array_ops.shape(times)[-1], sorted=True) indices = array_ops.reverse(indices_descending, axis=[0]) features_windowed = { key: array_ops.gather(params=value, indices=indices) for key, value in features_windowed.items() } non_decreasing = True features_batched = input_lib.maybe_shuffle_batch( features_windowed, num_threads=self._num_threads, seed=self._shuffle_seed, batch_size=self._batch_size, capacity=self._queue_capacity_multiplier * self._batch_size, min_after_dequeue=(self._shuffle_min_after_dequeue_multiplier * self._batch_size), keep_input=non_decreasing, enqueue_many=True) return (features_batched, None)
def model(a): _, indices = nn.top_k(a, topn) return indices
def sparsemax(logits, name=None): """Computes sparsemax activations [1]. For each batch `i` and class `j` we have $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$ [1]: https://arxiv.org/abs/1602.02068 Args: logits: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax", [logits]) as name: logits = ops.convert_to_tensor(logits, name="logits") obs = array_ops.shape(logits)[0] dims = array_ops.shape(logits)[1] # In the paper, they call the logits z. # The mean(logits) can be substracted from logits to make the algorithm # more numerically stable. the instability in this algorithm comes mostly # from the z_cumsum. Substacting the mean will cause z_cumsum to be close # to zero. However, in practise the numerical instability issues are very # minor and substacting the mean causes extra issues with inf and nan # input. z = logits # sort z z_sorted, _ = nn.top_k(z, k=dims) # calculate k(z) z_cumsum = math_ops.cumsum(z_sorted, axis=1) k = math_ops.range(1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype) z_check = 1 + k * z_sorted > z_cumsum # because the z_check vector is always [1,1,...1,0,0,...0] finding the # (index + 1) of the last `1` is the same as just summing the number of 1. k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1) # calculate tau(z) # If there are inf values or all values are -inf, the k_z will be zero, # this is mathematically invalid and will also cause the gather_nd to fail. # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then # fixed later (see p_safe) by returning p = nan. This results in the same # behavior as softmax. k_z_safe = math_ops.maximum(k_z, 1) indices = array_ops.stack([math_ops.range(0, obs), k_z_safe - 1], axis=1) tau_sum = array_ops.gather_nd(z_cumsum, indices) tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype) # calculate p p = math_ops.maximum(math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis]) # If k_z = 0 or if z = nan, then the input is invalid p_safe = array_ops.where( math_ops.logical_or(math_ops.equal(k_z, 0), math_ops.is_nan(z_cumsum[:, -1])), array_ops.fill([obs, dims], math_ops.cast(float("nan"), logits.dtype)), p) return p_safe
def model(a): return nn.top_k(a, k=10, sorted=True)
def model(a, k): return nn.top_k(a, k=k, sorted=True)
def sparsemax(logits, name=None): """Computes sparsemax activations [1]. For each batch `i` and class `j` we have $$sparsemax[i, j] = max(logits[i, j] - tau(logits[i, :]), 0)$$ [1]: https://arxiv.org/abs/1602.02068 Args: logits: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax", [logits]) as name: logits = ops.convert_to_tensor(logits, name="logits") obs = array_ops.shape(logits)[0] dims = array_ops.shape(logits)[1] # In the paper, they call the logits z. # The mean(logits) can be substracted from logits to make the algorithm # more numerically stable. the instability in this algorithm comes mostly # from the z_cumsum. Substacting the mean will cause z_cumsum to be close # to zero. However, in practise the numerical instability issues are very # minor and substacting the mean causes extra issues with inf and nan # input. z = logits # sort z z_sorted, _ = nn.top_k(z, k=dims) # calculate k(z) z_cumsum = math_ops.cumsum(z_sorted, axis=1) k = math_ops.range( 1, math_ops.cast(dims, logits.dtype) + 1, dtype=logits.dtype) z_check = 1 + k * z_sorted > z_cumsum # because the z_check vector is always [1,1,...1,0,0,...0] finding the # (index + 1) of the last `1` is the same as just summing the number of 1. k_z = math_ops.reduce_sum(math_ops.cast(z_check, dtypes.int32), axis=1) # calculate tau(z) # If there are inf values or all values are -inf, the k_z will be zero, # this is mathematically invalid and will also cause the gather_nd to fail. # Prevent this issue for now by setting k_z = 1 if k_z = 0, this is then # fixed later (see p_safe) by returning p = nan. This results in the same # behavior as softmax. k_z_safe = math_ops.maximum(k_z, 1) indices = array_ops.stack([math_ops.range(0, obs), k_z_safe - 1], axis=1) tau_sum = array_ops.gather_nd(z_cumsum, indices) tau_z = (tau_sum - 1) / math_ops.cast(k_z, logits.dtype) # calculate p p = math_ops.maximum( math_ops.cast(0, logits.dtype), z - tau_z[:, array_ops.newaxis]) # If k_z = 0 or if z = nan, then the input is invalid p_safe = array_ops.where( math_ops.logical_or( math_ops.equal(k_z, 0), math_ops.is_nan(z_cumsum[:, -1])), array_ops.fill([obs, dims], math_ops.cast(float("nan"), logits.dtype)), p) return p_safe
def _rank_resample(weights, biases, inputs, sampled_values, num_resampled, resampling_temperature, partition_strategy): """A helper function for rank_sampled_softmax_loss. This computes, for each i in `sampled_values`, log(sum_j exp((w_i * x_j + b_i) / resampling_temperature)) where w_i, b_i are the weight and bias of the i-th class, respectively, and j ranges over the rows of `inputs`. For efficiency, we rearrange the computation to log(sum_j exp(w_i * (x_j / resampling_temperature))) + b_i / resampling_temperature. This translates to the following batched computation using tensorflow ops: reduce_logsumexp(matmul(embeddings, transpose(inputs / resampling_temperature))) + biases / resampling_temperature The computation of the first term is colocated with the embeddings using `transform_fn` in `embedding_ops._embedding_lookup_and_transform`. The second term, not the bottleneck, is computed at the worker. Args: weights: From `rank_sampled_softmax_loss`. biases: From `rank_sampled_softmax_loss`. inputs: From `rank_sampled_softmax_loss`. sampled_values: A tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. num_resampled: An `int`. This many values are selected from `sampled_values` using the adaptive resampling algorithm. The caller must ensure that `num_resampled` is less than the size of `sampled_values`. resampling_temperature: A scalar `Tensor` with the temperature parameter for the adaptive resampling algorithm. partition_strategy: From `rank_sampled_softmax_loss`. Returns: A tuple of (`resampled_candidates`, `true_expected_count`, `resampled_expected_count`), similar to `sampled_values` but sampled down to `num_resampled` values. """ # This code supports passing a Tensor for num_resampled, but since it is only # called with an int, that's what we specify in the arg list. If this # function is ever externalized, we should change the doc to support Tensor. sampled, true_expected_count, sampled_expected_count = sampled_values sampled = math_ops.cast(array_ops.stop_gradient(sampled), dtypes.int64) true_expected_count = array_ops.stop_gradient(true_expected_count) sampled_expected_count = array_ops.stop_gradient(sampled_expected_count) reweighted_inputs = inputs / resampling_temperature def logsumexp_logit(embeddings): return math_ops.reduce_logsumexp( math_ops.matmul(embeddings, reweighted_inputs, transpose_b=True), axis=1, keepdims=False) # Calling this protected form of embedding_lookup allows co-locating # the logsumexp computation with the partitioned weights, which yields # a large speedup in practice. sampled_logits = embedding_ops._embedding_lookup_and_transform( # pylint: disable=protected-access weights, sampled, partition_strategy, transform_fn=logsumexp_logit) sampled_b = array_ops.reshape( embedding_ops.embedding_lookup(biases, sampled, partition_strategy), [-1]) sampled_logits += sampled_b / resampling_temperature _, resampled_indices = nn.top_k(sampled_logits, k=num_resampled, sorted=False) resampled = array_ops.gather(sampled, indices=resampled_indices) resampled_expected_count = array_ops.gather( sampled_expected_count, indices=resampled_indices) return resampled, true_expected_count, resampled_expected_count