def testScalar(self): with self.session(use_gpu=True): with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"): nn_ops.sparse_softmax_cross_entropy_with_logits( labels=constant_op.constant(0), logits=constant_op.constant(1.0))
def xent_grad(f): if not context.executing_eagerly(): return gradients_impl.gradients( nn_ops.sparse_softmax_cross_entropy_with_logits( labels=l, logits=f, name="xent"), [f])[0] with backprop_lib.GradientTape() as tape: tape.watch(f) return tape.gradient( nn_ops.sparse_softmax_cross_entropy_with_logits( labels=l, logits=f, name="xent"), [f])[0]
def generate_single_output(encoder_state, attention_states, sequence_length, targets, num_classes, buckets, use_mean_attention=False, softmax_loss_function=None, per_example_loss=False, name=None, use_attention=False): all_inputs = targets with tf.name_scope(name, "model_with_buckets", all_inputs): with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=None): bucket_attention_states, bucket_attn_weights, bucket_attns, bucket_outputs = attention_single_output_decoder( encoder_state, attention_states, output_size=num_classes, num_heads=1, sequence_length=sequence_length, initial_state_attention=True, use_attention=use_attention) if softmax_loss_function is None: assert len(bucket_outputs) == len(targets) == 1 # We need to make target and int64-tensor and set its shape. bucket_target = array_ops.reshape(math_ops.to_int64(targets[0]), [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=bucket_target, logits=bucket_outputs[0]) else: assert len(bucket_outputs) == len(targets) == 1 crossent = softmax_loss_function(bucket_outputs[0], targets[0]) batch_size = array_ops.shape(targets[0])[0] loss = tf.reduce_sum(crossent) / math_ops.cast(batch_size, dtypes.float32) return bucket_outputs, loss
def body(i, prev_c, prev_h, actions, log_probs): # pylint: disable=g-long-lambda signal = control_flow_ops.cond( math_ops.equal(i, 0), lambda: array_ops.tile( device_go_embedding, [self.hparams.num_children, 1]), lambda: embedding_ops.embedding_lookup(device_embeddings, actions.read(i - 1))) if self.hparams.keep_prob is not None: signal = nn_ops.dropout(signal, rate=(1 - self.hparams.keep_prob)) next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias) query = math_ops.matmul(next_h, attn_w_2) query = array_ops.reshape( query, [self.hparams.num_children, 1, self.hparams.hidden_size]) query = math_ops.tanh(query + attn_mem) query = array_ops.reshape(query, [ self.hparams.num_children * self.num_groups, self.hparams.hidden_size ]) query = math_ops.matmul(query, attn_v) query = array_ops.reshape( query, [self.hparams.num_children, self.num_groups]) query = nn_ops.softmax(query) query = array_ops.reshape( query, [self.hparams.num_children, self.num_groups, 1]) query = math_ops.reduce_sum(attn_mem * query, axis=1) query = array_ops.concat([next_h, query], axis=1) logits = math_ops.matmul(query, device_softmax) logits /= self.hparams.temperature if self.hparams.tanh_constant > 0: logits = math_ops.tanh(logits) * self.hparams.tanh_constant if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast(array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide(linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) if mode == "sample": next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed) elif mode == "greedy": next_y = math_ops.argmax(logits, 1) elif mode == "target": next_y = array_ops.slice(y, [0, i], [-1, 1]) else: raise NotImplementedError next_y = math_ops.cast(next_y, dtypes.int32) next_y = array_ops.reshape(next_y, [self.hparams.num_children]) actions = actions.write(i, next_y) log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=next_y) return i + 1, next_c, next_h, actions, log_probs
def __init__(self, args, training=True): self.args = args if not training: args.batch_size = 1 args.seq_length = 1 self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) cells = [] for _ in range(args.rnn_layers): cells.append(rnn.BasicLSTMCell(args.rnn_size)) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) dense_layer_w = tf.get_variable("dense_layer_w", [args.rnn_size, args.vocab_size]) dense_layer_b = tf.get_variable("dense_layer_b", [args.vocab_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) inputs = tf.split(inputs, args.seq_length, 1) inputs = [tf.squeeze(ip, [1]) for ip in inputs] self.initial_state = cell.zero_state(args.batch_size, tf.float32) outputs, self.final_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, cell) output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) logits = tf.matmul(output, dense_layer_w) + dense_layer_b self.probs = tf.nn.softmax(logits) self.predicted_output = tf.reshape(tf.argmax(self.probs, 1), [args.batch_size, args.seq_length]) self.lr = tf.Variable(0.0, trainable=False) loss = sparse_softmax_cross_entropy_with_logits(logits=logits, labels=tf.reshape(self.targets, [-1])) self.cost = tf.reduce_mean(loss) self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
def generate_single_output(encoder_state, attention_states, sequence_length, targets, num_classes, buckets, use_mean_attention=False, softmax_loss_function=None, per_example_loss=False, name=None, use_attention=False): all_inputs = targets with ops.op_scope(all_inputs, name, "model_with_buckets"): with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=None): bucket_attention_states, bucket_attn_weights, bucket_attns, bucket_outputs = attention_single_output_decoder( encoder_state, attention_states, output_size=num_classes, num_heads=1, sequence_length=sequence_length, initial_state_attention=True, use_attention=use_attention) if softmax_loss_function is None: assert len(bucket_outputs) == len(targets) == 1 # We need to make target and int64-tensor and set its shape. bucket_target = array_ops.reshape(math_ops.to_int64(targets[0]), [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=bucket_outputs[0], labels=bucket_target) else: assert len(bucket_outputs) == len(targets) == 1 crossent = softmax_loss_function(bucket_outputs[0], targets[0]) batch_size = array_ops.shape(targets[0])[0] loss = tf.reduce_sum(crossent) / math_ops.cast(batch_size, dtypes.float32) return bucket_outputs, loss
def sequence_loss_tensor( logits, targets, weights, num_classes, average_across_timesteps=True, softmax_loss_function=None, name=None ): """Weighted cross-entropy loss for a sequence of logits (per example). """ # if (logits.get_shape()[0:2]) != targets.get_shape() \ # or (logits.get_shape()[0:2]) != weights.get_shape(): # print(logits.get_shape()[0:2]) # print(targets.get_shape()) # print(weights.get_shape()) # raise ValueError("Shapes of logits, weights, and targets must be the " # "same") with ops.op_scope([logits, targets, weights], name, "sequence_loss_by_example"): probs_flat = tf.reshape(logits, [-1, num_classes]) targets = tf.reshape(targets, [-1]) if softmax_loss_function is None: crossent = nn_ops.sparse_softmax_cross_entropy_with_logits(probs_flat, targets) else: crossent = softmax_loss_function(probs_flat, targets) crossent = crossent * tf.reshape(weights, [-1]) crossent = tf.reduce_sum(crossent) total_size = math_ops.reduce_sum(weights) total_size += 1e-12 # to avoid division by zero crossent /= total_size return crossent
def _log_prob(self, k): k = ops.convert_to_tensor(k, name="k") if self.validate_args: k = distribution_util.embed_check_integer_casting_closed( k, target_dtype=dtypes.int32) if self.logits.get_shape()[:-1] == k.get_shape(): logits = self.logits else: logits = self.logits * array_ops.ones_like( array_ops.expand_dims(k, -1), dtype=self.logits.dtype) logits_shape = array_ops.shape(logits)[:-1] k *= array_ops.ones(logits_shape, dtype=k.dtype) k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1])) if k.dtype.is_integer: pass elif k.dtype.is_floating: # When `validate_args=True` we've already ensured int/float casting # is closed. return ops.cast(k, dtype=dtypes.int32) else: raise TypeError("`value` should have integer `dtype` or " "`self.dtype` ({})".format(self.dtype.base_dtype)) return -nn_ops.sparse_softmax_cross_entropy_with_logits(labels=k, logits=logits)
def testInt32GPU(self): if not context.context().num_gpus(): self.skipTest('No GPUs found') with ops.device('gpu:0'): xent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=[[0.0, 0.0]], labels=[0]) self.assertAllClose(xent, [0.69314718])
def MYsequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: # TODO(irving,ebrevdo): This reshape is needed because # sequence_loss_by_example is called with scalars sometimes, which # violates our general scalar strictness policy. target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) else: crossent = softmax_loss_function(logit, target) print crossent, weight log_perp_list.append(crossent * weight) print log_perp_list log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def seq2seq_loss(logits, targets, seq_len_target): """Calculate the cross entropy loss w.r.t. given target. Args: logits: A 2-d tensor of shape (TxB)x|V| containing the logit score per output symbol. targets: 2-d tensor of shape TxB that contains the ground truth output symbols. seq_len_target: Sequence length of output sequences. Required to mask padding symbols in output sequences. """ with ops.name_scope("sequence_loss", [logits, targets]): flat_targets = tf.reshape(targets, [-1]) cost = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=flat_targets) # Mask this cost since the output sequence is padded batch_major_mask = tf.sequence_mask(seq_len_target, dtype=tf.float32) time_major_mask = tf.transpose(batch_major_mask, [1, 0]) weights = tf.reshape(time_major_mask, [-1]) mask_cost = weights * cost loss = tf.reshape(mask_cost, tf.shape(targets)) # Average the loss for each example by the # of timesteps cost_per_example = tf.reduce_sum(loss, reduction_indices=0) /\ tf.cast(seq_len_target, tf.float32) # Return the average cost over all examples return tf.reduce_mean(cost_per_example)
def testLabelsPlaceholderScalar(self): with self.session(use_gpu=True): labels = array_ops.placeholder(np.int32) y = nn_ops.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=[[7.]]) with self.assertRaisesOpError("labels must be 1-D"): y.eval(feed_dict={labels: 0})
def testSecondGradient(self): with self.session() as sess: l = constant_op.constant([3, 0, 1], name="l") f = constant_op.constant( [0.3, 0.4, 0.1, 1.2, 0.1, 1.9, 0.1, 0.7, 0.8, 0.2, 1.3, 1.3], shape=[3, 4], dtype=dtypes.float64, name="f") x = nn_ops.sparse_softmax_cross_entropy_with_logits(labels=l, logits=f, name="xent") gradients = gradients_impl.gradients(x, [f])[0] err = gradient_checker.compute_gradient_error( f, [3, 4], gradients, [3, 4]) # Check that second derivative is calculated. # (it is equivalent to being `BatchMatMul` op in the graph because of # implementation of xentropy grad) op_names = [ op.op_def.name for op in sess.graph.get_operations() if op.op_def ] self.assertIn("BatchMatMulV2", op_names) self.assertLess(err, 5e-8)
def MMIloss(logits, targets, weights, lam, gam, average_across_timesteps=True, softmax_loss_function=None, name=None): """lam is lambda value(diversity penalty) of the object, gam is gamma value(length penalty) of the object (see section 4.5.1 of Li et al)""" if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) else: crossent = softmax_loss_function(logit, target) log_perp_list.append(crossent * weight) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size final_perps= log_perps - (lam)*lm_perps + (gam)*len(targets) return final_perps
def MYsequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError( "Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: # TODO(irving,ebrevdo): This reshape is needed because # sequence_loss_by_example is called with scalars sometimes, which # violates our general scalar strictness policy. target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) else: crossent = softmax_loss_function(logit, target) print crossent, weight log_perp_list.append(crossent * weight) print log_perp_list log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def log_prob(self, k, name="log_prob"): """Log-probability of class `k`. Args: k: `int32` or `int64` Tensor. Must be broadcastable with a `batch_shape` `Tensor`. name: A name for this operation (optional). Returns: The log-probabilities of the classes indexed by `k` """ with ops.name_scope(self.name): with ops.op_scope([k, self.logits], name): k = ops.convert_to_tensor(k, name="k") logits = self.logits * array_ops.ones_like( array_ops.expand_dims(k, -1), dtype=self.logits.dtype) k *= array_ops.ones( array_ops.slice( array_ops.shape(logits), [0], [array_ops.rank(logits) - 1]), dtype=k.dtype) k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1])) return -nn_ops.sparse_softmax_cross_entropy_with_logits(logits, k)
def sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): """ Weighted cross-entropy loss for a sequence of logits (per example). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError( "Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.name_scope("sequence_loss_by_example"): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logit, labels=target) else: crossent = softmax_loss_function(logit, target) log_perp_list.append(crossent * weight) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def sequence_loss_tensor(logits, targets, weights, num_classes, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). """ # if (logits.get_shape()[0:2]) != targets.get_shape() \ # or (logits.get_shape()[0:2]) != weights.get_shape(): # print(logits.get_shape()[0:2]) # print(targets.get_shape()) # print(weights.get_shape()) # raise ValueError("Shapes of logits, weights, and targets must be the " # "same") with ops.op_scope([logits, targets, weights], name, "sequence_loss_by_example"): probs_flat = tf.reshape(logits, [-1, num_classes]) targets = tf.reshape(targets, [-1]) if softmax_loss_function is None: crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( probs_flat, targets) else: crossent = softmax_loss_function(probs_flat, targets) crossent = crossent * tf.reshape(weights, [-1]) crossent = tf.reduce_sum(crossent) total_size = math_ops.reduce_sum(weights) total_size += 1e-12 # to avoid division by zero crossent /= total_size return crossent
def sequence_loss_tensor(logits, targets, weights, num_classes, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). faster? ; 3D tensor logit input; flattens and then multiples in one op; so no for loop """ with ops.name_scope(name, "sequence_loss_by_example", [logits, targets, weights]): probs_flat = tf.reshape(logits, [-1, num_classes]) targets = tf.reshape(targets, [-1]) if softmax_loss_function is None: crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( probs_flat, targets) else: crossent = softmax_loss_function(probs_flat, targets) crossent = crossent * tf.reshape(weights, [-1]) crossent = tf.reduce_sum(crossent) total_size = math_ops.reduce_sum(weights) total_size += 1e-12 # to avoid division by zero crossent /= total_size return crossent
def _log_prob(self, k): k = ops.convert_to_tensor(k, name="k") if self.validate_args: k = distribution_util.embed_check_integer_casting_closed( k, target_dtype=dtypes.int32) if self.logits.get_shape()[:-1] == k.get_shape(): logits = self.logits else: logits = self.logits * array_ops.ones_like( array_ops.expand_dims(k, -1), dtype=self.logits.dtype) logits_shape = array_ops.shape(logits)[:-1] k *= array_ops.ones(logits_shape, dtype=k.dtype) k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1])) if k.dtype.is_integer: pass elif k.dtype.is_floating: # When `validate_args=True` we've already ensured int/float casting # is closed. return ops.cast(k, dtype=dtypes.int32) else: raise TypeError("`value` should have integer `dtype` or " "`self.dtype` ({})".format( self.dtype.base_dtype)) return -nn_ops.sparse_softmax_cross_entropy_with_logits(labels=k, logits=logits)
def softmax_loss_function(logit, target): # loss function of seq2seq model logit = nn_ops.xw_plus_b(logit, output_projection[0], output_projection[1]) target = array_ops.reshape(target, [-1]) return nn_ops.sparse_softmax_cross_entropy_with_logits( labels=target, logits=logit)
def cross_entropy(labels, logits, name=None): """ Computes the cross entropy between the labels and logits This is a safe version that adds epsilon to logits to prevent log(0) """ return nn_ops.sparse_softmax_cross_entropy_with_logits(logits=ensure_finite(logits), labels=labels, name=name)
def log_prob(self, k, name="log_prob"): """Log-probability of class `k`. Args: k: `int32` or `int64` Tensor. Must be broadcastable with a `batch_shape` `Tensor`. name: A name for this operation (optional). Returns: The log-probabilities of the classes indexed by `k` """ with ops.name_scope(self.name): with ops.name_scope(name, values=[k, self.logits]): k = ops.convert_to_tensor(k, name="k") logits = self.logits * array_ops.ones_like( array_ops.expand_dims(k, -1), dtype=self.logits.dtype) k *= array_ops.ones( array_ops.slice(array_ops.shape(logits), [0], [array_ops.rank(logits) - 1]), dtype=k.dtype) k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1])) return -nn_ops.sparse_softmax_cross_entropy_with_logits( logits, k)
def testLabelsPlaceholderScalar(self): with self.test_session(use_gpu=True): labels = array_ops.placeholder(np.int32) y = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=[[7.]]) with self.assertRaisesOpError("labels must be 1-D"): y.eval(feed_dict={labels: 0})
def _log_prob(self, k): k = ops.convert_to_tensor(k, name="k") logits = self.logits * array_ops.ones_like(array_ops.expand_dims(k, -1), dtype=self.logits.dtype) shape = array_ops.slice(array_ops.shape(logits), [0], [array_ops.rank(logits) - 1]) k *= array_ops.ones(shape, dtype=k.dtype) k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1])) return -nn_ops.sparse_softmax_cross_entropy_with_logits(logits, k)
def testInt32GPU(self): if not context.context().num_gpus(): self.skipTest('No GPUs found') with ops.device('gpu:0'): xent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=[[0.0, 0.0]], labels=[0]) self.assertAllClose(xent, [0.69314718])
def sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). see original tensorflow code : <https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/legacy_seq2seq/python/ops/seq2seq.py#L1057> Parameters ---------- logits: List List of 2D Tensors of shape [batch_size x num_decoder_symbols]. targets: List List of 1D batch-sized int32 Tensors of the same length as logits. weights: List List of 1D batch-sized float-Tensors of the same length as logits. average_across_timesteps: Boolean If set, divide the returned cost by the total label weight. softmax_loss_function: None or Function Function (labels, logits) -> loss-batch to be used instead of the standard softmax (the default if this is None). **Note that to avoid confusion, it is required for the function to accept named arguments.** name: None or str Optional name for this operation, default: "sequence_loss_by_example". Returns ------- 1D batch-sized float Tensor: The log-perplexity for each sequence. Raises ------ ValueError: If len(logits) is different from len(targets) or len(weights). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError( "Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.name_scope(name, "sequence_loss_by_example", logits + targets + weights): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: # TODO(irving,ebrevdo): This reshape is needed because # sequence_loss_by_example is called with scalars sometimes, which # violates our general scalar strictness policy. target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=target, logits=logit) else: crossent = softmax_loss_function(labels=target, logits=logit) log_perp_list.append(crossent * weight) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def sampled_loss(logit, target): # labels = tf.reshape(labels, [-1, 1]) logit = nn_ops.xw_plus_b(logit, output_projection[0], output_projection[1]) # return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples, # self.target_vocab_size) target = array_ops.reshape(target, [-1]) return nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target)
def __init__(self, args, training=True): self.args = args # When we don't train then we will take in one character at a time and try to predict if not training: args.batch_size = 1 args.seq_length = 1 # Assign the basic type of RNN unit if args.mtype == 'rnn': cell_fn = rnn.BasicRNNCell elif args.mtype == 'gru': cell_fn = rnn.GRUCell elif args.mtype == 'lstm': cell_fn = rnn.BasicLSTMCell elif args.mtype == 'nas': cell_fn = rnn.NASCell else: raise Exception("model type not supported: {}".format(args.model)) cells = [] for _ in range(args.num_layers): cell = cell_fn(args.rnn_size) cells.append(cell) self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True) self.input_data = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.targets = tf.placeholder(tf.int32, [args.batch_size, args.seq_length]) self.initial_state = cell.zero_state(args.batch_size, tf.float32) softmax_w = tf.get_variable("softmax_w", [args.rnn_size, args.vocab_size]) softmax_b = tf.get_variable("softmax_b", [args.vocab_size]) embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size]) inputs = tf.nn.embedding_lookup(embedding, self.input_data) inputs = tf.split(inputs, args.seq_length, 1) inputs = [tf.squeeze(input_, [1]) for input_ in inputs] outputs, last_state = legacy_seq2seq.rnn_decoder( inputs, self.initial_state, cell) output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size]) self.logits = tf.matmul(output, softmax_w) + softmax_b self.probs = tf.nn.softmax(self.logits) self.predicted_output = tf.reshape(tf.argmax(self.probs, 1), [args.batch_size, args.seq_length]) loss = sparse_softmax_cross_entropy_with_logits( logits=[self.logits], labels=[tf.reshape(self.targets, [-1])]) self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length self.final_state = last_state self.lr = tf.Variable(0.0, trainable=False) self.optimizer = tf.train.AdamOptimizer(self.lr).minimize(self.cost)
def body(i, prev_c, prev_h, actions, log_probs): # pylint: disable=g-long-lambda signal = control_flow_ops.cond( math_ops.equal(i, 0), lambda: array_ops.tile(device_go_embedding, [self.hparams.num_children, 1]), lambda: embedding_ops.embedding_lookup(device_embeddings, actions.read(i - 1)) ) if self.hparams.keep_prob is not None: signal = nn_ops.dropout(signal, self.hparams.keep_prob) next_c, next_h = lstm(signal, prev_c, prev_h, w_lstm, forget_bias) query = math_ops.matmul(next_h, attn_w_2) query = array_ops.reshape( query, [self.hparams.num_children, 1, self.hparams.hidden_size]) query = math_ops.tanh(query + attn_mem) query = array_ops.reshape(query, [ self.hparams.num_children * self.num_groups, self.hparams.hidden_size ]) query = math_ops.matmul(query, attn_v) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups]) query = nn_ops.softmax(query) query = array_ops.reshape(query, [self.hparams.num_children, self.num_groups, 1]) query = math_ops.reduce_sum(attn_mem * query, axis=1) query = array_ops.concat([next_h, query], axis=1) logits = math_ops.matmul(query, device_softmax) logits /= self.hparams.temperature if self.hparams.tanh_constant > 0: logits = math_ops.tanh(logits) * self.hparams.tanh_constant if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) if mode == "sample": next_y = random_ops.multinomial(logits, 1, seed=self.hparams.seed) elif mode == "greedy": next_y = math_ops.argmax(logits, 1) elif mode == "target": next_y = array_ops.slice(y, [0, i], [-1, 1]) else: raise NotImplementedError next_y = math_ops.to_int32(next_y) next_y = array_ops.reshape(next_y, [self.hparams.num_children]) actions = actions.write(i, next_y) log_probs += nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=next_y) return i + 1, next_c, next_h, actions, log_probs
def _log_prob(self, k): k = ops.convert_to_tensor(k, name="k") logits = self.logits * array_ops.ones_like( array_ops.expand_dims(k, -1), dtype=self.logits.dtype) shape = array_ops.slice(array_ops.shape(logits), [0], [array_ops.rank(logits) - 1]) k *= array_ops.ones(shape, dtype=k.dtype) k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1])) return -nn_ops.sparse_softmax_cross_entropy_with_logits(logits, k)
def sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). Args: logits: [batch_size, num_steps, num_decoder_symbols]. if softmax_loss_function is not None then here is [batch_size, num_steps, emb_dim], actually is just outputs from dynamic_rnn if sotmax_loss_function is None, may be input is already [-1, num_decoder_symbols] flattened anyway, still ok targets: [batch_size, num_steps] weights: [batch_size, num_steps] average_across_timesteps: If set, divide the returned cost by the total label weight. softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: Optional name for this operation, default: "sequence_loss_by_example". Returns: 1D batch-sized float Tensor: The log-perplexity for each sequence. """ with ops.name_scope(name, "sequence_loss_by_example", [logits, targets, weights]): logits_shape = array_ops.shape(logits) batch_size = logits_shape[0] if softmax_loss_function is None: #croosents [batch_size, num_steps] #-----do not need to reshape for sparse_softmax_cross_entropy_with_logits accept both input #num_classes = logits_shape[-1] #logits = array_ops.reshape(logits, [-1, num_classes]) #targets = array_ops.reshape(targets, [-1]) crossents = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=targets) crossents = array_ops.reshape(crossents, [batch_size, -1]) weights = array_ops.reshape(weights, [batch_size, -1]) else: emb_dim = logits_shape[-1] #need reshape because unlike sparse_softmax_cross_entropy_with_logits, #tf.nn.sampled_softmax_loss now only accept 2d [batch_size, dim] as logits input logits = array_ops.reshape(logits, [-1, emb_dim]) targets = array_ops.reshape(targets, [-1, 1]) #croosents [batch_size * num_steps] crossents = softmax_loss_function(logits, targets) # croosents [batch_size, num_steps] crossents = array_ops.reshape(crossents, [batch_size, -1]) log_perps = math_ops.reduce_sum(math_ops.multiply(crossents, weights), 1) if average_across_timesteps: total_size = math_ops.reduce_sum(weights, 1) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def sequence_loss(self): with ops.name_scope("sequence_loss_by_example"): weights = tf.to_float(tf.sign(tf.abs(self.labels), name="mask")) batch_size = array_ops.shape(self.labels)[0] crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( self.logits, self.labels) log_perps = tf.reduce_sum( tf.reduce_sum(crossent * weights, reduction_indices=1)) return log_perps / tf.to_float(batch_size)
def _log_prob(self, k): k = ops.convert_to_tensor(k, name="k") if self.validate_args: k = distribution_util.embed_check_integer_casting_closed( k, target_dtype=dtypes.int32) k, logits = _broadcast_cat_event_and_params( k, self.logits, base_dtype=self.dtype.base_dtype) return -nn_ops.sparse_softmax_cross_entropy_with_logits(labels=k, logits=logits)
def _log_prob(self, k): k = ops.convert_to_tensor(k, name="k") if self.validate_args: k = distribution_util.embed_check_integer_casting_closed( k, target_dtype=dtypes.int32) k, logits = _broadcast_cat_event_and_params( k, self.logits, base_dtype=self.dtype.base_dtype) return -nn_ops.sparse_softmax_cross_entropy_with_logits(labels=k, logits=logits)
def _testHighDim(self, features, labels): np_loss, np_backprop = self._npXent(np.array(features), np.array(labels)) # manually reshape loss np_loss = np.reshape(np_loss, np.array(labels).shape) with self.test_session(use_gpu=True) as sess: loss = nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels) backprop = loss.op.inputs[0].op.outputs[1] tf_loss, tf_backprop = sess.run([loss, backprop]) self.assertAllCloseAccordingToType(np_loss, tf_loss) self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
def _testHighDim(self, features, labels): np_loss, np_backprop = self._npXent(np.array(features), np.array(labels)) # manually reshape loss np_loss = np.reshape(np_loss, np.array(labels).shape) tf_loss = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=features) if not context.executing_eagerly(): tf_backprop = tf_loss.op.inputs[0].op.outputs[1] else: with backprop_lib.GradientTape() as tape: features = constant_op.constant(features) tape.watch(features) tf_backprop = tape.gradient( nn_ops.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=features), [features])[0] tf_backprop = array_ops.reshape(tf_backprop, np_backprop.shape) self.assertAllCloseAccordingToType(np_loss, tf_loss) self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
def sequence_loss_tensor(logits, targets, num_classes, weights=None, average_across_timesteps=False, softmax_loss_function=None, name="sequenceLoss"): """ Weighted cross-entropy loss for a sequence of logits (per example). It is a modification of TensorFlow's own sequence_to_sequence_loss. TensorFlow's seq2seq loss works with a 2D list instead of a 3D tensors. :param tf.Tensor logits: Logits for each class for all samples. [batch_size, (sequence_length), num_classes] :param tf.Tensor targets: True classes of samples. [batch_size, (sequence_length)] :param int | tf.Tensor num_classes: The total number of classes. :param tf.Tensor weights: Weighing of each sample. [batch_size, (sequence_length)] :param bool average_across_timesteps: Average loss across time-dimension. :param Callable softmax_loss_function: Method used for computing loss. :param str name: Name of loss-functions scope. :return: tf.Tensor """ if average_across_timesteps: raise NotImplementedError( "Averaging across time-steps has not been implemented yet. ") with tf.variable_scope(name): # Flatten logits for using softmax operation, and targets for comparison # logits_flat: [batch_size * (sequence_length), num_classes] # targets: [batch_size * (sequence_length)] logits_flat = tf.reshape(logits, [-1, num_classes]) targets = tf.reshape(targets, [-1]) # If a custom loss function is given, then use that. Otherwise default # cross_ent: [batch_size * (sequence_length)] if softmax_loss_function is None: cross_ent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits_flat, labels=targets) else: cross_ent = softmax_loss_function(logits_flat, targets) # Weigh cross-entropy if wanted if weights is not None: cross_ent = cross_ent * tf.reshape(weights, [-1]) # Cross-entropy sum cross_ent = tf.reduce_sum(cross_ent) # Divide by total weighting # TODO: Couldn't you just normalize the weights first? if weights is not None: total_size = tf.reduce_sum(weights) total_size += 1e-12 # to avoid division by zero cross_ent /= total_size return cross_ent
def _testHighDim(self, features, labels): np_loss, np_backprop = self._npXent(np.array(features), np.array(labels)) # manually reshape loss np_loss = np.reshape(np_loss, np.array(labels).shape) with self.test_session(use_gpu=True) as sess: loss = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=features) backprop = loss.op.inputs[0].op.outputs[1] tf_loss, tf_backprop = sess.run([loss, backprop]) self.assertAllCloseAccordingToType(np_loss, tf_loss) self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
def testScalarHandling(self): with self.test_session(use_gpu=False) as sess: with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, ".*labels must be 1-D.*"): labels = array_ops.placeholder(dtypes.int32, shape=[None, 1]) logits = array_ops.placeholder(dtypes.float32, shape=[None, 3]) ce = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=array_ops.squeeze(labels), logits=logits) labels_v2 = np.zeros((1, 1), dtype=np.int32) logits_v2 = np.random.randn(1, 3) sess.run([ce], feed_dict={labels: labels_v2, logits: logits_v2})
def testScalarHandling(self): with self.test_session(use_gpu=False) as sess: with self.assertRaisesRegexp(errors_impl.InvalidArgumentError, ".*labels must be 1-D.*"): labels = array_ops.placeholder(dtypes.int32, shape=[None, 1]) logits = array_ops.placeholder(dtypes.float32, shape=[None, 3]) ce = nn_ops.sparse_softmax_cross_entropy_with_logits( logits, array_ops.squeeze(labels)) labels_v2 = np.zeros((1, 1), dtype=np.int32) logits_v2 = np.random.randn(1, 3) sess.run([ce], feed_dict={labels: labels_v2, logits: logits_v2})
def _sparse_vs_dense_xent_benchmark_sparse(labels, logits): # Using sparse_softmax_cross_entropy_with_logits labels = labels.astype(np.int64) labels = array_ops.identity(labels) logits = array_ops.identity(logits) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits, labels, name="SequenceLoss/CrossEntropy") crossent_sum = math_ops.reduce_sum(crossent) grads = gradients_impl.gradients([crossent_sum], [logits])[0] return (crossent_sum, grads)
def _sparse_vs_dense_xent_benchmark_sparse(labels, logits): # Using sparse_softmax_cross_entropy_with_logits labels = labels.astype(np.int64) labels = array_ops.identity(labels) logits = array_ops.identity(logits) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits, labels, name="SequenceLoss/CrossEntropy") crossent_sum = math_ops.reduce_sum(crossent) grads = gradients_impl.gradients([crossent_sum], [logits])[0] return (crossent_sum, grads)
def testGradient(self): with self.test_session(use_gpu=True): l = constant_op.constant([3, 0, 1], name="l") f = constant_op.constant( [0.1, 0.2, 0.3, 0.4, 0.1, 0.4, 0.9, 1.6, 0.1, 0.8, 2.7, 6.4], shape=[3, 4], dtype=dtypes.float64, name="f") x = nn_ops.sparse_softmax_cross_entropy_with_logits(f, l, name="xent") err = gradient_checker.compute_gradient_error(f, [3, 4], x, [3]) print("cross entropy gradient err = ", err) self.assertLess(err, 5e-8)
def _log_prob(self, k): k = ops.convert_to_tensor(k, name="k") if self.logits.get_shape()[:-1] == k.get_shape(): logits = self.logits else: logits = self.logits * array_ops.ones_like( array_ops.expand_dims(k, -1), dtype=self.logits.dtype) logits_shape = array_ops.shape(logits)[:-1] k *= array_ops.ones(logits_shape, dtype=k.dtype) k.set_shape(tensor_shape.TensorShape(logits.get_shape()[:-1])) return -nn_ops.sparse_softmax_cross_entropy_with_logits(labels=k, logits=logits)
def log_pmf(self, k, name="log_pmf"): """Log-probability of class `k`. Args: k: `int32` or `int64` Tensor with shape = `self.batch_shape()`. name: A name for this operation (optional). Returns: The log-probabilities of the classes indexed by `k` """ with ops.name_scope(self.name): k = ops.convert_to_tensor(k, name="k") k.set_shape(self.get_batch_shape()) return -nn_ops.sparse_softmax_cross_entropy_with_logits( self.logits, k, name=name)
def testSecondGradient(self): images_placeholder = array_ops.placeholder(dtypes.float32, shape=(3, 2)) labels_placeholder = array_ops.placeholder(dtypes.int32, shape=(3)) weights = variables.Variable(random_ops.truncated_normal([2], stddev=1.0)) weights_with_zeros = array_ops.stack([array_ops.zeros([2]), weights], axis=1) logits = math_ops.matmul(images_placeholder, weights_with_zeros) cross_entropy = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=labels_placeholder, logits=logits) loss = math_ops.reduce_mean(cross_entropy) # Taking ths second gradient should fail, since it is not # yet supported. with self.assertRaisesRegexp(LookupError, "explicitly disabled"): _ = gradients_impl.hessians(loss, [weights])
def make_grouping_predictions(self, input_layer, reuse=None): """model that predicts grouping (grouping_actions). Args: input_layer: group_input_layer reuse: reuse Returns: grouping_actions: actions grouping_log_probs: log probabilities corresponding to actions """ with variable_scope.variable_scope(self.hparams.name, reuse=True): # input_layer: tensor of size [1, num_ops, hidden_size] w_grouping_ff = variable_scope.get_variable("w_grouping_ff") w_grouping_softmax = variable_scope.get_variable("w_grouping_softmax") batch_size = array_ops.shape(input_layer)[0] embedding_dim = array_ops.shape(input_layer)[2] reshaped = array_ops.reshape(input_layer, [batch_size * self.num_ops, embedding_dim]) ff_output = math_ops.matmul(reshaped, w_grouping_ff) logits = math_ops.matmul(ff_output, w_grouping_softmax) if self.hparams.logits_std_noise > 0: num_in_logits = math_ops.cast( array_ops.size(logits), dtype=dtypes.float32) avg_norm = math_ops.divide( linalg_ops.norm(logits), math_ops.sqrt(num_in_logits)) logits_noise = random_ops.random_normal( array_ops.shape(logits), stddev=self.hparams.logits_std_noise * avg_norm) logits = control_flow_ops.cond( self.global_step > self.hparams.stop_noise_step, lambda: logits, lambda: logits + logits_noise) logits = array_ops.reshape(logits, [batch_size * self.num_ops, self.num_groups]) actions = random_ops.multinomial(logits, 1, seed=self.hparams.seed) actions = math_ops.to_int32(actions) actions = array_ops.reshape(actions, [batch_size, self.num_ops]) action_label = array_ops.reshape(actions, [-1]) log_probs = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=action_label) log_probs = array_ops.reshape(log_probs, [batch_size, -1]) log_probs = math_ops.reduce_sum(log_probs, 1) grouping_actions = actions grouping_log_probs = log_probs return grouping_actions, grouping_log_probs
def loss(self, data, labels): """The loss to minimize while training.""" if self.is_regression: diff = self.training_inference_graph(data) - math_ops.to_float(labels) mean_squared_error = math_ops.reduce_mean(diff * diff) root_mean_squared_error = math_ops.sqrt(mean_squared_error, name="loss") loss = root_mean_squared_error else: loss = math_ops.reduce_mean( nn_ops.sparse_softmax_cross_entropy_with_logits( self.training_inference_graph(data), array_ops.squeeze(math_ops.to_int32(labels))), name="loss") if self.regularizer: loss += layers.apply_regularization(self.regularizer, variables.trainable_variables()) return loss
def sequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). Args: logits: List of 2D Tensors of shape [batch_size x num_decoder_symbols]. targets: List of 1D batch-sized int32 Tensors of the same length as logits. weights: List of 1D batch-sized float-Tensors of the same length as logits. average_across_timesteps: If set, divide the returned cost by the total label weight. softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: Optional name for this operation, default: "sequence_loss_by_example". Returns: 1D batch-sized float Tensor: The log-perplexity for each sequence. Raises: ValueError: If len(logits) is different from len(targets) or len(weights). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: # TODO(irving,ebrevdo): This reshape is needed because # sequence_loss_by_example is called with scalars sometimes, which # violates our general scalar strictness policy. target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) else: crossent = softmax_loss_function(logit, target) log_perp_list.append(crossent * weight) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def fn(x, y): return nn_ops.sparse_softmax_cross_entropy_with_logits(logits=x, labels=y)[0]
def sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits. Depending on the values of `average_across_timesteps` and `average_across_batch`, the return Tensor will have rank 0, 1, or 2 as these arguments reduce the cross-entropy at each target, which has shape `[batch_size, sequence_length]`, over their respective dimensions. For example, if `average_across_timesteps` is `True` and `average_across_batch` is `False`, then the return Tensor will have shape `[batch_size]`. Args: logits: A Tensor of shape `[batch_size, sequence_length, num_decoder_symbols]` and dtype float. The logits correspond to the prediction across all classes at each timestep. targets: A Tensor of shape `[batch_size, sequence_length]` and dtype int. The target represents the true class at each timestep. weights: A Tensor of shape `[batch_size, sequence_length]` and dtype float. `weights` constitutes the weighting of each prediction in the sequence. When using `weights` as masking, set all valid timesteps to 1 and all padded timesteps to 0, e.g. a mask returned by `tf.sequence_mask`. average_across_timesteps: If set, sum the cost across the sequence dimension and divide the cost by the total label weight across timesteps. average_across_batch: If set, sum the cost across the batch dimension and divide the returned cost by the batch size. softmax_loss_function: Function (labels, logits) -> loss-batch to be used instead of the standard softmax (the default if this is None). **Note that to avoid confusion, it is required for the function to accept named arguments.** name: Optional name for this operation, defaults to "sequence_loss". Returns: A float Tensor of rank 0, 1, or 2 depending on the `average_across_timesteps` and `average_across_batch` arguments. By default, it has rank 0 (scalar) and is the weighted average cross-entropy (log-perplexity) per symbol. Raises: ValueError: logits does not have 3 dimensions or targets does not have 2 dimensions or weights does not have 2 dimensions. """ if len(logits.get_shape()) != 3: raise ValueError("Logits must be a " "[batch_size x sequence_length x logits] tensor") if len(targets.get_shape()) != 2: raise ValueError("Targets must be a [batch_size x sequence_length] " "tensor") if len(weights.get_shape()) != 2: raise ValueError("Weights must be a [batch_size x sequence_length] " "tensor") with ops.name_scope(name, "sequence_loss", [logits, targets, weights]): num_classes = array_ops.shape(logits)[2] logits_flat = array_ops.reshape(logits, [-1, num_classes]) targets = array_ops.reshape(targets, [-1]) if softmax_loss_function is None: crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=logits_flat) else: crossent = softmax_loss_function(labels=targets, logits=logits_flat) crossent *= array_ops.reshape(weights, [-1]) if average_across_timesteps and average_across_batch: crossent = math_ops.reduce_sum(crossent) total_size = math_ops.reduce_sum(weights) total_size += 1e-12 # to avoid division by 0 for all-0 weights crossent /= total_size else: batch_size = array_ops.shape(logits)[0] sequence_length = array_ops.shape(logits)[1] crossent = array_ops.reshape(crossent, [batch_size, sequence_length]) if average_across_timesteps and not average_across_batch: crossent = math_ops.reduce_sum(crossent, axis=[1]) total_size = math_ops.reduce_sum(weights, axis=[1]) total_size += 1e-12 # to avoid division by 0 for all-0 weights crossent /= total_size if not average_across_timesteps and average_across_batch: crossent = math_ops.reduce_sum(crossent, axis=[0]) total_size = math_ops.reduce_sum(weights, axis=[0]) total_size += 1e-12 # to avoid division by 0 for all-0 weights crossent /= total_size return crossent
def testVector(self): with self.session(use_gpu=True): loss = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=constant_op.constant(0), logits=constant_op.constant([1.0])) self.assertAllClose(0.0, self.evaluate(loss))
def sampled_sparse_softmax_loss(weights, biases, labels, inputs, num_sampled, num_classes, sampled_values=None, remove_accidental_hits=True, partition_strategy="mod", name="sampled_sparse_softmax_loss"): """Computes and returns the sampled sparse softmax training loss. This is a faster way to train a softmax classifier over a huge number of classes. This operation is for training only. It is generally an underestimate of the full softmax loss. A common use case is to use this method for training, and calculate the full softmax loss for evaluation or inference. In this case, you must set `partition_strategy="div"` for the two losses to be consistent, as in the following example: ```python if mode == "train": loss = tf.nn.sampled_sparse_softmax_loss( weights=weights, biases=biases, labels=labels, inputs=inputs, ..., partition_strategy="div") elif mode == "eval": logits = tf.matmul(inputs, tf.transpose(weights)) logits = tf.nn.bias_add(logits, biases) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.squeeze(labels), logits=logits) ``` See our [Candidate Sampling Algorithms Reference] (https://www.tensorflow.org/extras/candidate_sampling.pdf) Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007) ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` objects whose concatenation along dimension 0 has shape [num_classes, dim]. The (possibly-sharded) class embeddings. biases: A `Tensor` of shape `[num_classes]`. The class biases. labels: A `Tensor` of type `int64` and shape `[batch_size, 1]`. The index of the single target class for each row of logits. Note that this format differs from the `labels` argument of `nn.sparse_softmax_cross_entropy_with_logits`. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. num_sampled: An `int`. The number of classes to randomly sample per batch. num_classes: An `int`. The number of possible classes. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. (if None, we default to `log_uniform_candidate_sampler`) remove_accidental_hits: A `bool`. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is True. partition_strategy: A string specifying the partitioning strategy, relevant if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: A name for the operation (optional). Returns: A `batch_size` 1-D tensor of per-example sampled softmax losses. """ logits, _ = nn_impl._compute_sampled_logits( weights=weights, biases=biases, labels=labels, inputs=inputs, num_sampled=num_sampled, num_classes=num_classes, num_true=1, sampled_values=sampled_values, subtract_log_q=True, remove_accidental_hits=remove_accidental_hits, partition_strategy=partition_strategy, name=name) # There is only one true label. _compute_sampled_logits puts the true logit # at index 0. labels = array_ops.zeros([array_ops.shape(logits)[0], 1], dtype=dtypes.int64) sampled_losses = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=array_ops.squeeze(labels), logits=logits) # sampled_losses is a [batch_size] tensor. return sampled_losses
def testInt32GPU(self): with ops.device('gpu:0'): xent = nn_ops.sparse_softmax_cross_entropy_with_logits( logits=[[0.0, 0.0]], labels=[0]) self.assertAllClose(xent, [0.69314718])
def sequence_loss(logits, targets, weights, average_across_timesteps=True, average_across_batch=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). Args: logits: A 3D Tensor of shape [batch_size x sequence_length x num_decoder_symbols] and dtype float. The logits correspond to the prediction across all classes at each timestep. targets: A 2D Tensor of shape [batch_size x sequence_length] and dtype int. The target represents the true class at each timestep. weights: A 2D Tensor of shape [batch_size x sequence_length] and dtype float. Weights constitutes the weighting of each prediction in the sequence. When using weights as masking set all valid timesteps to 1 and all padded timesteps to 0. average_across_timesteps: If set, sum the cost across the sequence dimension and divide the cost by the total label weight across timesteps. average_across_batch: If set, sum the cost across the batch dimension and divide the returned cost by the batch size. softmax_loss_function: Function (labels-batch, inputs-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: Optional name for this operation, defaults to "sequence_loss". Returns: A scalar float Tensor: The average log-perplexity per symbol (weighted). Raises: ValueError: logits does not have 3 dimensions or targets does not have 2 dimensions or weights does not have 2 dimensions. """ if len(logits.get_shape()) != 3: raise ValueError("Logits must be a " "[batch_size x sequence_length x logits] tensor") if len(targets.get_shape()) != 2: raise ValueError("Targets must be a [batch_size x sequence_length] " "tensor") if len(weights.get_shape()) != 2: raise ValueError("Weights must be a [batch_size x sequence_length] " "tensor") with ops.name_scope(name, "sequence_loss", [logits, targets, weights]): num_classes = array_ops.shape(logits)[2] probs_flat = array_ops.reshape(logits, [-1, num_classes]) targets = array_ops.reshape(targets, [-1]) if softmax_loss_function is None: crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( labels=targets, logits=probs_flat) else: crossent = softmax_loss_function(targets, probs_flat) crossent = crossent * array_ops.reshape(weights, [-1]) if average_across_timesteps and average_across_batch: crossent = math_ops.reduce_sum(crossent) total_size = math_ops.reduce_sum(weights) total_size += 1e-12 # to avoid division by 0 for all-0 weights crossent /= total_size else: batch_size = array_ops.shape(logits)[0] sequence_length = array_ops.shape(logits)[1] crossent = array_ops.reshape(crossent, [batch_size, sequence_length]) if average_across_timesteps and not average_across_batch: crossent = math_ops.reduce_sum(crossent, axis=[1]) total_size = math_ops.reduce_sum(weights, axis=[1]) total_size += 1e-12 # to avoid division by 0 for all-0 weights crossent /= total_size if not average_across_timesteps and average_across_batch: crossent = math_ops.reduce_sum(crossent, axis=[0]) total_size = math_ops.reduce_sum(weights, axis=[0]) total_size += 1e-12 # to avoid division by 0 for all-0 weights crossent /= total_size return crossent
def testVector(self): with self.test_session(use_gpu=True): loss = nn_ops.sparse_softmax_cross_entropy_with_logits( constant_op.constant([1.0]), constant_op.constant(0)) self.assertAllClose(0.0, loss.eval())
def testScalar(self): with self.test_session(use_gpu=True): with self.assertRaisesRegexp(ValueError, ".*Logits cannot be scalars*"): nn_ops.sparse_softmax_cross_entropy_with_logits( constant_op.constant(1.0), constant_op.constant(0))
def testShapeMismatch(self): with self.test_session(use_gpu=True): with self.assertRaisesRegexp(ValueError, ".*Rank mismatch:*"): nn_ops.sparse_softmax_cross_entropy_with_logits( [[0., 1.], [2., 3.], [2., 3.]], [[0, 2]])