def testBasicMemory(self): """Make sure arguments can be passed correctly.""" with test_util.device(use_gpu=False): a = constant_op.constant(10, name="a") b = constant_op.constant(20, name="b") c = math_ops.add_n([a, b], name="c") d = math_ops.add_n([b, c], name="d") train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) train_op.append(d) mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph()) report = cost_analyzer.GenerateMemoryReport(mg) # Print the report to make it easier to debug print("{}".format(report)) # Check the report self.assertTrue( "Peak usage for device /job:localhost/replica:0/task:0/device:CPU:0: " "16 bytes" in report) self.assertTrue(" a:0 uses 4 bytes" in report) self.assertTrue(" b:0 uses 4 bytes" in report) self.assertTrue(" c:0 uses 4 bytes" in report) self.assertTrue(" d:0 uses 4 bytes" in report)
def sequence_loss_by_example(logits, targets, weights, average_across_time=True, scope=None): ''' A simple version of weighted sequence loss measured in sequence :param logits: :param targets: :param weights: :param average_across_time: :param softmax_loss_function: :param scope: :return: ''' if len(logits) != len(targets) or len(weights) != len(logits): raise ValueError("Lenghts of logits, weights and target must be same " "%d, %d, %d" %len(logits), len(weights), len(targets)) with tf.variable_scope(scope or "sequence_loss_by_example"): sequence_loss_list = [] for logit, target, weight in zip(logits, targets, weights): loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit,target) # tensorflow !!! sequence_loss_list.append(loss*weight) sequence_loss = math_ops.add_n(sequence_loss_list) if average_across_time: total_weight = math_ops.add_n(weights) + 1e-12 final_loss = sequence_loss/total_weight else: final_loss = sequence_loss return final_loss
def testAddN(self): devices = ["/cpu:0"] if test_util.is_gpu_available(): devices.append("/gpu:0") for device in devices: with ops.device(device): # With value opt1 = optional_ops.Optional.from_value((1.0, 2.0)) opt2 = optional_ops.Optional.from_value((3.0, 4.0)) add_tensor = math_ops.add_n([opt1._variant_tensor, opt2._variant_tensor]) add_opt = optional_ops._OptionalImpl(add_tensor, opt1.value_structure) self.assertAllEqual(self.evaluate(add_opt.get_value()), (4.0, 6.0)) # Without value opt_none1 = optional_ops.Optional.none_from_structure( opt1.value_structure) opt_none2 = optional_ops.Optional.none_from_structure( opt2.value_structure) add_tensor = math_ops.add_n([opt_none1._variant_tensor, opt_none2._variant_tensor]) add_opt = optional_ops._OptionalImpl(add_tensor, opt_none1.value_structure) self.assertFalse(self.evaluate(add_opt.has_value()))
def _MultiDeviceAddN(tensor_list, gradient_uid): """Adds tensors from potentially multiple devices.""" # Basic function structure comes from control_flow_ops.group(). # Sort tensors according to their devices. tensors_on_device = collections.defaultdict(lambda: []) for tensor in tensor_list: tensors_on_device[tensor.device].append(tensor) # For each device, add the tensors on that device first. # Then gather the partial sums from multiple devices. # TODO(sjhwang): Create hierarchical aggregation tree as pbar's suggestion. # E.g., aggregate per GPU, then per task, and so on. summands = [] def DeviceKey(dev): return "" if dev is None else dev for dev in sorted(six.iterkeys(tensors_on_device), key=DeviceKey): tensors = tensors_on_device[dev] with ops._colocate_with_for_gradient( # pylint: disable=protected-access tensors[0].op, gradient_uid, ignore_existing=True): summands.append(math_ops.add_n(tensors)) return math_ops.add_n(summands)
def testSimpleSwap(self): """Check that the swap annotations are followed.""" a = constant_op.constant(10, name='a') b = constant_op.constant(20, name='b') c = math_ops.add_n([a, b], name='c') d = math_ops.add_n([b, c], name='d') train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) train_op.append(d) d.op.node_def.attr['_swap_to_host'].i = 0 mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph()) rewriter_config = rewriter_config_pb2.RewriterConfig( memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) graph = tf_optimizer.OptimizeGraph(rewriter_config, mg) self.assertEqual(len(graph.node), 6) self.assertItemsEqual([node.name for node in graph.node], [ 'a', 'b', 'c', 'd', 'swap_in_d_0', 'swap_out_d_0', ]) for node in graph.node: if node.name == 'swap_in_d_0': self.assertEqual('swap_out_d_0', node.input[0]) self.assertEqual('^b', node.input[1]) elif node.name == 'swap_out_d_0': self.assertEqual('b', node.input[0]) elif node.name == 'd': self.assertEqual('swap_in_d_0', node.input[0]) self.assertEqual('c', node.input[1])
def MMIloss(logits, targets, weights, lam, gam, average_across_timesteps=True, softmax_loss_function=None, name=None): """lam is lambda value(diversity penalty) of the object, gam is gamma value(length penalty) of the object (see section 4.5.1 of Li et al)""" if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) else: crossent = softmax_loss_function(logit, target) log_perp_list.append(crossent * weight) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size final_perps= log_perps - (lam)*lm_perps + (gam)*len(targets) return final_perps
def testSimpleSwap(self): """Check that the swap annotations are followed.""" a = variables.Variable(10, name='a') b = variables.Variable(20, name='b') c = math_ops.add_n([a, b], name='c') d = math_ops.add_n([b, c], name='d') train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP) train_op.append(d) d.op.node_def.attr['_swap_to_host'].i = 0 mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph()) graph_size = len(mg.graph_def.node) rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL) graph = tf_optimizer.OptimizeGraph(rewriter_config, mg) self.assertEqual(len(graph.node), graph_size + 2) self.assertTrue( set([node.name for node in graph.node]) > set( ['a', 'b', 'c', 'd', 'swap_in_d_0', 'swap_out_d_0'])) for node in graph.node: if node.name == 'swap_in_d_0': self.assertEqual('swap_out_d_0', node.input[0]) self.assertEqual('^b/read', node.input[1]) elif node.name == 'swap_out_d_0': self.assertEqual('b/read', node.input[0]) elif node.name == 'd': self.assertEqual('swap_in_d_0', node.input[0]) self.assertEqual('c', node.input[1])
def surrogate_loss(sample_losses, stochastic_tensors=None, name="SurrogateLoss"): """Surrogate loss for stochastic graphs. This function will call `loss_fn` on each `StochasticTensor` upstream of `sample_losses`, passing the losses that it influenced. Note that currently `surrogate_loss` does not work with `StochasticTensor`s instantiated in `while_loop`s or other control structures. Args: sample_losses: a list or tuple of final losses. Each loss should be per example in the batch (and possibly per sample); that is, it should have dimensionality of 1 or greater. All losses should have the same shape. stochastic_tensors: a list of `StochasticTensor`s to add loss terms for. If None, defaults to all `StochasticTensor`s in the graph upstream of the `Tensor`s in `sample_losses`. name: the name with which to prepend created ops. Returns: `Tensor` loss, which is the sum of `sample_losses` and the `loss_fn`s returned by the `StochasticTensor`s. Raises: TypeError: if `sample_losses` is not a list or tuple, or if its elements are not `Tensor`s. ValueError: if any loss in `sample_losses` does not have dimensionality 1 or greater. """ with ops.op_scope(sample_losses, name): fixed_losses = [] if not isinstance(sample_losses, (list, tuple)): raise TypeError("sample_losses must be a list or tuple") for loss in sample_losses: if not isinstance(loss, ops.Tensor): raise TypeError("loss is not a Tensor: %s" % loss) ndims = loss.get_shape().ndims if not (ndims is not None and ndims >= 1): raise ValueError("loss must have dimensionality 1 or greater: %s" % loss) fixed_losses.append(array_ops.stop_gradient(loss)) stoch_dependencies_map = _stochastic_dependencies_map( fixed_losses, stochastic_tensors=stochastic_tensors) if not stoch_dependencies_map: logging.warn( "No collection of Stochastic Tensors found for current graph.") return math_ops.add_n(sample_losses) # Iterate through all of the stochastic dependencies, adding # surrogate terms where necessary. sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses] loss_terms = sample_losses for (stoch_node, dependent_losses) in stoch_dependencies_map.items(): loss_term = stoch_node.loss(list(dependent_losses)) if loss_term is not None: loss_terms.append(loss_term) return math_ops.add_n(loss_terms)
def MYsequence_loss_by_example(logits, targets, weights, average_across_timesteps=True, softmax_loss_function=None, name=None): if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): log_perp_list = [] for logit, target, weight in zip(logits, targets, weights): if softmax_loss_function is None: # TODO(irving,ebrevdo): This reshape is needed because # sequence_loss_by_example is called with scalars sometimes, which # violates our general scalar strictness policy. target = array_ops.reshape(target, [-1]) crossent = nn_ops.sparse_softmax_cross_entropy_with_logits( logit, target) else: crossent = softmax_loss_function(logit, target) print crossent, weight log_perp_list.append(crossent * weight) print log_perp_list log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def _full_batch_training_op(self, inputs, cluster_idx_list, cluster_centers): """Creates an op for training for full batch case. Args: inputs: list of input Tensors. cluster_idx_list: A vector (or list of vectors). Each element in the vector corresponds to an input row in 'inp' and specifies the cluster id corresponding to the input. cluster_centers: Tensor Ref of cluster centers. Returns: An op for doing an update of mini-batch k-means. """ cluster_sums = [] cluster_counts = [] epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype) for inp, cluster_idx in zip(inputs, cluster_idx_list): with ops.colocate_with(inp): cluster_sums.append( math_ops.unsorted_segment_sum(inp, cluster_idx, self._num_clusters)) cluster_counts.append( math_ops.unsorted_segment_sum( array_ops.reshape( array_ops.ones( array_ops.reshape(array_ops.shape(inp)[0], [-1])), [-1, 1]), cluster_idx, self._num_clusters)) with ops.colocate_with(cluster_centers): new_clusters_centers = math_ops.add_n(cluster_sums) / (math_ops.cast( math_ops.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon) if self._clusters_l2_normalized(): new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1) return state_ops.assign(cluster_centers, new_clusters_centers)
def testIndexedSlices(self): slc = ops.IndexedSlices( array_ops.constant([1, 2], shape=[1, 2]), array_ops.constant([1]), array_ops.constant([2, 2])) slc_as_dense = np.array([[0, 0], [1, 2]]) with self.test_session(use_gpu=True): # add_n currently always converts IndexedSlices to dense self.assertAllEqual(slc_as_dense, math_ops.add_n([slc]).eval()) self.assertAllEqual(2 * slc_as_dense, math_ops.add_n([slc, slc]).eval())
def testFloat(self): np.random.seed(12345) for num_inputs in range(1, 10): x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(num_inputs)] tf_x = ops.convert_n_to_tensor(x) with self.test_session(use_gpu=True): self.assertAllClose(sum(x), math_ops.add_n(tf_x).eval()) self.assertAllClose(x[0] * num_inputs, math_ops.add_n([tf_x[0]] * num_inputs).eval())
def testPartials(self): """Test that previously revealed a bug in buffer forwarding for AddN.""" partials = [] for _ in range(98): partials.append(math_ops.add_n([constant_op.constant(1)])) partials.append( math_ops.add_n([constant_op.constant(1), constant_op.constant(1)])) res = math_ops.add_n(partials) + constant_op.constant(0) with self.test_session(use_gpu=True): self.assertAllEqual(res.eval(), 100)
def sequence_loss_by_example(logits, targets, weights, num_decoder_symbols, average_across_timesteps=True, softmax_loss_function=None, name=None): """Weighted cross-entropy loss for a sequence of logits (per example). Args: logits: list of 2D Tensors of shape [batch_size x num_decoder_symbols]. targets: list of 1D batch-sized int32 Tensors of the same length as logits. weights: list of 1D batch-sized float-Tensors of the same length as logits. num_decoder_symbols: integer, number of decoder symbols (output classes). average_across_timesteps: If set, divide the returned cost by the total label weight. softmax_loss_function: function (inputs-batch, labels-batch) -> loss-batch to be used instead of the standard softmax (the default if this is None). name: optional name for this operation, default: "sequence_loss_by_example". Returns: 1D batch-sized float Tensor: the log-perplexity for each sequence. Raises: ValueError: if len(logits) is different from len(targets) or len(weights). """ if len(targets) != len(logits) or len(weights) != len(logits): raise ValueError("Lengths of logits, weights, and targets must be the same " "%d, %d, %d." % (len(logits), len(weights), len(targets))) with ops.op_scope(logits + targets + weights, name, "sequence_loss_by_example"): batch_size = array_ops.shape(targets[0])[0] log_perp_list = [] length = batch_size * num_decoder_symbols for i in xrange(len(logits)): if softmax_loss_function is None: # TODO(lukaszkaiser): There is no SparseCrossEntropy in TensorFlow, so # we need to first cast targets into a dense representation, and as # SparseToDense does not accept batched inputs, we need to do this by # re-indexing and re-sizing. When TensorFlow adds SparseCrossEntropy, # rewrite this method. indices = targets[i] + num_decoder_symbols * math_ops.range(batch_size) with ops.device("/cpu:0"): # Sparse-to-dense must be on CPU for now. dense = sparse_ops.sparse_to_dense( indices, array_ops.expand_dims(length, 0), 1.0, 0.0) target = array_ops.reshape(dense, [-1, num_decoder_symbols]) crossent = nn_ops.softmax_cross_entropy_with_logits( logits[i], target, name="SequenceLoss/CrossEntropy{0}".format(i)) else: crossent = softmax_loss_function(logits[i], targets[i]) log_perp_list.append(crossent * weights[i]) log_perps = math_ops.add_n(log_perp_list) if average_across_timesteps: total_size = math_ops.add_n(weights) total_size += 1e-12 # Just to avoid division by 0 for all-0 weights. log_perps /= total_size return log_perps
def _reduce(self, method_string, value, destinations): if not isinstance(value, values.MapOutput): return value l = value.get() assert l with ops.device(self._device): if method_string == "sum": return math_ops.add_n(l) elif method_string == "mean": return math_ops.add_n(l) / len(l) else: assert False
def testInt(self): np.random.seed(54321) for num_inputs in range(1, 10): x = [ np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(num_inputs) ] tf_x = ops.convert_n_to_tensor(x) with self.test_session(use_gpu=True): self.assertAllEqual(sum(x), math_ops.add_n(tf_x).eval()) self.assertAllEqual(x[0] * num_inputs, math_ops.add_n([tf_x[0]] * num_inputs).eval())
def _reduce(self, aggregation, value, destinations): if not isinstance(value, values.MapOutput): return value l = value.get() assert l with ops.device(self._device): if aggregation == vs.VariableAggregation.SUM: return math_ops.add_n(l) elif aggregation == vs.VariableAggregation.MEAN: return math_ops.add_n(l) / len(l) else: assert False
def _define_maximization_operation(self, num_batches): """Maximization operations.""" # TODO(xavigonzalvo): some of these operations could be moved to C++. # Compute the effective number of data points assigned to component k. with ops.control_dependencies(self._w): points_in_k = array_ops.squeeze( math_ops.add_n(self._points_in_k), axis=[0]) # Update alpha. if 'w' in self._params: final_points_in_k = points_in_k / num_batches num_examples = math_ops.cast(math_ops.reduce_sum(final_points_in_k), dtypes.float32) self._alpha_op = self._alpha.assign(final_points_in_k / (num_examples + MEPS)) else: self._alpha_op = control_flow_ops.no_op() self._train_ops = [self._alpha_op] # Update means. points_in_k_expanded = array_ops.reshape(points_in_k, [self._num_classes, 1, 1]) if 'm' in self._params: self._means_op = self._means.assign( math_ops.div( math_ops.add_n(self._w_mul_x), points_in_k_expanded + MEPS)) else: self._means_op = control_flow_ops.no_op() # means are (num_classes x 1 x dims) # Update covariances. with ops.control_dependencies([self._means_op]): b = math_ops.add_n(self._w_mul_x2) / (points_in_k_expanded + MEPS) new_covs = [] for k in range(self._num_classes): mean = self._means.value()[k, :, :] square_mean = math_ops.matmul(mean, mean, transpose_a=True) new_cov = b[k, :, :] - square_mean + self._min_var if self._covariance_type == FULL_COVARIANCE: new_covs.append(array_ops.expand_dims(new_cov, 0)) elif self._covariance_type == DIAG_COVARIANCE: new_covs.append( array_ops.expand_dims(array_ops.diag_part(new_cov), 0)) new_covs = array_ops.concat(new_covs, 0) if 'c' in self._params: # Train operations don't need to take care of the means # because covariances already depend on it. with ops.control_dependencies([self._means_op, new_covs]): self._train_ops.append( state_ops.assign( self._covs, new_covs, validate_shape=False))
def _testAllReduce(self, num_workers, num_gpus, shape, build_f): # Use local CPU as device for all inputs. num_devices = num_workers * num_gpus dev_list = ["/replica:0/task:0/device:CPU:0" for _ in range(num_devices)] with self.cached_session(): input_tensors = self._buildInitialVars(shape, dev_list) un_op = lambda x: math_ops.div( x, constant_op.constant(num_devices, dtype=types_pb2.DT_FLOAT)) simple_sum = math_ops.add_n(input_tensors) simple_sum.op.run() output_tensors = build_f(input_tensors, un_op) sum_reduced = math_ops.add_n(output_tensors) sum_reduced.op.run() self.assertAllClose(sum_reduced.eval(), self.evaluate(simple_sum))
def _get_cross_tower(self): all_components = tuple(self._index.values()) # TODO(josh11b): Use a strategy-specific method. total = math_ops.add_n(all_components) if self._aggregation == vs.VariableAggregation.MEAN: return total * (1./ len(all_components)) return total
def _reduce(self, aggregation, value, destinations): if values._enclosing_tpu_context() is not None: # pylint: disable=protected-access if aggregation == vs.VariableAggregation.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self.num_towers) elif aggregation != vs.VariableAggregation.SUM: raise NotImplementedError( "Currently only support sum & mean in TPUStrategy.") return tpu_ops.cross_replica_sum(value) # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_tower_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self.get_host_cpu_device(0)) else: raise ValueError('Multiple devices are not supported for TPUStrategy') if aggregation == vs.VariableAggregation.ONLY_FIRST_TOWER: return value[0] output = math_ops.add_n(value) if aggregation == vs.VariableAggregation.MEAN: return output * (1. / len(value)) return output
def testAddN(self): l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[]) l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[]) l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[]) result = math_ops.add_n((l1, l2, l3)) result_t = list_ops.tensor_list_stack(result, element_dtype=dtypes.float32) self.assertAllEqual(self.evaluate(result_t), [9., 12.])
def aggregate_single_gradient_using_copy(grad_and_vars, use_mean, check_inf_nan): """Calculate the average gradient for a shared variable across all replicas. Note that this function provides a synchronization point across all replicas. Args: grad_and_vars: A list or tuple of (gradient, variable) tuples. Each (gradient, variable) pair within the outer list represents the gradient of the variable calculated for a single replica, and the number of pairs equals the number of replicas. use_mean: if True, mean is taken, else sum of gradients is taken. check_inf_nan: check grads for nans and infs. Returns: The tuple ([(average_gradient, variable),], has_nan_or_inf) where the gradient has been averaged across all replicas. The variable is chosen from the first replica. The has_nan_or_inf indicates the grads has nan or inf. """ grads = [g for g, _ in grad_and_vars] grad = math_ops.add_n(grads) if use_mean and len(grads) > 1: grad = array_ops.multiply(grad, 1.0 / len(grads)) v = grad_and_vars[0][1] if check_inf_nan: has_nan_or_inf = array_ops.logical_not( array_ops.reduce_all(array_ops.is_finite(grads))) return (grad, v), has_nan_or_inf else: return (grad, v), None
def sequence_classifier(decoding, labels, sampling_decoding=None, name=None): """Returns predictions and loss for sequence of predictions. Args: decoding: List of Tensors with predictions. labels: List of Tensors with labels. sampling_decoding: Optional, List of Tensor with predictions to be used in sampling. E.g. they shouldn't have dependncy on outputs. If not provided, decoding is used. name: Operation name. Returns: Predictions and losses tensors. """ with ops.op_scope([decoding, labels], name, "sequence_classifier"): predictions, xent_list = [], [] for i, pred in enumerate(decoding): xent_list.append(nn.softmax_cross_entropy_with_logits( pred, labels[i], name="sequence_loss/xent_raw{0}".format(i))) if sampling_decoding: predictions.append(nn.softmax(sampling_decoding[i])) else: predictions.append(nn.softmax(pred)) xent = math_ops.add_n(xent_list, name="sequence_loss/xent") loss = math_ops.reduce_sum(xent, name="sequence_loss") return array_ops.expand_concat(1, predictions), loss
def __init__(self, inputs, num_clusters, initial_clusters, distance_metric, random_seed, kmeans_plus_plus_num_retries, cluster_centers, cluster_centers_updated, cluster_centers_initialized): """Creates an op factory. Args: inputs: See KMeans constructor. num_clusters: An integer Tensor providing the number of clusters. initial_clusters: See KMeans constructor. distance_metric: See KMeans constructor. random_seed: See KMeans constructor. kmeans_plus_plus_num_retries: See KMeans constructor. cluster_centers: The TF variable holding the initial centers. It may already contain some centers when the op is executed. cluster_centers_updated: A second TF variable to hold a copy of the initial centers, used for full-batch mode. In mini-batch mode, cluster_centers_updated is the same variable as cluster_centers. cluster_centers_initialized: A boolean TF variable that will be set to true when all the initial centers have been chosen. """ # All of these instance variables are constants. self._inputs = inputs self._num_clusters = num_clusters self._initial_clusters = initial_clusters self._distance_metric = distance_metric self._random_seed = random_seed self._kmeans_plus_plus_num_retries = kmeans_plus_plus_num_retries self._cluster_centers = cluster_centers self._cluster_centers_updated = cluster_centers_updated self._cluster_centers_initialized = cluster_centers_initialized self._num_selected = array_ops.shape(self._cluster_centers)[0] self._num_remaining = self._num_clusters - self._num_selected self._num_data = math_ops.add_n( [array_ops.shape(i)[0] for i in self._inputs])
def test_distributive_property(self): """Verifies the distributive property of matrix multiplication.""" with self.cached_session(): params = constant_op.constant([.1, .2, .3]) sp_values_a = sparse_tensor_lib.SparseTensor( values=["a"], indices=[[0, 0]], dense_shape=[3, 1]) sp_values_b = sparse_tensor_lib.SparseTensor( values=["b"], indices=[[2, 0]], dense_shape=[3, 1]) sp_values_c = sparse_tensor_lib.SparseTensor( values=["c"], indices=[[2, 0]], dense_shape=[3, 1]) sp_values = sparse_tensor_lib.SparseTensor( values=["a", "b", "c"], indices=[[0, 0], [2, 0], [2, 1]], dense_shape=[3, 2]) result_a = embedding_ops._sampled_scattered_embedding_lookup_sparse( params, sp_values_a, dimension=4, hash_key=self._hash_key) result_b = embedding_ops._sampled_scattered_embedding_lookup_sparse( params, sp_values_b, dimension=4, hash_key=self._hash_key) result_c = embedding_ops._sampled_scattered_embedding_lookup_sparse( params, sp_values_c, dimension=4, hash_key=self._hash_key) result = embedding_ops._sampled_scattered_embedding_lookup_sparse( params, sp_values, dimension=4, hash_key=self._hash_key) result_abc = math_ops.add_n([result_a, result_b, result_c]) self.assertAllClose(result.eval(), result_abc.eval())
def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn, method_string): # pylint: disable=g-missing-docstring all_values = [] count = 0 for v in per_device_value._index.values(): # pylint: disable=protected-access if isinstance(v, value_lib.MapOutput): v_list = v.get() if not v_list: continue count += len(v_list) # Sum within each device before aggregating across devices. v = math_ops.add_n(v_list) else: count += 1 all_values.append(v) if not all_values: raise ValueError("`per_device_value` must be non-empty") with ops.device(reduce_to_device): with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT): if method_string == "sum": reduced = accumulation_fn(all_values) elif method_string == "mean": reduced = accumulation_fn(all_values) / count else: raise ValueError("`method_string` must be 'sum' or 'mean'") return reduced
def testVariant(self): def create_constant_variant(value): return constant_op.constant( tensor_pb2.TensorProto( dtype=dtypes.variant.as_datatype_enum, tensor_shape=tensor_shape.TensorShape([]).as_proto(), variant_val=[ tensor_pb2.VariantTensorDataProto( # Match registration in variant_op_registry.cc type_name=b"int", metadata=np.array(value, dtype=np.int32).tobytes()) ])) # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant # copying between CPU and GPU is supported. with self.session(use_gpu=False): variant_const_3 = create_constant_variant(3) variant_const_4 = create_constant_variant(4) variant_const_5 = create_constant_variant(5) # 3 + 3 + 5 + 4 = 15. result = math_ops.add_n((variant_const_3, variant_const_3, variant_const_5, variant_const_4)) # Smoke test -- ensure this executes without trouble. # Right now, non-numpy-compatible objects cannot be returned from a # session.run call; similarly, objects that can't be converted to # native numpy types cannot be passed to ops.convert_to_tensor. # For now, run the test and examine the output to see that the result is # equal to 15. result_op = logging_ops.Print( result, [variant_const_3, variant_const_4, variant_const_5, result], message=("Variants stored an int: c(3), c(4), c(5), " "add_n(c(3), c(3), c(5), c(4)): ")).op result_op.run()
def _reduce(self, aggregation, value, destinations): graph = ops.get_default_graph() cf_context = graph._get_control_flow_context() # pylint: disable=protected-access # If we're inside the ReplicateContext, reduction should be done using # CrossReplicaSum while outside we can directly use an add_n op. while cf_context: if isinstance(cf_context, tpu.TPUReplicateContext): if aggregation == vs.VariableAggregation.MEAN: # TODO(jhseu): Revisit once we support model-parallelism. value *= (1. / self.num_towers) return tpu_ops.cross_replica_sum(value) cf_context = cf_context.outer_context # Validate that the destination is same as the host device # Note we don't do this when in replicate context as the reduction is # performed on the TPU device itself. devices = cross_tower_ops_lib.get_devices_from(destinations) if len(devices) == 1: assert device_util.canonicalize(devices[0]) == device_util.canonicalize( self._host) else: raise ValueError('Multiple devices are not supported for TPUStrategy') output = math_ops.add_n(value) if aggregation == vs.VariableAggregation.MEAN: return output * (1. / len(value)) return output
def approximate_duality_gap(self): """Add operations to compute the approximate duality gap. Returns: An Operation that computes the approximate duality gap over all examples. """ with name_scope('sdca/approximate_duality_gap'): _, values_list = self._hashtable.export_sharded() shard_sums = [] for values in values_list: with ops.device(values.device): # For large tables to_double() below allocates a large temporary # tensor that is freed once the sum operation completes. To reduce # peak memory usage in cases where we have multiple large tables on a # single device, we serialize these operations. # Note that we need double precision to get accurate results. with ops.control_dependencies(shard_sums): shard_sums.append( math_ops.reduce_sum(math_ops.to_double(values), 0)) summed_values = math_ops.add_n(shard_sums) primal_loss = summed_values[1] dual_loss = summed_values[2] example_weights = summed_values[3] # Note: we return NaN if there are no weights or all weights are 0, e.g. # if no examples have been processed return (primal_loss + dual_loss + self._l1_loss() + (2.0 * self._l2_loss(self._symmetric_l2_regularization())) ) / example_weights
def __call__(self, y_true, y_pred, sample_weight=None, regularization_losses=None): """Computes the overall loss. Arguments: y_true: An arbitrary structure of Tensors representing the ground truth. y_pred: An arbitrary structure of Tensors representing a Model's outputs. sample_weight: An arbitrary structure of Tensors representing the per-sample loss weights. If one Tensor is passed, it is used for all losses. If multiple Tensors are passed, the structure should match `y_pred`. regularization_losses: Additional losses to be added to the total loss. Returns: Tuple of `(total_loss, per_output_loss_list)` """ y_true = map_to_output_names(y_pred, self._output_names, y_true) sample_weight = map_to_output_names(y_pred, self._output_names, sample_weight) if not self._built: self._build(y_pred) y_true = nest.flatten(y_true) if y_true is not None else [] y_pred = nest.flatten(y_pred) # TODO(omalleyt): Remove ambiguity here. # This is currently needed to support passing only 1 loss and 1 target # to a Functional Model with multiple outputs. However, this is # ambiguous, especially with subclass, and we should reconsider how we # support this. if len(y_true) == 1 and len(y_pred) > 1: y_true = y_true * len(y_pred) sample_weight = nest.flatten(sample_weight) # Allows passing one sample-weight array for all outputs. if len(sample_weight) == 1 and len(y_pred) > 1: sample_weight = sample_weight * len(y_pred) loss_values = [] # Used for gradient calculation. loss_metric_values = [] # Used for loss metric calculation. zip_args = (y_true, y_pred, sample_weight, self._losses, self._loss_weights, self._per_output_metrics) for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args): if loss_obj is None: # Ok to have no loss for an output. continue y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw) sw = apply_mask(y_p, sw) loss_value = loss_obj(y_t, y_p, sample_weight=sw) loss_metric_value = loss_value # Correct for the `Mean` loss metrics counting each replica as a batch. if loss_obj.reduction == losses_utils.ReductionV2.SUM: loss_metric_value *= ds_context.get_strategy( ).num_replicas_in_sync if metric_obj is not None: metric_obj.update_state(loss_metric_value) if loss_weight is not None: loss_value *= loss_weight loss_metric_value *= loss_weight if (loss_obj.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE or loss_obj.reduction == losses_utils.ReductionV2.AUTO): loss_value = losses_utils.scale_loss_for_distribution( loss_value) loss_values.append(loss_value) loss_metric_values.append(loss_metric_value) if regularization_losses: reg_loss = math_ops.add_n(regularization_losses) loss_metric_values.append(reg_loss) loss_values.append( losses_utils.scale_loss_for_distribution(reg_loss)) if loss_values: total_loss_metric_value = math_ops.add_n(loss_metric_values) self._loss_metric.update_state(total_loss_metric_value) total_loss = math_ops.add_n(loss_values) return total_loss else: # Ok for a model to have no compiled loss. return array_ops.zeros(shape=())
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ # TODO(psv): Dedup code here with graph mode prepare_total_loss() fn. # Used to keep track of the total loss value (stateless). # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) + # loss_weight_2 * output_2_loss_fn(...) + # layer losses. total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] # Allow mixed `NumPy` and `EagerTensor` input here. if any( isinstance(input_t, (np.ndarray, float, int)) for input_t in nest.flatten(inputs)): inputs = nest.map_structure(ops.convert_to_tensor, inputs) outs = model(inputs, **kwargs) outs = nest.flatten(outs) masks = [getattr(t, '_keras_mask', None) for t in outs] targets = nest.flatten(targets) # Used to keep track of individual output losses. output_losses = [] with backend.name_scope('loss'): loss_fns = [ loss_fn for loss_fn in model.loss_functions if loss_fn is not None ] for i, loss_fn in enumerate(loss_fns): weights = sample_weights[i] if sample_weights else None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. mask, _, weights = ( tf_losses_utils.squeeze_or_expand_dimensions( mask, sample_weight=weights)) weights *= mask if hasattr(loss_fn, 'reduction'): per_sample_losses = loss_fn.call(targets[i], outs[i]) weighted_losses = losses_utils.compute_weighted_loss( per_sample_losses, sample_weight=weights, reduction=losses_utils.ReductionV2.NONE) loss_reduction = loss_fn.reduction # `AUTO` loss reduction defaults to `SUM_OVER_BATCH_SIZE` for all # compile use cases. if loss_reduction == losses_utils.ReductionV2.AUTO: loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE # Compute the stateless loss value. output_loss = losses_utils.reduce_weighted_loss( weighted_losses, reduction=loss_reduction) else: # Compute the stateless loss value for a custom loss class. # Here we assume that the class takes care of loss reduction # because if this class returns a vector value we cannot # differentiate between use case where a custom optimizer # expects a vector loss value vs unreduced per-sample loss value. output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) loss_reduction = losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: # Keep track of the stateful output loss result. output_losses.append(output_loss_metrics[i](output_loss)) # Scale output loss for distribution. For custom losses we assume # reduction was mean. if loss_reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE: output_loss = losses_utils.scale_loss_for_distribution( output_loss) total_loss += model._loss_weights_list[i] * output_loss # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += losses_utils.scale_loss_for_distribution( math_ops.add_n(custom_losses)) return outs, total_loss, output_losses, masks
def _inner_product_list(list1, list2): return math_ops.add_n( [math_ops.reduce_sum(elt1 * elt2) for elt1, elt2 in zip(list1, list2)])
def testExecuteIntAttr(self): three = constant_op.constant(3) four = constant_op.constant(4) total = math_ops.add_n([three, four]) self.assertAllEqual(7, total)
def size(self, name=None): with ops.name_scope(name, 'sharded_mutable_hash_table_size'): sizes = [ self._table_shards[i].size() for i in range(self._num_shards) ] return math_ops.add_n(sizes)
def _scaled_dot_product(scale, xs, ys, name=None): """Calculate a scaled, vector inner product between lists of Tensors.""" return math_ops.add_n([(scale * x) * y for x, y in zip(xs, ys) if _possibly_nonzero(x) and _possibly_nonzero(y)], name='scaled_dot_product')
def create_estimator_spec( self, features, mode, logits, labels=None, train_op_fn=None, regularization_losses=None): """Returns an `EstimatorSpec`. Args: features: Input `dict` of `Tensor` or `SparseTensor` objects. mode: Estimator's `ModeKeys`. logits: logits `Tensor` with shape `[D0, D1, ... DN, n_classes]`. For many applications, the shape is `[batch_size, n_classes]`. labels: Labels with shape matching `logits`. Can be multi-hot `Tensor` with shape `[D0, D1, ... DN, n_classes]` or `SparseTensor` with `dense_shape` `[D0, D1, ... DN, ?]`. `labels` is required argument when `mode` equals `TRAIN` or `EVAL`. train_op_fn: Function that takes a scalar loss `Tensor` and returns `train_op`. Required in TRAIN mode. regularization_losses: A list of additional scalar losses to be added to the training loss, such as regularization losses. These losses are usually expressed as a batch average, so for best results users need to set `loss_reduction=SUM_OVER_BATCH_SIZE` or `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to avoid scaling errors. Returns: `EstimatorSpec`. Raises: ValueError: If `train_op_fn` is `None` in TRAIN mode. """ with ops.name_scope(self._name, 'head'): logits = head_lib._check_logits_final_dim(logits, self.logits_dimension) # pylint:disable=protected-access # Predict. pred_keys = prediction_keys.PredictionKeys with ops.name_scope(None, 'predictions', (logits,)): probabilities = math_ops.sigmoid(logits, name=pred_keys.PROBABILITIES) predictions = { pred_keys.LOGITS: logits, pred_keys.PROBABILITIES: probabilities, } if mode == model_fn.ModeKeys.PREDICT: classifier_output = head_lib._classification_output( # pylint:disable=protected-access scores=probabilities, n_classes=self._n_classes, label_vocabulary=self._label_vocabulary) return model_fn.EstimatorSpec( mode=model_fn.ModeKeys.PREDICT, predictions=predictions, export_outputs={ _DEFAULT_SERVING_KEY: classifier_output, head_lib._CLASSIFY_SERVING_KEY: classifier_output, # pylint:disable=protected-access head_lib._PREDICT_SERVING_KEY: ( # pylint:disable=protected-access export_output.PredictOutput(predictions)) }) (training_loss, unreduced_loss, weights, processed_labels) = self.create_loss( features=features, mode=mode, logits=logits, labels=labels) if regularization_losses: regularization_loss = math_ops.add_n(regularization_losses) regularized_training_loss = math_ops.add_n( [training_loss, regularization_loss]) else: regularization_loss = None regularized_training_loss = training_loss # Eval. if mode == model_fn.ModeKeys.EVAL: return model_fn.EstimatorSpec( mode=model_fn.ModeKeys.EVAL, predictions=predictions, loss=regularized_training_loss, eval_metric_ops=self._eval_metric_ops( labels=processed_labels, probabilities=probabilities, weights=weights, unreduced_loss=unreduced_loss, regularization_loss=regularization_loss)) # Train. if train_op_fn is None: raise ValueError('train_op_fn can not be None.') # Only summarize mean_loss for SUM reduction to preserve backwards # compatibility. Otherwise skip it to avoid unnecessary computation. if self._loss_reduction == losses.Reduction.SUM: example_weight_sum = math_ops.reduce_sum( weights * array_ops.ones_like(unreduced_loss)) mean_loss = training_loss / example_weight_sum else: mean_loss = None with ops.name_scope(''): keys = metric_keys.MetricKeys summary.scalar( head_lib._summary_key(self._name, keys.LOSS), # pylint:disable=protected-access regularized_training_loss) if mean_loss is not None: summary.scalar( head_lib._summary_key(self._name, keys.LOSS_MEAN), # pylint:disable=protected-access mean_loss) if regularization_loss is not None: summary.scalar( head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION), # pylint:disable=protected-access regularization_loss) return model_fn.EstimatorSpec( mode=model_fn.ModeKeys.TRAIN, predictions=predictions, loss=regularized_training_loss, train_op=train_op_fn(regularized_training_loss))
def variational_beam_decoder_with_buckets(means, logvars, decoder_inputs, targets, weights, buckets, decoder, latent_dec, kl_f, sample, iaf=False, softmax_loss_function=None, per_example_loss=False, name=None): """Create a sequence-to-sequence model with support for bucketing. """ if len(targets) < buckets[-1][1]: raise ValueError("Length of targets (%d) must be at least that of last" "bucket (%d)." % (len(targets), buckets[-1][1])) if len(weights) < buckets[-1][1]: raise ValueError("Length of weights (%d) must be at least that of last" "bucket (%d)." % (len(weights), buckets[-1][1])) all_inputs = decoder_inputs + targets + weights losses = [] outputs = [] beam_paths = [] beam_path = [] KL_divergences = [] with ops.name_scope(name, "variational_decoder_with_buckets", all_inputs): for j, bucket in enumerate(buckets): with variable_scope.variable_scope( variable_scope.get_variable_scope(), reuse=True if j > 0 else None): latent_vector, kl_cost = sample(means[j], logvars[j]) decoder_initial_state = latent_dec(latent_vector) bucket_outputs, _, beam_path, beam_symbol = decoder( decoder_initial_state, decoder_inputs[:bucket[1]]) outputs.append(bucket_outputs) beam_paths.append(beam_path) beam_symbols.append(beam_symbol) total_size = math_ops.add_n(weights[:bucket[1]]) total_size += 1e-12 KL_divergences.append(tf.reduce_mean(kl_cost / total_size)) if per_example_loss: losses.append( sequence_loss_by_example( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) else: losses.append( sequence_loss( outputs[-1], targets[:bucket[1]], weights[:bucket[1]], softmax_loss_function=softmax_loss_function)) return outputs, losses, KL_objs, KL_costs
def __init__(self, config, name_scope, dtype=tf.float32): # with tf.variable_scope(name_or_scope=scope_name): with tf.device("/gpu:0"): emb_dim = config.embed_dim word_embedding = config.word_embedding num_layers = config.num_layers vocab_size = config.vocab_size buckets = config.buckets self.learning_rate = tf.Variable(float(config.learning_rate), trainable=False, dtype=dtype) self.global_step = tf.Variable(initial_value=0, trainable=False) self.query = [] self.answer = [] self.weight = [] for i in range(buckets[-1][0]): self.query.append(tf.placeholder(dtype=tf.int32, shape=[None], name="query{0}".format(i))) for i in xrange(buckets[-1][1]): self.answer.append(tf.placeholder(dtype=tf.int32, shape=[None], name="answer{0}".format(i))) for i in xrange(buckets[-1][1]): self.weight.append(tf.placeholder(dtype=tf.float32, shape=[None], name="weight{0}".format(i))) self.traj_ip_weight = tf.placeholder(dtype=tf.float32, shape=[None], name="traj_weight") # self.target = tf.placeholder(dtype=tf.int64, shape=[None], name="target") def create_rnn_cell(): # encoDecoCell = tf.contrib.rnn.GRUCell( # Or GRUCell, LSTMCell(args.hiddenSize) encoDecoCell = tf.nn.rnn_cell.GRUCell( # Or GRUCell, LSTMCell(args.hiddenSize) emb_dim, ) encoDecoCell = tf.contrib.rnn.DropoutWrapper( encoDecoCell, input_keep_prob=1.0, output_keep_prob=config.keep_prob ) return encoDecoCell # ''' encoder_mutil = tf.contrib.rnn.MultiRNNCell( [create_rnn_cell() for _ in range(num_layers)], ) # ''' query_encoder_emb = EmbeddingWrapper_GPU(encoder_mutil, embedding_classes=vocab_size, embedding_size=word_embedding) context_multi = tf.contrib.rnn.MultiRNNCell( [create_rnn_cell() for _ in range(1)], ) self.b_query_state = [] self.b_answer_state = [] self.b_state = [] self.b_reward = [] self.b_loss = [] self.b_train_op = [] self.b_traj_reward = [] # with tf.name_scope('structure'): for i, bucket in enumerate(buckets): state_list = [] reward_list = [] with tf.variable_scope(name_or_scope="Hier_RNN_encoder", reuse=True if i > 0 else None) as scope: query_output, query_state = tf.contrib.rnn.static_rnn(query_encoder_emb, inputs=self.query[:bucket[0]], dtype=tf.float32) self.b_query_state.append(query_state) with tf.variable_scope("Hier_RNN_encoder/rnn/embedding_wrapper", reuse=True): embed_in = tf.get_variable("embedding") emb_answer = [ embedding_ops.embedding_lookup(embed_in, ix) for ix in self.answer[:bucket[1]]] with tf.variable_scope(name_or_scope="Hier_RNN_context", reuse=True if i > 0 else None) as var_scope: ''' utilize the state from last step which record the hidden state of each encoding step ''' query_state_history = query_state[-1] context_action_history = [] for j in range(0, bucket[1]): if j > 0: var_scope.reuse_variables() action = emb_answer[j] emb_proj_w = tf.get_variable("embd_project_w", [word_embedding, emb_dim], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) emb_proj_b = tf.get_variable("embd_project_b", [emb_dim], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) projected = tf.matmul(action, emb_proj_w) + emb_proj_b context_action_history.append(projected) with tf.variable_scope(name_or_scope="Reward_concat_layer", reuse=True if i > 0 else None) as var_scope: context_input = [query_state_history] + context_action_history output, state = tf.contrib.rnn.static_rnn(context_multi, context_input, dtype=tf.float32) for j in range(0, bucket[1]): state_action_pair = [output[j], context_action_history[j]] state_list.append(state_action_pair) self.b_state.append(state_list) with tf.variable_scope("Softmax_layer_and_output", reuse=True if i > 0 else None) as var_scope: for j in range(0, bucket[1]): if j > 0: var_scope.reuse_variables() softmax_w1_s = tf.get_variable("softmax_w1_s", [emb_dim, 100], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_b1_s = tf.get_variable("softmax_b1_s", 100, dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_w1_a = tf.get_variable("softmax_w1_a", [emb_dim, 100], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_b1_a = tf.get_variable("softmax_b1_a", 100, dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) s_1 = tf.matmul(state_list[j][0], softmax_w1_s) + softmax_b1_s a_1 = tf.matmul(state_list[j][1], softmax_w1_a) + softmax_b1_a s_a_1 = tf.concat([s_1, a_1], 1) softmax_w3 = tf.get_variable("softmax_w3", [200, 100], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_b3 = tf.get_variable("softmax_b3", 100, dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_w4 = tf.get_variable("softmax_w4", [100, 50], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_b4 = tf.get_variable("softmax_b4", 50, dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_w5 = tf.get_variable("softmax_w5", [50, 1], dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) softmax_b5 = tf.get_variable("softmax_b5", 1, dtype=tf.float32, initializer=tf.random_normal_initializer(stddev=0.1)) logits_mid1 = tf.matmul(s_a_1, softmax_w3) + softmax_b3 logits_mid2 = tf.matmul(logits_mid1, softmax_w4) + softmax_b4 logits = tf.matmul(logits_mid2, softmax_w5) + softmax_b5 reward = tf.nn.sigmoid(logits) reward = tf.reshape(reward, [-1]) # print(reward.get_shape()) reward = tf.multiply(reward, self.weight[j]) # print(self.weight[j].get_shape()) reward_list.append(reward) self.b_reward.append(reward_list) with tf.name_scope("loss"): traj_reward = math_ops.add_n(reward_list) loss = tf.multiply(traj_reward, self.traj_ip_weight) # mean_loss = tf.reduce_mean(loss) mean_loss = tf.reduce_sum(loss) self.b_loss.append(mean_loss) self.b_traj_reward.append(traj_reward) with tf.name_scope("gradient_descent"): # ''' optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate) ''' optimizer = tf.train.AdamOptimizer( learning_rate=self.learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08 ) ''' gradients, variables = zip(*optimizer.compute_gradients(mean_loss)) gradients, _ = tf.clip_by_global_norm(gradients, 5.0) train_op = optimizer.apply_gradients(zip(gradients, variables), global_step=self.global_step) self.b_train_op.append(train_op) all_variables = [v for v in tf.global_variables() if name_scope in v.name] self.saver = tf.train.Saver(all_variables)
def _AggregatedGrads(grads, op, loop_state, aggregation_method=None): """Get the aggregated gradients for op. Args: grads: The map of memoized gradients. op: The op to get gradients for. loop_state: An object for maintaining the state of the while loops in the graph. It is of type ControlFlowState. None if the graph contains no while loops. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of gradients, one per each output of `op`. If the gradients for a particular output is a list, this function aggregates it before returning. Raises: TypeError: if the incoming grads are not Tensors or IndexedSlices. ValueError: if the arguments are invalid. """ if aggregation_method is None: aggregation_method = AggregationMethod.DEFAULT if aggregation_method not in [ AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N ]: raise ValueError("Invalid aggregation_method specified %s." % aggregation_method) out_grads = _GetGrads(grads, op) for i, out_grad in enumerate(out_grads): if loop_state: if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)): assert control_flow_ops.IsLoopSwitch(op) continue # Grads have to be Tensors or IndexedSlices if (isinstance(out_grad, collections.Sequence) and not all([ isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad if g is not None ])): raise TypeError("gradients have to be either all Tensors " "or all IndexedSlices") # Aggregate multiple gradients, and convert [] to None. if out_grad: if len(out_grad) < 2: used = "nop" out_grads[i] = out_grad[0] elif all([isinstance(g, ops.Tensor) for g in out_grad if g is not None]): tensor_shape = _AccumulatorShape(out_grad) if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N and len(out_grad) > 2 and tensor_shape.is_fully_defined()): # The benefit of using AccumulateN is that its inputs can be combined # in any order and this can allow the expression to be evaluated with # a smaller memory footprint. When used with gpu_allocator_retry, # it is possible to compute a sum of terms which are much larger than # total GPU memory. # AccumulateN can currently only be used if we know the shape for # an accumulator variable. If this is not known, or if we only have # 2 grads then we fall through to the "tree" case below. used = "accumulate_n" out_grads[i] = math_ops.accumulate_n(out_grad) elif aggregation_method in [ AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N ]: # Aggregate all gradients by doing pairwise sums: this may # reduce performance, but it can improve memory because the # gradients can be released earlier. # # TODO(vrv): Consider replacing this with a version of # tf.AddN() that eagerly frees its inputs as soon as they are # ready, so the order of this tree does not become a problem. used = "tree" with ops.name_scope(op.name + "_gradient_sum"): running_sum = out_grad[0] for grad in out_grad[1:]: running_sum = math_ops.add_n([running_sum, grad]) out_grads[i] = running_sum else: used = "add_n" out_grads[i] = _MultiDeviceAddN(out_grad) logging.vlog(2, " _AggregatedGrads %d x %s using %s", len(out_grad), tensor_shape, used) else: out_grad = math_ops._as_indexed_slices_list( [g for g in out_grad if g is not None]) out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad] # Form IndexedSlices out of the concatenated values and # indices. out_grads[i] = ops.IndexedSlices( array_ops.concat([x.values for x in out_grad], 0), array_ops.concat([x.indices for x in out_grad], 0), out_grad[0].dense_shape) else: # not out_grad # out_grads[i] is [], thus its aggregation is simply None. out_grads[i] = None return out_grads
def weighted_sum_from_feature_columns(columns_to_tensors, feature_columns, num_outputs, weight_collections=None, trainable=True, scope=None): """A tf.contrib.layers style linear prediction builder based on FeatureColumn. Generally a single example in training data is described with feature columns. This function generates weighted sum for each num_outputs. Weighted sum refers to logits in classification problems. It refers to prediction itself for linear regression problems. Example: ``` # Building model for training feature_columns = ( real_valued_column("my_feature1"), ... ) columns_to_tensor = tf.parse_example(...) logits = weighted_sum_from_feature_columns( columns_to_tensors=columns_to_tensor, feature_columns=feature_columns, num_outputs=1) loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits) ``` Args: columns_to_tensors: A mapping from feature column to tensors. 'string' key means a base feature (not-transformed). It can have FeatureColumn as a key too. That means that FeatureColumn is already transformed by input pipeline. For example, `inflow` may have handled transformations. feature_columns: A set containing all the feature columns. All items in the set should be instances of classes derived from FeatureColumn. num_outputs: An integer specifying number of outputs. Default value is 1. weight_collections: List of graph collections to which weights are added. trainable: If `True` also add variables to the graph collection `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable). scope: Optional scope for variable_scope. Returns: A tuple containing: * A Tensor which represents predictions of a linear model. * A dictionary which maps feature_column to corresponding Variable. * A Variable which is used for bias. Raises: ValueError: if FeatureColumn cannot be used for linear predictions. """ columns_to_tensors = columns_to_tensors.copy() check_feature_columns(feature_columns) with variable_scope.variable_scope( scope, default_name='weighted_sum_from_feature_columns', values=columns_to_tensors.values()): output_tensors = [] column_to_variable = dict() transformer = _Transformer(columns_to_tensors) # pylint: disable=protected-access for column in sorted(set(feature_columns), key=lambda x: x.key): transformed_tensor = transformer.transform(column) try: embedding_lookup_arguments = column._wide_embedding_lookup_arguments( transformed_tensor) variable, predictions = _create_embedding_lookup( column, columns_to_tensors, embedding_lookup_arguments, num_outputs, trainable, weight_collections) except NotImplementedError: with variable_scope.variable_scope( None, default_name=column.name, values=columns_to_tensors.values()): tensor = column._to_dense_tensor(transformed_tensor) tensor = _maybe_reshape_input_tensor(tensor, column.name, output_rank=2) variable = [ contrib_variables.model_variable( name='weight', shape=[tensor.get_shape()[1], num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=weight_collections) ] predictions = math_ops.matmul(tensor, variable[0], name='matmul') except ValueError as ee: raise ValueError( 'Error creating weighted sum for column: {}.\n' '{}'.format(column.name, ee)) output_tensors.append( array_ops.reshape(predictions, shape=(-1, num_outputs))) column_to_variable[column] = variable _log_variable(variable) _maybe_restore_from_checkpoint(column._checkpoint_path(), variable) # pylint: enable=protected-access predictions_no_bias = math_ops.add_n(output_tensors) bias = contrib_variables.model_variable( 'bias_weight', shape=[num_outputs], initializer=init_ops.zeros_initializer(), trainable=trainable, collections=_add_variable_collection(weight_collections)) _log_variable(bias) predictions = nn_ops.bias_add(predictions_no_bias, bias) return predictions, column_to_variable, bias
def add_all(*args): return math_ops.add_n(*args)
def _dot_product(xs, ys, name=None): """Calculate the vector inner product between two lists of Tensors.""" return math_ops.add_n([x * y for x, y in zip(xs, ys)])
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ # Used to keep track of the total loss value (stateless). # eg., total_loss = loss_weight_1 * output_1_loss_fn(...) + # loss_weight_2 * output_2_loss_fn(...) + # layer losses. total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] # Allow mixed `NumPy` and `EagerTensor` input here. if any( isinstance(input_t, (np.ndarray, float, int)) for input_t in nest.flatten(inputs)): inputs = nest.map_structure(ops.convert_to_tensor, inputs) outs = model(inputs, **kwargs) outs = nest.flatten(outs) # `None` by default for `EagerTensors`. masks = [t._keras_mask for t in outs] targets = nest.flatten(targets) # Used to keep track of individual output losses. output_losses = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): weights = sample_weights[i] if sample_weights else None mask = masks[i] with backend.name_scope(model.output_names[i] + '_loss'): if mask is not None: mask = math_ops.cast(mask, outs[i].dtype) # Update weights with mask. if weights is None: weights = mask else: # Update dimensions of weights to match with mask if possible. mask, _, weights = ( losses_utils.squeeze_or_expand_dimensions(mask, None, weights)) weights *= mask # Reset reduction on the loss so that we can get the per sample loss # value. We use this to get both the stateless and stateful loss # values without having to compute the underlying loss function # twice. weighted_losses = None if hasattr(loss_fn, 'reduction'): current_loss_reduction = loss_fn.reduction loss_fn.reduction = losses_utils.ReductionV2.NONE weighted_losses = loss_fn(targets[i], outs[i], sample_weight=weights) loss_fn.reduction = current_loss_reduction # Compute the stateless loss value. output_loss = losses_utils.reduce_weighted_loss(weighted_losses) else: # Compute the stateless loss value for a custom loss class. # Here we assume that the class takes care of loss reduction # because if this class returns a vector value we cannot # differentiate between use case where a custom optimizer # expects a vector loss value vs unreduced per-sample loss value. output_loss = loss_fn(targets[i], outs[i], sample_weight=weights) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: # Compute the stateful loss value. if weighted_losses is not None: aggregated_output_loss = output_loss_metrics[i](weighted_losses) else: # Custom loss class. aggregated_output_loss = training_utils.call_metric_function( output_loss_metrics[i], targets[i], outs[i], weights=weights) # Keep track of the stateful output loss result. output_losses.append(aggregated_output_loss) total_loss += model.loss_weights_list[i] * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += losses_utils.scale_loss_for_distribution( math_ops.add_n(custom_losses)) return outs, total_loss, output_losses, masks
def read_var(self, tower_local_var): """Read the aggregate value of a tower-local variable.""" if isinstance(tower_local_var, values.TowerLocalVariable): return math_ops.add_n(self.unwrap(tower_local_var)) assert isinstance(tower_local_var, values.Mirrored) return array_ops.identity(tower_local_var.get())
def _calculate_t1_cond_values(self, alpha, beta, eta, lambd, scaled_s_dg_db, scaled_s_dg_dl, graph, t): """ Calculates values for t > 1 condition part. :return: Assignments made and values that are later needed in calculations. """ assignments = [] cond_t1 = math_ops.greater(t, 1.0) s_dt_db_norm = math_ops.sqrt( math_ops.add_n([ math_ops.reduce_sum(self.get_slot(v, "s_dt_db")**2.0) for v in self._vars ])) s_dt_dl_norm = math_ops.sqrt( math_ops.add_n([ math_ops.reduce_sum(self.get_slot(v, "s_dt_dl")**2.0) for v in self._vars ])) alpha0 = control_flow_ops.cond( cond_t1, lambda: math_ops.log(0.5 * math_ops.minimum( s_dt_db_norm / math_ops.sqrt( math_ops.add_n([ math_ops.reduce_sum(sgb**2.0) for sgb in scaled_s_dg_db ])), s_dt_dl_norm / math_ops.sqrt( math_ops.add_n([ math_ops.reduce_sum(sgl**2.0) for sgl in scaled_s_dg_dl ])))), lambda: float('Inf')) cond_a0 = math_ops.greater(alpha, alpha0) alpha = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_a0), lambda: alpha - 2.0 * self._delta_t, lambda: alpha) beta = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_a0), lambda: math_ops.exp(alpha0), lambda: beta) eg2 = self._get_non_slot_variable("e_g2", graph) em2 = self._get_non_slot_variable("e_m2", graph) gamma = control_flow_ops.cond( cond_t1, lambda: math_ops.minimum( 1.0, math_ops.minimum(self._C_t * eg2 / s_dt_db_norm**2.0, self._C_t * em2 / s_dt_dl_norm**2.0)), lambda: self._get_non_slot_variable("gamma", graph)) cond_gl = math_ops.greater(lambd, gamma) eta = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_gl), lambda: eta - 2.0 * self._delta_t, lambda: eta) lambd = control_flow_ops.cond(math_ops.logical_and(cond_t1, cond_gl), lambda: gamma, lambda: lambd) assignments.append( state_ops.assign(self._get_non_slot_variable("alpha", graph), alpha)) assignments.append( state_ops.assign(self._get_non_slot_variable("beta", graph), beta)) assignments.append( state_ops.assign(self._get_non_slot_variable("eta", graph), eta)) assignments.append( state_ops.assign(self._get_non_slot_variable("lambda", graph), lambd)) assignments.append( state_ops.assign(self._get_non_slot_variable("gamma", graph), gamma)) return assignments, beta, lambd, gamma
def _model_loss(model, inputs, targets, output_loss_metrics=None, sample_weights=None, training=False): """Calculates the loss for a given model. Arguments: model: The model on which metrics are being calculated. inputs: Either a dictionary of inputs to the model or a list of input arrays. targets: List of target arrays. output_loss_metrics: List of metrics that are used to aggregated output loss values. sample_weights: Optional list of sample weight arrays. training: Whether the model should be run in inference or training mode. Returns: Returns the model output, total loss, loss value calculated using the specified loss function and masks for each output. The total loss includes regularization losses and applies masking and sample weighting to the loss value. """ total_loss = 0 kwargs = {} if model._expects_training_arg: kwargs['training'] = training if len(inputs) == 1 and not isinstance(inputs, dict): inputs = inputs[0] if model._compute_output_and_mask_jointly: outs, masks = model._call_and_compute_mask(inputs, **kwargs) masks = generic_utils.to_list(masks) else: outs = model.call(inputs, **kwargs) masks = None outs = generic_utils.to_list(outs) if masks is None: masks = [None for _ in outs] targets = generic_utils.to_list(targets) loss_metrics = [] aggregated_loss_metrics = [] with backend.name_scope('loss'): for i, loss_fn in enumerate(model.loss_functions): if sample_weights: weights = sample_weights[i] else: weights = None mask = masks[i] weighted_masked_fn = training_utils.weighted_masked_objective( loss_fn) with backend.name_scope(model.output_names[i] + '_loss'): output_loss = weighted_masked_fn(targets[i], outs[i], weights, mask=mask) # If the number of outputs is 1 then we don't append the loss metric # associated with each model output. When there are multiple outputs # associated with a model, each output's loss is calculated and returned # as part of the loss_metrics. if len(model.outputs) > 1: loss_metrics.append(backend.mean(output_loss)) if output_loss_metrics is not None: # Keep track of the stateful loss result. aggregated_loss_metrics.append( training_utils.call_metric_function( output_loss_metrics[i], targets[i], outs[i], weights=weights, mask=mask)) loss_weight = model.loss_weights_list[i] if total_loss is None: total_loss = loss_weight * output_loss else: total_loss += loss_weight * output_loss total_loss = backend.mean(total_loss) # Add regularization losses custom_losses = model.losses if custom_losses: total_loss += math_ops.add_n(custom_losses) model._clear_losses() return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
def _update_vars_and_estimators(self, scaled_g, beta, prev_lambd, lambd, gamma, mu, scaled_s_dg_db, scaled_s_dg_dl, dq_db, dq_dl, dbj_dmu, w_t, graph): """ Updates variables and estimator values. :return: Assignments of new values. """ assignments = [] momentum_values = [] for v, ov, g, sgb, sgl in zip(self._vars, self._original_vars, scaled_g, scaled_s_dg_db, scaled_s_dg_dl): sbtb = self.get_slot(v, "s_dbt_db") sbtl = self.get_slot(v, "s_dbt_dl") p = self.get_slot(v, "phi") btmu = self.get_slot(v, "dbt_dmu") stb = self.get_slot(v, "s_dt_db") stl = self.get_slot(v, "s_dt_dl") smb = self.get_slot(v, "s_dm_db") sml = self.get_slot(v, "s_dm_dl") m = self.get_slot(v, "momentum") if not self._use_ag: s_dbt_db = mu * gamma * sbtb + (1.0 - mu) * gamma * stb s_dbt_dl = mu * gamma * sbtl + (1.0 - mu) * gamma * stl dbt_dmu = -p + mu * btmu s_dm_db = lambd * gamma * smb - g - beta * gamma * sgb s_dm_dl = m + lambd * gamma * sml - beta * gamma * sgl momentum = lambd * m - beta * g phi = mu * p + momentum new_v = ov + momentum - phi s_dt_db = gamma * stb + s_dm_db s_dt_dl = gamma * stl + s_dm_dl else: s_dbt_db = mu * gamma * sbtb + \ (1.0 - mu) * gamma * (stb - lambd * smb) s_dbt_dl = mu * gamma * sbtl + \ (1.0 - mu) * (gamma * stl - m - lambd * gamma * sml) dbt_dmu = -p + mu * btmu s_dm_db = prev_lambd * gamma * smb - g - beta * gamma * sgb s_dm_dl = m + prev_lambd * gamma * sml - beta * sgl momentum = prev_lambd * m - beta * g phi = mu * p + momentum new_v = ov - beta * g - phi s_dt_db = gamma * stb + lambd * s_dm_db - g - beta * gamma * sgb s_dt_dl = gamma * stl + momentum + lambd * gamma * s_dm_dl - beta * gamma * sgl momentum_values.append(momentum) assignments.append( state_ops.assign(self.get_slot(v, "s_dbt_db"), s_dbt_db)) assignments.append( state_ops.assign(self.get_slot(v, "s_dbt_dl"), s_dbt_dl)) assignments.append( state_ops.assign(self.get_slot(v, "dbt_dmu"), dbt_dmu)) assignments.append( state_ops.assign(self.get_slot(v, "s_dm_db"), s_dm_db)) assignments.append( state_ops.assign(self.get_slot(v, "s_dm_dl"), s_dm_dl)) assignments.append( state_ops.assign(self.get_slot(v, "momentum"), momentum)) assignments.append(state_ops.assign(v, new_v)) assignments.append(state_ops.assign(self.get_slot(v, "phi"), phi)) assignments.append( state_ops.assign(self.get_slot(v, "s_dt_db"), s_dt_db)) assignments.append( state_ops.assign(self.get_slot(v, "s_dt_dl"), s_dt_dl)) e_dq_db2 = w_t * self._get_non_slot_variable("e_dq_db2", graph) + \ (1.0 - w_t) * (dq_db ** 2.0) e_dq_dl2 = w_t * self._get_non_slot_variable("e_dq_dl2", graph) + \ (1.0 - w_t) * (dq_dl ** 2.0) e_dbj_dmu2 = w_t * self._get_non_slot_variable("e_dbj_dmu2", graph) + \ (1.0 - w_t) * (dbj_dmu ** 2.0) e_g2 = w_t * self._get_non_slot_variable("e_g2", graph) + \ (1.0 - w_t) * math_ops.add_n([math_ops.reduce_sum(g ** 2.0) for g in scaled_g]) e_m2 = w_t * self._get_non_slot_variable("e_m2", graph) + \ (1.0 - w_t) * math_ops.add_n([math_ops.reduce_sum(m ** 2.0) for m in momentum_values]) assignments.append( state_ops.assign(self._get_non_slot_variable("e_dq_db2", graph), e_dq_db2)) assignments.append( state_ops.assign(self._get_non_slot_variable("e_dq_dl2", graph), e_dq_dl2)) assignments.append( state_ops.assign(self._get_non_slot_variable("e_dbj_dmu2", graph), e_dbj_dmu2)) assignments.append( state_ops.assign(self._get_non_slot_variable("e_g2", graph), e_g2)) assignments.append( state_ops.assign(self._get_non_slot_variable("e_m2", graph), e_m2)) return assignments
def add(self, x): return x + math_ops.add_n(self.a) + self.b["a"]
def sum_reg(weights, name=None): """Applies the sum of all the input regularizers.""" with ops.op_scope([weights], name, 'sum_regularizer') as scope: regularizer_tensors = [reg(weights) for reg in regularizer_list] return math_ops.add_n(regularizer_tensors, name=scope)
def loop_fn(i): x1 = array_ops.gather(x, i) return math_ops.add_n([x1, y, z])
def fn(): outputs = [] for _ in range(20): outputs.append(v * constant_op.constant(2.0)) return math_ops.add_n(outputs)
def __call__(self, y_true, y_pred, sample_weight=None): """Computes the overall loss. Arguments: y_true: An arbitrary structure of Tensors representing the ground truth. y_pred: An arbitrary structure of Tensors representing a Model's outputs. sample_weight: An arbitrary structure of Tensors representing the per-sample loss weights. If one Tensor is passed, it is used for all losses. If multiple Tensors are passed, the structure should match `y_pred`. Returns: Tuple of `(total_loss, per_output_loss_list)` """ if not self._built: self._build(y_pred) y_true = nest.flatten(y_true) y_pred = nest.flatten(y_pred) # TODO(omalleyt): Remove ambiguity here. # This is currently needed to support passing only 1 loss and 1 target # to a Functional Model with multiple outputs. However, this is # ambiguous, especially with subclass, and we should reconsider how we # support this. if len(y_true) == 1 and len(y_pred) > 1: y_true = y_true * len(y_pred) sample_weight = nest.flatten(sample_weight) # Allows passing one sample-weight array for all outputs. if len(sample_weight) == 1 and len(y_pred) > 1: sample_weight = sample_weight * len(y_pred) loss_values = [] metric_loss_values = [] # The loss value passed on to `Mean` metrics. zip_args = (y_true, y_pred, sample_weight, self._losses, self._loss_weights) for y_t, y_p, sw, loss_obj, loss_weight in zip(*zip_args): if loss_obj is None: # Ok to have no loss for an output. continue y_t = math_ops.cast(y_t, y_p.dtype) if sw is not None: sw = math_ops.cast(sw, y_p.dtype) # Handle Keras mask on outputs. mask = getattr(y_p, '_keras_mask', None) if mask is not None: mask = math_ops.cast(mask, y_p.dtype) if sw is not None: mask, _, sw = ( tf_losses_utils.squeeze_or_expand_dimensions( mask, sample_weight=sw)) sw *= mask else: sw = mask loss_value = loss_obj(y_t, y_p, sample_weight=sw) if loss_weight is not None: loss_value *= loss_weight metric_loss_values.append(loss_value) # TODO(omalleyt): Should this be in the `Loss` class? if (loss_obj.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE or loss_obj.reduction == losses_utils.ReductionV2.AUTO): loss_value = losses_utils.scale_loss_for_distribution( loss_value) loss_values.append(loss_value) # Ok for a model to have no compiled loss. total_loss = math_ops.add_n( loss_values) if loss_values else array_ops.zeros((1, )) # TODO(omalleyt): Don't return per-output losses once MetricsContainer # handles this. return total_loss, metric_loss_values
def surrogate_loss(sample_losses, stochastic_tensors=None, name="SurrogateLoss"): """Surrogate loss for stochastic graphs. This function will call `loss_fn` on each `StochasticTensor` upstream of `sample_losses`, passing the losses that it influenced. Note that currently `surrogate_loss` does not work with `StochasticTensor`s instantiated in `while_loop`s or other control structures. Args: sample_losses: a list or tuple of final losses. Each loss should be per example in the batch (and possibly per sample); that is, it should have dimensionality of 1 or greater. All losses should have the same shape. stochastic_tensors: a list of `StochasticTensor`s to add loss terms for. If None, defaults to all `StochasticTensor`s in the graph upstream of the `Tensor`s in `sample_losses`. name: the name with which to prepend created ops. Returns: `Tensor` loss, which is the sum of `sample_losses` and the `loss_fn`s returned by the `StochasticTensor`s. Raises: TypeError: if `sample_losses` is not a list or tuple, or if its elements are not `Tensor`s. ValueError: if any loss in `sample_losses` does not have dimensionality 1 or greater. """ with ops.name_scope(name, values=sample_losses): if not isinstance(sample_losses, (list, tuple)): raise TypeError("sample_losses must be a list or tuple") for loss in sample_losses: if not isinstance(loss, ops.Tensor): raise TypeError("loss is not a Tensor: %s" % loss) ndims = loss.get_shape().ndims if not (ndims is not None and ndims >= 1): raise ValueError( "loss must have dimensionality 1 or greater: %s" % loss) stoch_dependencies_map = _stochastic_dependencies_map( sample_losses, stochastic_tensors=stochastic_tensors) if not stoch_dependencies_map: logging.warn( "No collection of Stochastic Tensors found for current graph.") return math_ops.add_n(sample_losses) # Iterate through all of the stochastic dependencies, adding # surrogate terms where necessary. sample_losses = [ops.convert_to_tensor(loss) for loss in sample_losses] loss_terms = sample_losses for (stoch_node, dependent_losses) in stoch_dependencies_map.items(): dependent_losses = list(dependent_losses) logging.info("Losses influenced by StochasticTensor %s: [%s]", stoch_node.name, ", ".join([loss.name for loss in dependent_losses])) # Sum up the downstream losses for this ST influenced_loss = _add_n_or_sum(dependent_losses) # Compute surrogate loss term loss_term = stoch_node.loss( array_ops.stop_gradient(influenced_loss)) if loss_term is not None: loss_terms.append(loss_term) return _add_n_or_sum(loss_terms)
def __call__(self, y_true, y_pred, sample_weight=None, regularization_losses=None): """Computes the overall loss. Arguments: y_true: An arbitrary structure of Tensors representing the ground truth. y_pred: An arbitrary structure of Tensors representing a Model's outputs. sample_weight: An arbitrary structure of Tensors representing the per-sample loss weights. If one Tensor is passed, it is used for all losses. If multiple Tensors are passed, the structure should match `y_pred`. regularization_losses: Additional losses to be added to the total loss. Returns: Tuple of `(total_loss, per_output_loss_list)` """ y_true = self._conform_to_outputs(y_pred, y_true) sample_weight = self._conform_to_outputs(y_pred, sample_weight) if not self._built: self._build(y_pred) y_pred = nest.flatten(y_pred) y_true = nest.flatten(y_true) sample_weight = nest.flatten(sample_weight) loss_values = [] # Used for gradient calculation. loss_metric_values = [] # Used for loss metric calculation. batch_dim = None zip_args = (y_true, y_pred, sample_weight, self._losses, self._loss_weights, self._per_output_metrics) for y_t, y_p, sw, loss_obj, loss_weight, metric_obj in zip(*zip_args): if y_t is None or loss_obj is None: # Ok to have no loss for an output. continue y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw) sw = apply_mask(y_p, sw, get_mask(y_p)) loss_value = loss_obj(y_t, y_p, sample_weight=sw) loss_metric_value = loss_value # Correct for the `Mean` loss metrics counting each replica as a batch. if loss_obj.reduction == losses_utils.ReductionV2.SUM: loss_metric_value *= ds_context.get_strategy( ).num_replicas_in_sync if batch_dim is None: batch_dim = array_ops.shape(y_t)[0] if metric_obj is not None: metric_obj.update_state(loss_metric_value, sample_weight=batch_dim) if loss_weight is not None: loss_value *= loss_weight loss_metric_value *= loss_weight if (loss_obj.reduction == losses_utils.ReductionV2.SUM_OVER_BATCH_SIZE or loss_obj.reduction == losses_utils.ReductionV2.AUTO): loss_value = losses_utils.scale_loss_for_distribution( loss_value) loss_values.append(loss_value) loss_metric_values.append(loss_metric_value) if regularization_losses: regularization_losses = losses_utils.cast_losses_to_common_dtype( regularization_losses) reg_loss = math_ops.add_n(regularization_losses) loss_metric_values.append(reg_loss) loss_values.append( losses_utils.scale_loss_for_distribution(reg_loss)) if loss_values: loss_metric_values = losses_utils.cast_losses_to_common_dtype( loss_metric_values) total_loss_metric_value = math_ops.add_n(loss_metric_values) self._loss_metric.update_state(total_loss_metric_value, sample_weight=batch_dim) loss_values = losses_utils.cast_losses_to_common_dtype(loss_values) total_loss = math_ops.add_n(loss_values) return total_loss else: # Ok for a model to have no compiled loss. return array_ops.zeros(shape=())
def _AggregatedGrads(grads, op, gradient_uid, loop_state, aggregation_method=None): """Get the aggregated gradients for op. Args: grads: The map of memoized gradients. op: The op to get gradients for. gradient_uid: A unique identifier within the graph indicating which invocation of gradients is being executed. Used to cluster ops for compilation. loop_state: An object for maintaining the state of the while loops in the graph. It is of type ControlFlowState. None if the graph contains no while loops. aggregation_method: Specifies the method used to combine gradient terms. Accepted values are constants defined in the class `AggregationMethod`. Returns: A list of gradients, one per each output of `op`. If the gradients for a particular output is a list, this function aggregates it before returning. Raises: TypeError: if the incoming grads are not Tensors or IndexedSlices. ValueError: if the arguments are invalid. """ if aggregation_method is None: aggregation_method = AggregationMethod.DEFAULT valid_aggregation_methods = [ AggregationMethod.ADD_N, AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N ] if aggregation_method not in valid_aggregation_methods: raise ValueError( f"Invalid `aggregation_method` specified {aggregation_method}. " f"Accepted values are {valid_aggregation_methods}.") out_grads = _GetGrads(grads, op) for i, out_grad in enumerate(out_grads): if loop_state: if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)): assert control_flow_util.IsLoopSwitch(op) continue # Grads have to be Tensors or IndexedSlices if (isinstance(out_grad, collections_abc.Sequence) and not all( isinstance(g, (ops.Tensor, ops.IndexedSlices)) for g in out_grad if g is not None)): raise TypeError( f"Invalid gradient {out_grad} [index = {i}]. Gradients " "have to be either all Tensors or all IndexedSlices") # Aggregate multiple gradients, and convert [] to None. if out_grad: if len(out_grad) < 2: used = "nop" out_grads[i] = out_grad[0] elif all( isinstance(g, ops.Tensor) for g in out_grad if g is not None): tensor_shape = _AccumulatorShape(out_grad) if aggregation_method in [ AggregationMethod.EXPERIMENTAL_TREE, AggregationMethod.EXPERIMENTAL_ACCUMULATE_N ]: # Aggregate all gradients by doing pairwise sums: this may # reduce performance, but it can improve memory because the # gradients can be released earlier. # # TODO(vrv): Consider replacing this with a version of # tf.AddN() that eagerly frees its inputs as soon as they are # ready, so the order of this tree does not become a problem. used = "tree" with ops.name_scope(op.name + "_gradient_sum"): running_sum = out_grad[0] for grad in out_grad[1:]: running_sum = math_ops.add_n([running_sum, grad]) out_grads[i] = running_sum else: used = "add_n" out_grads[i] = _MultiDeviceAddN(out_grad, gradient_uid) logging.vlog(2, " _AggregatedGrads %d x %s using %s", len(out_grad), tensor_shape, used) else: out_grads[i] = backprop.aggregate_indexed_slices_gradients( out_grad) # pylint: disable=protected-access else: # not out_grad # out_grads[i] is [], thus its aggregation is simply None. out_grads[i] = None return out_grads
def _total_loss(self): return math_ops.add_n(tuple(loss.evaluate() for loss in self._losses))
def total_sampled_loss(self): return math_ops.add_n( tuple(loss.evaluate_on_sample() for loss in self.losses))
def _add_n(): inputs = keras.Input(shape=(10, )) outputs = math_ops.add_n([inputs, inputs, inputs]) return keras.Model(inputs, outputs)
def cond_body(): reduced = collective.reduce(reduce_util.ReduceOp.SUM, value, value, options) return math_ops.add_n(self.as_list(reduced)) / len(devices)