def _is_finite(arg1, *args): """Checks if the supplied tensors are finite. Args: arg1: A numeric `Tensor`. *args: (Optional) Other `Tensors` to check for finiteness. Returns: is_finite: Scalar boolean `Tensor` indicating whether all the supplied tensors are finite. """ finite = tf.reduce_all(tf.is_finite(arg1)) for arg in args: finite = finite & tf.reduce_all(tf.is_finite(arg)) return finite
def aggregate_single_gradient(grad_and_vars, use_mean, check_inf_nan): """Calculate the average gradient for a shared variable across all towers. Note that this function provides a synchronization point across all towers. Args: grad_and_vars: A list or tuple of (gradient, variable) tuples. Each (gradient, variable) pair within the outer list represents the gradient of the variable calculated for a single tower, and the number of pairs equals the number of towers. use_mean: if True, mean is taken, else sum of gradients is taken. check_inf_nan: check grads for nans and infs. Returns: The tuple ([(average_gradient, variable),], has_nan_or_inf) where the gradient has been averaged across all towers. The variable is chosen from the first tower. The has_nan_or_inf indicates the grads has nan or inf. """ grads = [g for g, _ in grad_and_vars] grad = tf.add_n(grads) if use_mean and len(grads) > 1: grad = tf.multiply(grad, 1.0 / len(grads)) v = grad_and_vars[0][1] if check_inf_nan: has_nan_or_inf = tf.logical_not(tf.reduce_all(tf.is_finite(grads))) return (grad, v), has_nan_or_inf else: return (grad, v), None
def correlation_loss(source_samples, target_samples, weight, scope=None): """Adds a similarity loss term, the correlation between two representations. Args: source_samples: a tensor of shape [num_samples, num_features] target_samples: a tensor of shape [num_samples, num_features] weight: a scalar weight for the loss. scope: optional name scope for summary tags. Returns: a scalar tensor representing the correlation loss value. """ with tf.name_scope('corr_loss'): source_samples -= tf.reduce_mean(source_samples, 0) target_samples -= tf.reduce_mean(target_samples, 0) source_samples = tf.nn.l2_normalize(source_samples, 1) target_samples = tf.nn.l2_normalize(target_samples, 1) source_cov = tf.matmul(tf.transpose(source_samples), source_samples) target_cov = tf.matmul(tf.transpose(target_samples), target_samples) corr_loss = tf.reduce_mean(tf.square(source_cov - target_cov)) * weight assert_op = tf.Assert(tf.is_finite(corr_loss), [corr_loss]) with tf.control_dependencies([assert_op]): tag = 'Correlation Loss' if scope: tag = scope + tag tf.summary.scalar(tag, corr_loss) tf.losses.add_loss(corr_loss) return corr_loss
def difference_loss(private_samples, shared_samples, weight=1.0, name=''): """Adds the difference loss between the private and shared representations. Args: private_samples: a tensor of shape [num_samples, num_features]. shared_samples: a tensor of shape [num_samples, num_features]. weight: the weight of the incoherence loss. name: the name of the tf summary. """ private_samples -= tf.reduce_mean(private_samples, 0) shared_samples -= tf.reduce_mean(shared_samples, 0) private_samples = tf.nn.l2_normalize(private_samples, 1) shared_samples = tf.nn.l2_normalize(shared_samples, 1) correlation_matrix = tf.matmul( private_samples, shared_samples, transpose_a=True) cost = tf.reduce_mean(tf.square(correlation_matrix)) * weight cost = tf.where(cost > 0, cost, 0, name='value') tf.summary.scalar('losses/Difference Loss {}'.format(name), cost) assert_op = tf.Assert(tf.is_finite(cost), [cost]) with tf.control_dependencies([assert_op]): tf.losses.add_loss(cost)
def mmd_loss(source_samples, target_samples, weight, scope=None): """Adds a similarity loss term, the MMD between two representations. This Maximum Mean Discrepancy (MMD) loss is calculated with a number of different Gaussian kernels. Args: source_samples: a tensor of shape [num_samples, num_features]. target_samples: a tensor of shape [num_samples, num_features]. weight: the weight of the MMD loss. scope: optional name scope for summary tags. Returns: a scalar tensor representing the MMD loss value. """ sigmas = [ 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 5, 10, 15, 20, 25, 30, 35, 100, 1e3, 1e4, 1e5, 1e6 ] gaussian_kernel = partial( utils.gaussian_kernel_matrix, sigmas=tf.constant(sigmas)) loss_value = maximum_mean_discrepancy( source_samples, target_samples, kernel=gaussian_kernel) loss_value = tf.maximum(1e-4, loss_value) * weight assert_op = tf.Assert(tf.is_finite(loss_value), [loss_value]) with tf.control_dependencies([assert_op]): tag = 'MMD Loss' if scope: tag = scope + tag tf.summary.scalar(tag, loss_value) tf.losses.add_loss(loss_value) return loss_value
def _quaternion_loss(labels, predictions, weight, batch_size, domain, add_summaries): """Creates a Quaternion Loss. Args: labels: The true quaternions. predictions: The predicted quaternions. weight: A scalar weight. batch_size: The size of the batches. domain: The name of the domain from which the labels were taken. add_summaries: Whether or not to add summaries for the losses. Returns: A `Tensor` representing the loss. """ assert domain in ['Source', 'Transferred'] params = {'use_logging': False, 'batch_size': batch_size} loss = weight * log_quaternion_loss(labels, predictions, params) if add_summaries: assert_op = tf.Assert(tf.is_finite(loss), [loss]) with tf.control_dependencies([assert_op]): tf.summary.histogram( 'Log_Quaternion_Loss_%s' % domain, loss, collections='losses') tf.summary.scalar( 'Task_Quaternion_Loss_%s' % domain, loss, collections='losses') return loss
def safe_sum(x, alt_value=-np.inf, name=None): """Elementwise adds list members, replacing non-finite results with alt_value. Args: x: Python `list` of `Tensors` to elementwise add. alt_value: Python scalar used to replace any elementwise sums which would otherwise be non-finite. name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., "safe_sum"). Returns: safe_sum: `Tensor` representing the elementwise sum of list of `Tensor`s `x` or `alt_value` where sums are non-finite. Raises: TypeError: if `x` is not list-like. ValueError: if `x` is empty. """ with tf.name_scope(name, 'safe_sum', [x, alt_value]): if not is_list_like(x): raise TypeError('Expected list input.') if not x: raise ValueError('Input should not be empty.') n = np.int32(len(x)) in_shape = x[0].shape x = tf.stack(x, axis=-1) # The sum is NaN if any element is NaN or we see both +Inf and -Inf. Thus # we will replace such rows with the `alt_value`. Typically the `alt_value` # is chosen so the `MetropolisHastings` `TransitionKernel` always rejects # the proposal. rejection. # Regarding the following float-comparisons, recall comparing with NaN is # always False, i.e., we're implicitly capturing NaN and explicitly # capturing +/- Inf. is_sum_determinate = ( tf.reduce_all(tf.is_finite(x) | (x >= 0.), axis=-1) & tf.reduce_all(tf.is_finite(x) | (x <= 0.), axis=-1)) is_sum_determinate = tf.tile( is_sum_determinate[..., tf.newaxis], multiples=tf.concat([tf.ones(tf.rank(x) - 1, dtype=tf.int32), [n]], axis=0)) alt_value = np.array(alt_value, x.dtype.as_numpy_dtype) x = tf.where(is_sum_determinate, x, tf.fill(tf.shape(x), value=alt_value)) x = tf.reduce_sum(x, axis=-1) x.set_shape(x.shape.merge_with(in_shape)) return x
def exponential_moving_average(self, var, avg_var=None, decay=0.999, ignore_nan=False): """Calculates the exponential moving average. TODO(): check if this implementation of moving average can now be replaced by tensorflows implementation. Adds a variable to keep track of the exponential moving average and adds an update operation to the bookkeeper. The name of the variable is '%s_average' % name prefixed with the current variable scope. Args: var: The variable for which a moving average should be computed. avg_var: The variable to set the average into, if None create a zero initialized one. decay: How much history to use in the moving average. Higher, means more history values [0, 1) accepted. ignore_nan: If the value is NaN or Inf, skip it. Returns: The averaged variable. Raises: ValueError: if decay is not in [0, 1). """ with self._g.as_default(): if decay < 0 or decay >= 1.0: raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay) if avg_var is None: avg_name = '%s_average' % _bare_var_name(var) with tf.control_dependencies(None): with tf.name_scope(avg_name + '/Initializer/'): if isinstance(var, tf.Variable): init_val = var.initialized_value() elif var.get_shape().is_fully_defined(): init_val = tf.constant(0, shape=var.get_shape(), dtype=var.dtype.base_dtype) else: init_val = tf.constant(0, dtype=var.dtype.base_dtype) avg_var = tf.Variable(init_val, name=avg_name, trainable=False) num_updates = tf.cast(self.global_step, tf.float32) decay = tf.minimum(decay, tf.maximum(0.9, (1.0 + num_updates) / (10.0 + num_updates))) with tf.device(avg_var.device): if ignore_nan: var = tf.where(tf.is_finite(var), var, avg_var) if var.get_shape().is_fully_defined(): avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var)) else: avg_update = tf.assign(avg_var, avg_var - (1 - decay) * (avg_var - var), validate_shape=False) self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update) return avg_update
def _compute_accept_prob(self, position, momentum, position_post, momentum_post, sumlogdet): """Compute the prob of accepting the proposed state given old state.""" old_hamil = self.hamiltonian(position, momentum) new_hamil = self.hamiltonian(position_post, momentum_post) prob = tf.exp(tf.minimum(old_hamil - new_hamil + sumlogdet, 0.)) # Ensure numerical stability as well as correct gradients return tf.where(tf.is_finite(prob), prob, tf.zeros_like(prob))
def _compare(self, x, use_gpu): np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x) with self.test_session(use_gpu=use_gpu) as sess: inx = tf.convert_to_tensor(x) ofinite, oinf, onan = tf.is_finite(inx), tf.is_inf(inx), tf.is_nan(inx) tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan]) self.assertAllEqual(np_inf, tf_inf) self.assertAllEqual(np_nan, tf_nan) self.assertAllEqual(np_finite, tf_finite) self.assertShapeEqual(np_inf, oinf) self.assertShapeEqual(np_nan, onan) self.assertShapeEqual(np_finite, ofinite)
def _loss(self, predictions): with tf.name_scope("loss"): # if training then crop center of y, else, padding was applied slice_amt = (np.sum(self.filter_sizes) - len(self.filter_sizes)) / 2 slice_y = self.y_norm[:,slice_amt:-slice_amt, slice_amt:-slice_amt] _y = tf.cond(self.is_training, lambda: slice_y, lambda: self.y_norm) tf.subtract(predictions, _y) err = tf.square(predictions - _y) err_filled = utils.fill_na(err, 0) finite_count = tf.reduce_sum(tf.cast(tf.is_finite(err), tf.float32)) mse = tf.reduce_sum(err_filled) / finite_count return mse
def dann_loss(source_samples, target_samples, weight, scope=None): """Adds the domain adversarial (DANN) loss. Args: source_samples: a tensor of shape [num_samples, num_features]. target_samples: a tensor of shape [num_samples, num_features]. weight: the weight of the loss. scope: optional name scope for summary tags. Returns: a scalar tensor representing the correlation loss value. """ with tf.variable_scope('dann'): batch_size = tf.shape(source_samples)[0] samples = tf.concat(axis=0, values=[source_samples, target_samples]) samples = slim.flatten(samples) domain_selection_mask = tf.concat( axis=0, values=[tf.zeros((batch_size, 1)), tf.ones((batch_size, 1))]) # Perform the gradient reversal and be careful with the shape. grl = grl_ops.gradient_reversal(samples) grl = tf.reshape(grl, (-1, samples.get_shape().as_list()[1])) grl = slim.fully_connected(grl, 100, scope='fc1') logits = slim.fully_connected(grl, 1, activation_fn=None, scope='fc2') domain_predictions = tf.sigmoid(logits) domain_loss = tf.losses.log_loss( domain_selection_mask, domain_predictions, weights=weight) domain_accuracy = utils.accuracy( tf.round(domain_predictions), domain_selection_mask) assert_op = tf.Assert(tf.is_finite(domain_loss), [domain_loss]) with tf.control_dependencies([assert_op]): tag_loss = 'losses/domain_loss' tag_accuracy = 'losses/domain_accuracy' if scope: tag_loss = scope + tag_loss tag_accuracy = scope + tag_accuracy tf.summary.scalar(tag_loss, domain_loss) tf.summary.scalar(tag_accuracy, domain_accuracy) return domain_loss
def _create_autosummary_var(name, value_expr): assert not _autosummary_finalized v = tf.cast(value_expr, tf.float32) if v.shape.ndims is 0: v = [v, np.float32(1.0)] elif v.shape.ndims is 1: v = [tf.reduce_sum(v), tf.cast(tf.shape(v)[0], tf.float32)] else: v = [tf.reduce_sum(v), tf.reduce_prod(tf.cast(tf.shape(v), tf.float32))] v = tf.cond(tf.is_finite(v[0]), lambda: tf.stack(v), lambda: tf.zeros(2)) with tf.control_dependencies(None): var = tf.Variable(tf.zeros(2)) # [numerator, denominator] update_op = tf.cond(tf.is_variable_initialized(var), lambda: tf.assign_add(var, v), lambda: tf.assign(var, v)) if name in _autosummary_vars: _autosummary_vars[name].append(var) else: _autosummary_vars[name] = [var] return update_op
def calc_loss(predictions, true_y, additional_mask=None): """ Calculates losses, ignoring NaN true values (assigning zero loss to them) :param predictions: Predicted values :param true_y: True values :param additional_mask: :return: MAE loss, differentiable SMAPE loss, competition SMAPE loss """ # Take into account NaN's in true values mask = tf.is_finite(true_y) # Fill NaNs by zeros (can use any value) true_y = tf.where(mask, true_y, tf.zeros_like(true_y)) # Assign zero weight to NaNs weights = tf.to_float(mask) if additional_mask is not None: weights = weights * tf.expand_dims(additional_mask, axis=0) mae_loss = tf.losses.absolute_difference(labels=true_y, predictions=predictions, weights=weights) return mae_loss, smape_loss(true_y, predictions, weights), calc_smape_rounded(true_y, predictions, weights), tf.size(true_y)
def log_blend(inputs, weights): """Blends state in the log space. Args: inputs: A set of scalar states, one for each particle in each particle filter. Should be [num_samples, batch_size]. weights: A set of weights used to blend the state. Each set of weights should be of dimension [num_samples] (one weight for each previous particle). There should be one set of weights for each new particle in each particle filter. Thus the shape should be [num_samples, batch_size, num_samples] where the first axis indexes new particle and the last axis indexes old particles. Returns: blended: The blended states, a tensor of shape [num_samples, batch_size]. """ raw_max = tf.reduce_max(inputs, axis=0, keepdims=True) my_max = tf.stop_gradient( tf.where(tf.is_finite(raw_max), raw_max, tf.zeros_like(raw_max)) ) # Don't ask. blended = tf.log(tf.einsum("ijk,kj->ij", weights, tf.exp(inputs - raw_max))) + my_max return blended
def loop_cond(itr, obj_accum, unused_params, unused_attend_params, unused_flattened_states, unused_global_state, all_obj, init_obj, *args): """Termination conditions of the sub-problem optimization loop.""" del args # unused cond1 = tf.less(itr, num_iter) # We've run < num_iter times cond2 = tf.is_finite(obj_accum) # The objective is still finite if self.obj_train_max_multiplier > 0: current_obj = tf.gather(all_obj, itr) # Account for negative init_obj too max_diff = (self.obj_train_max_multiplier - 1) * tf.abs(init_obj) max_obj = init_obj + max_diff # The objective is a reasonable multiplier of the original objective cond3 = tf.less(current_obj, max_obj) return tf.logical_and(tf.logical_and(cond1, cond2), cond3, name="training_loop_cond") else: return tf.logical_and(cond1, cond2, name="training_loop_cond")
def exponential_moving_average( self, var, avg_var=None, decay=0.999, ignore_nan=False): """Calculates the exponential moving average. Adds a variable to keep track of the exponential moving average and adds an update operation to the bookkeeper. The name of the variable is '%s_average' % name prefixed with the current variable scope. Args: var: The variable for which a moving average should be computed. avg_var: The variable to set the average into, if None create a zero initialized one. decay: How much history to use in the moving average. Higher, means more history values [0, 1) accepted. ignore_nan: If the value is NaN or Inf, skip it. Returns: The averaged variable. Raises: ValueError: if decay is not in [0, 1). """ with self.g.as_default(): if decay < 0 or decay >= 1.0: raise ValueError('Decay is %5.2f, but has to be in [0, 1).' % decay) if not avg_var: shape = var.get_shape() avg_name = '%s_average' % _bare_var_name(var) avg_var = tf.Variable( tf.zeros_initializer(shape=shape, dtype=var.dtype), name=avg_name, trainable=False) num_updates = tf.cast(self.global_step, tf.float32) decay = tf.maximum( 0.9, tf.minimum(decay, (1.0 + num_updates) / (10.0 + num_updates))) with tf.device(avg_var.device): if ignore_nan: var = tf.select(tf.is_finite(var), var, avg_var) avg_update = tf.assign_sub(avg_var, (1 - decay) * (avg_var - var)) self._g.add_to_collection(GraphKeys.UPDATE_OPS, avg_update) return avg_var
def apply_updates(self): assert not self._updates_applied self._updates_applied = True devices = list(self._dev_grads.keys()) total_grads = sum(len(grads) for grads in self._dev_grads.values()) assert len(devices) >= 1 and total_grads >= 1 ops = [] with absolute_name_scope(self.scope): # Cast gradients to FP32 and calculate partial sum within each device. dev_grads = OrderedDict() # device => [(grad, var), ...] for dev_idx, dev in enumerate(devices): with tf.name_scope('ProcessGrads%d' % dev_idx), tf.device(dev): sums = [] for gv in zip(*self._dev_grads[dev]): assert all(v is gv[0][1] for g, v in gv) g = [tf.cast(g, tf.float32) for g, v in gv] g = g[0] if len(g) == 1 else tf.add_n(g) sums.append((g, gv[0][1])) dev_grads[dev] = sums # Sum gradients across devices. if len(devices) > 1: with tf.name_scope('SumAcrossGPUs'), tf.device(None): for var_idx, grad_shape in enumerate(self._grad_shapes): g = [dev_grads[dev][var_idx][0] for dev in devices] if np.prod(grad_shape): # nccl does not support zero-sized tensors g = tf.contrib.nccl.all_sum(g) for dev, gg in zip(devices, g): dev_grads[dev][var_idx] = (gg, dev_grads[dev][var_idx][1]) # Apply updates separately on each device. for dev_idx, (dev, grads) in enumerate(dev_grads.items()): with tf.name_scope('ApplyGrads%d' % dev_idx), tf.device(dev): # Scale gradients as needed. if self.use_loss_scaling or total_grads > 1: with tf.name_scope('Scale'): coef = tf.constant(np.float32(1.0 / total_grads), name='coef') coef = self.undo_loss_scaling(coef) grads = [(g * coef, v) for g, v in grads] # Check for overflows. with tf.name_scope('CheckOverflow'): grad_ok = tf.reduce_all(tf.stack([tf.reduce_all(tf.is_finite(g)) for g, v in grads])) # Update weights and adjust loss scaling. with tf.name_scope('UpdateWeights'): opt = self._dev_opt[dev] ls_var = self.get_loss_scaling_var(dev) if not self.use_loss_scaling: ops.append(tf.cond(grad_ok, lambda: opt.apply_gradients(grads), tf.no_op)) else: ops.append(tf.cond(grad_ok, lambda: tf.group(tf.assign_add(ls_var, self.loss_scaling_inc), opt.apply_gradients(grads)), lambda: tf.group(tf.assign_sub(ls_var, self.loss_scaling_dec)))) # Report statistics on the last device. if dev == devices[-1]: with tf.name_scope('Statistics'): ops.append(autosummary(self.id + '/learning_rate', self.learning_rate)) ops.append(autosummary(self.id + '/overflow_frequency', tf.where(grad_ok, 0, 1))) if self.use_loss_scaling: ops.append(autosummary(self.id + '/loss_scaling_log2', ls_var)) # Initialize variables and group everything into a single op. self.reset_optimizer_state() init_uninited_vars(list(self._dev_ls_var.values())) return tf.group(*ops, name='TrainingOp')
def whether_proceed(grads): finits = _map(lambda grad: tf.reduce_all(tf.is_finite(grad)), grads) return tf.reduce_all(finits)
def _process(self, grads): for grad, var in grads: # TODO make assert work tf.Assert(tf.reduce_all(tf.is_finite(var)), [var]) return grads
def make_finite(t, replacement): """Replaces non-finite tensor values with the replacement value.""" return tf.where(tf.is_finite(t), t, replacement)
def minimize(value_and_gradients_function, initial_position, tolerance=1e-8, x_tolerance=0, f_relative_tolerance=0, initial_inverse_hessian_estimate=None, max_iterations=50, parallel_iterations=1, name=None): """Applies the BFGS algorithm to minimize a differentiable function. Performs unconstrained minimization of a differentiable function using the BFGS scheme. For details of the algorithm, see [Nocedal and Wright(2006)][1]. ### Usage: The following example demonstrates the BFGS optimizer attempting to find the minimum for a simple two dimensional quadratic objective function. ```python minimum = np.array([1.0, 1.0]) # The center of the quadratic bowl. scales = np.array([2.0, 3.0]) # The scales along the two axes. # The objective function and the gradient. def quadratic(x): value = tf.reduce_sum(scales * (x - minimum) ** 2) return value, tf.gradients(value, x)[0] start = tf.constant([0.6, 0.8]) # Starting point for the search. optim_results = tfp.optimizer.bfgs_minimize( quadratic, initial_position=start, tolerance=1e-8) with tf.Session() as session: results = session.run(optim_results) # Check that the search converged assert(results.converged) # Check that the argmin is close to the actual value. np.testing.assert_allclose(results.position, minimum) # Print out the total number of function evaluations it took. Should be 6. print ("Function evaluations: %d" % results.num_objective_evaluations) ``` ### References: [1]: Jorge Nocedal, Stephen Wright. Numerical Optimization. Springer Series in Operations Research. pp 136-140. 2006 http://pages.mtu.edu/~struther/Courses/OLD/Sp2013/5630/Jorge_Nocedal_Numerical_optimization_267490.pdf Args: value_and_gradients_function: A Python callable that accepts a point as a real `Tensor` and returns a tuple of `Tensor`s of real dtype containing the value of the function and its gradient at that point. The function to be minimized. The first component of the return value should be a real scalar `Tensor`. The second component (the gradient) should have the same shape as the input value to the function. initial_position: `Tensor` of real dtype. The starting point of the search procedure. Should be a point at which the function value and the gradient norm are finite. tolerance: Scalar `Tensor` of real dtype. Specifies the gradient tolerance for the procedure. If the supremum norm of the gradient vector is below this number, the algorithm is stopped. x_tolerance: Scalar `Tensor` of real dtype. If the absolute change in the position between one iteration and the next is smaller than this number, the algorithm is stopped. f_relative_tolerance: Scalar `Tensor` of real dtype. If the relative change in the objective value between one iteration and the next is smaller than this value, the algorithm is stopped. initial_inverse_hessian_estimate: Optional `Tensor` of the same dtype as the components of the output of the `value_and_gradients_function`. If specified, the shape should be `initial_position.shape` * 2. For example, if the shape of `initial_position` is `[n]`, then the acceptable shape of `initial_inverse_hessian_estimate` is as a square matrix of shape `[n, n]`. If the shape of `initial_position` is `[n, m]`, then the required shape is `[n, m, n, m]`. For the correctness of the algorithm, it is required that this parameter be symmetric and positive definite. Specifies the starting estimate for the inverse of the Hessian at the initial point. If not specified, the identity matrix is used as the starting estimate for the inverse Hessian. max_iterations: Scalar positive int32 `Tensor`. The maximum number of iterations for BFGS updates. parallel_iterations: Positive integer. The number of iterations allowed to run in parallel. name: (Optional) Python str. The name prefixed to the ops created by this function. If not supplied, the default name 'minimize' is used. Returns: optimizer_results: A namedtuple containing the following items: converged: Scalar boolean tensor indicating whether the minimum was found within tolerance. failed: Scalar boolean tensor indicating whether a line search step failed to find a suitable step size satisfying Wolfe conditions. In the absence of any constraints on the number of objective evaluations permitted, this value will be the complement of `converged`. However, if there is a constraint and the search stopped due to available evaluations being exhausted, both `failed` and `converged` will be simultaneously False. num_objective_evaluations: The total number of objective evaluations performed. position: A tensor containing the last argument value found during the search. If the search converged, then this value is the argmin of the objective function. objective_value: A tensor containing the value of the objective function at the `position`. If the search converged, then this is the (local) minimum of the objective function. objective_gradient: A tensor containing the gradient of the objective function at the `position`. If the search converged the max-norm of this tensor should be below the tolerance. inverse_hessian_estimate: A tensor containing the inverse of the estimated Hessian. """ with tf.name_scope(name, 'minimize', [initial_position, tolerance, initial_inverse_hessian_estimate]): initial_position = tf.convert_to_tensor(initial_position, name='initial_position') dtype = initial_position.dtype.base_dtype tolerance = tf.convert_to_tensor(tolerance, dtype=dtype, name='grad_tolerance') f_relative_tolerance = tf.convert_to_tensor(f_relative_tolerance, dtype=dtype, name='f_relative_tolerance') x_tolerance = tf.convert_to_tensor(x_tolerance, dtype=dtype, name='x_tolerance') max_iterations = tf.convert_to_tensor(max_iterations, name='max_iterations') domain_shape = distribution_util.prefer_static_shape(initial_position) if initial_inverse_hessian_estimate is None: inv_hessian_shape = tf.concat([domain_shape, domain_shape], 0) initial_inv_hessian = tf.eye(tf.size(initial_position), dtype=dtype) initial_inv_hessian = tf.reshape(initial_inv_hessian, inv_hessian_shape, name='initial_inv_hessian') else: initial_inv_hessian = tf.convert_to_tensor( initial_inverse_hessian_estimate, dtype=dtype, name='initial_inv_hessian') # If an initial inverse Hessian is supplied, ensure that it is positive # definite. The easiest way to validate this is to compute the Cholesky # decomposition. However, it seems that simply adding a control dependency # on the decomposition result is not enough to trigger it. We need to # add an assert on the result. if initial_inverse_hessian_estimate is not None: # The supplied Hessian may not be of rank 2. Reshape it so it is. initial_inv_hessian_sqr_mat = tf.reshape( initial_inverse_hessian_estimate, tf.stack([tf.size(initial_position), tf.size(initial_position)], axis=0)) # If the matrix is not positive definite, the Cholesky decomposition will # fail. Adding an assert on it ensures it will be triggered. cholesky_factor = tf.cholesky(initial_inv_hessian_sqr_mat) is_positive_definite = tf.reduce_all(tf.is_finite(cholesky_factor)) asymmetry = tf.norm(initial_inv_hessian_sqr_mat - tf.transpose(initial_inv_hessian_sqr_mat), np.inf) is_symmetric = tf.equal(asymmetry, 0) with tf.control_dependencies( [tf.Assert(is_positive_definite, ['Initial inverse Hessian is not positive definite.', initial_inverse_hessian_estimate]), tf.Assert(is_symmetric, ['Initial inverse Hessian is not symmetric', initial_inverse_hessian_estimate])]): f0, df0 = value_and_gradients_function(initial_position) else: f0, df0 = value_and_gradients_function(initial_position) initial_convergence = _initial_convergence_test(df0, tolerance) # The `state` here is a BfgsOptimizerResults tuple with values for the # current state of the algorithm computation. def _cond(state): """Stopping condition for the algorithm.""" keep_going = tf.logical_not(state.converged | state.failed | (state.num_iterations >= max_iterations)) return keep_going def _body(state): """Main optimization loop.""" search_direction = _get_search_direction(state.inverse_hessian_estimate, state.objective_gradient) derivative_at_start_pt = tf.reduce_sum(state.objective_gradient * search_direction) # If the derivative at the start point is not negative, reset the # Hessian estimate and recompute the search direction. needs_reset = derivative_at_start_pt >= 0 def _reset_search_dirn(): search_direction = _get_search_direction(initial_inv_hessian, state.objective_gradient) return search_direction, initial_inv_hessian search_direction, inv_hessian_estimate = tf.contrib.framework.smart_cond( needs_reset, true_fn=_reset_search_dirn, false_fn=lambda: (search_direction, state.inverse_hessian_estimate)) line_search_value_grad_func = _restrict_along_direction( value_and_gradients_function, state.position, search_direction) derivative_at_start_pt = tf.reduce_sum(state.objective_gradient * search_direction) ls_result = linesearch.hager_zhang( line_search_value_grad_func, initial_step_size=tf.convert_to_tensor(1, dtype=dtype), objective_at_zero=state.objective_value, grad_objective_at_zero=derivative_at_start_pt) state_after_ls = _update_state( state, failed=~ls_result.converged, # Fail if line search failed. num_iterations=state.num_iterations + 1, num_objective_evaluations=( state.num_objective_evaluations + ls_result.func_evals), inverse_hessian_estimate=inv_hessian_estimate) def _do_bfgs_update(): state_updated = _update_position( value_and_gradients_function, state_after_ls, search_direction * ls_result.left_pt, tolerance, f_relative_tolerance, x_tolerance) # If not converged, update the Hessian. return tf.contrib.framework.smart_cond( state_updated.converged, lambda: state_updated, lambda: _update_inv_hessian(state_after_ls, state_updated)) next_state = tf.contrib.framework.smart_cond( state_after_ls.failed, true_fn=lambda: state_after_ls, false_fn=_do_bfgs_update) return [next_state] initial_state = BfgsOptimizerResults( converged=initial_convergence, failed=False, num_iterations=tf.convert_to_tensor(0), num_objective_evaluations=1, position=initial_position, objective_value=f0, objective_gradient=df0, inverse_hessian_estimate=initial_inv_hessian) return tf.while_loop(_cond, _body, [initial_state], parallel_iterations=parallel_iterations)[0]
def fit_one_step( model_matrix, response, model, model_coefficients_start=None, predicted_linear_response_start=None, l2_regularizer=None, dispersion=None, offset=None, learning_rate=None, fast_unsafe_numerics=True, name=None): """Runs one step of Fisher scoring. Args: model_matrix: (Batch of) `float`-like, matrix-shaped `Tensor` where each row represents a sample's features. response: (Batch of) vector-shaped `Tensor` where each element represents a sample's observed response (to the corresponding row of features). Must have same `dtype` as `model_matrix`. model: `tfp.glm.ExponentialFamily`-like instance used to construct the negative log-likelihood loss, gradient, and expected Hessian (i.e., the Fisher information matrix). model_coefficients_start: Optional (batch of) vector-shaped `Tensor` representing the initial model coefficients, one for each column in `model_matrix`. Must have same `dtype` as `model_matrix`. Default value: Zeros. predicted_linear_response_start: Optional `Tensor` with `shape`, `dtype` matching `response`; represents `offset` shifted initial linear predictions based on `model_coefficients_start`. Default value: `offset` if `model_coefficients is None`, and `tf.linalg.matvec(model_matrix, model_coefficients_start) + offset` otherwise. l2_regularizer: Optional scalar `Tensor` representing L2 regularization penalty, i.e., `loss(w) = sum{-log p(y[i]|x[i],w) : i=1..n} + l2_regularizer ||w||_2^2`. Default value: `None` (i.e., no L2 regularization). dispersion: Optional (batch of) `Tensor` representing `response` dispersion, i.e., as in, `p(y|theta) := exp((y theta - A(theta)) / dispersion)`. Must broadcast with rows of `model_matrix`. Default value: `None` (i.e., "no dispersion"). offset: Optional `Tensor` representing constant shift applied to `predicted_linear_response`. Must broadcast to `response`. Default value: `None` (i.e., `tf.zeros_like(response)`). learning_rate: Optional (batch of) scalar `Tensor` used to dampen iterative progress. Typically only needed if optimization diverges, should be no larger than `1` and typically very close to `1`. Default value: `None` (i.e., `1`). fast_unsafe_numerics: Optional Python `bool` indicating if solve should be based on Cholesky or QR decomposition. Default value: `True` (i.e., "prefer speed via Cholesky decomposition"). name: Python `str` used as name prefix to ops created by this function. Default value: `"fit_one_step"`. Returns: model_coefficients: (Batch of) vector-shaped `Tensor`; represents the next estimate of the model coefficients, one for each column in `model_matrix`. predicted_linear_response: `response`-shaped `Tensor` representing linear predictions based on new `model_coefficients`, i.e., `tf.linalg.matvec(model_matrix, model_coefficients_next) + offset`. """ graph_deps = [model_matrix, response, model_coefficients_start, predicted_linear_response_start, dispersion, learning_rate] with tf.name_scope(name, 'fit_one_step', graph_deps): [ model_matrix, response, model_coefficients_start, predicted_linear_response_start, offset, ] = prepare_args( model_matrix, response, model_coefficients_start, predicted_linear_response_start, offset) # Compute: mean, grad(mean, predicted_linear_response_start), and variance. mean, variance, grad_mean = model(predicted_linear_response_start) # If either `grad_mean` or `variance is non-finite or zero, then we'll # replace it with a value such that the row is zeroed out. Although this # procedure may seem circuitous, it is necessary to ensure this algorithm is # itself differentiable. is_valid = (tf.is_finite(grad_mean) & tf.not_equal(grad_mean, 0.) & tf.is_finite(variance) & (variance > 0.)) def mask_if_invalid(x, mask): mask = tf.fill(tf.shape(x), value=np.array(mask, x.dtype.as_numpy_dtype)) return tf.where(is_valid, x, mask) # Run one step of iteratively reweighted least-squares. # Compute "`z`", the adjusted predicted linear response. # z = predicted_linear_response_start # + learning_rate * (response - mean) / grad_mean z = (response - mean) / mask_if_invalid(grad_mean, 1.) # TODO(jvdillon): Rather than use learning rate, we should consider using # backtracking line search. if learning_rate is not None: z *= learning_rate[..., tf.newaxis] z += predicted_linear_response_start # Compute "`w`", the per-sample weight. if dispersion is not None: # For convenience, we'll now scale the variance by the dispersion factor. variance *= dispersion w = (mask_if_invalid(grad_mean, 0.) * tf.rsqrt(mask_if_invalid(variance, np.inf))) a = model_matrix * w[..., tf.newaxis] b = z * w # Solve `min{ || A @ model_coefficients - b ||_2**2 : model_coefficients }` # where `@` denotes `matmul`. if l2_regularizer is None: l2_regularizer = np.array(0, a.dtype.as_numpy_dtype) else: l2_regularizer_ = distribution_util.maybe_get_static_value( l2_regularizer, a.dtype.as_numpy_dtype) if l2_regularizer_ is not None: l2_regularizer = l2_regularizer_ def _embed_l2_regularization(): """Adds synthetic observations to implement L2 regularization.""" # `tf.matrix_solve_ls` does not respect the `l2_regularization` argument # when `fast_unsafe_numerics` is `False`. This function adds synthetic # observations to the data to implement the regularization instead. # Adding observations `sqrt(l2_regularizer) * I` is mathematically # equivalent to adding the term # `-l2_regularizer ||coefficients||_2**2` to the log-likelihood. num_model_coefficients = num_cols(model_matrix) batch_shape = tf.shape(model_matrix)[:-2] eye = tf.eye( num_model_coefficients, batch_shape=batch_shape, dtype=a.dtype) a_ = tf.concat([a, tf.sqrt(l2_regularizer) * eye], axis=-2) b_ = distribution_util.pad( b, count=num_model_coefficients, axis=-1, back=True) # Return l2_regularizer=0 since its now embedded. l2_regularizer_ = np.array(0, a.dtype.as_numpy_dtype) return a_, b_, l2_regularizer_ a, b, l2_regularizer = tf.contrib.framework.smart_cond( smart_reduce_all([not(fast_unsafe_numerics), l2_regularizer > 0.]), _embed_l2_regularization, lambda: (a, b, l2_regularizer)) model_coefficients_next = tf.matrix_solve_ls( a, b[..., tf.newaxis], fast=fast_unsafe_numerics, l2_regularizer=l2_regularizer, name='model_coefficients_next') model_coefficients_next = model_coefficients_next[..., 0] # TODO(b/79122261): The approach used in `matrix_solve_ls` could be made # faster by avoiding explicitly forming Q and instead keeping the # factorization in 'implicit' form with stacked (rescaled) Householder # vectors underneath the 'R' and then applying the (accumulated) # reflectors in the appropriate order to apply Q'. However, we don't # presently do this because we lack core TF functionality. For reference, # the vanilla QR approach is: # q, r = tf.linalg.qr(a) # c = tf.matmul(q, b, adjoint_a=True) # model_coefficients_next = tf.matrix_triangular_solve( # r, c, lower=False, name='model_coefficients_next') predicted_linear_response_next = calculate_linear_predictor( model_matrix, model_coefficients_next, offset, name='predicted_linear_response_next') return model_coefficients_next, predicted_linear_response_next
def mask_nans(x): x_zeros = tf.zeros_like(x) x_mask = tf.is_finite(x) y = tf.where(x_mask, x, x_zeros) return y
def process(self, grads): for grad, var in grads: assert grad is not None, "Grad is None for variable {}".format(var.name) # TODO make assert work tf.Assert(tf.reduce_all(tf.is_finite(var)), [var]) return grads
def apply_updates(self, allow_no_op: bool = False) -> tf.Operation: """Construct training op to update the registered variables based on their gradients.""" tfutil.assert_tf_initialized() assert not self._updates_applied self._updates_applied = True all_ops = [] # Check for no-op. if allow_no_op and len(self._devices) == 0: with tfutil.absolute_name_scope(self.scope): return tf.no_op(name='TrainingOp') # Clean up gradients. for device_idx, device in enumerate(self._devices.values()): with tfutil.absolute_name_scope(self.scope + "/Clean%d" % device_idx), tf.device( device.name): for var, grad in device.grad_raw.items(): # Filter out disconnected gradients and convert to float32. grad = [g for g in grad if g is not None] grad = [tf.cast(g, tf.float32) for g in grad] # Sum within the device. if len(grad) == 0: grad = tf.zeros(var.shape) # No gradients => zero. elif len(grad) == 1: grad = grad[0] # Single gradient => use as is. else: grad = tf.add_n(grad) # Multiple gradients => sum. # Scale as needed. scale = 1.0 / len(device.grad_raw[var]) / len( self._devices) scale = tf.constant(scale, dtype=tf.float32, name="scale") if self.minibatch_multiplier is not None: scale /= tf.cast(self.minibatch_multiplier, tf.float32) scale = self.undo_loss_scaling(scale) device.grad_clean[var] = grad * scale # Sum gradients across devices. if len(self._devices) > 1: with tfutil.absolute_name_scope(self.scope + "/Broadcast"), tf.device(None): for all_vars in zip(*[ device.grad_clean.keys() for device in self._devices.values() ]): if len(all_vars) > 0 and all( dim > 0 for dim in all_vars[0].shape.as_list() ): # NCCL does not support zero-sized tensors. all_grads = [ device.grad_clean[var] for device, var in zip( self._devices.values(), all_vars) ] all_grads = nccl_ops.all_sum(all_grads) for device, var, grad in zip(self._devices.values(), all_vars, all_grads): device.grad_clean[var] = grad # Apply updates separately on each device. for device_idx, device in enumerate(self._devices.values()): with tfutil.absolute_name_scope(self.scope + "/Apply%d" % device_idx), tf.device( device.name): # pylint: disable=cell-var-from-loop # Accumulate gradients over time. if self.minibatch_multiplier is None: acc_ok = tf.constant(True, name='acc_ok') device.grad_acc = OrderedDict(device.grad_clean) else: # Create variables. with tf.control_dependencies(None): for var in device.grad_clean.keys(): device.grad_acc_vars[var] = tf.Variable( tf.zeros(var.shape), trainable=False, name="grad_acc_var") device.grad_acc_count = tf.Variable( tf.zeros([]), trainable=False, name="grad_acc_count") # Track counter. count_cur = device.grad_acc_count + 1.0 count_inc_op = lambda: tf.assign(device.grad_acc_count, count_cur) count_reset_op = lambda: tf.assign(device.grad_acc_count, tf.zeros([])) acc_ok = (count_cur >= tf.cast(self.minibatch_multiplier, tf.float32)) all_ops.append( tf.cond(acc_ok, count_reset_op, count_inc_op)) # Track gradients. for var, grad in device.grad_clean.items(): acc_var = device.grad_acc_vars[var] acc_cur = acc_var + grad device.grad_acc[var] = acc_cur with tf.control_dependencies([acc_cur]): acc_inc_op = lambda: tf.assign(acc_var, acc_cur) acc_reset_op = lambda: tf.assign( acc_var, tf.zeros(var.shape)) all_ops.append( tf.cond(acc_ok, acc_reset_op, acc_inc_op)) # No overflow => apply gradients. all_ok = tf.reduce_all( tf.stack([acc_ok] + [ tf.reduce_all(tf.is_finite(g)) for g in device.grad_acc.values() ])) apply_op = lambda: device.optimizer.apply_gradients( [(tf.cast(grad, var.dtype), var) for var, grad in device.grad_acc.items()]) all_ops.append(tf.cond(all_ok, apply_op, tf.no_op)) # Adjust loss scaling. if self.use_loss_scaling: ls_inc_op = lambda: tf.assign_add(device.loss_scaling_var, self.loss_scaling_inc) ls_dec_op = lambda: tf.assign_sub(device.loss_scaling_var, self.loss_scaling_dec) ls_update_op = lambda: tf.group( tf.cond(all_ok, ls_inc_op, ls_dec_op)) all_ops.append(tf.cond(acc_ok, ls_update_op, tf.no_op)) # Last device => report statistics. if device_idx == len(self._devices) - 1: all_ops.append( autosummary.autosummary(self.id + "/learning_rate", self.learning_rate)) all_ops.append( autosummary.autosummary(self.id + "/overflow_frequency", tf.where(all_ok, 0, 1), condition=acc_ok)) if self.use_loss_scaling: all_ops.append( autosummary.autosummary( self.id + "/loss_scaling_log2", device.loss_scaling_var)) # Initialize variables. self.reset_optimizer_state() if self.use_loss_scaling: tfutil.init_uninitialized_vars( [device.loss_scaling_var for device in self._devices.values()]) if self.minibatch_multiplier is not None: tfutil.run([ var.initializer for device in self._devices.values() for var in list(device.grad_acc_vars.values()) + [device.grad_acc_count] ]) # Group everything into a single op. with tfutil.absolute_name_scope(self.scope): return tf.group(*all_ops, name="TrainingOp")