def _measure_valued_normal_scale_grad(function, dist_samples, dist, coupling=True): """Computes the measure valued gradient wrt the `scale of the Normal `dist`. For details, see Section 6 of "Monte Carlo Gradient Estimation in Machine learning". Args: function: A function for which to compute stochastic gradient for. dist_samples: a tf.Tensor of samples from `dist`. dist: A tfp.distributions.Distribution instance. The code here assumes this distribution is from the Normal family. coupling: A boolean. Whether or not to use coupling for the positive and negative samples. Recommended: True, as this reduces variance. Returns: A tf.Tensor of size `num_samples`, where `num_samples` are the number of samples (the first dimension) of `dist_samples`. The gradient of function(dist_samples) wrt to the scale of `dist`. """ mean = dist.loc # We will rely on backprop to compute the right gradient with respect # to the log scale. scale = dist.stddev() utils.assert_rank(mean, 1) utils.assert_rank(scale, 1) # Duplicate the D dimension - N x D x D base_dist_samples = utils.tile_second_to_last_dim(dist_samples) shape = dist_samples.shape # N x D pos_sample = dist_utils.sample_ds_maxwell(shape, loc=0., scale=1.0) if coupling: neg_sample = dist_utils.std_gaussian_from_std_dsmaxwell(pos_sample) else: neg_sample = tf.random.normal(shape) # N x D positive_diag = mean + scale * pos_sample positive_diag.shape.assert_is_compatible_with(shape) # N x D negative_diag = mean + scale * neg_sample negative_diag.shape.assert_is_compatible_with(shape) # Set the positive and negative values - N x D x D. positive = tf.linalg.set_diag(base_dist_samples, positive_diag) negative = tf.linalg.set_diag(base_dist_samples, negative_diag) c = scale # D f = function # Broadcast the division. grads = (_apply_f(f, positive) - _apply_f(f, negative)) / c # grads - N x D grads.shape.assert_is_compatible_with(shape) return grads
def score_function_loss(function, dist_samples, dist): """Computes the score_function surrogate loss.""" log_probs = dist.log_prob(tf.stop_gradient(dist_samples)) # log_probs is of the size of the number of samples. utils.assert_rank(log_probs, 1) # Broadcast the log_probs to the loss. loss = tf.stop_gradient(function(dist_samples)) * log_probs return loss
def control_variates_surrogate_loss(dist, dist_samples, dist_vars, model_loss_fn, grad_loss_fn, control_variate_fn, estimate_cv_coeff=True, num_posterior_samples_cv_coeff=20): r"""Computes a surrogate loss by computing the gradients manually. The loss function returned is: \sum_i stop_grad(grad_i) * var_i, where grad_i was computed from stochastic_loss and control variate. This function uses `compute_control_variate_coeff` to compute the control variate coefficients and should be used only in conjunction with control variates. Args: dist: a tfp.distributions.Distribution instance. dist_samples: samples from dist. dist_vars: the variables for which we are interested in computing gradients. The distribution samples should depend on these variables. model_loss_fn: A function with signature: lambda samples: f(samples). The model loss function. grad_loss_fn: The gradient estimator function. Needs to return both a surrogate loss and a dictionary of jacobians. control_variate_fn: The surrogate control variate function. Its gradient will be used as a control variate. estimate_cv_coeff: Boolean. Whether or not to use a coefficient for the control variate to minimize variance of the surrogate loss estimate. If False, the control variate coefficient is set to 1. If True, uses `compute_control_variate_coeff` to compute the coefficient. num_posterior_samples_cv_coeff: The number of posterior samples used to compute the cv coeff. Only used if `estimate_cv_coeff` is True. Returns: A tuple containing three elements: * the surrogate loss - a tf.Tensor [num_samples]. * the jacobians wrt dist_vars. * a dict of debug information. """ _, expected_control_variate, _, cv_jacobians = control_variate_fn( dist, dist_samples, model_loss_fn, grad_loss_fn=grad_loss_fn) _, loss_jacobians = grad_loss_fn(model_loss_fn, dist_samples, dist) jacobians = {} for dist_var in dist_vars: if estimate_cv_coeff: cv_coeff = compute_control_variate_coeff( dist, dist_var, model_loss_fn=model_loss_fn, grad_loss_fn=grad_loss_fn, control_variate_fn=control_variate_fn, num_samples=num_posterior_samples_cv_coeff) else: cv_coeff = 1. var_jacobians = loss_jacobians[ dist_var] - cv_coeff * cv_jacobians[dist_var] # Num samples x num_variables utils.assert_rank(var_jacobians, 2) jacobians[dist_var] = var_jacobians utils.add_grads_to_jacobians(jacobians, expected_control_variate * cv_coeff, [dist_var]) surrogate_loss = 0.0 for dist_var in dist_vars: surrogate_loss += tf.stop_gradient(jacobians[dist_var]) * dist_var # Sum over variable dimensions. surrogate_loss = tf.reduce_sum(surrogate_loss, axis=1) return surrogate_loss, jacobians
def compute_control_variate_coeff(dist, dist_var, model_loss_fn, grad_loss_fn, control_variate_fn, num_samples, moving_averages=False, eps=1e-3): r"""Computes the control variate coefficients for the given variable. The coefficient is given by: \sum_k cov(df/d var_k, dcv/d var_k) / (\sum var(dcv/d var_k) + eps) Where var_k is the k'th element of the variable dist_var. The covariance and variance calculations are done from samples obtained from the distribution `dist`. Args: dist: a tfp.distributions.Distribution instance. dist_var: the variable for which we are interested in computing the coefficient. The distribution samples should depend on these variables. model_loss_fn: A function with signature: lambda samples: f(samples). The model loss function. grad_loss_fn: The gradient estimator function. Needs to return both a surrogate loss and a dictionary of jacobians. control_variate_fn: The surrogate control variate function. Its gradient will be used as a control variate. num_samples: Int. The number of samples to use for the cov/var calculation. moving_averages: Bool. Whether or not to use moving averages for the calculation. eps: Float. Used to stabilize division. Returns: a tf.Tensor of rank 0. The coefficient for the input variable. """ # Resample to avoid biased gradients. cv_dist_samples = dist.sample(num_samples) cv_jacobians = control_variate_fn(dist, cv_dist_samples, model_loss_fn, grad_loss_fn=grad_loss_fn)[-1] loss_jacobians = grad_loss_fn(model_loss_fn, cv_dist_samples, dist)[-1] cv_jacobians = cv_jacobians[dist_var] loss_jacobians = loss_jacobians[dist_var] # Num samples x num_variables utils.assert_rank(loss_jacobians, 2) # Num samples x num_variables utils.assert_rank(cv_jacobians, 2) mean_f = tf.reduce_mean(loss_jacobians, axis=0) mean_cv, var_cv = tf.nn.moments(cv_jacobians, axes=[0]) cov = tf.reduce_mean((loss_jacobians - mean_f) * (cv_jacobians - mean_cv), axis=0) utils.assert_rank(var_cv, 1) utils.assert_rank(cov, 1) # Compute the coefficients which minimize variance. # Since we want to minimize the variances across parameter dimensions, # the optimal # coefficients are given by the sum of covariances per # dimensions over the sum of variances per dimension. cv_coeff = tf.reduce_sum(cov) / (tf.reduce_sum(var_cv) + eps) cv_coeff = tf.stop_gradient(cv_coeff) utils.assert_rank(cv_coeff, 0) if moving_averages: cv_coeff = tf.stop_gradient(snt.MovingAverage(decay=0.9)(cv_coeff)) return cv_coeff