def _phi(r, order): """Coordinate-wise nonlinearity used to define the order of the interpolation. See https://en.wikipedia.org/wiki/Polyharmonic_spline for the definition. Args: r: input op order: interpolation order Returns: phi_k evaluated coordinate-wise on r, for k = r """ # using EPSILON prevents log(0), sqrt0), etc. # sqrt(0) is well-defined, but its gradient is not with ops.name_scope('phi'): if order == 1: r = math_ops.maximum(r, EPSILON) r = math_ops.sqrt(r) return r elif order == 2: return 0.5 * r * math_ops.log(math_ops.maximum(r, EPSILON)) elif order == 4: return 0.5 * math_ops.square(r) * math_ops.log( math_ops.maximum(r, EPSILON)) elif order % 2 == 0: r = math_ops.maximum(r, EPSILON) return 0.5 * math_ops.pow(r, 0.5 * order) * math_ops.log(r) else: r = math_ops.maximum(r, EPSILON) return math_ops.pow(r, 0.5 * order)
def _sample_n(self, n, seed=None): shape = array_ops.concat(0, ([n], array_ops.shape(self.mean()))) np_dtype = self.dtype.as_numpy_dtype() minval = np.nextafter(np_dtype(0), np_dtype(1)) uniform = random_ops.random_uniform(shape=shape, minval=minval, maxval=1, dtype=self.dtype, seed=seed) sampled = -math_ops.log(-math_ops.log(uniform)) return sampled * self.scale + self.loc
def logloss(y_true, y_pred): y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon())) losses += math_ops.multiply((1 - y_true), math_ops.log(1 - y_pred + K.epsilon())) return K.mean(-losses, axis=-1)
def _inverse(self, y): y = self._maybe_assert_valid_y(y) if self.power == 0.: return math_ops.log(y) # If large y accuracy is an issue, consider using: # (y**self.power - 1.) / self.power when y >> 1. return math_ops.expm1(math_ops.log(y) * self.power) / self.power
def _forward_log_det_jacobian(self, x): x = self._maybe_assert_valid_x(x) return ( -(x / self.scale) ** self.concentration + (self.concentration - 1) * math_ops.log(x) + math_ops.log(self.concentration) + -self.concentration * math_ops.log(self.scale))
def log_prob(self, counts, name="log_prob"): """`Log(P[counts])`, computed for every batch member. For each batch member of counts `k`, `P[counts]` is the probability that after sampling `n` draws from this Binomial distribution, the number of successes is `k`. Note that different sequences of draws can result in the same counts, thus the probability includes a combinatorial coefficient. Args: counts: Non-negative tensor with dtype `dtype` and whose shape can be broadcast with `self.p` and `self.n`. `counts` is only legal if it is less than or equal to `n` and its components are equal to integer values. name: Name to give this Op, defaults to "log_prob". Returns: Log probabilities for each record, shape `[N1,...,Nm]`. """ n = self._n p = self._p with ops.name_scope(self.name): with ops.name_scope(name, values=[self._n, self._p, counts]): counts = self._check_counts(counts) prob_prob = counts * math_ops.log(p) + ( n - counts) * math_ops.log(1 - p) combinations = math_ops.lgamma(n + 1) - math_ops.lgamma( counts + 1) - math_ops.lgamma(n - counts + 1) log_prob = prob_prob + combinations return log_prob
def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None): """Adds a Log Loss term to the training procedure. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size [batch_size], then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: predictions: The predicted outputs. labels: The ground truth output tensor, same dimensions as 'predictions'. weights: Coefficients for the loss a scalar, a tensor of shape [batch_size] or a tensor whose shape matches `predictions`. epsilon: A small increment to add to avoid taking a log of zero. scope: The scope for the operations performed in computing the loss. Returns: A scalar `Tensor` representing the loss value. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. """ with ops.name_scope(scope, "log_loss", [predictions, labels, weights]) as scope: predictions.get_shape().assert_is_compatible_with(labels.get_shape()) predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) losses = -math_ops.multiply( labels, math_ops.log(predictions + epsilon)) - math_ops.multiply( (1 - labels), math_ops.log(1 - predictions + epsilon)) return compute_weighted_loss(losses, weights, scope=scope)
def compute_step(x_val, geometric=False): if geometric: # Consider geometric series where t_mul != 1 # 1 + t_mul + t_mul^2 ... = (1 - t_mul^i_restart) / (1 - t_mul) # First find how many restarts were performed for a given x_val # Find maximal integer i_restart value for which this equation holds # x_val >= (1 - t_mul^i_restart) / (1 - t_mul) # x_val * (1 - t_mul) <= (1 - t_mul^i_restart) # t_mul^i_restart <= (1 - x_val * (1 - t_mul)) # tensorflow allows only log with base e # i_restart <= log(1 - x_val * (1 - t_mul) / log(t_mul) # Find how many restarts were performed i_restart = math_ops.floor( math_ops.log(c_one - x_val * (c_one - t_mul)) / math_ops.log(t_mul)) # Compute the sum of all restarts before the current one sum_r = (c_one - t_mul ** i_restart) / (c_one - t_mul) # Compute our position within the current restart x_val = (x_val - sum_r) / t_mul ** i_restart else: # Find how many restarts were performed i_restart = math_ops.floor(x_val) # Compute our position within the current restart x_val = x_val - i_restart return i_restart, x_val
def log_prob(self, x, name="log_prob"): """`Log(P[counts])`, computed for every batch member. Args: x: Non-negative floating point tensor whose shape can be broadcast with `self.a` and `self.b`. For fixed leading dimensions, the last dimension represents counts for the corresponding Beta distribution in `self.a` and `self.b`. `x` is only legal if 0 < x < 1. name: Name to give this Op, defaults to "log_prob". Returns: Log probabilities for each record, shape `[N1,...,Nm]`. """ a = self._a b = self._b with ops.name_scope(self.name): with ops.name_scope(name, values=[a, x]): x = self._check_x(x) unnorm_pdf = (a - 1) * math_ops.log(x) + ( b - 1) * math_ops.log(1 - x) normalization_factor = -(math_ops.lgamma(a) + math_ops.lgamma(b) - math_ops.lgamma(a + b)) log_prob = unnorm_pdf + normalization_factor return log_prob
def _kl_gamma_gamma(g0, g1, name=None): """Calculate the batched KL divergence KL(g0 || g1) with g0 and g1 Gamma. Args: g0: instance of a Gamma distribution object. g1: instance of a Gamma distribution object. name: (optional) Name to use for created operations. Default is "kl_gamma_gamma". Returns: kl_gamma_gamma: `Tensor`. The batchwise KL(g0 || g1). """ with ops.name_scope(name, "kl_gamma_gamma", values=[ g0.concentration, g0.rate, g1.concentration, g1.rate]): # Result from: # http://www.fil.ion.ucl.ac.uk/~wpenny/publications/densities.ps # For derivation see: # http://stats.stackexchange.com/questions/11646/kullback-leibler-divergence-between-two-gamma-distributions pylint: disable=line-too-long return (((g0.concentration - g1.concentration) * math_ops.digamma(g0.concentration)) + math_ops.lgamma(g1.concentration) - math_ops.lgamma(g0.concentration) + g1.concentration * math_ops.log(g0.rate) - g1.concentration * math_ops.log(g1.rate) + g0.concentration * (g1.rate / g0.rate - 1.))
def log_prob(self, x, name="log_prob"): """Log prob of observations in `x` under these Gamma distribution(s). Args: x: tensor of dtype `dtype`, must be broadcastable with `alpha` and `beta`. name: The name to give this op. Returns: log_prob: tensor of dtype `dtype`, the log-PDFs of `x`. Raises: TypeError: if `x` and `alpha` are different dtypes. """ with ops.name_scope(self.name): with ops.op_scope([self._alpha, self._beta, x], name): alpha = self._alpha beta = self._beta x = ops.convert_to_tensor(x) x = control_flow_ops.with_dependencies( [check_ops.assert_positive(x)] if self.strict else [], x) contrib_tensor_util.assert_same_float_dtype(tensors=[x,], dtype=self.dtype) return (alpha * math_ops.log(beta) + (alpha - 1) * math_ops.log(x) - beta * x - math_ops.lgamma(self._alpha))
def _sample_n(self, n, seed=None): sample_shape = array_ops.concat(([n], array_ops.shape(self.logits)), 0) logits = self.logits * array_ops.ones(sample_shape) logits_2d = array_ops.reshape(logits, [-1, self.event_size]) np_dtype = self.dtype.as_numpy_dtype # Uniform variates must be sampled from the interval (0,1] rather than # [0,1], as they are passed through log() to compute Gumbel variates. # We need to use np.finfo(np_dtype).tiny because it is the smallest, # positive, "normal" number. A "normal" number is such that the mantissa # has an implicit leading 1. Normal, positive numbers x, y have the # reasonable property that: x + y >= max(x, y). # minval=np.nextafter(np.float32(0),1)) can cause # tf.random_uniform(dtype=tf.float32) to sample 0. uniform = random_ops.random_uniform(shape=array_ops.shape(logits_2d), minval=np.finfo(np_dtype).tiny, maxval=1, dtype=self.dtype, seed=seed) gumbel = -math_ops.log(-math_ops.log(uniform)) noisy_logits = math_ops.div(gumbel + logits_2d, self._temperature_2d) samples = nn_ops.log_softmax(noisy_logits) ret = array_ops.reshape(samples, sample_shape) return ret
def _BetaincGrad(op, grad): """Returns gradient of betainc(a, b, x) with respect to x.""" # TODO(ebrevdo): Perhaps add the derivative w.r.t. a, b a, b, x = op.inputs # two cases: x is a scalar and a/b are same-shaped tensors, or vice # versa; so its sufficient to check against shape(a). sa = array_ops.shape(a) sx = array_ops.shape(x) # pylint: disable=protected-access _, rx = gen_array_ops._broadcast_gradient_args(sa, sx) # pylint: enable=protected-access # Perform operations in log space before summing, because terms # can grow large. log_beta = ( gen_math_ops.lgamma(a) + gen_math_ops.lgamma(b) - gen_math_ops.lgamma(a + b)) partial_x = math_ops.exp((b - 1) * math_ops.log(1 - x) + (a - 1) * math_ops.log(x) - log_beta) # TODO(b/36815900): Mark None return values as NotImplemented return ( None, # da None, # db array_ops.reshape(math_ops.reduce_sum(partial_x * grad, rx), sx))
def _log_prob(self, x): y = (x - self.mu) / self.sigma half_df = 0.5 * self.df return (math_ops.lgamma(0.5 + half_df) - math_ops.lgamma(half_df) - 0.5 * math_ops.log(self.df) - 0.5 * math.log(math.pi) - math_ops.log(self.sigma) - (0.5 + half_df) * math_ops.log(1. + math_ops.square(y) / self.df))
def _inverse_log_det_jacobian(self, y, use_saved_statistics=False): if not y.shape.is_fully_defined(): raise ValueError("Input must have shape known at graph construction.") input_shape = np.int32(y.shape.as_list()) if not self.batchnorm.built: # Create variables. self.batchnorm.build(input_shape) event_dims = self.batchnorm.axis reduction_axes = [i for i in range(len(input_shape)) if i not in event_dims] if use_saved_statistics or not self._training: log_variance = math_ops.log( self.batchnorm.moving_variance + self.batchnorm.epsilon) else: # At training-time, ildj is computed from the mean and log-variance across # the current minibatch. _, v = nn.moments(y, axes=reduction_axes, keepdims=True) log_variance = math_ops.log(v + self.batchnorm.epsilon) # `gamma` and `log Var(y)` reductions over event_dims. # Log(total change in area from gamma term). log_total_gamma = math_ops.reduce_sum(math_ops.log(self.batchnorm.gamma)) # Log(total change in area from log-variance term). log_total_variance = math_ops.reduce_sum(log_variance) # The ildj is scalar, as it does not depend on the values of x and are # constant across minibatch elements. return log_total_gamma - 0.5 * log_total_variance
def __init__(self, logits=None, p=None, dtype=dtypes.int32, validate_args=True, allow_nan_stats=False, name="Bernoulli"): """Construct Bernoulli distributions. Args: logits: An N-D `Tensor` representing the log-odds of a positive event. Each entry in the `Tensor` parametrizes an independent Bernoulli distribution where the probability of an event is sigmoid(logits). p: An N-D `Tensor` representing the probability of a positive event. Each entry in the `Tensor` parameterizes an independent Bernoulli distribution. dtype: dtype for samples. validate_args: Whether to assert that `0 <= p <= 1`. If not validate_args, `log_pmf` may return nans. allow_nan_stats: Boolean, default False. If False, raise an exception if a statistic (e.g. mean/mode/etc...) is undefined for any batch member. If True, batch members with valid parameters leading to undefined statistics will return NaN for this statistic. name: A name for this distribution. Raises: ValueError: If p and logits are passed, or if neither are passed. """ self._allow_nan_stats = allow_nan_stats self._name = name self._dtype = dtype self._validate_args = validate_args check_op = check_ops.assert_less_equal if p is None and logits is None: raise ValueError("Must pass p or logits.") elif p is not None and logits is not None: raise ValueError("Must pass either p or logits, not both.") elif p is None: with ops.op_scope([logits], name): self._logits = array_ops.identity(logits, name="logits") with ops.name_scope(name): with ops.name_scope("p"): self._p = math_ops.sigmoid(self._logits) elif logits is None: with ops.name_scope(name): with ops.name_scope("p"): p = array_ops.identity(p) one = constant_op.constant(1., p.dtype) zero = constant_op.constant(0., p.dtype) self._p = control_flow_ops.with_dependencies( [check_op(p, one), check_op(zero, p)] if validate_args else [], p) with ops.name_scope("logits"): self._logits = math_ops.log(self._p) - math_ops.log(1. - self._p) with ops.name_scope(name): with ops.name_scope("q"): self._q = 1. - self._p self._batch_shape = array_ops.shape(self._logits) self._event_shape = array_ops.constant([], dtype=dtypes.int32)
def _log_prob(self, x): x = control_flow_ops.with_dependencies([check_ops.assert_positive(x)] if self.validate_args else [], x) return ( self.alpha * math_ops.log(self.beta) - math_ops.lgamma(self.alpha) - (self.alpha + 1.0) * math_ops.log(x) - self.beta / x )
def _log_prob(self, x): x = self._assert_valid_sample(x) log_unnormalized_prob = ((self.a - 1.) * math_ops.log(x) + (self.b - 1.) * math_ops.log(1. - x)) log_normalization = (math_ops.lgamma(self.a) + math_ops.lgamma(self.b) - math_ops.lgamma(self.a_b_sum)) return log_unnormalized_prob - log_normalization
def _inverse_log_det_jacobian(self, y): y = self._maybe_assert_valid(y) event_dims = self._event_dims_tensor(y) return math_ops.reduce_sum( math_ops.log(self.concentration1) + math_ops.log(self.concentration0) + (self.concentration1 - 1) * math_ops.log(y) + (self.concentration0 - 1) * math_ops.log1p(-y**self.concentration1), axis=event_dims)
def _inverse_log_det_jacobian(self, y): y = self._maybe_assert_valid_y(y) event_dims = self._event_dims_tensor(y) return math_ops.reduce_sum( -math_ops.log1p(-y) + (1 / self.concentration - 1) * math_ops.log(-math_ops.log1p(-y)) + math_ops.log(self.scale / self.concentration), axis=event_dims)
def _forward_log_det_jacobian(self, x): x = self._maybe_assert_valid_x(x) event_dims = self._event_dims_tensor(x) return math_ops.reduce_sum( -(x / self.scale) ** self.concentration + (self.concentration - 1) * math_ops.log(x) + math_ops.log(self.concentration) + -self.concentration * math_ops.log(self.scale), axis=event_dims)
def _log_prob(self, counts): counts = self._check_counts(counts) prob_prob = (counts * math_ops.log(self.p) + (self.n - counts) * math_ops.log(1. - self.p)) combinations = (math_ops.lgamma(self.n + 1) - math_ops.lgamma(counts + 1) - math_ops.lgamma(self.n - counts + 1)) log_prob = prob_prob + combinations return log_prob
def _log_prob(self, x): x = control_flow_ops.with_dependencies([check_ops.assert_positive(x)] if self.validate_args else [], x) contrib_tensor_util.assert_same_float_dtype(tensors=[x], dtype=self.dtype) return (self.alpha * math_ops.log(self.beta) + (self.alpha - 1.) * math_ops.log(x) - self.beta * x - math_ops.lgamma(self.alpha))
def _log_abs_determinant(self): logging.warn( "Using (possibly slow) default implementation of determinant." " Requires conversion to a dense matrix and O(N^3) operations.") if self._can_use_cholesky(): diag = array_ops.matrix_diag_part(self._get_cached_chol()) return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1]) abs_det = math_ops.abs(self.determinant()) return math_ops.log(abs_det)
def _entropy(self): u = array_ops.expand_dims(self.df * self._ones(), -1) v = array_ops.expand_dims(self._ones(), -1) beta_arg = array_ops.concat_v2([u, v], len(u.get_shape()) - 1) / 2 half_df = 0.5 * self.df return ((0.5 + half_df) * (math_ops.digamma(0.5 + half_df) - math_ops.digamma(half_df)) + 0.5 * math_ops.log(self.df) + special_math_ops.lbeta(beta_arg) + math_ops.log(self.sigma))
def _entropy(self): v = array_ops.ones(self.batch_shape_tensor(), dtype=self.dtype)[..., None] u = v * self.df[..., None] beta_arg = array_ops.concat([u, v], -1) / 2. return (math_ops.log(math_ops.abs(self.scale)) + 0.5 * math_ops.log(self.df) + special_math_ops.lbeta(beta_arg) + 0.5 * (self.df + 1.) * (math_ops.digamma(0.5 * (self.df + 1.)) - math_ops.digamma(0.5 * self.df)))
def _log_abs_determinant(self): if self._is_spd: diag = array_ops.matrix_diag_part(self._chol) return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1]) if self.dtype.is_complex: abs_det = math_ops.complex_abs(self.determinant()) else: abs_det = math_ops.abs(self.determinant()) return math_ops.log(abs_det)
def _enclosing_power_of_two(value): """Return 2**N for integer N such that 2**N >= value.""" value_static = tensor_util.constant_value(value) if value_static is not None: return constant_op.constant( int(2**np.ceil(np.log(value_static) / np.log(2.0))), value.dtype) return math_ops.cast( math_ops.pow(2.0, math_ops.ceil( math_ops.log(math_ops.to_float(value)) / math_ops.log(2.0))), value.dtype)
def composed_sampler(logits, num_samples): # [batch size, num classes, num samples] unif = random_ops.random_uniform(logits.get_shape().concatenate( tensor_shape.TensorShape([num_samples]))) noise = -math_ops.log(-math_ops.log(unif)) # [batch size, num classes, 1] logits = array_ops.expand_dims(logits, -1) # [batch size, num samples] return math_ops.argmax(logits + noise, axis=1)
def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Adds a Log Loss term to the training procedure. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: labels: The ground truth output tensor, same dimensions as 'predictions'. predictions: The predicted outputs. weights: Optional `Tensor` whose rank is either 0, or the same rank as `labels`, and must be broadcastable to `labels` (i.e., all dimensions must be either `1`, or the same as the corresponding `losses` dimension). epsilon: A small increment to add to avoid taking a log of zero. scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `labels`; otherwise, it is scalar. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. Also if `labels` or `predictions` is None. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ if labels is None: raise ValueError("labels must not be None.") if predictions is None: raise ValueError("predictions must not be None.") with ops.name_scope(scope, "log_loss", (predictions, labels, weights)) as scope: predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) losses = -math_ops.multiply( labels, math_ops.log(predictions + epsilon)) - math_ops.multiply( (1 - labels), math_ops.log(1 - predictions + epsilon)) return compute_weighted_loss( losses, weights, scope, loss_collection, reduction=reduction)
def poisson(y_true, y_pred): return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
def generate_nan(x): """Intetionally generates NaNs by taking log of negative number.""" casted_x = math_ops.cast(x, dtypes.float32) return math_ops.log([[-1.0, 1.0], [3.0, 5.0]]) + casted_x
def _entropy(self): return (self.concentration - math_ops.log(self.rate) + math_ops.lgamma(self.concentration) + ((1. - self.concentration) * math_ops.digamma(self.concentration)))
def CompiledFunction(x): return math_ops.log(x)
def log_huber(x, m): if math_ops.abs(x) <= m: return x**2 else: return m**2 * (1 - 2 * math_ops.log(m) + math_ops.log(x**2))
def my_conditional(x): if math_ops.less(math_ops.reduce_sum(x), 0.0): return math_ops.log(x) else: return math_ops.log(-x)
def log1p(x): y = 1.0 + x return math_ops.log(y)
def log_zero(): """Computes `log(0.0)`.""" return math_ops.log(constant_op.constant(0.))
def _entropy(self): return (self.alpha - math_ops.log(self.beta) + math_ops.lgamma(self.alpha) + (1. - self.alpha) * math_ops.digamma(self.alpha))
def _log_normalization(self): return (math_ops.log(math_ops.abs(self.scale)) + 0.5 * math_ops.log(self.df) + 0.5 * np.log(np.pi) + math_ops.lgamma(0.5 * self.df) - math_ops.lgamma(0.5 * (self.df + 1.)))
def mean_squared_logarithmic_error(y_true, y_pred): first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.) second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.) return K.mean(math_ops.square(first_log - second_log), axis=-1)
def _list_mle_loss(labels, logits, weights=None, lambda_weight=None, reduction=core_losses.Reduction.SUM_BY_NONZERO_WEIGHTS, name=None, seed=None): """Computes the ListMLE loss [Xia et al. 2008] for a list. Given the labels of graded relevance l_i and the logits s_i, we calculate the ListMLE loss for the given list. The `lambda_weight` re-weights examples based on l_i and r_i. The recommended weighting scheme is the formulation presented in the "Position-Aware ListMLE" paper (Lan et. al) and available using create_p_list_mle_lambda_weight() factory function above. Args: labels: A `Tensor` of the same shape as `logits` representing graded relevance. logits: A `Tensor` with shape [batch_size, list_size]. Each value is the ranking score of the corresponding item. weights: A scalar, a `Tensor` with shape [batch_size, 1] for list-wise weights, or a `Tensor` with shape [batch_size, list_size] for item-wise weights. lambda_weight: A `DCGLambdaWeight` instance. reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to reduce training loss over batch. name: A string used as the name for this loss. seed: A randomization seed used when shuffling ground truth permutations. Returns: An op for the ListMLE loss. """ with ops.name_scope(name, 'list_mle_loss', (labels, logits, weights)): is_label_valid = utils.is_label_valid(labels) # Reset the invalid labels to 0 and reset the invalid logits to a logit with # ~= 0 contribution. labels = array_ops.where(is_label_valid, labels, array_ops.zeros_like(labels)) logits = array_ops.where( is_label_valid, logits, math_ops.log(_EPSILON) * array_ops.ones_like(logits)) weights = 1.0 if weights is None else ops.convert_to_tensor(weights) weights = array_ops.squeeze(weights) # Shuffle labels and logits to add randomness to sort. shuffled_indices = utils.shuffle_valid_indices(is_label_valid, seed) shuffled_labels = array_ops.gather_nd(labels, shuffled_indices) shuffled_logits = array_ops.gather_nd(logits, shuffled_indices) sorted_labels, sorted_logits = utils.sort_by_scores( shuffled_labels, [shuffled_labels, shuffled_logits]) raw_max = math_ops.reduce_max(sorted_logits, axis=1, keepdims=True) sorted_logits = sorted_logits - raw_max sums = math_ops.cumsum(math_ops.exp(sorted_logits), axis=1, reverse=True) sums = math_ops.log(sums) - sorted_logits if lambda_weight is not None and isinstance(lambda_weight, ListMLELambdaWeight): sums *= lambda_weight.individual_weights(sorted_labels) negative_log_likelihood = math_ops.reduce_sum(sums, 1) return core_losses.compute_weighted_loss(negative_log_likelihood, weights=weights, reduction=reduction)
def _log_unnormalized_prob(self, x): x = self._maybe_assert_valid_sample(x) return (self.concentration - 1.) * math_ops.log(x) - self.rate * x
def _log_cdf(self, x): return math_ops.log(self._cdf(x))
def kullback_leibler_divergence(y_true, y_pred): y_true = K.clip(y_true, K.epsilon(), 1) y_pred = K.clip(y_pred, K.epsilon(), 1) return math_ops.reduce_sum(y_true * math_ops.log(y_true / y_pred), axis=-1)
def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled, num_classes, num_true=1, sampled_values=None, subtract_log_q=True, remove_accidental_hits=False, name=None): """Helper function for nce_loss and sampled_softmax_loss functions. Computes sampled output training logits and labels suitable for implementing e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see sampled_softmax_loss). Note: In the case where num_true > 1, we assign to each target class the target probability 1 / num_true so that the target probabilities sum to 1 per-example. Args: weights: tensor of label embeddings with shape = [num_classes, dim] biases: tensor of num_classes label biases inputs: tensor with shape = [batch_size, dim] corresponding to forward activations of the input network labels: int tensor with shape [batch_size, num_true] num_sampled: number of label classes to sample per batch num_classes: number of possible label classes in the data (e.g. vocab size) num_true: number of target classes per example (default: 1) sampled_values: a tuple of (sampled_candidates, true_expected_count, sampled_expected_count) returned by a *CandidateSampler function to use (if None, we default to LogUniformCandidateSampler) subtract_log_q: subtract the log expected count of the labels in the sample to get the logits of the true labels (default: True) Turn off for Negative Sampling. remove_accidental_hits: whether to remove "accidental hits" where a sampled label equals the true labels (bool, default: False) name: name for this op Returns: out_logits, out_labels: tensors with shape [batch_size, num_true + num_sampled] for passing to either SigmoidCrossEntropyWithLogits (NCE) or SoftmaxCrossEntropyWithLogits (sampled softmax). """ with ops.op_scope( [weights, biases, inputs, labels], name, "compute_sampled_logits"): if labels.dtype != types.int64: labels = math_ops.cast(labels, types.int64) labels_flat = array_ops.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: num_sampled vector # true_expected_count shape = [batch_size, 1] # sampled_expected_count shape = num_sampled vector if sampled_values is None: sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_sampled, unique=True, range_max=num_classes) # NOTE: pylint cannot tell that 'sampled_values' is a sequence # pylint: disable=unpacking-non-sequence sampled, true_expected_count, sampled_expected_count = sampled_values # pylint: enable=unpacking-non-sequence # weights shape is [num_classes, dim] # labels_flat is a [batch_size * num_true] vector # true_w shape is [batch_size * num_true, dim] # true_b is a [batch_size * num_true] vector true_w = embedding_ops.embedding_lookup(weights, labels_flat) true_b = embedding_ops.embedding_lookup(biases, labels_flat) # inputs shape is [batch_size, dim] # true_w shape is [batch_size * num_true, dim] # row_wise_dots is [batch_size, num_true, dim] dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat(0, [[-1, num_true], dim]) row_wise_dots = math_ops.mul( array_ops.expand_dims(inputs, 1), array_ops.reshape(true_w, new_true_w_shape)) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat(0, [[-1], dim])) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) true_b = array_ops.reshape(true_b, [-1, num_true]) true_logits += true_b # Lookup weights and biases for sampled labels. # sampled is a num_sampled int vector # sampled_w shape is [num_sampled, dim] # sampled_b is a num_sampled float vector sampled_w = embedding_ops.embedding_lookup(weights, sampled) sampled_b = embedding_ops.embedding_lookup(biases, sampled) # inputs has shape [batch_size, dim] # sampled_w has shape [num_sampled, dim] # sampled_b has shape [num_sampled] # Apply X*W'+B, which yields [batch_size, num_sampled] sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True) + sampled_b if remove_accidental_hits: acc_hits = candidate_sampling_ops.compute_accidental_hits( labels, sampled, num_true=num_true) acc_indices, acc_ids, acc_weights = acc_hits # This is how SparseToDense expects the indices. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1]) acc_ids_2d_int32 = array_ops.reshape(math_ops.cast( acc_ids, types.int32), [-1, 1]) sparse_indices = array_ops.concat( 1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices") # Create sampled_logits_shape = [batch_size, num_sampled] sampled_logits_shape = array_ops.concat( 0, [array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0)]) sampled_logits += sparse_ops.sparse_to_dense( sparse_indices, sampled_logits_shape, acc_weights, 0.0) if subtract_log_q: # Subtract log of Q(l), prior probability that l appears in sampled. true_logits -= math_ops.log(true_expected_count) sampled_logits -= math_ops.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat(1, [true_logits, sampled_logits]) # true_logits is a float tensor, ones_like(true_logits) is a float tensor # of ones. We then divide by num_true to ensure the per-example labels sum # to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat( 1, [array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(sampled_logits)]) return out_logits, out_labels
def map_fn(x): return math_ops.log(math_ops.square(x) + 1)
def expectation_importance_sampler(f, log_p, sampling_dist_q, z=None, n=None, seed=None, name='expectation_importance_sampler'): r"""Monte Carlo estimate of `E_p[f(Z)] = E_q[f(Z) p(Z) / q(Z)]`. With `p(z) := exp{log_p(z)}`, this `Op` returns ``` n^{-1} sum_{i=1}^n [ f(z_i) p(z_i) / q(z_i) ], z_i ~ q, \approx E_q[ f(Z) p(Z) / q(Z) ] = E_p[f(Z)] ``` This integral is done in log-space with max-subtraction to better handle the often extreme values that `f(z) p(z) / q(z)` can take on. If `f >= 0`, it is up to 2x more efficient to exponentiate the result of `expectation_importance_sampler_logspace` applied to `Log[f]`. User supplies either `Tensor` of samples `z`, or number of samples to draw `n` Args: f: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape broadcastable to `q.batch_shape`. For example, `f` works "just like" `q.log_prob`. log_p: Callable mapping samples from `sampling_dist_q` to `Tensors` with shape broadcastable to `q.batch_shape`. For example, `log_p` works "just like" `sampling_dist_q.log_prob`. sampling_dist_q: The sampling distribution. `tf.contrib.distributions.Distribution`. `float64` `dtype` recommended. `log_p` and `q` should be supported on the same set. z: `Tensor` of samples from `q`, produced by `q.sample_n`. n: Integer `Tensor`. Number of samples to generate if `z` is not provided. seed: Python integer to seed the random number generator. name: A name to give this `Op`. Returns: The importance sampling estimate. `Tensor` with `shape` equal to batch shape of `q`, and `dtype` = `q.dtype`. """ q = sampling_dist_q with ops.name_scope(name, values=[z, n]): z = _get_samples(q, z, n, seed) log_p_z = log_p(z) q_log_prob_z = q.log_prob(z) def _importance_sampler_positive_f(log_f_z): # Same as expectation_importance_sampler_logspace, but using Tensors # rather than samples and functions. Allows us to sample once. log_values = log_f_z + log_p_z - q_log_prob_z return _logspace_mean(log_values) # With f_plus(z) = max(0, f(z)), f_minus(z) = max(0, -f(z)), # E_p[f(Z)] = E_p[f_plus(Z)] - E_p[f_minus(Z)] # = E_p[f_plus(Z) + 1] - E_p[f_minus(Z) + 1] # Without incurring bias, 1 is added to each to prevent zeros in logspace. # The logarithm is approximately linear around 1 + epsilon, so this is good # for small values of 'z' as well. f_z = f(z) log_f_plus_z = math_ops.log(nn.relu(f_z) + 1.) log_f_minus_z = math_ops.log(nn.relu(-1. * f_z) + 1.) log_f_plus_integral = _importance_sampler_positive_f(log_f_plus_z) log_f_minus_integral = _importance_sampler_positive_f(log_f_minus_z) return math_ops.exp(log_f_plus_integral) - math_ops.exp( log_f_minus_integral)
def _log_normalization(self): return (math_ops.lgamma(self.concentration) - self.concentration * math_ops.log(self.rate))
def grad(dy): # `dy` will come in as 1.0. Taking log of -1.0 leads to NaN. return math_ops.log(-dy)
def get_logits_and_prob(logits=None, p=None, multidimensional=False, validate_args=False, name="GetLogitsAndProb"): """Converts logits to probabilities and vice-versa, and returns both. Args: logits: Numeric `Tensor` representing log-odds. p: Numeric `Tensor` representing probabilities. multidimensional: `Boolean`, default `False`. If `True`, represents whether the last dimension of `logits` or `p`, a [N1, N2, ... k] dimensional tensor, represent the logits / probability between k classes. For `p`, this will additionally assert that the values in the last dimension sum to one. If `False`, this will instead assert that each value of `p` is in `[0, 1]`, and will do nothing to `logits`. validate_args: `Boolean`, default `False`. Whether to assert `0 <= p <= 1` if multidimensional is `False`, otherwise that the last dimension of `p` sums to one. name: A name for this operation (optional). Returns: Tuple with `logits` and `p`. If `p` has an entry that is `0` or `1`, then the corresponding entry in the returned logits will be `-Inf` and `Inf` respectively. Raises: ValueError: if neither `p` nor `logits` were passed in, or both were. """ with ops.name_scope(name, values=[p, logits]): if p is None and logits is None: raise ValueError("Must pass p or logits.") elif p is not None and logits is not None: raise ValueError("Must pass either p or logits, not both.") elif p is None: logits = array_ops.identity(logits, name="logits") with ops.name_scope("p"): if multidimensional: p = nn.softmax(logits) else: p = math_ops.sigmoid(logits) elif logits is None: with ops.name_scope("p"): p = array_ops.identity(p) if validate_args: one = constant_op.constant(1., p.dtype) dependencies = [check_ops.assert_non_negative(p)] if multidimensional: dependencies += [ assert_close(math_ops.reduce_sum( p, reduction_indices=[-1]), one, message="p does not sum to 1.") ] else: dependencies += [ check_ops.assert_less_equal( p, one, message="p has components greater than 1.") ] p = control_flow_ops.with_dependencies(dependencies, p) with ops.name_scope("logits"): if multidimensional: # Here we don't compute the multidimensional case, in a manner # consistent with respect to the unidimensional case. We do so # following the TF convention. Typically, you might expect to see # logits = log(p) - log(gather(p, pivot)). A side-effect of being # consistent with the TF approach is that the unidimensional case # implicitly handles the second dimension but the multidimensional # case explicitly keeps the pivot dimension. logits = math_ops.log(p) else: logits = math_ops.log(p) - math_ops.log(1. - p) return (logits, p)
def _entropy(self): return math_ops.log(self.range())
def _logcosh(x): return x + nn.softplus(-2. * x) - math_ops.log(2.)
def __init__(self, mix_loc, temperature, distribution, loc=None, scale=None, quadrature_size=8, quadrature_fn=quadrature_scheme_softmaxnormal_quantiles, validate_args=False, allow_nan_stats=True, name="VectorDiffeomixture"): """Constructs the VectorDiffeomixture on `R^d`. The vector diffeomixture (VDM) approximates the compound distribution ```none p(x) = int p(x | z) p(z) dz, where z is in the K-simplex, and p(x | z) := p(x | loc=sum_k z[k] loc[k], scale=sum_k z[k] scale[k]) ``` Args: mix_loc: `float`-like `Tensor` with shape `[b1, ..., bB, K-1]`. In terms of samples, larger `mix_loc[..., k]` ==> `Z` is more likely to put more weight on its `kth` component. temperature: `float`-like `Tensor`. Broadcastable with `mix_loc`. In terms of samples, smaller `temperature` means one component is more likely to dominate. I.e., smaller `temperature` makes the VDM look more like a standard mixture of `K` components. distribution: `tf.Distribution`-like instance. Distribution from which `d` iid samples are used as input to the selected affine transformation. Must be a scalar-batch, scalar-event distribution. Typically `distribution.reparameterization_type = FULLY_REPARAMETERIZED` or it is a function of non-trainable parameters. WARNING: If you backprop through a VectorDiffeomixture sample and the `distribution` is not `FULLY_REPARAMETERIZED` yet is a function of trainable variables, then the gradient will be incorrect! loc: Length-`K` list of `float`-type `Tensor`s. The `k`-th element represents the `shift` used for the `k`-th affine transformation. If the `k`-th item is `None`, `loc` is implicitly `0`. When specified, must have shape `[B1, ..., Bb, d]` where `b >= 0` and `d` is the event size. scale: Length-`K` list of `LinearOperator`s. Each should be positive-definite and operate on a `d`-dimensional vector space. The `k`-th element represents the `scale` used for the `k`-th affine transformation. `LinearOperator`s must have shape `[B1, ..., Bb, d, d]`, `b >= 0`, i.e., characterizes `b`-batches of `d x d` matrices quadrature_size: Python `int` scalar representing number of quadrature points. Larger `quadrature_size` means `q_N(x)` better approximates `p(x)`. quadrature_fn: Python callable taking `normal_loc`, `normal_scale`, `quadrature_size`, `validate_args` and returning `tuple(grid, probs)` representing the SoftmaxNormal grid and corresponding normalized weight. normalized) weight. Default value: `quadrature_scheme_softmaxnormal_quantiles`. validate_args: Python `bool`, default `False`. When `True` distribution parameters are checked for validity despite possibly degrading runtime performance. When `False` invalid inputs may silently render incorrect outputs. allow_nan_stats: Python `bool`, default `True`. When `True`, statistics (e.g., mean, mode, variance) use the value "`NaN`" to indicate the result is undefined. When `False`, an exception is raised if one or more of the statistic's batch members are undefined. name: Python `str` name prefixed to Ops created by this class. Raises: ValueError: if `not scale or len(scale) < 2`. ValueError: if `len(loc) != len(scale)` ValueError: if `quadrature_grid_and_probs is not None` and `len(quadrature_grid_and_probs[0]) != len(quadrature_grid_and_probs[1])` ValueError: if `validate_args` and any not scale.is_positive_definite. TypeError: if any scale.dtype != scale[0].dtype. TypeError: if any loc.dtype != scale[0].dtype. NotImplementedError: if `len(scale) != 2`. ValueError: if `not distribution.is_scalar_batch`. ValueError: if `not distribution.is_scalar_event`. """ parameters = locals() with ops.name_scope(name, values=[mix_loc, temperature]): if not scale or len(scale) < 2: raise ValueError("Must specify list (or list-like object) of scale " "LinearOperators, one for each component with " "num_component >= 2.") if loc is None: loc = [None]*len(scale) if len(loc) != len(scale): raise ValueError("loc/scale must be same-length lists " "(or same-length list-like objects).") dtype = scale[0].dtype.base_dtype loc = [ops.convert_to_tensor(loc_, dtype=dtype, name="loc{}".format(k)) if loc_ is not None else None for k, loc_ in enumerate(loc)] for k, scale_ in enumerate(scale): if validate_args and not scale_.is_positive_definite: raise ValueError("scale[{}].is_positive_definite = {} != True".format( k, scale_.is_positive_definite)) if scale_.dtype.base_dtype != dtype: raise TypeError( "dtype mismatch; scale[{}].base_dtype=\"{}\" != \"{}\"".format( k, scale_.dtype.base_dtype.name, dtype.name)) self._endpoint_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, event_ndims=1, validate_args=validate_args, name="endpoint_affine_{}".format(k)) for k, (loc_, scale_) in enumerate(zip(loc, scale))] # TODO(jvdillon): Remove once we support k-mixtures. # We make this assertion here because otherwise `grid` would need to be a # vector not a scalar. if len(scale) != 2: raise NotImplementedError("Currently only bimixtures are supported; " "len(scale)={} is not 2.".format(len(scale))) mix_loc = ops.convert_to_tensor( mix_loc, dtype=dtype, name="mix_loc") temperature = ops.convert_to_tensor( temperature, dtype=dtype, name="temperature") self._grid, probs = tuple(quadrature_fn( mix_loc / temperature, 1. / temperature, quadrature_size, validate_args)) # Note: by creating the logits as `log(prob)` we ensure that # `self.mixture_distribution.logits` is equivalent to # `math_ops.log(self.mixture_distribution.probs)`. self._mixture_distribution = categorical_lib.Categorical( logits=math_ops.log(probs), validate_args=validate_args, allow_nan_stats=allow_nan_stats) asserts = distribution_util.maybe_check_scalar_distribution( distribution, dtype, validate_args) if asserts: self._grid = control_flow_ops.with_dependencies( asserts, self._grid) self._distribution = distribution self._interpolated_affine = [ AffineLinearOperator(shift=loc_, scale=scale_, event_ndims=1, validate_args=validate_args, name="interpolated_affine_{}".format(k)) for k, (loc_, scale_) in enumerate(zip( interpolate_loc(self._grid, loc), interpolate_scale(self._grid, scale)))] [ self._batch_shape_, self._batch_shape_tensor_, self._event_shape_, self._event_shape_tensor_, ] = determine_batch_event_shapes(self._grid, self._endpoint_affine) super(VectorDiffeomixture, self).__init__( dtype=dtype, # We hard-code `FULLY_REPARAMETERIZED` because when # `validate_args=True` we verify that indeed # `distribution.reparameterization_type == FULLY_REPARAMETERIZED`. A # distribution which is a function of only non-trainable parameters # also implies we can use `FULLY_REPARAMETERIZED`. However, we cannot # easily test for that possibility thus we use `validate_args=False` # as a "back-door" to allow users a way to use non # `FULLY_REPARAMETERIZED` distribution. In such cases IT IS THE USERS # RESPONSIBILITY to verify that the base distribution is a function of # non-trainable parameters. reparameterization_type=distribution_lib.FULLY_REPARAMETERIZED, validate_args=validate_args, allow_nan_stats=allow_nan_stats, parameters=parameters, graph_parents=( distribution._graph_parents # pylint: disable=protected-access + [loc_ for loc_ in loc if loc_ is not None] + [p for scale_ in scale for p in scale_.graph_parents]), name=name)
def logloss(y_true, y_pred): losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon())) losses += math_ops.multiply((1 - y_true), math_ops.log(1 - y_pred + K.epsilon())) return K.mean(-losses, axis=-1)
def mean_squared_logarithmic_error(y_true, y_pred): # pylint: disable=missing-docstring y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.) second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.) return K.mean(math_ops.squared_difference(first_log, second_log), axis=-1)
def kernel(target_log_prob_fn, current_state, step_size, num_leapfrog_steps, seed=None, current_target_log_prob=None, current_grads_target_log_prob=None, name=None): """Runs one iteration of Hamiltonian Monte Carlo. Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm that takes a series of gradient-informed steps to produce a Metropolis proposal. This function applies one step of HMC to randomly update the variable `x`. This function can update multiple chains in parallel. It assumes that all leftmost dimensions of `current_state` index independent chain states (and are therefore updated independently). The output of `target_log_prob_fn()` should sum log-probabilities across all event dimensions. Slices along the rightmost dimensions may have different target distributions; for example, `current_state[0, :]` could have a different target distribution from `current_state[1, :]`. This is up to `target_log_prob_fn()`. (The number of independent chains is `tf.size(target_log_prob_fn(*current_state))`.) #### Examples: ##### Simple chain with warm-up. ```python tfd = tf.contrib.distributions # Tuning acceptance rates: dtype = np.float32 target_accept_rate = 0.631 num_warmup_iter = 500 num_chain_iter = 500 x = tf.get_variable(name="x", initializer=dtype(1)) step_size = tf.get_variable(name="step_size", initializer=dtype(1)) target = tfd.Normal(loc=dtype(0), scale=dtype(1)) next_x, other_results = hmc.kernel( target_log_prob_fn=target.log_prob, current_state=x, step_size=step_size, num_leapfrog_steps=3)[:4] x_update = x.assign(next_x) step_size_update = step_size.assign_add( step_size * tf.where( tf.exp(tf.minimum(other_results.log_accept_ratio), 0.) > target_accept_rate, 0.01, -0.01)) warmup = tf.group([x_update, step_size_update]) tf.global_variables_initializer().run() sess.graph.finalize() # No more graph building. # Warm up the sampler and adapt the step size for _ in xrange(num_warmup_iter): sess.run(warmup) # Collect samples without adapting step size samples = np.zeros([num_chain_iter]) for i in xrange(num_chain_iter): _, x_, target_log_prob_, grad_ = sess.run([ x_update, x, other_results.target_log_prob, other_results.grads_target_log_prob]) samples[i] = x_ print(samples.mean(), samples.std()) ``` ##### Sample from more complicated posterior. I.e., ```none W ~ MVN(loc=0, scale=sigma * eye(dims)) for i=1...num_samples: X[i] ~ MVN(loc=0, scale=eye(dims)) eps[i] ~ Normal(loc=0, scale=1) Y[i] = X[i].T * W + eps[i] ``` ```python tfd = tf.contrib.distributions def make_training_data(num_samples, dims, sigma): dt = np.asarray(sigma).dtype zeros = tf.zeros(dims, dtype=dt) x = tfd.MultivariateNormalDiag( loc=zeros).sample(num_samples, seed=1) w = tfd.MultivariateNormalDiag( loc=zeros, scale_identity_multiplier=sigma).sample(seed=2) noise = tfd.Normal( loc=dt(0), scale=dt(1)).sample(num_samples, seed=3) y = tf.tensordot(x, w, axes=[[1], [0]]) + noise return y, x, w def make_prior(sigma, dims): # p(w | sigma) return tfd.MultivariateNormalDiag( loc=tf.zeros([dims], dtype=sigma.dtype), scale_identity_multiplier=sigma) def make_likelihood(x, w): # p(y | x, w) return tfd.MultivariateNormalDiag( loc=tf.tensordot(x, w, axes=[[1], [0]])) # Setup assumptions. dtype = np.float32 num_samples = 150 dims = 10 num_iters = int(5e3) true_sigma = dtype(0.5) y, x, true_weights = make_training_data(num_samples, dims, true_sigma) # Estimate of `log(true_sigma)`. log_sigma = tf.get_variable(name="log_sigma", initializer=dtype(0)) sigma = tf.exp(log_sigma) # State of the Markov chain. weights = tf.get_variable( name="weights", initializer=np.random.randn(dims).astype(dtype)) prior = make_prior(sigma, dims) def joint_log_prob_fn(w): # f(w) = log p(w, y | x) return prior.log_prob(w) + make_likelihood(x, w).log_prob(y) weights_update = weights.assign( hmc.kernel(target_log_prob_fn=joint_log_prob, current_state=weights, step_size=0.1, num_leapfrog_steps=5)[0]) with tf.control_dependencies([weights_update]): loss = -prior.log_prob(weights) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01) log_sigma_update = optimizer.minimize(loss, var_list=[log_sigma]) sess.graph.finalize() # No more graph building. tf.global_variables_initializer().run() sigma_history = np.zeros(num_iters, dtype) weights_history = np.zeros([num_iters, dims], dtype) for i in xrange(num_iters): _, sigma_, weights_, _ = sess.run([log_sigma_update, sigma, weights]) weights_history[i, :] = weights_ sigma_history[i] = sigma_ true_weights_ = sess.run(true_weights) # Should converge to something close to true_sigma. plt.plot(sigma_history); plt.ylabel("sigma"); plt.xlabel("iteration"); ``` Args: target_log_prob_fn: Python callable which takes an argument like `current_state` (or `*current_state` if it's a list) and returns its (possibly unnormalized) log-density under the target distribution. current_state: `Tensor` or Python `list` of `Tensor`s representing the current state(s) of the Markov chain(s). The first `r` dimensions index independent chains, `r = tf.rank(target_log_prob_fn(*current_state))`. step_size: `Tensor` or Python `list` of `Tensor`s representing the step size for the leapfrog integrator. Must broadcast with the shape of `current_state`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. When possible, it's often helpful to match per-variable step sizes to the standard deviations of the target distribution in each variable. num_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to `step_size * num_leapfrog_steps`. seed: Python integer to seed the random number generator. current_target_log_prob: (Optional) `Tensor` representing the value of `target_log_prob_fn` at the `current_state`. The only reason to specify this argument is to reduce TF graph size. Default value: `None` (i.e., compute as needed). current_grads_target_log_prob: (Optional) Python list of `Tensor`s representing gradient of `current_target_log_prob` at the `current_state` and wrt the `current_state`. Must have same shape as `current_state`. The only reason to specify this argument is to reduce TF graph size. Default value: `None` (i.e., compute as needed). name: Python `str` name prefixed to Ops created by this function. Default value: `None` (i.e., "hmc_kernel"). Returns: next_state: Tensor or Python list of `Tensor`s representing the state(s) of the Markov chain(s) at each result step. Has same shape as `current_state`. kernel_results: `collections.namedtuple` of internal calculations used to advance the chain. Raises: ValueError: if there isn't one `step_size` or a list with same length as `current_state`. """ with ops.name_scope(name, "hmc_kernel", [ current_state, step_size, num_leapfrog_steps, seed, current_target_log_prob, current_grads_target_log_prob ]): with ops.name_scope("initialize"): [ current_state_parts, step_sizes, current_target_log_prob, current_grads_target_log_prob ] = _prepare_args(target_log_prob_fn, current_state, step_size, current_target_log_prob, current_grads_target_log_prob, maybe_expand=True) independent_chain_ndims = distributions_util.prefer_static_rank( current_target_log_prob) current_momentums = [] for s in current_state_parts: current_momentums.append( random_ops.random_normal(shape=array_ops.shape(s), dtype=s.dtype.base_dtype, seed=seed)) seed = distributions_util.gen_new_seed( seed, salt="hmc_kernel_momentums") num_leapfrog_steps = ops.convert_to_tensor( num_leapfrog_steps, dtype=dtypes.int32, name="num_leapfrog_steps") [ proposed_momentums, proposed_state_parts, proposed_target_log_prob, proposed_grads_target_log_prob, ] = _leapfrog_integrator(current_momentums, target_log_prob_fn, current_state_parts, step_sizes, num_leapfrog_steps, current_target_log_prob, current_grads_target_log_prob) energy_change = _compute_energy_change(current_target_log_prob, current_momentums, proposed_target_log_prob, proposed_momentums, independent_chain_ndims) log_accept_ratio = -energy_change # u < exp(log_accept_ratio), where u~Uniform[0,1) # ==> log(u) < log_accept_ratio random_value = random_ops.random_uniform( shape=array_ops.shape(energy_change), dtype=energy_change.dtype, seed=seed) random_negative = math_ops.log(random_value) is_accepted = random_negative < log_accept_ratio accepted_target_log_prob = array_ops.where(is_accepted, proposed_target_log_prob, current_target_log_prob) next_state_parts = [ _choose(is_accepted, proposed_state_part, current_state_part, independent_chain_ndims) for current_state_part, proposed_state_part in zip( current_state_parts, proposed_state_parts) ] accepted_grads_target_log_prob = [ _choose(is_accepted, proposed_grad, grad, independent_chain_ndims) for proposed_grad, grad in zip(proposed_grads_target_log_prob, current_grads_target_log_prob) ] maybe_flatten = lambda x: x if _is_list_like(current_state) else x[0] return [ maybe_flatten(next_state_parts), KernelResults( log_accept_ratio=log_accept_ratio, current_grads_target_log_prob=accepted_grads_target_log_prob, current_target_log_prob=accepted_target_log_prob, is_accepted=is_accepted, proposed_grads_target_log_prob=proposed_grads_target_log_prob, proposed_state=maybe_flatten(proposed_state_parts), proposed_target_log_prob=proposed_target_log_prob, ), ]
def _log_sum_sq(x, axis=None): """Computes log(sum(x**2)).""" return math_ops.reduce_logsumexp(2. * math_ops.log(math_ops.abs(x)), axis)
def Forward(x): return math_ops.log(x)
def sample_from_datasets(datasets, weights=None, seed=None): """Samples elements at random from the datasets in `datasets`. Args: datasets: A list of `tf.data.Dataset` objects with compatible structure. weights: (Optional.) A list of `len(datasets)` floating-point values where `weights[i]` represents the probability with which an element should be sampled from `datasets[i]`, or a `tf.data.Dataset` object where each element is such a list. Defaults to a uniform distribution across `datasets`. seed: (Optional.) A `tf.int64` scalar `tf.Tensor`, representing the random seed that will be used to create the distribution. See `tf.set_random_seed` for behavior. Returns: A dataset that interleaves elements from `datasets` at random, according to `weights` if provided, otherwise with uniform probability. Raises: TypeError: If the `datasets` or `weights` arguments have the wrong type. ValueError: If the `weights` argument is specified and does not match the length of the `datasets` element. """ num_datasets = len(datasets) if not isinstance(weights, dataset_ops.Dataset): if weights is None: # Select inputs with uniform probability. logits = [[1.0] * num_datasets] else: # Use the given `weights` as the probability of choosing the respective # input. weights = ops.convert_to_tensor(weights, name="weights") if weights.dtype not in (dtypes.float32, dtypes.float64): raise TypeError("`weights` must be convertible to a tensor of " "`tf.float32` or `tf.float64` elements.") if not weights.shape.is_compatible_with([num_datasets]): raise ValueError( "`weights` must be a vector of length `len(datasets)`.") # The `stateless_multinomial()` op expects log-probabilities, as opposed # to weights. logits = array_ops.expand_dims( math_ops.log(weights, name="logits"), 0) # NOTE(mrry): We only specialize when `weights` is not a `Dataset`. When it # is a `Dataset`, it is possible that evaluating it has a side effect the # user depends on. if len(datasets) == 1: return datasets[0] def select_dataset_constant_logits(seed): return array_ops.squeeze(stateless.stateless_multinomial( logits, 1, seed=seed), axis=[0, 1]) selector_input = dataset_ops.MapDataset( random_ops.RandomDataset(seed).batch(2), select_dataset_constant_logits, use_inter_op_parallelism=False) else: # Use each element of the given `weights` dataset as the probability of # choosing the respective input. # The `stateless_multinomial()` op expects log-probabilities, as opposed to # weights. logits_ds = weights.map(lambda *p: math_ops.log(p, name="logits")) def select_dataset_varying_logits(logits, seed): return array_ops.squeeze(stateless.stateless_multinomial( logits, 1, seed=seed), axis=[0, 1]) logits_and_seeds = dataset_ops.Dataset.zip( (logits_ds, random_ops.RandomDataset(seed).batch(2))) selector_input = dataset_ops.MapDataset(logits_and_seeds, select_dataset_varying_logits, use_inter_op_parallelism=False) return _DirectedInterleaveDataset(selector_input, datasets)