def log_jac_det(self, value, *inputs): return at.sum(value[..., self.diag_idxs], axis=-1)
def logdet(self): return at.repeat(at.sum(at.log(self.scale)), self.z0.shape[0])
def log_jac_det(self, value, *inputs): y = at.zeros(value.shape) return at.sum(y, axis=-1)
def MvNormalLogp(): """Compute the log pdf of a multivariate normal distribution. This should be used in MvNormal.logp once Theano#5908 is released. Parameters ---------- cov: aet.matrix The covariance matrix. delta: aet.matrix Array of deviations from the mean. """ cov = aet.matrix("cov") cov.tag.test_value = floatX(np.eye(3)) delta = aet.matrix("delta") delta.tag.test_value = floatX(np.zeros((2, 3))) solve_lower = Solve(A_structure="lower_triangular") solve_upper = Solve(A_structure="upper_triangular") cholesky = Cholesky(lower=True, on_error="nan") n, k = delta.shape n, k = f(n), f(k) chol_cov = cholesky(cov) diag = aet.diag(chol_cov) ok = aet.all(diag > 0) chol_cov = aet.switch(ok, chol_cov, aet.fill(chol_cov, 1)) delta_trans = solve_lower(chol_cov, delta.T).T result = n * k * aet.log(f(2) * np.pi) result += f(2) * n * aet.sum(aet.log(diag)) result += (delta_trans ** f(2)).sum() result = f(-0.5) * result logp = aet.switch(ok, result, -np.inf) def dlogp(inputs, gradients): (g_logp,) = gradients cov, delta = inputs g_logp.tag.test_value = floatX(1.0) n, k = delta.shape chol_cov = cholesky(cov) diag = aet.diag(chol_cov) ok = aet.all(diag > 0) chol_cov = aet.switch(ok, chol_cov, aet.fill(chol_cov, 1)) delta_trans = solve_lower(chol_cov, delta.T).T inner = n * aet.eye(k) - aet.dot(delta_trans.T, delta_trans) g_cov = solve_upper(chol_cov.T, inner) g_cov = solve_upper(chol_cov.T, g_cov.T) tau_delta = solve_upper(chol_cov.T, delta_trans.T) g_delta = tau_delta.T g_cov = aet.switch(ok, g_cov, -np.nan) g_delta = aet.switch(ok, g_delta, -np.nan) return [-0.5 * g_cov * g_logp, -g_delta * g_logp] return OpFromGraph([cov, delta], [logp], grad_overrides=dlogp, inline=True)
def logsumexp(x, axis=None, keepdims=True): # Adapted from https://github.com/Theano/Theano/issues/1563 x_max = at.max(x, axis=axis, keepdims=True) x_max = at.switch(at.isinf(x_max), 0, x_max) res = at.log(at.sum(at.exp(x - x_max), axis=axis, keepdims=True)) + x_max return res if keepdims else res.squeeze()
def total_norm_constraint(tensor_vars, max_norm, epsilon=1e-7, return_norm=False): """Rescales a list of tensors based on their combined norm If the combined norm of the input tensors exceeds the threshold then all tensors are rescaled such that the combined norm is equal to the threshold. Scaling the norms of the gradients is often used when training recurrent neural networks [1]_. Parameters ---------- tensor_vars: List of TensorVariables. Tensors to be rescaled. max_norm: float Threshold value for total norm. epsilon: scalar, optional Value used to prevent numerical instability when dividing by very small or zero norms. return_norm: bool If true the total norm is also returned. Returns ------- tensor_vars_scaled: list of TensorVariables The scaled tensor variables. norm: Aesara scalar The combined norms of the input variables prior to rescaling, only returned if ``return_norms=True``. Examples -------- >>> from lasagne.layers import InputLayer, DenseLayer >>> import lasagne >>> from lasagne.updates import sgd, total_norm_constraint >>> x = at.matrix() >>> y = at.ivector() >>> l_in = InputLayer((5, 10)) >>> l1 = DenseLayer(l_in, num_units=7, nonlinearity=at.nnet.softmax) >>> output = lasagne.layers.get_output(l1, x) >>> cost = at.mean(at.nnet.categorical_crossentropy(output, y)) >>> all_params = lasagne.layers.get_all_params(l1) >>> all_grads = at.grad(cost, all_params) >>> scaled_grads = total_norm_constraint(all_grads, 5) >>> updates = sgd(scaled_grads, all_params, learning_rate=0.1) Notes ----- The total norm can be used to monitor training. References ---------- .. [1] Sutskever, I., Vinyals, O., & Le, Q. V. (2014): Sequence to sequence learning with neural networks. In Advances in Neural Information Processing Systems (pp. 3104-3112). """ norm = at.sqrt(sum(at.sum(tensor**2) for tensor in tensor_vars)) dtype = np.dtype(aesara.config.floatX).type target_norm = at.clip(norm, 0, dtype(max_norm)) multiplier = target_norm / (dtype(epsilon) + norm) tensor_vars_scaled = [step * multiplier for step in tensor_vars] if return_norm: return tensor_vars_scaled, norm else: return tensor_vars_scaled
def norm_constraint(tensor_var, max_norm, norm_axes=None, epsilon=1e-7): """Max weight norm constraints and gradient clipping This takes a TensorVariable and rescales it so that incoming weight norms are below a specified constraint value. Vectors violating the constraint are rescaled so that they are within the allowed range. Parameters ---------- tensor_var: TensorVariable Aesara expression for update, gradient, or other quantity. max_norm: scalar This value sets the maximum allowed value of any norm in `tensor_var`. norm_axes: sequence (list or tuple) The axes over which to compute the norm. This overrides the default norm axes defined for the number of dimensions in `tensor_var`. When this is not specified and `tensor_var` is a matrix (2D), this is set to `(0,)`. If `tensor_var` is a 3D, 4D or 5D tensor, it is set to a tuple listing all axes but axis 0. The former default is useful for working with dense layers, the latter is useful for 1D, 2D and 3D convolutional layers. (Optional) epsilon: scalar, optional Value used to prevent numerical instability when dividing by very small or zero norms. Returns ------- TensorVariable Input `tensor_var` with rescaling applied to weight vectors that violate the specified constraints. Examples -------- >>> param = aesara.shared( ... np.random.randn(100, 200).astype(aesara.config.floatX)) >>> update = param + 100 >>> update = norm_constraint(update, 10) >>> func = aesara.function([], [], updates=[(param, update)]) >>> # Apply constrained update >>> _ = func() >>> from lasagne.utils import compute_norms >>> norms = compute_norms(param.get_value()) >>> np.isclose(np.max(norms), 10) True Notes ----- When `norm_axes` is not specified, the axes over which the norm is computed depend on the dimensionality of the input variable. If it is 2D, it is assumed to come from a dense layer, and the norm is computed over axis 0. If it is 3D, 4D or 5D, it is assumed to come from a convolutional layer and the norm is computed over all trailing axes beyond axis 0. For other uses, you should explicitly specify the axes over which to compute the norm using `norm_axes`. """ ndim = tensor_var.ndim if norm_axes is not None: sum_over = tuple(norm_axes) elif ndim == 2: # DenseLayer sum_over = (0, ) elif ndim in [3, 4, 5]: # Conv{1,2,3}DLayer sum_over = tuple(range(1, ndim)) else: raise ValueError("Unsupported tensor dimensionality {}." "Must specify `norm_axes`".format(ndim)) dtype = np.dtype(aesara.config.floatX).type norms = at.sqrt(at.sum(at.sqr(tensor_var), axis=sum_over, keepdims=True)) target_norms = at.clip(norms, 0, dtype(max_norm)) constrained_output = tensor_var * (target_norms / (dtype(epsilon) + norms)) return constrained_output
def log_jac_det(self, value, *inputs): return at.sum(value[..., 1:], axis=-1)
def backward(self, value, *inputs): remaining = 1 - at.sum(value[..., :], axis=-1, keepdims=True) return at.concatenate([value[..., :], remaining], axis=-1)
def diag(self, X): X, Xc, _ = self._common(X, None) return at.sum(at.square(Xc), 1)
def log_jac_det(self, value, *inputs): diag_idxs = self.param_extract_fn(inputs) return at.sum(value[diag_idxs])
] p2['f'] = 100 * (x_dev[1] - x_dev[0]**2)**2 + (1 - x_dev[0])**2 p2['ce'] = None p2['neq'] = 0 p2['ci'] = None p2['nineq'] = 0 p2['init'] = np.random.randn(2).astype(float_dtype) p2['ground_truth'] = [np.array([1.0, 1.0], dtype=float_dtype)] #test_problems.append(p2) p3 = dict() p3['text_statements'] = [ 'maximize f(x, y) = x + y subject to x**2 + y**2 = 1' ] p3['f'] = -T.sum(x_dev) p3['ce'] = T.sum(x_dev**2) - 1.0 p3['neq'] = 1 p3['ci'] = None p3['nineq'] = 0 p3['init'] = np.random.randn(2).astype(float_dtype) p3['ground_truth'] = [ np.array([np.sqrt(2.0) / 2.0, np.sqrt(2.0) / 2.0], dtype=float_dtype) ] #test_problems.append(p3) p4 = dict() p4['text_statements'] = [ 'maximize f(x, y) = (x**2)*y subject to x**2 + y**2 = 3' ]
def logpt( var: Union[TensorVariable, List[TensorVariable]], rv_values: Optional[Union[TensorVariable, Dict[TensorVariable, TensorVariable]]] = None, *, jacobian: bool = True, scaling: bool = True, transformed: bool = True, sum: bool = True, **kwargs, ) -> Union[TensorVariable, List[TensorVariable]]: """Create a measure-space (i.e. log-likelihood) graph for a random variable or a list of random variables at a given point. The input `var` determines which log-likelihood graph is used and `rv_value` is that graph's input parameter. For example, if `var` is the output of a ``NormalRV`` ``Op``, then the output is a graph of the density function for `var` set to the value `rv_value`. Parameters ========== var The `RandomVariable` output that determines the log-likelihood graph. Can also be a list of variables. The final log-likelihood graph will be the sum total of all individual log-likelihood graphs of variables in the list. rv_values A variable, or ``dict`` of variables, that represents the value of `var` in its log-likelihood. If no `rv_value` is provided, ``var.tag.value_var`` will be checked and, when available, used. jacobian Whether or not to include the Jacobian term. scaling A scaling term to apply to the generated log-likelihood graph. transformed Apply transforms. sum Sum the log-likelihood or return each term as a separate list item. """ # TODO: In future when we drop support for tag.value_var most of the following # logic can be removed and logpt can just be a wrapper function that calls aeppl's # joint_logprob directly. # If var is not a list make it one. if not isinstance(var, (list, tuple)): var = [var] # If logpt isn't provided values it is assumed that the tagged value var or # observation is the value variable for that particular RV. if rv_values is None: rv_values = {} for rv in var: value_var = getattr(rv.tag, "observations", getattr(rv.tag, "value_var", None)) if value_var is None: raise ValueError(f"No value variable found for var {rv}") rv_values[rv] = value_var # Else we assume we were given a single rv and respective value elif not isinstance(rv_values, Mapping): if len(var) == 1: rv_values = { var[0]: at.as_tensor_variable(rv_values).astype(var[0].type) } else: raise ValueError( "rv_values must be a dict if more than one var is requested") if scaling: rv_scalings = {} for rv, value_var in rv_values.items(): rv_scalings[value_var] = _get_scaling( getattr(rv.tag, "total_size", None), value_var.shape, value_var.ndim) # Aeppl needs all rv-values pairs, not just that of the requested var. # Hence we iterate through the graph to collect them. tmp_rvs_to_values = rv_values.copy() for node in io_toposort(graph_inputs(var), var): try: curr_vars = [node.default_output()] except ValueError: curr_vars = node.outputs for curr_var in curr_vars: if curr_var in tmp_rvs_to_values: continue # Check if variable has a value variable value_var = getattr(curr_var.tag, "observations", getattr(curr_var.tag, "value_var", None)) if value_var is not None: tmp_rvs_to_values[curr_var] = value_var # After collecting all necessary rvs and values, we check for any value transforms transform_map = {} if transformed: for rv, value_var in tmp_rvs_to_values.items(): if hasattr(value_var.tag, "transform"): transform_map[value_var] = value_var.tag.transform # If the provided value_variable does not have transform information, we # check if the original `rv.tag.value_var` does. # TODO: This logic should be replaced by an explicit dict of # `{value_var: transform}` similar to `rv_values`. else: original_value_var = getattr(rv.tag, "value_var", None) if original_value_var is not None and hasattr( original_value_var.tag, "transform"): transform_map[value_var] = original_value_var.tag.transform transform_opt = TransformValuesOpt(transform_map) temp_logp_var_dict = factorized_joint_logprob(tmp_rvs_to_values, extra_rewrites=transform_opt, use_jacobian=jacobian, **kwargs) # aeppl returns the logpt for every single value term we provided to it. This includes # the extra values we plugged in above, so we filter those we actually wanted in the # same order they were given in. logp_var_dict = {} for value_var in rv_values.values(): logp_var_dict[value_var] = temp_logp_var_dict[value_var] if scaling: for value_var in logp_var_dict.keys(): if value_var in rv_scalings: logp_var_dict[value_var] *= rv_scalings[value_var] if sum: logp_var = at.sum( [at.sum(factor) for factor in logp_var_dict.values()]) else: logp_var = list(logp_var_dict.values()) # TODO: deprecate special behavior when only one variable is requested and # always return a list. This is here for backwards compatibility as logpt # started as a replacement to factor.logpt, but it should now be considered an # internal function reached only via model.logp* methods. if len(logp_var) == 1: logp_var = logp_var[0] return logp_var
def logcdfpt( var: TensorVariable, rv_values: Optional[Union[TensorVariable, Dict[TensorVariable, TensorVariable]]] = None, *, scaling: bool = True, sum: bool = True, **kwargs, ) -> TensorVariable: """Create a measure-space (i.e. log-cdf) graph for a random variable at a given point. Parameters ========== var The `RandomVariable` output that determines the log-likelihood graph. rv_values A variable, or ``dict`` of variables, that represents the value of `var` in its log-likelihood. If no `rv_value` is provided, ``var.tag.value_var`` will be checked and, when available, used. jacobian Whether or not to include the Jacobian term. scaling A scaling term to apply to the generated log-likelihood graph. transformed Apply transforms. sum Sum the log-likelihood. """ if not isinstance(rv_values, Mapping): rv_values = {var: rv_values} if rv_values is not None else {} rv_var, rv_value_var = extract_rv_and_value_vars(var) rv_value = rv_values.get(rv_var, rv_value_var) if rv_var is not None and rv_value is None: raise ValueError(f"No value variable specified or associated with {rv_var}") if rv_value is not None: rv_value = at.as_tensor(rv_value) if rv_var is not None: # Make sure that the value is compatible with the random variable rv_value = rv_var.type.filter_variable(rv_value.astype(rv_var.dtype)) if rv_value_var is None: rv_value_var = rv_value rv_node = rv_var.owner rng, size, dtype, *dist_params = rv_node.inputs # Here, we plug the actual random variable into the log-likelihood graph, # because we want a log-likelihood graph that only contains # random variables. This is important, because a random variable's # parameters can contain random variables themselves. # Ultimately, with a graph containing only random variables and # "deterministics", we can simply replace all the random variables with # their value variables and be done. tmp_rv_values = rv_values.copy() tmp_rv_values[rv_var] = rv_var logp_var = _logcdf(rv_node.op, rv_var, tmp_rv_values, *dist_params, **kwargs) transform = getattr(rv_value_var.tag, "transform", None) if rv_value_var else None # Replace random variables with their value variables replacements = rv_values.copy() replacements.update({rv_var: rv_value, rv_value_var: rv_value}) (logp_var,), _ = rvs_to_value_vars( (logp_var,), apply_transforms=False, initial_replacements=replacements, ) if sum: logp_var = at.sum(logp_var) if scaling: logp_var *= _get_scaling( getattr(rv_var.tag, "total_size", None), rv_value.shape, rv_value.ndim ) # Recompute test values for the changes introduced by the replacements # above. if config.compute_test_value != "off": for node in io_toposort(graph_inputs((logp_var,)), (logp_var,)): compute_test_value(node) if rv_var.name is not None: logp_var.name = f"__logp_{rv_var.name}" return logp_var
def logpt( var: TensorVariable, rv_values: Optional[Union[TensorVariable, Dict[TensorVariable, TensorVariable]]] = None, *, jacobian: bool = True, scaling: bool = True, transformed: bool = True, sum: bool = True, **kwargs, ) -> TensorVariable: """Create a measure-space (i.e. log-likelihood) graph for a random variable or a list of random variables at a given point. The input `var` determines which log-likelihood graph is used and `rv_value` is that graph's input parameter. For example, if `var` is the output of a ``NormalRV`` ``Op``, then the output is a graph of the density function for `var` set to the value `rv_value`. Parameters ========== var The `RandomVariable` output that determines the log-likelihood graph. Can also be a list of variables. The final log-likelihood graph will be the sum total of all individual log-likelihood graphs of variables in the list. rv_values A variable, or ``dict`` of variables, that represents the value of `var` in its log-likelihood. If no `rv_value` is provided, ``var.tag.value_var`` will be checked and, when available, used. jacobian Whether or not to include the Jacobian term. scaling A scaling term to apply to the generated log-likelihood graph. transformed Apply transforms. sum Sum the log-likelihood. """ # TODO: In future when we drop support for tag.value_var most of the following # logic can be removed and logpt can just be a wrapper function that calls aeppl's # joint_logprob directly. # If var is not a list make it one. if not isinstance(var, list): var = [var] # If logpt isn't provided values and the variable (provided in var) # is an RV, it is assumed that the tagged value var or observation is # the value variable for that particular RV. if rv_values is None: rv_values = {} for _var in var: if isinstance(_var.owner.op, RandomVariable): rv_value_var = getattr( _var.tag, "observations", getattr(_var.tag, "value_var", _var) ) rv_values = {_var: rv_value_var} elif not isinstance(rv_values, Mapping): # Else if we're given a single value and a single variable we assume a mapping among them. rv_values = ( {var[0]: at.as_tensor_variable(rv_values).astype(var[0].type)} if len(var) == 1 else {} ) # Since the filtering of logp graph is based on value variables # provided to this function if not rv_values: warnings.warn("No value variables provided the logp will be an empty graph") if scaling: rv_scalings = {} for _var in var: rv_value_var = getattr(_var.tag, "observations", getattr(_var.tag, "value_var", _var)) rv_scalings[rv_value_var] = _get_scaling( getattr(_var.tag, "total_size", None), rv_value_var.shape, rv_value_var.ndim ) # Unlike aeppl, PyMC's logpt is expected to plug in the values variables to corresponding # RVs automatically unless the values are explicity set to None. Hence we iterate through # the graph to find RVs and construct a new RVs to values dictionary. tmp_rvs_to_values = rv_values.copy() transform_map = {} for node in io_toposort(graph_inputs(var), var): if isinstance(node.op, RandomVariable): curr_var = node.out rv_value_var = getattr( curr_var.tag, "observations", getattr(curr_var.tag, "value_var", curr_var) ) rv_value = rv_values.get(curr_var, rv_value_var) tmp_rvs_to_values[curr_var] = rv_value # Along with value variables we also check for transforms if any. if hasattr(rv_value_var.tag, "transform") and transformed: transform_map[rv_value] = rv_value_var.tag.transform # The condition below is a hackish way of excluding the value variable for the # RV being indexed in case of Advanced Indexing of RVs. It gets added by the # logic above but aeppl does not expect us to include it in the dictionary of # {RV:values} given to it. if isinstance(node.op, subtensor_types): curr_var = node.out if ( curr_var in tmp_rvs_to_values.keys() and curr_var.owner.inputs[0] in tmp_rvs_to_values.keys() ): tmp_rvs_to_values.pop(curr_var.owner.inputs[0]) transform_opt = TransformValuesOpt(transform_map) temp_logp_var_dict = factorized_joint_logprob( tmp_rvs_to_values, extra_rewrites=transform_opt, use_jacobian=jacobian, **kwargs ) # aeppl returns the logpt for every single value term we provided to it. This includes # the extra values we plugged in above so we need to filter those out. logp_var_dict = {} for value_var, _logp in temp_logp_var_dict.items(): if value_var in rv_values.values(): logp_var_dict[value_var] = _logp # If it's an empty dictionary the logp is None if not logp_var_dict: logp_var = None else: # Otherwise apply appropriate scalings and at.add and/or at.sum the # graphs accordingly. if scaling: for _value in logp_var_dict.keys(): if _value in rv_scalings: logp_var_dict[_value] *= rv_scalings[_value] if len(logp_var_dict) == 1: logp_var_dict = tuple(logp_var_dict.values())[0] if sum: logp_var = at.sum(logp_var_dict) else: logp_var = logp_var_dict else: if sum: logp_var = at.sum([at.sum(factor) for factor in logp_var_dict.values()]) else: logp_var = at.add(*logp_var_dict.values()) # Recompute test values for the changes introduced by the replacements # above. if config.compute_test_value != "off": for node in io_toposort(graph_inputs((logp_var,)), (logp_var,)): compute_test_value(node) return logp_var
def logpt( var: TensorVariable, rv_values: Optional[Union[TensorVariable, Dict[TensorVariable, TensorVariable]]] = None, *, jacobian: bool = True, scaling: bool = True, transformed: bool = True, cdf: bool = False, sum: bool = False, **kwargs, ) -> TensorVariable: """Create a measure-space (i.e. log-likelihood) graph for a random variable at a given point. The input `var` determines which log-likelihood graph is used and `rv_value` is that graph's input parameter. For example, if `var` is the output of a ``NormalRV`` ``Op``, then the output is a graph of the density function for `var` set to the value `rv_value`. Parameters ========== var The `RandomVariable` output that determines the log-likelihood graph. rv_values A variable, or ``dict`` of variables, that represents the value of `var` in its log-likelihood. If no `rv_value` is provided, ``var.tag.value_var`` will be checked and, when available, used. jacobian Whether or not to include the Jacobian term. scaling A scaling term to apply to the generated log-likelihood graph. transformed Apply transforms. cdf Return the log cumulative distribution. sum Sum the log-likelihood. """ if not isinstance(rv_values, Mapping): rv_values = {var: rv_values} if rv_values is not None else {} rv_var, rv_value_var = extract_rv_and_value_vars(var) rv_value = rv_values.get(rv_var, rv_value_var) if rv_var is not None and rv_value is None: raise ValueError( f"No value variable specified or associated with {rv_var}") if rv_value is not None: rv_value = at.as_tensor(rv_value) if rv_var is not None: # Make sure that the value is compatible with the random variable rv_value = rv_var.type.filter_variable( rv_value.astype(rv_var.dtype)) if rv_value_var is None: rv_value_var = rv_value if rv_var is None: if var.owner is not None: return _logp( var.owner.op, var, rv_values, *var.owner.inputs, jacobian=jacobian, scaling=scaling, transformed=transformed, cdf=cdf, sum=sum, ) return at.zeros_like(var) rv_node = rv_var.owner rng, size, dtype, *dist_params = rv_node.inputs # Here, we plug the actual random variable into the log-likelihood graph, # because we want a log-likelihood graph that only contains # random variables. This is important, because a random variable's # parameters can contain random variables themselves. # Ultimately, with a graph containing only random variables and # "deterministics", we can simply replace all the random variables with # their value variables and be done. tmp_rv_values = rv_values.copy() tmp_rv_values[rv_var] = rv_var if not cdf: logp_var = _logp(rv_node.op, rv_var, tmp_rv_values, *dist_params, **kwargs) else: logp_var = _logcdf(rv_node.op, rv_var, tmp_rv_values, *dist_params, **kwargs) transform = getattr(rv_value_var.tag, "transform", None) if rv_value_var else None if transform and transformed and not cdf and jacobian: transformed_jacobian = transform.jacobian_det(rv_var, rv_value) if transformed_jacobian: if logp_var.ndim > transformed_jacobian.ndim: logp_var = logp_var.sum(axis=-1) logp_var += transformed_jacobian # Replace random variables with their value variables replacements = rv_values.copy() replacements.update({rv_var: rv_value, rv_value_var: rv_value}) (logp_var, ), _ = rvs_to_value_vars( (logp_var, ), apply_transforms=transformed and not cdf, initial_replacements=replacements, ) if sum: logp_var = at.sum(logp_var) if scaling: logp_var *= _get_scaling(getattr(rv_var.tag, "total_size", None), rv_value.shape, rv_value.ndim) # Recompute test values for the changes introduced by the replacements # above. if config.compute_test_value != "off": for node in io_toposort(graph_inputs((logp_var, )), (logp_var, )): compute_test_value(node) if rv_var.name is not None: logp_var.name = "__logp_%s" % rv_var.name return logp_var