def marginal_maxent_generic(dist, rvs, **kwargs): from cvxopt import matrix verbose = kwargs.get('verbose', False) logger = basic_logger('dit.maxentropy', verbose) rv_mode = kwargs.pop('rv_mode', None) A, b = marginal_constraints_generic(dist, rvs, rv_mode) # Reduce the size of A so that only nonzero elements are searched. # Also make it full rank. variables = isolate_zeros_generic(dist, rvs) Asmall = A[:, variables.nonzero] # pylint: disable=no-member Asmall, b, rank = as_full_rank(Asmall, b) Asmall = matrix(Asmall) b = matrix(b) # Set cvx info level based on logging.INFO level. if logger.isEnabledFor(logging.INFO): show_progress = True else: show_progress = False logger.info("Finding initial distribution.") initial_x, _ = initial_point_generic(dist, rvs, A=Asmall, b=b, isolated=variables, show_progress=show_progress) initial_x = matrix(initial_x) objective = negentropy # We optimize the reduced problem. # For the gradient, we are going to keep the elements we know to be zero # at zero. Generally, the gradient is: log2(x_i) + 1 / ln(b) nonzero = variables.nonzero # pylint: disable=no-member ln2 = np.log(2) def gradient(x): # This operates only on nonzero elements. xarr = np.asarray(x) # All of the optimization elements should be greater than zero # But occasional they might go slightly negative or zero. # In those cases, we will just set the gradient to zero and keep the # value fixed from that point forward. bad_x = xarr <= 0 grad = np.log2(xarr) + 1 / ln2 grad[bad_x] = 0 return matrix(grad) logger.info("Finding maximum entropy distribution.") x, obj = frank_wolfe(objective, gradient, Asmall, b, initial_x, **kwargs) x = np.asarray(x).transpose()[0] # Rebuild the full distribution. xfinal = np.zeros(A.shape[1]) xfinal[nonzero] = x return xfinal, obj # , Asmall, b, variables
def marginal_maxent_generic(dist, rvs, **kwargs): from cvxopt import matrix verbose = kwargs.get('verbose', False) logger = basic_logger('dit.maxentropy', verbose) rv_mode = kwargs.pop('rv_mode', None) A, b = marginal_constraints_generic(dist, rvs, rv_mode) # Reduce the size of A so that only nonzero elements are searched. # Also make it full rank. variables = isolate_zeros_generic(dist, rvs) Asmall = A[:, variables.nonzero] # pylint: disable=no-member Asmall, b, rank = as_full_rank(Asmall, b) Asmall = matrix(Asmall) b = matrix(b) # Set cvx info level based on logging.INFO level. if logger.isEnabledFor(logging.INFO): show_progress = True else: show_progress = False logger.info("Finding initial distribution.") initial_x, _ = initial_point_generic(dist, rvs, A=Asmall, b=b, isolated=variables, show_progress=show_progress) initial_x = matrix(initial_x) objective = negentropy # We optimize the reduced problem. # For the gradient, we are going to keep the elements we know to be zero # at zero. Generally, the gradient is: log2(x_i) + 1 / ln(b) nonzero = variables.nonzero # pylint: disable=no-member ln2 = np.log(2) def gradient(x): # This operates only on nonzero elements. xarr = np.asarray(x) # All of the optimization elements should be greater than zero # But occasional they might go slightly negative or zero. # In those cases, we will just set the gradient to zero and keep the # value fixed from that point forward. bad_x = xarr <= 0 grad = np.log2(xarr) + 1 / ln2 grad[bad_x] = 0 return matrix(grad) logger.info("Finding maximum entropy distribution.") x, obj = frank_wolfe(objective, gradient, Asmall, b, initial_x, **kwargs) x = np.asarray(x).transpose()[0] # Rebuild the full distribution. xfinal = np.zeros(A.shape[1]) xfinal[nonzero] = x return xfinal, obj#, Asmall, b, variables
def frank_wolfe(objective, gradient, A, b, initial_x, maxiters=2000, tol=1e-4, clean=True, verbose=None): """ Uses the Frank--Wolfe algorithm to minimize the convex objective. Minimization is subject to the linear equality constraint: A x = b. Assumes x should be nonnegative. Parameters ---------- objective : callable The objective function. It would receive a ``cvxopt`` matrix for the input `x` and return the value of the objective function. gradient : callable The gradient function. It should receive a ``cvxopt`` matrix for the input `x` and return the value of the gradient evaluated at `x`. A : matrix A ``cvxopt`` matrix specifying the LHS linear equality constraints. b : matrix A ``cvxopt`` matrix specifying the RHS linear equality constraints. initial_x : matrix A ``cvxopt`` matrix specifying the initial `x` to use. maxiters : int The maximum number of iterations to perform. If convergence was not reached after the last iteration, a warning is issued and the current value of `x` is returned. tol : float The tolerance used to determine when we have converged to the optimum. clean : bool Occasionally, the iteration process will take nonnegative values to be ever so slightly negative. If ``True``, then we forcibly make such values equal to zero and renormalize the vector. This is an application specific decision and is probably not more generally useful. verbose : int An integer representing the logging level ala the ``logging`` module. If `None`, then (effectively) the log level is set to `WARNING`. For a bit more information, set this to `logging.INFO`. For a bit less, set this to `logging.ERROR`, or perhaps 100. """ # Function level import to avoid circular import. from dit.algorithms.optutil import op_runner # Function level import to keep cvxopt dependency optional. # All variables should be cvxopt variables, not NumPy arrays from cvxopt import matrix from cvxopt.modeling import variable # Set up a custom logger. logger = basic_logger('dit.frankwolfe', verbose) # Set cvx info level based on logging.DEBUG level. if logger.isEnabledFor(logging.DEBUG): show_progress = True else: show_progress = False assert (A.size[1] == initial_x.size[0]) n = initial_x.size[0] x = initial_x xdiff = 0 TOL = 1e-7 verbosechunk = maxiters / 10 for i in range(maxiters): obj = objective(x) grad = gradient(x) xbar = variable(n) new_objective = grad.T * xbar constraints = [] constraints.append((xbar >= 0)) constraints.append((-TOL <= A * xbar - b)) constraints.append((A * xbar - b <= TOL)) logger.debug('FW Iteration: {}'.format(i)) opt = op_runner(new_objective, constraints, show_progress=show_progress) if opt.status != 'optimal': msg = '\tFrank-Wolfe: Did not find optimal direction on ' msg += 'iteration {}: {}' msg = msg.format(i, opt.status) logger.info(msg) # Calculate optimality gap xbar_opt = opt.variables()[0].value opt_bd = grad.T * (xbar_opt - x) msg = "i={:6} obj={:10.7f} opt_bd={:10.7f} xdiff={:12.10f}" if logger.isEnabledFor(logging.DEBUG): logger.debug(msg.format(i, obj, opt_bd[0, 0], xdiff)) logger.debug("") elif i % verbosechunk == 0: logger.info(msg.format(i, obj, opt_bd[0, 0], xdiff)) xnew = (i * x + 2 * xbar_opt) / (i + 2) xdiff = np.linalg.norm(xnew - x) x = xnew if xdiff < tol: obj = objective(x) break else: msg = "Only converged to xdiff={:12.10f} after {} iterations. " msg += "Desired: {}" logger.warn(msg.format(xdiff, maxiters, tol)) xopt = np.array(x) if clean: xopt[np.abs(xopt) < tol] = 0 xopt /= xopt.sum() return xopt, obj
def __init__(self, dist, sources, target, k=2, rv_mode=None, extra_constraints=True, source_marginal=False, tol=None, prng=None, verbose=None): """ Initialize an optimizer for the partial information framework. Parameters ---------- dist : distribution The distribution used to calculate the partial information. sources : list of lists The sources random variables. Each random variable specifies a list of random variables in `dist` that define a source. target : list The random variables in `dist` that define the target. k : int The size of the marginals that are constrained to equal marginals from `dist`. For the calculation of unique information, we use k=2. Note that these marginals include the target random variable. rv_mode : str, None Specifies how to interpret the elements of each source and the target. Valid options are: {'indices', 'names'}. If equal to 'indices', then the elements of each source and the target are interpreted as random variable indices. If equal to 'names', the elements are interpreted as random variable names. If `None`, then the value of `dist._rv_mode` is consulted. extra_constraints : bool When possible, additional constraints beyond the required marginal constraints are added to the optimization problem. These exist values of the input and output that satisfy p(inputs | outputs) = 1 In that case, p(inputs, outputs) is equal to q(inputs, outputs) for all q in the feasible set. source_marginal : bool If `True`, also require that the source marginal distribution p(X_1, ..., X_n) is matched. This will yield a distribution such that S^k := H(q) - H(p) is the information that is not captured by matching the k-way marginals that include the target. k=1 is the mutual information between the sources and the target. tol : float | None The desired convergence tolerance. prng : RandomState A NumPy-compatible pseudorandom number generator. verbose : int An integer representing the logging level ala the ``logging`` module. If `None`, then (effectively) the log level is set to `WARNING`. For a bit more information, set this to `logging.INFO`. For a bit less, set this to `logging.ERROR`, or perhaps 100. """ self.logger = basic_logger('dit.pid_broja', verbose) # Store the original parameters in case we want to construct an # "uncoalesced" distribution from the optimial distribution. self.dist_original = dist self._params = Bunch(sources=sources, target=target, rv_mode=rv_mode) self.dist = prepare_dist(dist, sources, target, rv_mode=rv_mode) self.k = k self.extra_constraints = extra_constraints self.source_marginal = source_marginal self.verbose = verbose super(MaximumConditionalEntropy, self).__init__(self.dist, tol=tol, prng=prng)