def _get_grad_log_post(W1D, Wprior, H, y, X, testing=False):
    """Returns multinomial gradient of the negative log posterior probability with C classes.

   Parameters
   ----------
   W1D : array-like, shape (C*p, )
       Flattened vector of parameters at which the negative log posterior is to be evaluated
   Wprior : array-like, shape (C, p)
       vector of prior means on the parameters to be fit
   H : array-like, shape (C*p, C*p) or independent between classes (C, p, p)
       Array of prior Hessian (inverse covariance of prior distribution of parameters)
   y : array-like, shape (N, ) starting at 0
       vector of binary ({0, 1, ... C} possible responses)
   X : array-like, shape (N, p)
       array of features

   Returns
   -------
    grad_log_post1D : array-like, shape (C*p, )
            Flattened gradient of negative log posterior

   References
   ----------
   Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
   Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)
    """

    # calculate gradient log posterior
    C, p = Wprior.shape
    W = W1D.reshape(C, p)

    mu = _get_softmax_probs(X, W)  # shape (N, C)
    grad_log_likelihood = np.zeros_like(W)
    grad_log_prior = np.zeros_like(W)

    for c in range(C):
        if H.shape == (C, p, p):
            grad_log_likelihood[:, c] = X.T @ (mu[:, c] - np.int32(y == c))
            K = (W[c] - Wprior[c]).reshape(-1)
            grad_log_prior[c] = H[c] @ K
        elif H.shape == (C * p, C * p):
            grad_log_likelihood[c] = X.T @ (mu[:, c] - np.int32(y == c))

    if H.shape == (C * p, C * p):
        K = (W - Wprior).reshape(-1)  # change to shape (C*p, )
        grad_log_prior = H @ K
        grad_log_prior = grad_log_prior.reshape(C, p)  # change to shape (C, p)

    grad_log_posterior = grad_log_likelihood + grad_log_prior
    grad_log_post1D = grad_log_posterior.reshape(-1)

    if testing:
        return [grad_log_post1D, grad_log_likelihood.reshape(-1), grad_log_prior.reshape(-1)]
    else:
        return grad_log_post1D
def plot_images(images,
                ax,
                ims_per_row=5,
                padding=5,
                digit_dimensions=(28, 28),
                cmap=matplotlib.cm.binary,
                vmin=None,
                vmax=None):
    """Images should be a (N_images x pixels) matrix."""
    N_images = images.shape[0]
    N_rows = np.int32(np.ceil(float(N_images) / ims_per_row))
    pad_value = np.min(images.ravel())
    concat_images = np.full(
        ((digit_dimensions[0] + padding) * N_rows + padding,
         (digit_dimensions[1] + padding) * ims_per_row + padding), pad_value)
    for i in range(N_images):
        cur_image = np.reshape(images[i, :], digit_dimensions)
        row_ix = i // ims_per_row
        col_ix = i % ims_per_row
        row_start = padding + (padding + digit_dimensions[0]) * row_ix
        col_start = padding + (padding + digit_dimensions[1]) * col_ix
        concat_images[row_start:row_start + digit_dimensions[0],
                      col_start:col_start + digit_dimensions[1]] = cur_image
    cax = ax.matshow(concat_images, cmap=cmap, vmin=vmin, vmax=vmax)
    plt.xticks(np.array([]))
    plt.yticks(np.array([]))
    return cax
def get_binary_monte_carlo_probs(X, w, H, num_samples=100):
    """ Uses monte carlo approximation to get posterior predictive logistic regression probability with C classes.

    Parameters
    ----------
    X : array-like, shape (N, p)
       array of covariates
    w : array-like, shape (p, )
       array of fitted MAP parameters
    H : array-like, shape (p, p) or (p, )
       array of log posterior Hessian (covariance matrix of fitted MAP parameters)
    num_samples: number of samples to approximate the posterior


    Returns
    -------
    probs : array-like, shape (N, C)
       moderated (by full distribution) logistic probability
    preds : array-like, shape (N, )
        predicted classes ({0,1, ..., C})

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)
    """

    N, _ = X.shape

    if len(H.shape) == 2:
        w_sample = np.random.multivariate_normal(w, np.linalg.inv(H), num_samples)
    elif len(H.shape) == 1:
        w_sample = np.random.multivariate_normal(w, np.diag(1 / (H + EPS)), num_samples)
    else:
        raise ValueError('Incompatible Hessian')

    probs = np.mean(expit(X @ w_sample.T), axis=1)
    preds = np.int32(probs > 0.5)

    return probs, preds
Example #4
0
    # -------------- LOADING DATASET ------------------------
    # load the images
    npr.seed(0)
    _, train_images, train_labels, test_images, test_labels = load_mnist()

    rand_idx = np.arange(train_images.shape[0])
    npr.shuffle(rand_idx)

    train_images = train_images[rand_idx]
    train_labels = train_labels[rand_idx]

    # UNIFORM CLASS SAMPLE CODE
    # uniformly sample each class
    cls_labels = train_labels.argmax(axis=1)
    cls_images = [train_images[cls_labels == i] for i in range(10)]
    rand_cls = np.int32(npr.random(30) / 0.1)
    rand_idx = [npr.randint(cls_images[cls].shape[0]) for cls in rand_cls]

    train_images = np.vstack(
        [cls_images[rand_cls[i]][rand_idx[i]] for i in range(30)])
    train_labels = np.vstack([
        train_labels[cls_labels == rand_cls[i]][rand_idx[i]] for i in range(30)
    ])

    # binarize
    train_images = np.round(train_images)
    test_images = np.round(test_images)

    # -------------- LOADING DATASET ------------------------
    print('LOADED DATASET')
def fit(y, X, Wprior, H, solver='BFGS', use_autograd=True, bounds=None, maxiter=10000, disp=False):
    """ Bayesian Logistic Regression Solver.  Assumes Laplace (Gaussian) Approximation
    to the posterior of the fitted parameter vector. Uses scipy.optimize.minimize

    Parameters
    ----------
    y : array-like, shape (N, ) starting at 0
        vector of binary ({0, 1, ... C} possible responses)
    X : array-like, shape (N, p)
        array of features
    Wprior : array-like, shape (C, p)
        vector of prior means on the parameters to be fit
    H : array-like, shape (C*p, C*p) or independent between classes (C, p, p)
        Array of prior Hessian (inverse covariance of prior distribution of parameters)
    solver : string
        scipy optimize solver used.  this should be either 'Newton-CG', 'BFGS' or 'L-BFGS-B'.
        The default is BFGS.
    use_autograd:
        whether to use autograd's jacobian and hessian functions to solve
    bounds : iterable of length p
        a length p list (or tuple) of tuples each of length 2.
        This is only used if the solver is set to 'L-BFGS-B'. In that case, a tuple
        (lower_bound, upper_bound), both floats, is defined for each parameter.  See the
        scipy.optimize.minimize docs for further information.
    maxiter : int
        maximum number of iterations for scipy.optimize.minimize solver.
    disp: bool
        whether to print convergence messages and additional information
    Returns
    -------
    W_results : array-like, shape (C, p)
        posterior parameters (MAP estimate)
    H_results : array-like, shape like `H`
        posterior Hessian  (Hessian of negative log posterior evaluated at MAP parameters)

    References
    ----------
    Chapter 8 of Murphy, K. 'Machine Learning a Probabilistic Perspective', MIT Press (2012)
    Chapter 4 of Bishop, C. 'Pattern Recognition and Machine Learning', Springer (2006)
    """

    # Check dimensionalities and data types

    # check X
    if len(X.shape) != 2:
        raise ValueError('X should be a matrix of shape (N, p)')
    (nX, pX) = X.shape
    if not np.issubdtype(X.dtype, np.float):
        X = np.float32(X)

    # check y
    if len(y.shape) > 1:
        raise ValueError('y should be a vector of shape (N, )')
    if len(y) != nX:
        raise ValueError('y and X should have the same number of examples')
    if not np.issubdtype(y.dtype, np.integer):
        y = np.int32(y)

    # check Wprior
    if len(Wprior.shape) != 2:
        raise ValueError('prior mean should be a vector of shape (C, p)')
    cW, pW = Wprior.shape
    if cW == 1:
        raise ValueError('please use binary logistic regression since the number of classes is 1')
    if pW != pX:
        raise ValueError('prior mean should have the same number of features as X')
    if not np.issubdtype(Wprior.dtype, np.float):
        Wprior = np.float32(Wprior)

    # check H
    if len(H.shape) == 3:
        cH, pH1, pH2 = H.shape
        if cH != cW:
            raise ValueError('prior Hessian does not have the same number of classes as prior mean')
        if pH1 != pX:
            raise ValueError('prior Hessian does not have the same number of features as prior mean')
        if pH1 != pH2:
            raise ValueError('prior Hessian should be a square matrix of shape (C, p, p)')
    elif len(H.shape) == 2:
        cpH1, cpH2 = H.shape
        if cpH1 != cpH2:
            raise ValueError('prior Hessian should be a square matrix of shape (C*p, C*p)')
        if cpH1 != pX * cW:
            raise ValueError('prior Hessian should be a square matrix of shape (C*p, C*p)')
    else:
        raise ValueError('prior Hessian should be of shape (C*p, C*p) or (C, p, p)')
    if not np.issubdtype(H.dtype, np.float):
        H = np.float32(H)

    if not has_autograd:
        use_autograd = False

    # choose between manually coded or autograd's jacobian and hessian functions
    # and use hessian product rather than hessian for newton-cg solver
    if use_autograd:
        jac_f = jacobian(_get_f_log_posterior)
        hess_f = hessian(_get_f_log_posterior)

    else:
        jac_f = _get_grad_log_post
        hess_f = _get_H_log_post

    # Do the regression
    if solver == 'Newton-CG':
        hessp_f = lambda W1D, q, Wprior, H, y, X: hess_f(W1D, Wprior, H, y, X) @ q
        results = minimize(_get_f_log_posterior, Wprior.reshape(-1), args=(Wprior, H, y, X), jac=jac_f,
                           hessp=hessp_f, method='Newton-CG', options={'maxiter': maxiter, 'disp': disp})

        W_results1D = results.x
        H_results = hess_f(W_results1D, Wprior, H, y, X)

    elif solver == 'BFGS':
        results = minimize(_get_f_log_posterior, Wprior.reshape(-1), args=(Wprior, H, y, X),
                           jac=jac_f, method='BFGS', options={'maxiter': maxiter, 'disp': disp})
        W_results1D = results.x
        H_results = hess_f(W_results1D, Wprior, H, y, X)

    elif solver == 'L-BFGS-B':
        results = minimize(_get_f_log_posterior, Wprior.reshape(-1), args=(Wprior, H, y, X),
                           jac=jac_f, method='L-BFGS-B', bounds=bounds, options={'maxiter': maxiter, 'disp': disp})
        W_results1D = results.x
        H_results = hess_f(W_results1D, Wprior, H, y, X)
    else:
        raise ValueError('Unknown solver specified: "{0}"'.format(solver))

    W_results = W_results1D.reshape(Wprior.shape)

    return W_results, H_results