Beispiel #1
0
    def adjust(self, node):
        """ sample from posterior """
        D  = np.vstack([d.pos for d in node.data ])
        old=np.array([node.get_likelihood_angle(d) for d in node.data ])
        old=np.log(old).mean()
        #import ipdb as pdb; pdb.set_trace()
        D -= node.parent.pos
        D  = D.T

        old_angle = node.angle
        # determine posterior of angle given prior and data
        x_ang = np.arctan2(D[1,:],D[0,:])
        R1 = self.angle_kappa * np.cos(node.parent.angle) + np.sum(np.cos(x_ang))
        R2 = self.angle_kappa * np.sin(node.parent.angle) + np.sum(np.sin(x_ang))
        #R1 =  np.sum(np.cos(x_ang))
        #R2 =  np.sum(np.sin(x_ang))
        mu = np.arctan2(R2,R1)
        Rn = R1 / np.cos(mu)
        node.angle = vonmises.rvs(Rn,loc=mu)
        #node.angle = mu

        X = D.copy()

        # rotate around newly drawn angle
        D  = np.dot(rotmat(-node.angle), D)
        assert D.shape[1] == len(node.data)


        # determine posterior of length given prior and data
        aN = self.length_a0 + D.shape[1]/2.0
        bN = self.length_b0 + D.shape[1]/2.0 * np.var(D[0,:])
        #aN =  D.shape[1]/2.0
        #bN =  D.shape[1]/2.0 * np.var(D[0,:])
        #import ipdb; ipdb.set_trace()
        node.length = gamma.rvs(aN,loc=1.0/bN)
        #node.length = np.var(D[0,:])

        old_pos = node.pos.copy()

        node.update_position()

        def f():
            plt.plot(X[0,:],X[1,:],".b")
            #plt.plot(D[0,:],D[1,:],".r")
            plt.plot(node.pos[0]-node.parent.pos[0],node.pos[1]-node.parent.pos[1],"*r")
            plt.plot(old_pos[0]-node.parent.pos[0],old_pos[1]-node.parent.pos[1],"*b")
            #plt.plot(node.parent.pos[0],node.parent.pos[1],"*r")
            plt.show()

        new=np.array([node.get_likelihood_angle(d) for d in node.data ])
        new=np.log(new).mean()
        print('old likelihood: %f     new likelihood: %f'%(old,new))
Beispiel #2
0
def infer_sample_rate(series):
  seriesNone = series + []
  series = series + [] # we need a fresh copy!
  for i,s in enumerate(series):
    if seriesNone[i] == None:
      series[i] = 0
    if series[i] == 0:
      series[i] = 0.001
  rates = list(gamma.rvs(series, 1))
  for i,r in enumerate(rates):
    if seriesNone[i] == None or seriesNone[i] == 0:
      rates[i] = None
  return rates
Beispiel #3
0
    def _initialise_posterior(self, data):

        D = self.basis.get_dim(data[0])

        # Intialise weights and covariances
        res = sgd(self._map,
                  self.__random.randn(D),
                  data,
                  maxiter=self.maxiter,
                  updater=self.updater,
                  batch_size=self.batch_size,
                  random_state=self.randstate)

        # Initialise each posterior component randomly around the MAP weights
        self.covariance = gamma.rvs(2, scale=0.5, size=(D, self.K))
        self.weights = res.x[:, np.newaxis] + \
            np.sqrt(self.covariance) * self.__random.rand(D, self.K)
        self.weights[:, 0] = res.x  # Make sure we include the MAP weights too
Beispiel #4
0
def learn(X, y, likelihood, lparams, basis, bparams, regulariser=1.,
          postcomp=10, use_sgd=True, maxit=1000, tol=1e-7, batchsize=100,
          rate=0.9, eta=1e-5, verbose=True):
    """
    Learn the parameters of a Bayesian generalised linear model (GLM).

    The learning algorithm uses nonparametric variational inference [1]_, and
    optionally stochastic gradients.

    Parameters
    ----------
        X: ndarray
            (N, d) array input dataset (N samples, d dimensions).
        y: ndarray
            (N,) array targets (N samples)
        likelihood: Object
            A likelihood object, see the likelihoods module.
        lparams: sequence
            a sequence of parameters for the likelihood object, e.g. the
            likelihoods.Gaussian object takes a variance parameter, so this
            should be :code:`[var]`.
        basis: Basis
            A basis object, see the basis_functions module.
        bparams: sequence
            A sequence of parameters of the basis object.
        regulariser: float, optional
            weight regulariser (variance) initial value.
        postcomp: int, optional
            Number of diagonal Gaussian components to use to approximate the
            posterior distribution.
        tol: float, optional
           Optimiser relative tolerance convergence criterion.
        use_sgd: bool, optional
            If :code:`True` then use SGD (Adadelta) optimisation instead of
            L-BFGS.
        maxit: int, optional
            Maximum number of iterations of the optimiser to run. If
            :code:`use_sgd` is :code:`True` then this is the number of complete
            passes through the data before optimization terminates (unless it
            converges first).
        batchsize: int, optional
            number of observations to use per SGD batch. Ignored if
            :code:`use_sgd=False`.
        rate: float, optional
            SGD decay rate, must be [0, 1]. Ignored if :code:`use_sgd=False`.
        eta: float, optional
            Jitter term for adadelta SGD. Ignored if :code:`use_sgd=False`.
        verbose: bool, optional
            log the learning status.

    Returns
    -------
        m: ndarray
            (D, postcomp) array of posterior weight means (D is the dimension
            of the features).
        C: ndarray
            (D, postcomp) array of posterior weight variances.
        lparams: sequence
            learned sequence of likelihood object hyperparameters.
        bparams: sequence
            learned sequence of basis object hyperparameters.

    Notes
    -----
        This approximates the posterior distribution over the weights with
        a mixture of Gaussians:

        .. math ::

            \mathbf{w} \sim \\frac{1}{K} \sum^K_{k=1}
                \mathcal{N}(\mathbf{m_k}, \\boldsymbol{\Psi}_k)

        where,

        .. math ::

            \\boldsymbol{\Psi}_k = \\text{diag}([\Psi_{k,1}, \ldots,
                \Psi_{k,D}]).

        This is so arbitrary likelihoods can be used with this algorithm, while
        still mainting flexible and tractable non-Gaussian posteriors.
        Additionaly this has the benefit that we have a reduced number of
        parameters to optimise (compared with full covariance Gaussians).

        The main differences between this implementation and the GLM in [1]_
        are:
            - We use diagonal mixtures, as opposed to isotropic.
            - We do not cycle between optimising eq. 10 and 11 (objectives L1
              and L2) in the paper. We use the full objective L2 for
              everything, including the posterior means, and we optimise all
              parameters together.

        Even though these changes make learning a little slower, and require
        third derivatives of the likelihoods, we obtain better results and we
        can use SGD straight-forwardly.
    """

    N, d = X.shape
    D = basis(np.atleast_2d(X[0, :]), *bparams).shape[1]
    K = postcomp

    # Pre-allocate here
    dm = np.zeros((D, K))
    dC = np.zeros((D, K))
    H = np.empty((D, K))

    # Objective function Eq. 10 from [1], and gradients of ALL params
    def L2(_m, _C, _reg, _lparams, *args):

        # Extract data, parameters, etc
        _bparams, y, X = args[:-1], args[-1][:, 0], args[-1][:, 1:]

        # Dimensions
        M, d = X.shape
        D, K = _m.shape
        B = N / M

        # Basis function stuff
        Phi = basis(X, *_bparams)  # M x D
        Phi2 = Phi**2
        Phi3 = Phi**3
        f = Phi.dot(_m)  # M x K
        df, d2f, d3f = np.zeros((M, K)), np.zeros((M, K)), np.zeros((M, K))

        # Posterior responsability terms
        logqkk = _qmatrix(_m, _C)
        logqk = logsumexp(logqkk, axis=0)  # log term of Eq. 7 from [1]
        pz = np.exp(logqkk - logqk)

        # Big loop though posterior mixtures for calculating stuff
        ll = 0
        dlp = [np.zeros_like(p) for p in _lparams]

        for k in range(K):

            # Common likelihood calculations
            ll += B * likelihood.loglike(y, f[:, k], *_lparams).sum()
            df[:, k] = B * likelihood.df(y, f[:, k], *_lparams)
            d2f[:, k] = B * likelihood.d2f(y, f[:, k], *_lparams)
            d3f[:, k] = B * likelihood.d3f(y, f[:, k], *_lparams)
            H[:, k] = d2f[:, k].dot(Phi2) - 1. / _reg

            # Posterior mean and covariance gradients
            mkmj = _m[:, k][:, np.newaxis] - _m
            iCkCj = 1 / (_C[:, k][:, np.newaxis] + _C)
            dC[:, k] = (-((mkmj * iCkCj)**2 - 2 * iCkCj).dot(pz[:, k])
                        + H[:, k]) / (2 * K)
            dm[:, k] = (df[:, k].dot(Phi)
                        + 0.5 * _C[:, k] * d3f[:, k].dot(Phi3)
                        + (iCkCj * mkmj).dot(pz[:, k])
                        - _m[:, k] / _reg) / K

            # Likelihood parameter gradients
            dp = likelihood.dp(y, f[:, k], *_lparams)
            dp2df = likelihood.dpd2f(y, f[:, k], *_lparams)
            for l in range(len(_lparams)):
                dpH = dp2df[l].dot(Phi2)
                dlp[l] -= B * (dp[l].sum() + 0.5 * (_C[:, k] * dpH).sum()) / K

        # Regulariser gradient
        dreg = (((_m**2).sum() + _C.sum()) / _reg**2 - D * K / _reg) / (2 * K)

        # Basis function parameter gradients
        def dtheta(dPhi):
            dt = 0
            dPhiPhi = dPhi * Phi
            for k in range(K):
                dPhimk = dPhi.dot(_m[:, k])
                dPhiH = d2f[:, k].dot(dPhiPhi) + \
                    0.5 * (d3f[:, k] * dPhimk).dot(Phi2)
                dt -= (df[:, k].dot(dPhimk) + (_C[:, k] * dPhiH).sum()) / K
            return dt

        dbp = apply_grad(dtheta, basis.grad(X, *_bparams))

        # Objective, Eq. 10 in [1]
        L2 = 1. / K * (ll
                       - 0.5 * D * K * np.log(2 * np.pi * _reg)
                       - 0.5 * (_m**2).sum() / _reg
                       + 0.5 * (_C * H).sum()
                       - logqk.sum() + np.log(K))

        if verbose:
            log.info("L2 = {}, reg = {}, lparams = {}, bparams = {}"
                     .format(L2, _reg, _lparams, _bparams))

        return -L2, append_or_extend([-dm, -dC, -dreg, dlp], dbp)

    # Intialise m and C
    m = np.random.randn(D, K) + np.arange(K) - K / 2
    C = gamma.rvs(2, scale=0.5, size=(D, K))

    bounds = [Bound(shape=m.shape),
              Positive(shape=C.shape),
              Positive(),
              likelihood.bounds]
    append_or_extend(bounds, basis.bounds)

    vparams = [m, C, regulariser, lparams] + bparams

    if use_sgd is False:
        nmin = structured_minimizer(logtrick_minimizer(minimize))
        res = nmin(L2, vparams, ftol=tol, maxiter=maxit,
                   method='L-BFGS-B', jac=True, bounds=bounds,
                   args=(np.hstack((y[:, np.newaxis], X)),))
    else:
        nsgd = structured_sgd(logtrick_sgd(sgd))
        res = nsgd(L2, vparams, np.hstack((y[:, np.newaxis], X)), rate=rate,
                   eta=eta, bounds=bounds, gtol=tol, passes=maxit,
                   batchsize=batchsize, eval_obj=True)

    (m, C, regulariser, lparams), bparams = res.x[:4], res.x[4:]

    if verbose:
        log.info("Finished! Objective = {}, reg = {}, lparams = {}, "
                 "bparams = {}, message: {}."
                 .format(-res.fun, regulariser, lparams, bparams, res.message))

    return m, C, lparams, bparams