Example #1
0
def test_expit():
    # Check numerical stability of expit (logistic function).

    # Simulate our previous Cython implementation, based on
    #http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression
    assert_almost_equal(expit(1000.), 1. / (1. + np.exp(-1000.)), decimal=16)
    assert_almost_equal(expit(-1000.), np.exp(-1000.) / (1. + np.exp(-1000.)),
                        decimal=16)

    x = np.arange(10)
    out = np.zeros_like(x, dtype=np.float32)
    assert_array_almost_equal(expit(x), expit(x, out=out))
Example #2
0
def test_expit():
    # Check numerical stability of expit (logistic function).

    # Simulate our previous Cython implementation, based on
    #http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression
    assert_almost_equal(expit(1000.), 1. / (1. + np.exp(-1000.)), decimal=16)
    assert_almost_equal(expit(-1000.),
                        np.exp(-1000.) / (1. + np.exp(-1000.)),
                        decimal=16)

    x = np.arange(10)
    out = np.zeros_like(x, dtype=np.float32)
    assert_array_almost_equal(expit(x), expit(x, out=out))
Example #3
0
    def _sample_visibles(self, h, temperature=1.0):
        """Sample from the distribution P(v|h).

        h : array-like, shape (n_samples, n_components)
            Values of the hidden layer to sample from.

        Returns
        -------
        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.
        """
        p = np.dot(h, self.components_/temperature)
        p += self.intercept_visible_/(min(1.0, temperature) if BIASED_PRIOR else temperature)
        expit(p, out=p)
        return (self.rng_.random_sample(size=p.shape) < p)
Example #4
0
    def _sample_visibles(self, h, temperature=1.0):
        """Sample from the distribution P(v|h).

        h : array-like, shape (n_samples, n_components)
            Values of the hidden layer to sample from.

        Returns
        -------
        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.
        """
        p = np.dot(h, self.components_ / temperature)
        p += self.intercept_visible_ / (min(1.0, temperature)
                                        if BIASED_PRIOR else temperature)
        expit(p, out=p)
        return (self.rng_.random_sample(size=p.shape) < p)
Example #5
0
    def _fit(self, v_pos):
        """Inner fit for one mini-batch.
        Adjust the parameters to maximize the likelihood of v using
        Extended Mean Field theory (second order TAP equations).
        Parameters
        ----------
        v_pos : array-like, shape (n_samples, n_features)
            The data to use for training.
        """
        X_batch = v_pos
        lr = float(self.learning_rate) / X_batch.shape[0]
        decay = self.decay

        v_pos, h_pos, v_init, h_init = self.init_batch(X_batch)

        a = safe_sparse_dot(h_init, self.W, dense_output=True) + self.v_bias
        a = expit(a, out=a)

        # get_negative_samples
        v_neg, h_neg = self.equilibrate(v_init, h_init, iters=self.neq_steps)

        # basic gradient
        dW = self.weight_gradient(v_pos, h_pos, v_neg, h_neg)

        # regularization based on weight decay
        #  similar to momentum >
        if self.weight_decay == "L1":
            dW -= decay * np.sign(self.W)
        elif self.weight_decay == "L2":
            dW -= decay * self.W

        # can we use BLAS here ?
        # momentum
        # note:  what do we do if lr changes per step ? not ready yet
        dW += self.momentum * self.dW_prev
        # update
        self.W += lr * dW

        # storage for next iteration

        # is this is a memory killer
        self.dW_prev = dW

        # is this wasteful...can we avoid storing 2X the W mat ?
        # elementwise multiply
        self.W2 = np.multiply(self.W, self.W)

        # update bias terms
        #   csr matrix sum is screwy, returns [[1,self.n_components]] 2-d array
        #   so I always use np.asarray(X.sum(axis=0)).squeeze()
        #   although (I think) this could be optimized
        self.v_bias += lr * (np.asarray(v_pos.sum(axis=0)).squeeze() -
                             np.asarray(v_neg.sum(axis=0)).squeeze())
        self.h_bias += lr * (np.asarray(h_pos.sum(axis=0)).squeeze() -
                             np.asarray(h_neg.sum(axis=0)).squeeze())

        return 0
Example #6
0
    def _sample_visibles(self, h, rng):
        """Sample from the distribution P(v|h).

        Parameters
        ----------
        h : array-like, shape (n_samples, n_components)
            Values of the hidden layer to sample from.

        rng : RandomState
            Random number generator to use.

        Returns
        -------
        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.
        """
        p = np.dot(h, self.components_)
        p += self.intercept_visible_
        expit(p, out=p)
        return (rng.random_sample(size=p.shape) < p)
Example #7
0
 def _mean_hiddens(self, v):
     """Computes the conditional probabilities P(h=1|v).
     Parameters
     ----------
     v : array-like, shape (n_samples, n_features)
         Values of the visible layer.
     Returns
     -------
     h : array-like, shape (n_samples, n_components)
         Corresponding mean field values for the hidden layer.
     """
     p = safe_sparse_dot(v, self.W.T) + self.h_bias
     return expit(p, out=p)
Example #8
0
 def _mean_visible(self, h):
     """Computes the conditional probabilities P(v=1|h).
     Parameters
     ----------
     h : array-like, shape (n_samples, n_components)
         Corresponding mean field values for the hidden layer.
     Returns
     -------
      v : array-like, shape (n_samples, n_features)
         Values of the visible layer.     
     """
     #p = np.dot(h, self.W) + self.v_bias
     p = safe_sparse_dot(h, self.W) + self.v_bias
     return expit(p, out=p)
Example #9
0
    def _mean_hiddens(self, v, temperature=1.0):
        """Computes the probabilities P(h=1|v).

        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.

        Returns
        -------
        h : array-like, shape (n_samples, n_components)
            Corresponding mean field values for the hidden layer.
        """
        p = safe_sparse_dot(v, self.components_.T/temperature)
        p += self.intercept_hidden_/(min(1.0, temperature) if BIASED_PRIOR else temperature)
        return expit(p, out=p)
Example #10
0
    def _mean_hiddens(self, v, temperature=1.0):
        """Computes the probabilities P(h=1|v).

        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.

        Returns
        -------
        h : array-like, shape (n_samples, n_components)
            Corresponding mean field values for the hidden layer.
        """
        p = safe_sparse_dot(v, self.components_.T / temperature)
        p += self.intercept_hidden_ / (min(1.0, temperature)
                                       if BIASED_PRIOR else temperature)
        return expit(p, out=p)
Example #11
0
    def mh_update(self, v, h):
        """update TAP hidden magnetizations, to second order"""
        a = safe_sparse_dot(v, self.W.T) + self.h_bias

        v_fluc = (v - (np.multiply(v, v)))
        #a += (v-v*v).dot((self.W2).T)*(0.5-h)

        if issparse(h):
            h_half = (0.5 - h.to_dense())
        else:
            h_half = (0.5 - h)

        a += np.multiply(safe_sparse_dot(v_fluc, self.W2.T), h_half)

        return expit(a, out=a)
    def _mean_hiddens(self, v):
        """Computes the probabilities P(h=1|v).

        Parameters
        ----------
        v : array-like, shape (n_samples, n_features)
            Values of the visible layer.

        Returns
        -------
        h : array-like, shape (n_samples, n_components)
            Corresponding mean field values for the hidden layer.
        """
        p = safe_sparse_dot(v, self.components_.T)
        p += self.intercept_hidden_
        return expit(p, out=p)
    def fgrad(we, X, y, l1, l2):
        nsamples, nfactors = X.shape

        w0 = we[0]
        w = we[1:(nfactors + 1)] - we[(nfactors + 1):]
        yz = y * (safe_sparse_dot(X, w) + w0)
        f = -np.sum(log_logistic(yz)) + l1 * np.sum(
            we[1:]) + 0.5 * l2 * np.dot(w, w)

        e = (expit(yz) - 1) * y
        g = safe_sparse_dot(X.T, e) + l2 * w
        g0 = np.sum(e)

        grad = np.concatenate([g, -g]) + l1
        grad = np.insert(grad, 0, g0)

        return f, grad
Example #14
0
    def mv_update(self, v, h):
        """update TAP visbile magnetizations, to second order"""

        # a = np.dot(h, self.W) + self.v_bias
        a = safe_sparse_dot(h, self.W) + self.v_bias

        h_fluc = h - np.multiply(h, h)
        #a += h_fluc.dot(self.W2)*(0.5-v)

        # 0.5-v is elementwise => dense
        if issparse(v):
            v_half = (0.5 - v.todense())
        else:
            v_half = (0.5 - v)

        a += np.multiply(safe_sparse_dot(h_fluc, self.W2), v_half)
        return expit(a, out=a)
Example #15
0
def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
    """Computes the logistic loss and gradient.
    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    y : ndarray, shape (n_samples,)
        Array of labels.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.
    Returns
    -------
    out : float
        Logistic loss.
    grad : ndarray, shape (n_features,) or (n_features + 1,)
        Logistic gradient.
    """
    _, n_features = X.shape
    grad = np.empty_like(w)

    w, c, yz = _intercept_dot(w, X, y)

    if sample_weight is None:
        sample_weight = np.ones(y.shape[0])

    # Logistic loss is the negative of the log of the logistic function.
    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)

    z = expit(yz)
    z0 = sample_weight * (z - 1) * y

    grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w

    # Case where we fit the intercept.
    if grad.shape[0] > n_features:
        grad[-1] = z0.sum()
    return out, grad
Example #16
0
def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None):
    """Computes the logistic loss and gradient.
    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    y : ndarray, shape (n_samples,)
        Array of labels.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.
    Returns
    -------
    out : float
        Logistic loss.
    grad : ndarray, shape (n_features,) or (n_features + 1,)
        Logistic gradient.
    """
    _, n_features = X.shape
    grad = np.empty_like(w)

    w, c, yz = _intercept_dot(w, X, y)

    if sample_weight is None:
        sample_weight = np.ones(y.shape[0])

    # Logistic loss is the negative of the log of the logistic function.
    out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w)

    z = expit(yz)
    z0 = sample_weight * (z - 1) * y

    grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w

    # Case where we fit the intercept.
    if grad.shape[0] > n_features:
        grad[-1] = z0.sum()
    return out, grad
 def dloss(self, y, pred):
     return (-y * expit(-y * pred))
Example #18
0
    def _fit(self, v_pos, rng, targets, epoch_num=1):
        """Inner fit for one mini-batch.

        Adjust the parameters to maximize the likelihood of v using
        Stochastic Maximum Likelihood (SML).

        Parameters
        ----------
        v_pos : array-like, shape (n_samples, n_features)
            The data to use for training.

        rng : RandomState
            Random number generator to use for sampling.

        target: array-like, shape (n_samples, unique_labels)
            Labeled data.
        """
        momentum = 0.7

        ######## VISABLE POSITIVE TO HIDDEN POSITIVE PHASE  ##########
        h_pos = self._mean_hiddens(v_pos, targets)

        ######## HIDDEN POSITIVE TO VISABLE NEGATIVE PHASE  ##########
        batch_count = len(v_pos)
        self.target_bias_matrix = numpy.tile(self.target_bias_, (batch_count, 1))
        self.visible_bias_matrix = numpy.tile(self.intercept_visible_, (batch_count, 1))
        self.hidden_bias_matrix = numpy.tile(self.intercept_hidden_, (batch_count, 1))
        neg_lab_states = self.target_bias_matrix * 0.

        if targets.sum() != 0:
            temp_h_pos = h_pos
            for j in range(self.cd_iter):
                ## positive hidden label states from previous positive hidden probabilities
                pos_hid_states = temp_h_pos > numpy.random.random(size=h_pos.shape)

                neg_lab_prob = numpy.exp(numpy.dot(pos_hid_states, self.target_components_.T) + self.target_bias_matrix)
                neg_lab_prob = neg_lab_prob / neg_lab_prob.sum(axis=1).reshape(batch_count, 1)

                cum_probs = numpy.cumsum(neg_lab_prob, axis=1)
                sampling = cum_probs > rng.uniform(0, 1., (batch_count, 1))

                neg_lab_states = numpy.zeros(self.target_bias_matrix.shape, dtype=float)
                for j, s in enumerate(sampling):
                    try:
                        index = min(numpy.where(s)[0])
                        neg_lab_states[j, index] = 1

                    except ValueError:
                        sys.exit(1)

                v_neg = expit(numpy.dot(pos_hid_states, self.components_.T) + self.intercept_visible_)
                v_neg_states = v_neg > numpy.random.uniform(0., 1., v_neg.shape)   ## given _sample_visibles this line is not needed
                temp_h_pos = self._mean_hiddens(v_neg_states, neg_lab_states)

            h_neg = temp_h_pos
        else:
            h_pos_states = h_pos > numpy.random.random(h_pos.shape)
            if self.regularization_mu is not None:
                ## force sparsity by pressuring hidden units to turn on ##
                sparse_h_pos = self.regularization(h_pos, self.regularization_mu, axis=0)

                # this will force selectivity
                #sparse_h_pos = self.regularization(sparse_h_pos, self.regularization_mu, axis=1)

                h_pos = (1 - self.phi) * sparse_h_pos + self.phi * h_pos
                h_pos_states = h_pos

            ## visable negative must be a function of hidden positive states
            v_neg = expit(numpy.dot(h_pos_states, self.components_.T) + self.intercept_visible_)

            ######## VISABLE NEGATIVE TO HIDDEN NEGATIVE PHASE #########
            h_neg = expit(numpy.dot(v_neg, self.components_) + self.intercept_hidden_)

        err = numpy.sum(numpy.square((v_neg - v_pos)))

        ## compute learning rates by dividing by batch size
        lr = float(self.learning_rate) / v_pos.shape[0]
        lr_bias = float(self.learning_rate_bias) / v_pos.shape[0]

        ######## Update Components and Bias Units ########
        update_comp = safe_sparse_dot(v_pos.T, h_pos, dense_output=True)
        update_comp -= safe_sparse_dot(v_neg.T, h_neg, dense_output=True)
        update_comp -= self.weight_cost * v_pos.shape[0] * self.components_  # weight decay
        self.vishidinc = lr * update_comp + self.vishidinc * momentum
        self.components_ += self.vishidinc

        update_comp_lab = safe_sparse_dot(targets.T, h_pos, dense_output=True)
        update_comp_lab -= safe_sparse_dot(neg_lab_states.T, h_neg, dense_output=True)
        update_comp_lab -= self.weight_cost * v_neg.shape[0] * self.target_components_
        self.target_components_ += lr * update_comp_lab

        self.hidbiasinc = momentum * self.hidbiasinc + lr_bias * (h_pos.sum(axis=0) - h_neg.sum(axis=0))
        self.intercept_hidden_ += self.hidbiasinc

        self.visbiasinc = momentum * self.visbiasinc + lr_bias * (v_pos.sum(axis=0) - v_neg.sum(axis=0))
        self.intercept_visible_ += self.visbiasinc

        self.target_bias_ += lr_bias * (targets.sum(axis=0) - neg_lab_states.sum(axis=0))
        return err
Example #19
0
def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
    """Computes the gradient and the Hessian, in the case of a logistic loss.
    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    y : ndarray, shape (n_samples,)
        Array of labels.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.
    Returns
    -------
    grad : ndarray, shape (n_features,) or (n_features + 1,)
        Logistic gradient.
    Hs : callable
        Function that takes the gradient as a parameter and returns the
        matrix product of the Hessian and gradient.
    """
    n_samples, n_features = X.shape
    grad = np.empty_like(w)
    fit_intercept = grad.shape[0] > n_features

    w, c, yz = _intercept_dot(w, X, y)

    if sample_weight is None:
        sample_weight = np.ones(y.shape[0])

    z = expit(yz)
    z0 = sample_weight * (z - 1) * y

    grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w

    # Case where we fit the intercept.
    if fit_intercept:
        grad[-1] = z0.sum()

    # The mat-vec product of the Hessian
    d = sample_weight * z * (1 - z)
    if sparse.issparse(X):
        dX = safe_sparse_dot(
            sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), X)
    else:
        # Precompute as much as possible
        dX = d[:, np.newaxis] * X

    if fit_intercept:
        # Calculate the double derivative with respect to intercept
        # In the case of sparse matrices this returns a matrix object.
        dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))

    def Hs(s):
        ret = np.empty_like(s)
        ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
        ret[:n_features] += alpha * s[:n_features]

        # For the fit intercept case.
        if fit_intercept:
            ret[:n_features] += s[-1] * dd_intercept
            ret[-1] = dd_intercept.dot(s[:n_features])
            ret[-1] += d.sum() * s[-1]
        return ret

    return grad, Hs
 def predict(self, X):
     n = (len(self.we) - 1) / 2
     w0 = self.we[0]
     w = self.we[1:n + 1] - self.we[n + 1:]
     return expit(w0 + safe_sparse_dot(X, w))
Example #21
0
def _logistic_grad_hess(w, X, y, alpha, sample_weight=None):
    """Computes the gradient and the Hessian, in the case of a logistic loss.
    Parameters
    ----------
    w : ndarray, shape (n_features,) or (n_features + 1,)
        Coefficient vector.
    X : {array-like, sparse matrix}, shape (n_samples, n_features)
        Training data.
    y : ndarray, shape (n_samples,)
        Array of labels.
    alpha : float
        Regularization parameter. alpha is equal to 1 / C.
    sample_weight : array-like, shape (n_samples,) optional
        Array of weights that are assigned to individual samples.
        If not provided, then each sample is given unit weight.
    Returns
    -------
    grad : ndarray, shape (n_features,) or (n_features + 1,)
        Logistic gradient.
    Hs : callable
        Function that takes the gradient as a parameter and returns the
        matrix product of the Hessian and gradient.
    """
    n_samples, n_features = X.shape
    grad = np.empty_like(w)
    fit_intercept = grad.shape[0] > n_features

    w, c, yz = _intercept_dot(w, X, y)

    if sample_weight is None:
        sample_weight = np.ones(y.shape[0])

    z = expit(yz)
    z0 = sample_weight * (z - 1) * y

    grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w

    # Case where we fit the intercept.
    if fit_intercept:
        grad[-1] = z0.sum()

    # The mat-vec product of the Hessian
    d = sample_weight * z * (1 - z)
    if sparse.issparse(X):
        dX = safe_sparse_dot(sparse.dia_matrix((d, 0),
                             shape=(n_samples, n_samples)), X)
    else:
        # Precompute as much as possible
        dX = d[:, np.newaxis] * X

    if fit_intercept:
        # Calculate the double derivative with respect to intercept
        # In the case of sparse matrices this returns a matrix object.
        dd_intercept = np.squeeze(np.array(dX.sum(axis=0)))

    def Hs(s):
        ret = np.empty_like(s)
        ret[:n_features] = X.T.dot(dX.dot(s[:n_features]))
        ret[:n_features] += alpha * s[:n_features]

        # For the fit intercept case.
        if fit_intercept:
            ret[:n_features] += s[-1] * dd_intercept
            ret[-1] = dd_intercept.dot(s[:n_features])
            ret[-1] += d.sum() * s[-1]
        return ret

    return grad, Hs
Example #22
0
 def sigma_means(self, x, b, W):
     """helper class for computing Wx+b """
     a = safe_sparse_dot(x, W.T) + b
     return expit(a, out=a)