def test_expit(): # Check numerical stability of expit (logistic function). # Simulate our previous Cython implementation, based on #http://fa.bianp.net/blog/2013/numerical-optimizers-for-logistic-regression assert_almost_equal(expit(1000.), 1. / (1. + np.exp(-1000.)), decimal=16) assert_almost_equal(expit(-1000.), np.exp(-1000.) / (1. + np.exp(-1000.)), decimal=16) x = np.arange(10) out = np.zeros_like(x, dtype=np.float32) assert_array_almost_equal(expit(x), expit(x, out=out))
def _sample_visibles(self, h, temperature=1.0): """Sample from the distribution P(v|h). h : array-like, shape (n_samples, n_components) Values of the hidden layer to sample from. Returns ------- v : array-like, shape (n_samples, n_features) Values of the visible layer. """ p = np.dot(h, self.components_/temperature) p += self.intercept_visible_/(min(1.0, temperature) if BIASED_PRIOR else temperature) expit(p, out=p) return (self.rng_.random_sample(size=p.shape) < p)
def _sample_visibles(self, h, temperature=1.0): """Sample from the distribution P(v|h). h : array-like, shape (n_samples, n_components) Values of the hidden layer to sample from. Returns ------- v : array-like, shape (n_samples, n_features) Values of the visible layer. """ p = np.dot(h, self.components_ / temperature) p += self.intercept_visible_ / (min(1.0, temperature) if BIASED_PRIOR else temperature) expit(p, out=p) return (self.rng_.random_sample(size=p.shape) < p)
def _fit(self, v_pos): """Inner fit for one mini-batch. Adjust the parameters to maximize the likelihood of v using Extended Mean Field theory (second order TAP equations). Parameters ---------- v_pos : array-like, shape (n_samples, n_features) The data to use for training. """ X_batch = v_pos lr = float(self.learning_rate) / X_batch.shape[0] decay = self.decay v_pos, h_pos, v_init, h_init = self.init_batch(X_batch) a = safe_sparse_dot(h_init, self.W, dense_output=True) + self.v_bias a = expit(a, out=a) # get_negative_samples v_neg, h_neg = self.equilibrate(v_init, h_init, iters=self.neq_steps) # basic gradient dW = self.weight_gradient(v_pos, h_pos, v_neg, h_neg) # regularization based on weight decay # similar to momentum > if self.weight_decay == "L1": dW -= decay * np.sign(self.W) elif self.weight_decay == "L2": dW -= decay * self.W # can we use BLAS here ? # momentum # note: what do we do if lr changes per step ? not ready yet dW += self.momentum * self.dW_prev # update self.W += lr * dW # storage for next iteration # is this is a memory killer self.dW_prev = dW # is this wasteful...can we avoid storing 2X the W mat ? # elementwise multiply self.W2 = np.multiply(self.W, self.W) # update bias terms # csr matrix sum is screwy, returns [[1,self.n_components]] 2-d array # so I always use np.asarray(X.sum(axis=0)).squeeze() # although (I think) this could be optimized self.v_bias += lr * (np.asarray(v_pos.sum(axis=0)).squeeze() - np.asarray(v_neg.sum(axis=0)).squeeze()) self.h_bias += lr * (np.asarray(h_pos.sum(axis=0)).squeeze() - np.asarray(h_neg.sum(axis=0)).squeeze()) return 0
def _sample_visibles(self, h, rng): """Sample from the distribution P(v|h). Parameters ---------- h : array-like, shape (n_samples, n_components) Values of the hidden layer to sample from. rng : RandomState Random number generator to use. Returns ------- v : array-like, shape (n_samples, n_features) Values of the visible layer. """ p = np.dot(h, self.components_) p += self.intercept_visible_ expit(p, out=p) return (rng.random_sample(size=p.shape) < p)
def _mean_hiddens(self, v): """Computes the conditional probabilities P(h=1|v). Parameters ---------- v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. """ p = safe_sparse_dot(v, self.W.T) + self.h_bias return expit(p, out=p)
def _mean_visible(self, h): """Computes the conditional probabilities P(v=1|h). Parameters ---------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. Returns ------- v : array-like, shape (n_samples, n_features) Values of the visible layer. """ #p = np.dot(h, self.W) + self.v_bias p = safe_sparse_dot(h, self.W) + self.v_bias return expit(p, out=p)
def _mean_hiddens(self, v, temperature=1.0): """Computes the probabilities P(h=1|v). v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. """ p = safe_sparse_dot(v, self.components_.T/temperature) p += self.intercept_hidden_/(min(1.0, temperature) if BIASED_PRIOR else temperature) return expit(p, out=p)
def _mean_hiddens(self, v, temperature=1.0): """Computes the probabilities P(h=1|v). v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. """ p = safe_sparse_dot(v, self.components_.T / temperature) p += self.intercept_hidden_ / (min(1.0, temperature) if BIASED_PRIOR else temperature) return expit(p, out=p)
def mh_update(self, v, h): """update TAP hidden magnetizations, to second order""" a = safe_sparse_dot(v, self.W.T) + self.h_bias v_fluc = (v - (np.multiply(v, v))) #a += (v-v*v).dot((self.W2).T)*(0.5-h) if issparse(h): h_half = (0.5 - h.to_dense()) else: h_half = (0.5 - h) a += np.multiply(safe_sparse_dot(v_fluc, self.W2.T), h_half) return expit(a, out=a)
def _mean_hiddens(self, v): """Computes the probabilities P(h=1|v). Parameters ---------- v : array-like, shape (n_samples, n_features) Values of the visible layer. Returns ------- h : array-like, shape (n_samples, n_components) Corresponding mean field values for the hidden layer. """ p = safe_sparse_dot(v, self.components_.T) p += self.intercept_hidden_ return expit(p, out=p)
def fgrad(we, X, y, l1, l2): nsamples, nfactors = X.shape w0 = we[0] w = we[1:(nfactors + 1)] - we[(nfactors + 1):] yz = y * (safe_sparse_dot(X, w) + w0) f = -np.sum(log_logistic(yz)) + l1 * np.sum( we[1:]) + 0.5 * l2 * np.dot(w, w) e = (expit(yz) - 1) * y g = safe_sparse_dot(X.T, e) + l2 * w g0 = np.sum(e) grad = np.concatenate([g, -g]) + l1 grad = np.insert(grad, 0, g0) return f, grad
def mv_update(self, v, h): """update TAP visbile magnetizations, to second order""" # a = np.dot(h, self.W) + self.v_bias a = safe_sparse_dot(h, self.W) + self.v_bias h_fluc = h - np.multiply(h, h) #a += h_fluc.dot(self.W2)*(0.5-v) # 0.5-v is elementwise => dense if issparse(v): v_half = (0.5 - v.todense()) else: v_half = (0.5 - v) a += np.multiply(safe_sparse_dot(h_fluc, self.W2), v_half) return expit(a, out=a)
def _logistic_loss_and_grad(w, X, y, alpha, sample_weight=None): """Computes the logistic loss and gradient. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- out : float Logistic loss. grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. """ _, n_features = X.shape grad = np.empty_like(w) w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) # Logistic loss is the negative of the log of the logistic function. out = -np.sum(sample_weight * log_logistic(yz)) + .5 * alpha * np.dot(w, w) z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if grad.shape[0] > n_features: grad[-1] = z0.sum() return out, grad
def dloss(self, y, pred): return (-y * expit(-y * pred))
def _fit(self, v_pos, rng, targets, epoch_num=1): """Inner fit for one mini-batch. Adjust the parameters to maximize the likelihood of v using Stochastic Maximum Likelihood (SML). Parameters ---------- v_pos : array-like, shape (n_samples, n_features) The data to use for training. rng : RandomState Random number generator to use for sampling. target: array-like, shape (n_samples, unique_labels) Labeled data. """ momentum = 0.7 ######## VISABLE POSITIVE TO HIDDEN POSITIVE PHASE ########## h_pos = self._mean_hiddens(v_pos, targets) ######## HIDDEN POSITIVE TO VISABLE NEGATIVE PHASE ########## batch_count = len(v_pos) self.target_bias_matrix = numpy.tile(self.target_bias_, (batch_count, 1)) self.visible_bias_matrix = numpy.tile(self.intercept_visible_, (batch_count, 1)) self.hidden_bias_matrix = numpy.tile(self.intercept_hidden_, (batch_count, 1)) neg_lab_states = self.target_bias_matrix * 0. if targets.sum() != 0: temp_h_pos = h_pos for j in range(self.cd_iter): ## positive hidden label states from previous positive hidden probabilities pos_hid_states = temp_h_pos > numpy.random.random(size=h_pos.shape) neg_lab_prob = numpy.exp(numpy.dot(pos_hid_states, self.target_components_.T) + self.target_bias_matrix) neg_lab_prob = neg_lab_prob / neg_lab_prob.sum(axis=1).reshape(batch_count, 1) cum_probs = numpy.cumsum(neg_lab_prob, axis=1) sampling = cum_probs > rng.uniform(0, 1., (batch_count, 1)) neg_lab_states = numpy.zeros(self.target_bias_matrix.shape, dtype=float) for j, s in enumerate(sampling): try: index = min(numpy.where(s)[0]) neg_lab_states[j, index] = 1 except ValueError: sys.exit(1) v_neg = expit(numpy.dot(pos_hid_states, self.components_.T) + self.intercept_visible_) v_neg_states = v_neg > numpy.random.uniform(0., 1., v_neg.shape) ## given _sample_visibles this line is not needed temp_h_pos = self._mean_hiddens(v_neg_states, neg_lab_states) h_neg = temp_h_pos else: h_pos_states = h_pos > numpy.random.random(h_pos.shape) if self.regularization_mu is not None: ## force sparsity by pressuring hidden units to turn on ## sparse_h_pos = self.regularization(h_pos, self.regularization_mu, axis=0) # this will force selectivity #sparse_h_pos = self.regularization(sparse_h_pos, self.regularization_mu, axis=1) h_pos = (1 - self.phi) * sparse_h_pos + self.phi * h_pos h_pos_states = h_pos ## visable negative must be a function of hidden positive states v_neg = expit(numpy.dot(h_pos_states, self.components_.T) + self.intercept_visible_) ######## VISABLE NEGATIVE TO HIDDEN NEGATIVE PHASE ######### h_neg = expit(numpy.dot(v_neg, self.components_) + self.intercept_hidden_) err = numpy.sum(numpy.square((v_neg - v_pos))) ## compute learning rates by dividing by batch size lr = float(self.learning_rate) / v_pos.shape[0] lr_bias = float(self.learning_rate_bias) / v_pos.shape[0] ######## Update Components and Bias Units ######## update_comp = safe_sparse_dot(v_pos.T, h_pos, dense_output=True) update_comp -= safe_sparse_dot(v_neg.T, h_neg, dense_output=True) update_comp -= self.weight_cost * v_pos.shape[0] * self.components_ # weight decay self.vishidinc = lr * update_comp + self.vishidinc * momentum self.components_ += self.vishidinc update_comp_lab = safe_sparse_dot(targets.T, h_pos, dense_output=True) update_comp_lab -= safe_sparse_dot(neg_lab_states.T, h_neg, dense_output=True) update_comp_lab -= self.weight_cost * v_neg.shape[0] * self.target_components_ self.target_components_ += lr * update_comp_lab self.hidbiasinc = momentum * self.hidbiasinc + lr_bias * (h_pos.sum(axis=0) - h_neg.sum(axis=0)) self.intercept_hidden_ += self.hidbiasinc self.visbiasinc = momentum * self.visbiasinc + lr_bias * (v_pos.sum(axis=0) - v_neg.sum(axis=0)) self.intercept_visible_ += self.visbiasinc self.target_bias_ += lr_bias * (targets.sum(axis=0) - neg_lab_states.sum(axis=0)) return err
def _logistic_grad_hess(w, X, y, alpha, sample_weight=None): """Computes the gradient and the Hessian, in the case of a logistic loss. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. Hs : callable Function that takes the gradient as a parameter and returns the matrix product of the Hessian and gradient. """ n_samples, n_features = X.shape grad = np.empty_like(w) fit_intercept = grad.shape[0] > n_features w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if fit_intercept: grad[-1] = z0.sum() # The mat-vec product of the Hessian d = sample_weight * z * (1 - z) if sparse.issparse(X): dX = safe_sparse_dot( sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), X) else: # Precompute as much as possible dX = d[:, np.newaxis] * X if fit_intercept: # Calculate the double derivative with respect to intercept # In the case of sparse matrices this returns a matrix object. dd_intercept = np.squeeze(np.array(dX.sum(axis=0))) def Hs(s): ret = np.empty_like(s) ret[:n_features] = X.T.dot(dX.dot(s[:n_features])) ret[:n_features] += alpha * s[:n_features] # For the fit intercept case. if fit_intercept: ret[:n_features] += s[-1] * dd_intercept ret[-1] = dd_intercept.dot(s[:n_features]) ret[-1] += d.sum() * s[-1] return ret return grad, Hs
def predict(self, X): n = (len(self.we) - 1) / 2 w0 = self.we[0] w = self.we[1:n + 1] - self.we[n + 1:] return expit(w0 + safe_sparse_dot(X, w))
def _logistic_grad_hess(w, X, y, alpha, sample_weight=None): """Computes the gradient and the Hessian, in the case of a logistic loss. Parameters ---------- w : ndarray, shape (n_features,) or (n_features + 1,) Coefficient vector. X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. y : ndarray, shape (n_samples,) Array of labels. alpha : float Regularization parameter. alpha is equal to 1 / C. sample_weight : array-like, shape (n_samples,) optional Array of weights that are assigned to individual samples. If not provided, then each sample is given unit weight. Returns ------- grad : ndarray, shape (n_features,) or (n_features + 1,) Logistic gradient. Hs : callable Function that takes the gradient as a parameter and returns the matrix product of the Hessian and gradient. """ n_samples, n_features = X.shape grad = np.empty_like(w) fit_intercept = grad.shape[0] > n_features w, c, yz = _intercept_dot(w, X, y) if sample_weight is None: sample_weight = np.ones(y.shape[0]) z = expit(yz) z0 = sample_weight * (z - 1) * y grad[:n_features] = safe_sparse_dot(X.T, z0) + alpha * w # Case where we fit the intercept. if fit_intercept: grad[-1] = z0.sum() # The mat-vec product of the Hessian d = sample_weight * z * (1 - z) if sparse.issparse(X): dX = safe_sparse_dot(sparse.dia_matrix((d, 0), shape=(n_samples, n_samples)), X) else: # Precompute as much as possible dX = d[:, np.newaxis] * X if fit_intercept: # Calculate the double derivative with respect to intercept # In the case of sparse matrices this returns a matrix object. dd_intercept = np.squeeze(np.array(dX.sum(axis=0))) def Hs(s): ret = np.empty_like(s) ret[:n_features] = X.T.dot(dX.dot(s[:n_features])) ret[:n_features] += alpha * s[:n_features] # For the fit intercept case. if fit_intercept: ret[:n_features] += s[-1] * dd_intercept ret[-1] = dd_intercept.dot(s[:n_features]) ret[-1] += d.sum() * s[-1] return ret return grad, Hs
def sigma_means(self, x, b, W): """helper class for computing Wx+b """ a = safe_sparse_dot(x, W.T) + b return expit(a, out=a)