def predict(self, X): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y: array of shape = [n_samples] or [n_samples, n_outputs] The predicted values. """ # Check data if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) # Assign chunk of trees to jobs n_jobs, n_trees, starts = _partition_estimators(self) # Parallel loop all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_predict_regression)( self.estimators_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)) # Reduce y_hat = sum(all_y_hat) / len(self.estimators_) return y_hat
def procrustes_rotation(X1, X2, copy=True): """Apply optimal rotation and scaling matrix between two matrices. Parameters ---------- X1, X2: array-likes with the same shape (n_samples, n_features) Returns ------- X2_t : array-like, shape (n_samples, n_features) """ X1 = as_float_array(array2d(X1), copy=copy) X2 = as_float_array(array2d(X2), copy=copy) X1_mean = X1.mean(0) X2_mean = X2.mean(0) X1 -= X1_mean X2 -= X2_mean X1_norm = linalg.norm(X1, 'fro') X2_norm = linalg.norm(X1, 'fro') X1 /= X1_norm X2 /= X2_norm U, S, V = linalg.svd(np.dot(X1.T, X2), full_matrices=False) U, V = svd_flip(U, V) R = np.dot(V.T, U.T) X2_t = np.sum(S) * X1_norm * np.dot(X2, R) + X1_mean return X2_t
def _joint_log_likelihood(self, X, mask=None): X = array2d(X) if mask is not None: mask = array2d(mask) X = X.copy() X[mask] = np.nan joint_log_likelihood = np.zeros((len(self.classes_), X.shape[0])) for i in range(np.size(self.classes_)): joint_log_likelihood[i, :] = self._jll(X, i) return joint_log_likelihood.T
def fit(self, X, y=None, **params): """Fit the model with X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the instance itself. Notes ----- Calling multiple times will update the components """ X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) if self.iteration != 0 and n_features != self.components_.shape[1]: raise ValueError( 'The dimensionality of the new data and the existing components_ does not match' ) # incrementally fit the model for i in range(0, X.shape[0]): self.partial_fit(X[i, :]) return self
def fit(self, X, y=None): """Fit the model from data in X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. Returns ------- self: object Returns the object itself """ random_state = check_random_state(self.random_state) X = array2d(X) if self.n_components is None: n_components = X.shape[1] else: n_components = self.n_components V, U, E, self.n_iter_ = dict_learning( X, n_components, self.alpha, tol=self.tol, max_iter=self.max_iter, method=self.fit_algorithm, n_jobs=self.n_jobs, code_init=self.code_init, dict_init=self.dict_init, verbose=self.verbose, random_state=random_state, return_n_iter=True ) self.components_ = U self.error_ = E return self
def med_cross_distances_test(X): """ Computes the nonzero componentwise L1 cross-distances between the vectors in X. Parameters ---------- X: array_like An array with shape (n_samples, n_features) Returns ------- D: array with shape (n_samples * (n_samples - 1) / 2, n_features) The array of componentwise L1 cross-distances. ij: arrays with shape (n_samples * (n_samples - 1) / 2, 2) The indices i and j of the vectors in X associated to the cross- distances in D: D[k] = np.abs(X[ij[k, 0]] - Y[ij[k, 1]]). """ X = array2d(X) n_samples, n_features = X.shape n_nonzero_cross_dist = n_samples * (n_samples - 1) / 2 ij = np.zeros((n_nonzero_cross_dist, 2), dtype=np.int) D = np.zeros((n_nonzero_cross_dist, n_features)) ll_1 = 0 for k in range(n_samples - 1): ll_0 = ll_1 ll_1 = ll_0 + n_samples - k - 1 ij[ll_0:ll_1, 0] = k ij[ll_0:ll_1, 1] = np.arange(k + 1, n_samples) D[ll_0:ll_1] = np.abs(X[k] - X[(k + 1):n_samples]) return D, ij.astype(np.int)
def predict(self, X): """Predict regression target for X. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y: array of shape = [n_samples] The predicted values. """ if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) # TODO - validate n_features is correct? n_samples, n_features = X.shape if self._n_features != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is {} and " " input n_features is {}".format( self._n_features, n_features)) result = np.empty(n_samples, dtype=DTYPE) return self._evaluator.predict(X, result)
def l1_multiply(X): """ Computes the nonzero componentwise L1 cross-distances between the vectors in X. Parameters ---------- X: array_like An array with shape (n_samples, n_features) Returns ------- D: array with shape (n_samples * (n_samples - 1) / 2, n_features) The array of componentwise L1 cross-distances. ij: arrays with shape (n_samples * (n_samples - 1) / 2, 2) The indices i and j of the vectors in X associated to the cross- distances in D: D[k] = np.abs(X[ij[k, 0]] - Y[ij[k, 1]]). """ X = array2d(X) n_samples, n_features = X.shape n_nonzero_cross_dist = n_samples * (n_samples - 1) / 2 ij = np.zeros((n_nonzero_cross_dist, 2), dtype=np.int) D = np.zeros((n_nonzero_cross_dist, n_features)) ll_1 = 0 for k in range(n_samples - 1): ll_0 = ll_1 ll_1 = ll_0 + n_samples - k - 1 ij[ll_0:ll_1, 0] = k ij[ll_0:ll_1, 1] = np.arange(k + 1, n_samples) D[ll_0:ll_1] = np.abs(X[k] * X[(k + 1) : n_samples]) return D, ij.astype(np.int)
def predict(self, X): """ Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array, shape = (n_samples, n_features) The input samples. Internally, it will be converted to `dtype=np.float32`. Returns ------- y : array, shape = (n_samples, ) The predicted values. """ # A call to predict(...) preceding a call to fit(...). if not self.estimators_: return self.bias X = array2d(X, dtype=DTYPE, copy=False, force_all_finite=False) all_y_hat = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_helper)(tree, "predict", X) for tree in self.estimators_ ) return sum(all_y_hat) / len(self.estimators_)
def transform(self, X): """ Transform new points into embedding space. Parameters ---------- X : array-like, shape = [n_samples, n_features] Returns ------- X_new : array, shape = [n_samples, n_components] Notes ----- Because of scaling performed by this method, it is discouraged to use it together with methods that are not scale-invariant (like SVMs) """ X = array2d(X) ind = self.nbrs_.kneighbors(X, n_neighbors=self.n_neighbors, return_distance=False) weights = barycenter_weights(X, self.nbrs_._fit_X[ind], reg=self.reg) X_new = np.empty((X.shape[0], self.n_components)) for i in range(X.shape[0]): X_new[i] = np.dot(self.embedding_[ind[i]].T, weights[i]) return X_new
def predict(self, X): """ Predict regression value for X. Parameters ---------- X : array, shape = (n_samples, n_features) The input samples. Internally, it will be converted to `dtype=np.float32`. Returns ------- y : array, shape = (n_samples,) The predict values. """ X = array2d(X, dtype=DTYPE, copy=False, force_all_finite=False) n_samples, n_features = X.shape if self.grower is None: return self.bias if self.n_features != n_features: raise ValueError( "Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features) ) return self.tree_.predict(X)
def predict(self, X): """Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y: array of shape = [n_samples] or [n_samples, n_outputs] The predicted values. """ # Check data if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) # Assign chunk of trees to jobs n_jobs, n_trees, starts = _partition_estimators(self) # Parallel loop all_y_hat = Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_predict_regression) (self.estimators_[starts[i]:starts[i + 1]], X) for i in range(n_jobs)) # Reduce y_hat = sum(all_y_hat) / len(self.estimators_) return y_hat
def f_test(self, contrast, pval=False): from sklearn.utils import array2d #Ypred = self.predict(self.X) #betas = self.coef #ss_errors = np.sum((self.Y - self.y_hat) ** 2, axis=0) C1 = array2d(contrast).T n, p = self.X.shape #Xpinv = scipy.linalg.pinv(X) rank_x = np.linalg.matrix_rank(self.pinv) C0 = np.eye(p) - np.dot(C1, scipy.linalg.pinv2(C1)) # Ortho. cont. to C1 X0 = np.dot(self.X, C0) # Design matrix of the reduced model X0pinv = scipy.linalg.pinv2(X0) rank_x0 = np.linalg.matrix_rank(X0pinv) # Find the subspace (X1) of Xc1, which is orthogonal to X0 # The projection matrix M due to X1 can be derived from the residual # forming matrix of the reduced model X0 # R0 is the residual forming matrix of the reduced model R0 = np.eye(n) - np.dot(X0, X0pinv) # R is the residual forming matrix of the full model R = np.eye(n) - np.dot(self.X, self.pinv) # compute the projection matrix M = R0 - R #Ypred = np.dot(self.X, betas) y_hat = self.predict(self.X) SS = np.sum(y_hat * np.dot(M, y_hat), axis=0) df_c1 = rank_x - rank_x0 df_res = n - rank_x ## Broadcast over self.err_ss of Y f_stats = (SS * df_res) / (self.err_ss * df_c1) if not pval: return (f_stats, None) else: p_vals = stats.f.sf(f_stats, df_c1, df_res) return f_stats, p_vals
def fit(self, X, y=None, headers=None, verbose=False): X = array2d(X) if (X.ndim != 2): raise ValueError('X must have dimension 2, ndim='+X.ndim) # n_samples, self.n_features_ = X.shape y = np.atleast_1d(y) # y = y.astype(DOUBLE) if self.target is not None: if y is None: y = [None]*len(X) if (len(y) != len(X)): raise ValueError('y must be same shape as X, len(X)='+str(len(X))+', len(y)='+str(len(y))) if headers is not None: if (len(headers) != len(X)): raise ValueError('headers must be same shape as X, len(X)='+str(len(X))+', len(headers)='+str(len(headers))) for x,t in zip(X,y): if verbose: print x,t event = array2json(x,headers) if self.target is not None: event[self.target] = t self.stream.train(event)
def predict(self, X): ''' Predict regression target for X. The predicted regression target of an input sample is computed as the mean predicted regression targets of the trees in the forest. Parameters ---------- X : array, shape = (n_samples, n_features) The input samples. Internally, it will be converted to `dtype=np.float32`. Returns ------- y : array, shape = (n_samples, ) The predicted values. ''' # A call to predict(...) preceding a call to fit(...). if not self.estimators_: return self.bias X = array2d(X, dtype=DTYPE, copy=False, force_all_finite=False) all_y_hat = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")\ (delayed(_parallel_helper)(tree, 'predict', X) for tree in self.estimators_) return sum(all_y_hat) / len(self.estimators_)
def fit(self, X): X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ U, S, V = linalg.svd(X, full_matrices=False) explained_variance_ = (S**2) / n_samples explained_variance_ratio_ = (explained_variance_ / explained_variance_.sum()) components_ = V n_components = self.n_components if n_components is None: n_components = n_features # store n_samples to revert whitening when getting covariance self.n_samples_ = n_samples self.components_ = components_[self.start_c:self.start_c + n_components] self.explained_variance_ = explained_variance_[self. start_c:self.start_c + n_components] self.explained_variance_ratio_ = explained_variance_ratio_[ self.start_c:self.start_c + n_components] self.n_components_ = n_components return self
def corr_cut(t, d): return corr( array2d( np.hstack([ optimal_theta[0][0:i], t[0], optimal_theta[0][(i + 1)::] ])), d)
def predict(self, X): ''' Predict regression value for X. Parameters ---------- X : array, shape = (n_samples, n_features) The input samples. Internally, it will be converted to `dtype=np.float32`. Returns ------- y : array, shape = (n_samples,) The predict values. ''' X = array2d(X, dtype=DTYPE, copy=False, force_all_finite=False) n_samples, n_features = X.shape if self.grower is None: return self.bias if self.n_features != n_features: raise ValueError('Number of features of the model must ' ' match the input. Model n_features is %s and ' ' input n_features is %s ' % (self.n_features_, n_features)) return self.tree_.predict(X)
def fit(self, X, y=None, **params): """Fit the model with X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the instance itself. Notes ----- Calling multiple times will update the components """ X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) if self.iteration != 0 and n_features != self.components_.shape[1]: raise ValueError("The dimensionality of the new data and the existing components_ does not match") # incrementally fit the model for i in range(0, X.shape[0]): self.partial_fit(X[i, :]) return self
def transform(self, sequences): """Apply the dimensionality reduction on X. Parameters ---------- sequences: list of array-like, each of shape (n_samples_i, n_features) Training data, where n_samples_i in the number of samples in sequence i and n_features is the number of features. Returns ------- sequence_new : list of array-like, each of shape (n_samples_i, n_components) """ check_iter_of_sequences(sequences, max_iter=3) # we might be lazy-loading sequences_new = [] for X in sequences: X = array2d(X) if self.means_ is not None: X = X - self.means_ X_transformed = np.dot(X, self.components_.T) if self.weighted_transform: X_transformed *= self.timescales_ sequences_new.append(X_transformed) return sequences_new
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the trees in the forest. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Check data if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) # Assign chunk of trees to jobs n_jobs, n_trees, starts = _partition_estimators(self) # Bugfix for _parallel_predict_proba which expects a list for multi-label and integer for single-label problems if not isinstance(self.n_classes_, int) and len(self.n_classes_) == 1: n_classes_ = self.n_classes_[0] else: n_classes_ = self.n_classes_ # Parallel loop all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_predict_proba)( self.estimators_[starts[i]:starts[i + 1]], X, n_classes_, self.n_outputs_) for i in range(n_jobs)) # Reduce proba = all_proba[0] if self.n_outputs_ == 1: for j in xrange(1, len(all_proba)): proba += all_proba[j] proba /= len(self.estimators_) else: for j in xrange(1, len(all_proba)): for k in xrange(self.n_outputs_): proba[k] += all_proba[j][k] for k in xrange(self.n_outputs_): proba[k] /= self.n_estimators return proba
def fit(self, X, y=None): X = array2d(X) X = as_float_array(X, copy = self.copy) print X.shape sigma = np.dot(X.T,X) / X.shape[1] U, S, V = linalg.svd(sigma) tmp = np.dot(U, np.diag(1/np.sqrt(S+self.regularization))) self.components_ = np.dot(tmp, U.T) return self
def predict(self, X): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise Exception("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in xrange(self.n_outputs_): predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0]
def predict_proba(self, X): """Predict class probabilities for X. The predicted class probabilities of an input sample is computed as the mean predicted class probabilities of the trees in the forest. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. The order of the classes corresponds to that in the attribute `classes_`. """ # Check data if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) # Assign chunk of trees to jobs n_jobs, n_trees, starts = _partition_estimators(self) # Bugfix for _parallel_predict_proba which expects a list for multi-label and integer for single-label problems if not isinstance(self.n_classes_, int) and len(self.n_classes_) == 1: n_classes_ = self.n_classes_[0] else: n_classes_ = self.n_classes_ # Parallel loop all_proba = Parallel(n_jobs=n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_predict_proba) (self.estimators_[starts[i]:starts[i + 1]], X, n_classes_, self.n_outputs_) for i in range(n_jobs)) # Reduce proba = all_proba[0] if self.n_outputs_ == 1: for j in xrange(1, len(all_proba)): proba += all_proba[j] proba /= len(self.estimators_) else: for j in xrange(1, len(all_proba)): for k in xrange(self.n_outputs_): proba[k] += all_proba[j][k] for k in xrange(self.n_outputs_): proba[k] /= self.n_estimators return proba
def fit(self, X, y=None, **params): """Fit the model with X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the instance itself. Notes ----- Calling multiple times will update the components """ X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) # init if self.iteration == 0: self.mean_ = np.zeros([n_features], np.float) self.components_ = np.zeros([self.n_components, n_features], np.float) else: if n_features != self.components_.shape[1]: raise ValueError( 'The dimensionality of the new data and the existing components_ does not match' ) # incrementally fit the model for i in range(0, X.shape[0]): self.partial_fit(X[i, :]) # update explained_variance_ratio_ self.explained_variance_ratio_ = np.sqrt( np.sum(self.components_**2, axis=1)) # sort by explained_variance_ratio_ idx = np.argsort(-self.explained_variance_ratio_) self.explained_variance_ratio_ = self.explained_variance_ratio_[idx] self.components_ = self.components_[idx, :] # re-normalize self.explained_variance_ratio_ = (self.explained_variance_ratio_ / self.explained_variance_ratio_.sum()) for r in range(0, self.components_.shape[0]): self.components_[r, :] /= np.sqrt( np.dot(self.components_[r, :], self.components_[r, :])) return self
def fit(self, X, y=None): X = array2d(X) X = as_float_array(X, copy=self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ sigma = np.dot(X.T, X) / X.shape[1] U, S, V = linalg.svd(sigma) tmp = np.dot(U, np.diag(1 / np.sqrt(S + self.regularization))) self.components_ = np.dot(tmp, U.T) return self
def _joint_log_likelihood(self, X): X = array2d(X) joint_log_likelihood = [] for i in xrange(np.size(self.classes_)): jointi = np.log(self.class_prior_[i]) n_ij = -0.5 * np.sum(np.log(np.pi * self.sigma_[i, :])) n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.sigma_[i, :]), 1) joint_log_likelihood.append(jointi + n_ij) joint_log_likelihood = np.array(joint_log_likelihood).T return joint_log_likelihood
def predict(self, X): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE) n_samples, n_features = X.shape if self.tree_ is None: raise Exception("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError( "Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features) ) proba = self.tree_.predict(X) # Classification if isinstance(self, ClassifierMixin): if self.n_outputs_ == 1: return self.classes_.take(np.argmax(proba, axis=1), axis=0) else: predictions = np.zeros((n_samples, self.n_outputs_)) for k in xrange(self.n_outputs_): predictions[:, k] = self.classes_[k].take(np.argmax(proba[:, k], axis=1), axis=0) return predictions # Regression else: if self.n_outputs_ == 1: return proba[:, 0] else: return proba[:, :, 0]
def _joint_log_likelihood(self, X): X = array2d(X) joint_log_likelihood = [] for i in range(np.size(self.classes_)): jointi = np.log(self.class_prior_[i]) n_ij = -0.5 * np.sum(np.log(np.pi * self.sigma_[i, :])) n_ij -= 0.5 * np.sum( ((X - self.theta_[i, :])**2) / (self.sigma_[i, :]), 1) joint_log_likelihood.append(jointi + n_ij) joint_log_likelihood = np.array(joint_log_likelihood).T return joint_log_likelihood
def fit_transform(self, X, y=None): """ Fit the model to the data X and transform it. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. """ X = array2d(X) self.fit(X, y) return self.transform(X)
def fit(self, X, y=None): X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ eigs, eigv = eigh(np.dot(X.T, X) / n_samples + \ self.bias * np.identity(n_features)) components = np.dot(eigv * np.sqrt(1.0 / eigs), eigv.T) self.components_ = components #Order the explained variance from greatest to least self.explained_variance_ = eigs[::-1] return self
def fit(self, X, y=None): """ Fit the model to the data X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self """ X = array2d(X) dtype = np.float32 if X.dtype.itemsize == 4 else np.float64 rng = check_random_state(self.random_state) self.components_ = np.asarray( rng.normal(0, 0.01, (self.n_components, X.shape[1])), dtype=dtype, order='fortran') self.intercept_hidden_ = np.zeros(self.n_components, dtype=dtype) self.intercept_visible_ = np.zeros(X.shape[1], dtype=dtype) self.h_samples_ = np.zeros((self.batch_size, self.n_components), dtype=dtype) inds = np.arange(X.shape[0]) rng.shuffle(inds) n_batches = int(np.ceil(len(inds) / float(self.batch_size))) verbose = self.verbose for iteration in xrange(self.n_iter): pl = 0. if verbose: begin = time.time() for minibatch in xrange(n_batches): pl_batch = self._fit(X[inds[minibatch::n_batches]], rng) if verbose: pl += pl_batch.sum() if verbose: pl /= X.shape[0] end = time.time() print("Iteration %d, pseudo-likelihood = %.2f, time = %.2fs" % (iteration, pl, end - begin)) return self
def fit(self, X, y=None, **params): """Fit the model with X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the instance itself. Notes ----- Calling multiple times will update the components """ X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) # init if self.iteration == 0: self.mean_ = np.zeros([n_features], np.float) self.components_ = np.zeros([self.n_components,n_features], np.float) else: if n_features != self.components_.shape[1]: raise ValueError('The dimensionality of the new data and the existing components_ does not match') # incrementally fit the model for i in range(0,X.shape[0]): self.partial_fit(X[i,:]) # update explained_variance_ratio_ self.explained_variance_ratio_ = np.sqrt(np.sum(self.components_**2,axis=1)) # sort by explained_variance_ratio_ idx = np.argsort(-self.explained_variance_ratio_) self.explained_variance_ratio_ = self.explained_variance_ratio_[idx] self.components_ = self.components_[idx,:] # re-normalize self.explained_variance_ratio_ = (self.explained_variance_ratio_ / self.explained_variance_ratio_.sum()) for r in range(0,self.components_.shape[0]): self.components_[r,:] /= np.sqrt(np.dot(self.components_[r,:],self.components_[r,:])) return self
def transform(self, X): """ Computes the probabilities ``P({\bf h}_j=1|{\bf v}={\bf X})``. Parameters ---------- X: array-like, shape (n_samples, n_features) Returns ------- h: array-like, shape (n_samples, n_components) """ X = array2d(X) return self._mean_hiddens(X)
def fit(self, X, y=None): X = array2d(X) X = as_float_array(X, copy=self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ X = X.T examples = np.shape(X)[1] sigma = np.dot(X, X.T) / (examples - 1) U, S, V = linalg.svd(sigma) d = np.sqrt(1 / S[0:100]) dd = np.append(d, np.zeros((np.shape(X)[0] - 100))) #tmp = np.dot(U, np.diag(1/np.sqrt(S +self.regularization))) tmp = np.dot(U, np.diag(dd)) self.components_ = np.dot(tmp, U.T) return self
def detect(self, X): X = array2d(X) n_samples, n_features = X.shape N_obs = self.N_obs if self.N_obs is not None else n_features if N_obs > self.N_ref: raise ValueError i_pred = [] for X_i in X: detection = detect_stream(X_i, N_obs, self.R_pos_, self.R_neg_, self.gamma, self.theta, self.D_req) i_pred.append(detection) return i_pred
def fit(self, X, y=None): X = array2d(X) X = as_float_array(X, copy = self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ X = X.T examples = np.shape(X)[1] sigma = np.dot(X,X.T) / (examples - 1) U, S, V = linalg.svd(sigma) d = np.sqrt(1/S[0:100]) dd = np.append(d, np.zeros((np.shape(X)[0] - 100))) #tmp = np.dot(U, np.diag(1/np.sqrt(S +self.regularization))) tmp = np.dot(U, np.diag(dd)) self.components_ = np.dot(tmp, U.T) return self
def _fit(self, X): X = array2d(X) self._initialize(X.shape[1]) self.n_observations_ += X.shape[0] self.n_sequences_ += 1 self._outer_0_to_T_lagged += np.dot(X[:-self.offset].T, X[self.offset:]) self._sum_0_to_TminusTau += X[:-self.offset].sum(axis=0) self._sum_tau_to_T = X[self.offset:].sum(axis=0) self._sum_0_to_T = X.sum(axis=0) self._outer_0_to_TminusTau = np.dot(X[:-self.offset].T, X[:-self.offset]) self._outer_offset_to_T = np.dot(X[self.offset:].T, X[self.offset:]) self._is_ditry = True
def fit(self, X, y, sample_weight=None): """Fit Naive Bayes classifier according to X, y Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. sample_weight : array-like, shape = [n_samples], optional Weights applied to individual samples (1. for unweighted). Returns ------- self : object Returns self. """ X, y = check_arrays(X, y, sparse_format='csr') X = X.astype(np.float) y = column_or_1d(y, warn=True) _, n_features = X.shape labelbin = LabelBinarizer() Y = labelbin.fit_transform(y) self.classes_ = labelbin.classes_ if Y.shape[1] == 1: Y = np.concatenate((1 - Y, Y), axis=1) # convert to float to support sample weight consistently Y = Y.astype(np.float64) if sample_weight is not None: Y *= array2d(sample_weight).T class_prior = self.class_prior # Count raw events from data before updating the class log prior # and feature log probas n_effective_classes = Y.shape[1] self.class_count_ = np.zeros(n_effective_classes, dtype=np.float64) self.feature_count_ = np.zeros((n_effective_classes, n_features), dtype=np.float64) self._count(X, Y) self._update_feature_log_prob() self._update_class_log_prior(class_prior=class_prior) return self
def calc_kernel_matrix(self, X): """ Perform only the calculation of the covariance matrix given the GP and a dataset Parameters ---------- X : double array_like An array with shape (n_samples, n_features) with the input at which observations were made. Returns ------- gp : adds properties self.D and self.K """ # Force data to 2D numpy.array X = array2d(X) n_samples, n_features = X.shape # Normalise input data or not. Do if normalise is 1 (all normalise) or 2 (input normalise) if self.normalise > 0: X_mean = sp.mean(X, axis=0) X_std = sp.std(X, axis=0) X_std[X_std == 0.] = 1. # center and scale X if necessary X = (X - X_mean) / X_std else: X_mean = 0.0 X_std = 1.0 # Calculate distance matrix in vector form. The matrix form of X is obtained by scipy.spatial.distance.squareform(X) D = sp.spatial.distance.pdist(X, metric = self.metric) D = sp.spatial.distance.squareform(D) # Divide each distance ij by sqrt(N_i * N_j) if self.normalise == -1: natoms = (X != 0.).sum(1) D = D / sp.sqrt(sp.outer(natoms, natoms)) # Covariance matrix K # sklearn correlation doesn't work. Probably correlation_models needs some different inputs K = kernel(D, self.theta0, correlation=self.corr) self.X = X if not self.low_memory: self.D = D self.K = K self.X_mean, self.X_std = X_mean, X_std return K
def calc_kernel_matrix(self, X): """ Perform only the calculation of the covariance matrix given the GP and a dataset Parameters ---------- X : double array_like An array with shape (n_samples, n_features) with the input at which observations were made. Returns ------- gp : adds properties self.D and self.K """ # Force data to 2D numpy.array X = array2d(X) n_samples, n_features = X.shape # Normalise input data or not. Do if normalise is 1 (all normalise) or 2 (input normalise) if self.normalise > 0: X_mean = sp.mean(X, axis=0) X_std = sp.std(X, axis=0) X_std[X_std == 0.] = 1. # center and scale X if necessary X = (X - X_mean) / X_std else: X_mean = 0.0 X_std = 1.0 # Calculate distance matrix in vector form. The matrix form of X is obtained by scipy.spatial.distance.squareform(X) D = sp.spatial.distance.pdist(X, metric=self.metric) D = sp.spatial.distance.squareform(D) # Divide each distance ij by sqrt(N_i * N_j) if self.normalise == -1: natoms = (X != 0.).sum(1) D = D / sp.sqrt(sp.outer(natoms, natoms)) # Covariance matrix K # sklearn correlation doesn't work. Probably correlation_models needs some different inputs K = kernel(D, self.theta0, correlation=self.corr) self.X = X if not self.low_memory: self.D = D self.K = K self.X_mean, self.X_std = X_mean, X_std return K
def apply(self, X): """Apply trees in the forest to X, return leaf indices. Parameters ---------- X : array-like, shape = [n_samples, n_features] Input data. Returns ------- X_leaves : array_like, shape = [n_samples, n_estimators] For each datapoint x in X and for each tree in the forest, return the index of the leaf x ends up in. """ X = array2d(X, dtype=DTYPE) return np.array([est.tree_.apply(X) for est in self.estimators_]).T
def partial_fit(self, X, y=None, iter_offset=None): """Updates the model using the data in X as a mini-batch. Parameters ---------- X: array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. iter_offset: integer, optional The number of iteration on data batches that has been performed before this call to partial_fit. This is optional: if no number is passed, the memory of the object is used. Returns ------- self : object Returns the instance itself. """ if not hasattr(self, 'random_state_'): self.random_state_ = check_random_state(self.random_state) X = array2d(X) if hasattr(self, 'components_'): dict_init = self.components_ else: dict_init = self.dict_init inner_stats = getattr(self, 'inner_stats_', None) if iter_offset is None: iter_offset = getattr(self, 'iter_offset_', 0) U, (A, B) = dict_learning_online( X, self.n_components, self.alpha, n_iter=self.n_iter, method=self.fit_algorithm, n_jobs=self.n_jobs, dict_init=dict_init, batch_size=len(X), shuffle=False, verbose=self.verbose, return_code=False, iter_offset=iter_offset, random_state=self.random_state_, return_inner_stats=True, inner_stats=inner_stats, ) self.components_ = U # Keep track of the state of the algorithm to be able to do # some online fitting (partial_fit) self.inner_stats_ = (A, B) self.iter_offset_ = iter_offset + self.n_iter return self
def fit(self, features_train): X = array2d(features_train) n_samples, n_features = X.shape print 'given train features dimensions before PCA : ', features_train.shape X = as_float_array(X) # Data preprocessing by Mean Normalization # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ # Compute covariance matrix cov_matrix = np.dot(np.transpose(X), X) / n_samples print 'cov_matrix dimensions : ', cov_matrix.shape # Compute SVD U, S, V = linalg.svd(cov_matrix, full_matrices=1, compute_uv=1) print 'x dimensions : ', X.shape print 'U dimensions : ', U.shape print 'S dimensions : ', S.shape # Calculate optimal k - min number of principal components to maintain 99% of variance variance_retained = np.sum(S[:self.k_components]) / np.sum(S) while variance_retained < self.variance_percent_retained: self.k_components += 1 variance_retained = np.sum(S[:self.k_components]) / np.sum(S) #print 'k_components : ', self.k_components, ' variance : ', variance_retained if self.k_components is None: self.k_components = n_features elif not 0 <= self.k_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d" % (self.k_components, n_features)) self.components = U self.U_reduce = U[:, :self.k_components] print 'number of principal components : ', self.k_components self.U = U self.S = S self.V = V return (U, S, V)
def fit(self, X, y, check_input=True): ''' Build a forest of trees from the available chunk of training set (X, y). Parameters ---------- X : array, shape = (n_samples, n_features) The training input samples. Internally, it will be converted to `dtype=np.float32`. y : array, shape = (n_samples,) The target values. Returns ------- self : object Returns self. ''' if check_input: X = array2d(X, dtype=DTYPE, copy=False, force_all_finite=False) if y.ndim != 1: raise ValueError('y must be 1-d array') if len(y) != X.shape[0]: raise ValueError('Number of labels (%d) does not match ' 'number of samples (%d).' % (len(y), X.shape[0])) if y.dtype != DOUBLE or not y.flags.c_contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # First call to fit(...)? if not self.estimators_: self._initialize_estimators() # Parallel loop: we use the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading always more efficient than multiprocessing in # that case. Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")\ (delayed(_parallel_build_trees)(tree, X, y, tree_idx, len(self.estimators_), self.prob, verbose=self.verbose) for tree_idx, tree in enumerate(self.estimators_)) return self
def fit(self, X, y, check_input=True): """ Build a forest of trees from the available chunk of training set (X, y). Parameters ---------- X : array, shape = (n_samples, n_features) The training input samples. Internally, it will be converted to `dtype=np.float32`. y : array, shape = (n_samples,) The target values. Returns ------- self : object Returns self. """ if check_input: X = array2d(X, dtype=DTYPE, copy=False, force_all_finite=False) if y.ndim != 1: raise ValueError("y must be 1-d array") if len(y) != X.shape[0]: raise ValueError( "Number of labels (%d) does not match " "number of samples (%d)." % (len(y), X.shape[0]) ) if y.dtype != DOUBLE or not y.flags.c_contiguous: y = np.ascontiguousarray(y, dtype=DOUBLE) # First call to fit(...)? if not self.estimators_: self._initialize_estimators() # Parallel loop: we use the threading backend as the Cython code # for fitting the trees is internally releasing the Python GIL # making threading always more efficient than multiprocessing in # that case. Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(_parallel_build_trees)(tree, X, y, tree_idx, len(self.estimators_), self.prob, verbose=self.verbose) for tree_idx, tree in enumerate(self.estimators_) ) return self
def fit(self, features_train): X = array2d(features_train) n_samples, n_features = X.shape print 'given train features dimensions before PCA : ', features_train.shape X = as_float_array(X) # Data preprocessing by Mean Normalization # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ # Compute covariance matrix cov_matrix = np.dot(np.transpose(X), X)/n_samples print 'cov_matrix dimensions : ', cov_matrix.shape # Compute SVD U, S, V = linalg.svd(cov_matrix, full_matrices=1, compute_uv=1) print 'x dimensions : ', X.shape print 'U dimensions : ', U.shape print 'S dimensions : ', S.shape # Calculate optimal k - min number of principal components to maintain 99% of variance variance_retained = np.sum(S[:self.k_components]) / np.sum(S) while variance_retained < self.variance_percent_retained: self.k_components += 1 variance_retained = np.sum(S[:self.k_components]) / np.sum(S) #print 'k_components : ', self.k_components, ' variance : ', variance_retained if self.k_components is None: self.k_components = n_features elif not 0 <= self.k_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d" % (self.k_components, n_features)) self.components = U self.U_reduce = U[:, :self.k_components] print 'number of principal components : ', self.k_components self.U = U self.S = S self.V = V return (U, S, V)
def fit(self, X, y, mask=None): """Fit Gaussian Naive Bayes according to X, y Parameters ---------- X : array-like, shape = [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like, shape = [n_samples] Target values. mask : array-like, shape = [n_samples, n_features] Binary, 1 at unobserved features. Returns ------- self : object Returns self. """ X, y = check_arrays(X, y, sparse_format='dense') n_samples, n_features = X.shape if n_samples != y.shape[0]: raise ValueError("X and y have incompatible shapes") if mask is not None: mask = array2d(mask) X = X.copy() X[mask] = np.nan self.classes_ = unique_y = np.unique(y) n_classes = unique_y.shape[0] self.theta_ = np.zeros((n_classes, n_features)) self.sigma_ = np.zeros((n_classes, n_features)) self.class_prior_ = np.zeros(n_classes) self._n_ij = [] epsilon = 1e-9 for i, y_i in enumerate(unique_y): self.theta_[i, :] = bn.nanmean(X[y == y_i, :], axis=0) self.sigma_[i, :] = bn.nanvar(X[y == y_i, :], axis=0) + epsilon self.class_prior_[i] = np.float(np.sum(y == y_i)) / n_samples self._n_ij.append(-0.5 * np.sum(np.log(np.pi * self.sigma_[i, :]))) self._logprior = np.log(self.class_prior_) return self
def transform(self, X): """Apply the dimensionality reduction on X. Parameters ---------- X : array-like, shape (n_samples, n_features) New data, where n_samples in the number of samples and n_features is the number of features. Returns ------- X_new : array-like, shape (n_samples, n_components) """ X = array2d(X) X_transformed = X - self.mean_ X_transformed = np.dot(X_transformed, self.components_.T) return np.asarray(X_transformed)
def _fit(self, X): X = np.asarray(array2d(X), dtype=np.float64) self._initialize(X.shape[1]) if not len(X) > self.lag_time: raise ValueError('First dimension must be longer than ' 'lag_time=%d. X has shape (%d, %d)' % ((self.lag_time,) + X.shape)) self.n_observations_ += X.shape[0] self.n_sequences_ += 1 self._outer_0_to_T_lagged += np.dot(X[:-self.lag_time].T, X[self.lag_time:]) self._sum_0_to_TminusTau += X[:-self.lag_time].sum(axis=0) self._sum_tau_to_T += X[self.lag_time:].sum(axis=0) self._sum_0_to_T += X.sum(axis=0) self._outer_0_to_TminusTau += np.dot(X[:-self.lag_time].T, X[:-self.lag_time]) self._outer_offset_to_T += np.dot(X[self.lag_time:].T, X[self.lag_time:]) self._is_dirty = True
def predict_proba(self, X): """Predict class probabilities of the input samples X. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- p : array of shape = [n_samples, n_classes], or a list of n_outputs such arrays if n_outputs > 1. The class probabilities of the input samples. Classes are ordered by arithmetical order. """ if getattr(X, "dtype", None) != DTYPE or X.ndim != 2: X = array2d(X, dtype=DTYPE, order="F") n_samples, n_features = X.shape if self.tree_ is None: raise Exception("Tree not initialized. Perform a fit first.") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) proba = [] P = self.tree_.predict(X) for k in xrange(self.n_outputs_): P_k = P[:, k, :self.n_classes_[k]] normalizer = P_k.sum(axis=1)[:, np.newaxis] normalizer[normalizer == 0.0] = 1.0 P_k /= normalizer proba.append(P_k) if self.n_outputs_ == 1: return proba[0] else: return proba
def _decision_function(self, X): X = array2d(X) norm2 = [] for i in range(len(self.classes_)): R = self.rotations_[i] S = self.scalings_[i] Xm = X - self.means_[i] X2 = np.dot(Xm, R * (S**(-0.5))) norm2.append(np.sum(X2**2, 1)) norm2 = np.array(norm2).T # shape = [len(X), n_classes] """return (-0.5 * (norm2 + np.sum(np.log(self.scalings_), 1)) + np.log(self.priors_)) """ sum_log_scalings = [] for i in range(len(self.scalings_)): """ Is this correct? Or do we sum across the columns instead?""" sum_log_scalings.append(np.sum(np.log(self.scalings_[i]))) sum_log_scalings = np.array(sum_log_scalings) return (-0.5 * (norm2 + sum_log_scalings + np.log(self.priors_)))
def fit(self, X, y=None): # X should be 2-dimensional X = array2d(X) X = as_float_array(X, copy=self.copy) # center the mean row-wise self.mean_ = np.mean(X, axis=0) X -= self.mean_ # compute sigma = np.dot(X.T, X) / X.shape[1] U, S, V = linalg.svd(sigma) # perform dimensionality reduction if needed if self.n_components: U = U[:self.n_components] self.explained_variance_ = (S ** 2) / X.shape[0] self.explained_variance_ratio_ = (self.explained_variance_ / self.explained_variance_.sum()) # obtain transformation matrix tmp = np.dot(U, np.diag(1/np.sqrt(S+self.regularization))) self.components_ = np.dot(tmp, U.T) # if self.n_components: # self.components_ = self.components_[:self.n_components] return self