def test_np_matrix(): # Confirm that input validation code does not return np.matrix X = np.arange(12).reshape(3, 4) assert_false(isinstance(as_float_array(X), np.matrix)) assert_false(isinstance(as_float_array(np.matrix(X)), np.matrix)) assert_false(isinstance(as_float_array(sp.csc_matrix(X)), np.matrix))
def test_np_matrix(): """ Confirm that input validation code does not return np.matrix """ X = np.arange(12).reshape(3, 4) assert_false(isinstance(as_float_array(X), np.matrix)) assert_false(isinstance(as_float_array(np.matrix(X)), np.matrix)) assert_false(isinstance(as_float_array(sp.csc_matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csr(X), np.matrix)) assert_false(isinstance(atleast2d_or_csr(np.matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csr(sp.csc_matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csc(X), np.matrix)) assert_false(isinstance(atleast2d_or_csc(np.matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csc(sp.csr_matrix(X)), np.matrix)) assert_false(isinstance(safe_asarray(X), np.matrix)) assert_false(isinstance(safe_asarray(np.matrix(X)), np.matrix)) assert_false(isinstance(safe_asarray(sp.lil_matrix(X)), np.matrix)) assert_true(atleast2d_or_csr(X, copy=False) is X) assert_false(atleast2d_or_csr(X, copy=True) is X) assert_true(atleast2d_or_csc(X, copy=False) is X) assert_false(atleast2d_or_csc(X, copy=True) is X)
def mean_absolute_error(y_true, y_pred): """ Mean absolute error and its standard deviation. If you need only mean absolute error, use :func:`sklearn.metrics.mean_absolute_error` Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores Returns ------- mean : float mean of squared errors stdev : float standard deviation of squared errors """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calculate errors errs = np.abs(y_true - y_pred) mean = np.nanmean(errs) stdev = np.nanstd(errs) return mean, stdev
def transform_with_scaler( Y, scaler=None, wrt_X=[] ): Y = as_float_array( Y ) if len(wrt_X) and not scaler: wrt_X = as_float_array( wrt_X ) scaler = get_scaler( wrt_X ) with_mean = scaler.mean_ with_stdv = scaler.std_ Z = scaler.transform(Y) return Z
def item_finder_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * AUC * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) if not is_binary_score(y_true): raise ValueError('True scores must be binary') y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'n_samples': y_true.size, 'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)}, 'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}} # statistics at least 0 and 1 must be contained in a score array if is_binary_score(y_true, allow_uniform=False): stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred) # display statistics if disp: print( json.dumps( stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def item_finder_statistics(y_true, y_pred): """ Full Statistics of prediction performance * n_samples * mean_absolute_error: mean, stdev * mean_squared_error: mean, rmse, stdev * predicted: mean, stdev * true: mean, stdev Parameters ---------- y_true : array, shape=(n_samples,) Ground truth scores y_pred : array, shape=(n_samples,) Predicted scores Returns ------- stats : dict Full statistics of prediction performance """ # check inputs assert_all_finite(y_true) if not is_binary_score(y_true): raise ValueError('True scores must be binary') y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = {} # dataset size stats['n_samples'] = y_true.size # descriptive statistics of ground truth scores stats['true'] = {'mean': np.mean(y_true), 'stdev': np.std(y_true)} # descriptive statistics of ground predicted scores stats['predicted'] = {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)} # statistics at least 0 and 1 must be contained in a score array if is_binary_score(y_true, allow_uniform=False): # AUC (area under the curve) stats['area under the curve'] = skm.roc_auc_score(y_true, y_pred) return stats
def fit(self, X, y): n_samples, self.n_features = X.shape self.n_outputs = y.shape[1] self._init_fit(X) self.hidden_activations_ = self._get_hidden_activations(X) if self.regularized: self._solve_regularized(as_float_array(y, copy=True)) else: self._solve(as_float_array(y, copy=True)) return self
def score_predictor_report(y_true, y_pred, disp=True): """ Report brief summary of prediction performance * mean absolute error * root mean squared error * number of data * mean and standard dev. of true scores * mean and standard dev. of predicted scores Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores disp : bool, optional, default=True if True, print report Returns ------- stats : dict belief summary of prediction performance """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calc statistics stats = { 'mean absolute error': skm.mean_absolute_error(y_true, y_pred), 'root mean squared error': np.sqrt(np.maximum(skm.mean_squared_error(y_true, y_pred), 0.)), 'n_samples': y_true.size, 'true': {'mean': np.mean(y_true), 'stdev': np.std(y_true)}, 'predicted': {'mean': np.mean(y_pred), 'stdev': np.std(y_pred)}} # display statistics if disp: print(json.dumps( stats, sort_keys=True, indent=4, separators=(',', ': '), ensure_ascii=False), file=sys.stderr) return stats
def fit(self, X, y): """ Fit the model using X, y as training data. Parameters ---------- X : {array-like, sparse matrix} of shape [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape [n_samples, n_outputs] Target values (class labels in classification, real numbers in regression) Returns ------- self : object Returns an instance of self. """ # fit random hidden layer and compute the hidden layer activations self.hidden_activations_ = self.hidden_layer.fit_transform(X) # solve the regression from hidden activations to outputs self._fit_regression(as_float_array(y, copy=True)) return self
def _fit(self, X): """Fit the model to the data X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. Returns ------- X : ndarray, shape (n_samples, n_features) The input data, copied, centered and whitened when requested. """ random_state = self._random_state X = np.atleast_2d(as_float_array(X)) self._initialize(X, random_state) for it in range(self.n_iter): if it % 10 == 0: self._print_status(it) else: logger.info("<{}>".format(it)) self._sample_topics(random_state) self._print_status(self.n_iter) self.components_ = self.nzw + self.eta self.components_ /= np.sum(self.components_, axis=1, keepdims=True) return self.ndz
def fit(self, X, y=None, **params): """Fit the model with X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the instance itself. Notes ----- Calling multiple times will update the components """ X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) if self.iteration != 0 and n_features != self.components_.shape[1]: raise ValueError("The dimensionality of the new data and the existing components_ does not match") # incrementally fit the model for i in range(0, X.shape[0]): self.partial_fit(X[i, :]) return self
def k_modes(X, n_clusters, n_init=1, max_iter=5, verbose=False, tol=1e-4, random_state=None, copy_x=True, n_jobs=1): """K-modes clustering algorithm.""" if n_init <= 0: raise ValueError("Invalid number of initializations." " n_init=%d must be bigger than zero." % n_init) X = as_float_array(X, copy=copy_x) matrix_all_irm = _compute_all_irm(X, n_clusters) best_labels, best_modes, best_mirm = None, None, -np.inf if n_jobs == 1: for j in range(2,n_clusters+1): # For a single thread, less memory is needed if we just store one set # of the best results (as opposed to one set per run per thread). for it in range(n_init): # run a k-modes once labels, modes, mirm_sum = _kmodes_single( X, j, matrix_all_irm, max_iter=max_iter, verbose=verbose, tol=tol, random_state=random_state) # determine if these results are the best so far if mirm_sum >= best_mirm: best_labels = labels.copy() best_modes = modes.copy() best_mirm = mirm_sum else: # TODO: pass return best_modes, best_labels, best_mirm
def transform(self, X): X = as_float_array(X, copy=self.copy) if self.mean_ is not None and self.std_ is not None: X -= self.mean_ X /= self.std_ X_whitend = np.dot(X, self.components_) return X_whitend
def fit(self, X): n_samples, self.n_features = X.shape self.n_outputs = X.shape[1] self._init_fit(X) self.hidden_activations_ = self._get_hidden_activations(X) self._regularized(as_float_array(X, copy=True)) #self.coef_output_ = safe_sparse_dot(pinv2(self.hidden_activations_), X) return self
def fit(self, X, y=None): X = array2d(X) X = as_float_array(X, copy = self.copy) print X.shape sigma = np.dot(X.T,X) / X.shape[1] U, S, V = linalg.svd(sigma) tmp = np.dot(U, np.diag(1/np.sqrt(S+self.regularization))) self.components_ = np.dot(tmp, U.T) return self
def fit(self, X): X = as_float_array(X, copy=self.copy) self._mean = np.mean(X, axis=0) X -= self._mean sigma = np.dot(X.T,X) / X.shape[1] U, S, V = linalg.svd(sigma) tmp = np.dot(U, np.diag(1 / np.sqrt(S + self.regularization))) self._components = np.dot(tmp, U.T) return self
def test_as_float_array(): """ Test function for as_float_array """ X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) # Checks that the return type is ok X2 = as_float_array(X, copy=False) np.testing.assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert_true(as_float_array(X, False) is not X) # Checking that the new type is ok np.testing.assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert_true(as_float_array(X, copy=False) is X)
def import_lda(filename): lines = filename.read().splitlines() nr_lines = len(lines) labels = range(nr_lines) features = range(nr_lines) for i in range(nr_lines): labels[i], data = lines[i].split(',') features[i] = map(float, data.split()) X = as_float_array(features) return labels, X
def transform(self, X, y=None, copy=None): """Perform ZCA whitening Parameters ---------- X : array-like with shape [n_samples, n_features] The data to whiten along the features axis. """ check_is_fitted(self, 'mean_') X = as_float_array(X, copy=self.copy) return np.dot(X - self.mean_, self.whiten_.T)
def fit(self, X, y): """Fit the model using X, y as training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data. y : array-like, shape = [n_samples] Target values. Will be cast to X's dtype if necessary Returns ------- self : object Returns an instance of self. """ X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True, ensure_min_samples=2, estimator=self) X = as_float_array(X, copy=False) n_samples, n_features = X.shape X, y, X_offset, y_offset, X_scale = \ self._preprocess_data(X, y, self.fit_intercept, self.normalize) estimator_func, params = self._make_estimator_and_params(X, y) memory = self.memory if memory is None: memory = Memory(cachedir=None, verbose=0) elif isinstance(memory, six.string_types): memory = Memory(cachedir=memory, verbose=0) elif not isinstance(memory, Memory): raise ValueError("'memory' should either be a string or" " a sklearn.utils.Memory" " instance, got 'memory={!r}' instead.".format( type(memory))) scores_ = memory.cache(_resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch' ])(estimator_func, X, y, scaling=self.scaling, n_resampling=self.n_resampling, n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch, random_state=self.random_state, sample_fraction=self.sample_fraction, **params) if scores_.ndim == 1: scores_ = scores_[:, np.newaxis] self.all_scores_ = scores_ self.scores_ = np.max(self.all_scores_, axis=1) return self
def k_means(X, n_clusters, init='similar_cut', sparsity=None, max_iter=10, verbose=False, tol=1e-4, random_state=None, debug_directory=None, n_jobs=1, algorithm=None, **kargs): random_state = check_random_state(random_state) if max_iter <= 0: raise ValueError('Number of iterations should be a positive number,' ' got %d instead' % max_iter) X = as_float_array(X) tol = _tolerance(X, tol) labels, inertia, centers, debug_header = None, None, None, None if debug_directory: # Create debug header strf_now = datetime.datetime.now() debug_header = str(strf_now).replace(':', '-').replace(' ', '_').split('.')[0] # Check debug_directory if not os.path.exists(debug_directory): os.makedirs(debug_directory) s = datetime.datetime.now() # For a single thread, run a k-means once centers, labels, inertia, n_iter_ = kmeans_single( X, n_clusters, max_iter=max_iter, init=init, sparsity=sparsity, verbose=verbose, tol=tol, random_state=random_state, debug_directory=debug_directory, debug_header=debug_header, algorithm=algorithm, **kargs) # parallelisation of k-means runs # TODO return centers, labels, inertia
def fit_transform(self, X, y=None): """ Fit LSI model to X and perform dimensionality reduction on X. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Training data. Returns ------- X_new : array, shape (n_samples, n_components) Reduced version of X. This will always be a dense array. """ X = as_float_array(X, copy=False) random_state = check_random_state(self.random_state) # If sparse and not csr or csc, convert to csr if sp.issparse(X) and X.getformat() not in ["csr", "csc"]: X = X.tocsr() if self.algorithm == "arpack": U, Sigma, VT = svds(X, k=self.n_components, tol=self.tol) # svds doesn't abide by scipy.linalg.svd/randomized_svd # conventions, so reverse its outputs. Sigma = Sigma[::-1] U, VT = svd_flip(U[:, ::-1], VT[::-1]) elif self.algorithm == "randomized": k = self.n_components n_features = X.shape[1] if k >= n_features: raise ValueError("n_components must be < n_features;" " got %d >= %d" % (k, n_features)) U, Sigma, VT = randomized_svd(X, self.n_components, n_iter=self.n_iter, random_state=random_state) else: raise ValueError("unknown algorithm %r" % self.algorithm) self.components_ = VT self.Sigma = Sigma[:self.n_components] # Calculate explained variance & explained variance ratio X_transformed = np.dot(U, np.diag(Sigma)) self.explained_variance_ = exp_var = np.var(X_transformed, axis=0) if sp.issparse(X): _, full_var = mean_variance_axis(X, axis=0) full_var = full_var.sum() else: full_var = np.var(X, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var return X_transformed
def test_as_float_array(): """Test function for as_float_array""" X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) # Checks that the return type is ok X2 = as_float_array(X, copy=False) np.testing.assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert_true(as_float_array(X, False) is not X) # Checking that the new type is ok np.testing.assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert_true(as_float_array(X, copy=False) is X) # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert_true(np.isfortran(as_float_array(X, copy=True)))
def inverse_transform(self, X, copy=None): """Undo the ZCA transform and rotate back to the original representation Parameters ---------- X : array-like with shape [n_samples, n_features] The data to rotate back. """ check_is_fitted(self, 'mean_') X = as_float_array(X, copy=self.copy) return np.dot(X, self.dewhiten_) + self.mean_
def get_distance_metric(X, dtype='euclidean', normalize=True, nrows=10 ): X = as_float_array( X ) dist = nn.DistanceMetric.get_metric('euclidean') d = dist.pairwise(X) print '-' * 80 print introspection( dist ) nrows=min(nrows,len(d)) for row in d[:nrows]: print [ "%.2f" % x for x in row[:nrows] ] print '-' * 80 return dist
def fit(self, X, y=None): """Estimate the precision using an adaptive maximum likelihood estimator. Parameters ---------- X : ndarray, shape (n_samples, n_features) Data from which to compute the proportion matrix. """ X = check_array(X, ensure_min_features=2, estimator=self) X = as_float_array(X, copy=False, force_all_finite=False) n_samples, n_features = X.shape # perform first estimate new_estimator = clone(self.estimator) new_estimator.fit(X) if self.method == 'binary': # generate weights self.lam_ = self._binary_weights(new_estimator) # perform second step adaptive estimate self.estimator_ = QuicGraphLasso(lam=self.lam_ * new_estimator.lam_, mode='default', init_method='cov', auto_scale=False) self.estimator_.fit(X) elif self.method == 'inverse_squared': self.lam_ = self._inverse_squared_weights(new_estimator) # perform second step adaptive estimate self.estimator_ = QuicGraphLassoCV(lam=self.lam_ * new_estimator.lam_, auto_scale=False) self.estimator_.fit(X) elif self.method == 'inverse': self.lam_ = self._inverse_weights(new_estimator) # perform second step adaptive estimate self.estimator_ = QuicGraphLassoCV(lam=self.lam_ * new_estimator.lam_, auto_scale=False) self.estimator_.fit(X) else: raise NotImplementedError( ("Only method='binary', 'inverse_squared', or", "'inverse' have been implemented.")) self.is_fitted = True return self
def _check_X(self, X): _X = None if not hasattr(X, 'dtype'): _X = check_array(as_float_array(X)) _X = check_array(X) if self.n_features: if _X.shape[1] != self.n_features: raise Exception( 'X has {} columns while {} are expected'.format( _X.shape[1], self.n_features)) return _X
def fit(self, X, y=None, **fit_params): """Fits the inverse covariance model according to the given training data and parameters. Parameters ----------- X : 2D ndarray, shape (n_features, n_features) Input data. Returns ------- self """ # quic-specific outputs self.opt_ = None self.cputime_ = None self.iters_ = None self.duality_gap_ = None # these must be updated upon self.fit() self.sample_covariance_ = None self.lam_scale_ = None self.is_fitted_ = False self.path_ = _validate_path(self.path) X = check_array(X, ensure_min_features=2, estimator=self) X = as_float_array(X, copy=False, force_all_finite=False) self.init_coefs(X) if self.method == "quic": ( self.precision_, self.covariance_, self.opt_, self.cputime_, self.iters_, self.duality_gap_, ) = quic( self.sample_covariance_, self.lam * self.lam_scale_, mode=self.mode, tol=self.tol, max_iter=self.max_iter, Theta0=self.Theta0, Sigma0=self.Sigma0, path=self.path_, msg=self.verbose, ) else: raise NotImplementedError( "Only method='quic' has been implemented.") self.is_fitted_ = True return self
def fit(self, X, y=None): from scipy import linalg from sklearn.utils import as_float_array X = as_float_array(X, copy=self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ sigma = np.dot(X.T, X) / X.shape[1] U, S, V = linalg.svd(sigma) tmp = np.dot(U, np.diag(1 / np.sqrt(S + self.regularization))) self.components_ = np.dot(tmp, U.T) return self
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) # Checks that the return type is ok X2 = as_float_array(X, copy=False) np.testing.assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert_true(as_float_array(X, False) is not X) # Checking that the new type is ok np.testing.assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert_true(as_float_array(X, copy=False) is X) # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert_true(np.isfortran(as_float_array(X, copy=True))) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert_false(np.isnan(M).any())
def fit(self, X, y=None): X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ eigs, eigv = eigh(np.dot(X.T, X) / n_samples + \ self.bias * np.identity(n_features)) components = np.dot(eigv * np.sqrt(1.0 / eigs), eigv.T) self.components_ = components #Order the explained variance from greatest to least self.explained_variance_ = eigs[::-1] return self
def predict(self, X): """ compute the correlation coefficient with irpas signature """ signature = self.get_signature() X = as_float_array(X) X_transformed = self.transform(X) - signature[1] corrcoef = np.array( [np.corrcoef(signature[0], e)[0][1] for e in X_transformed]) corrcoef[np.isnan(corrcoef)] = np.finfo(np.float32).min return corrcoef
def mean_squared_error(y_true, y_pred): """ Root mean square error, mean square error, and its standard deviation. If you need only RMSE, use :func:`sklearn.metrics.mean_absolute_error` Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores Returns ------- rmse : float root mean squared error mean : float mean of absolute errors stdev : float standard deviation of absolute errors """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calculate errors errs = (y_true - y_pred) ** 2 mean = np.nanmean(errs) stdev = np.nanstd(errs) rmse = np.sqrt(np.maximum(mean, 0.)) return rmse, mean, stdev
def mean_squared_error(y_true, y_pred): """ Root mean square error, mean square error, and its standard deviation. If you need only RMSE, use :func:`sklearn.metrics.mean_absolute_error` Parameters ---------- y_true : array, shape(n_samples,) Ground truth scores y_pred : array, shape(n_samples,) Predicted scores Returns ------- rmse : float root mean squared error mean : float mean of absolute errors stdev : float standard deviation of absolute errors """ # check inputs assert_all_finite(y_true) y_true = as_float_array(y_true) assert_all_finite(y_pred) y_pred = as_float_array(y_pred) check_consistent_length(y_true, y_pred) # calculate errors errs = (y_true - y_pred)**2 mean = np.nanmean(errs) stdev = np.nanstd(errs) rmse = np.sqrt(np.maximum(mean, 0.)) return rmse, mean, stdev
def _validate_X(self, X): if len(X.shape) == 1: raise ValueError('X should be a 2-dimensional array.') # if one feature: # X = X.reshape(1,-1) # else: # one sample # X = X.reshape(-1,1) if X.shape[0] == 0: raise ValueError('Empty samples.') if X.shape[1] == 0: raise ValueError( '0 feature(s) (shape=(3, 0)) while a minimum of %d is required.' % (1, )) return as_float_array(check_array(X))
def fit(self, X, y=None, **params): """Fit the model with X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training data, where n_samples in the number of samples and n_features is the number of features. Returns ------- self : object Returns the instance itself. Notes ----- Calling multiple times will update the components """ X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) # init if self.iteration == 0: self.mean_ = np.zeros([n_features], np.float) self.components_ = np.zeros([self.n_components,n_features], np.float) else: if n_features != self.components_.shape[1]: raise ValueError('The dimensionality of the new data and the existing components_ does not match') # incrementally fit the model for i in range(0,X.shape[0]): self.partial_fit(X[i,:]) # update explained_variance_ratio_ self.explained_variance_ratio_ = np.sqrt(np.sum(self.components_**2,axis=1)) # sort by explained_variance_ratio_ idx = np.argsort(-self.explained_variance_ratio_) self.explained_variance_ratio_ = self.explained_variance_ratio_[idx] self.components_ = self.components_[idx,:] # re-normalize self.explained_variance_ratio_ = (self.explained_variance_ratio_ / self.explained_variance_ratio_.sum()) for r in range(0,self.components_.shape[0]): self.components_[r,:] /= np.sqrt(np.dot(self.components_[r,:],self.components_[r,:])) return self
def _center_data(self, X, y): ''' Centers data''' X = as_float_array(X, self.copy_X) # normalisation should be done in preprocessing! X_std = np.ones(X.shape[1], dtype=X.dtype) if self.fit_intercept: X_mean = np.average(X, axis=0) y_mean = np.average(y, axis=0) X -= X_mean y = y - y_mean else: X_mean = np.zeros(X.shape[1], dtype=X.dtype) y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype) return X, y, X_mean, y_mean, X_std
def _center_data(self,X,y): ''' Centers data''' X = as_float_array(X,self.copy_X) # normalisation should be done in preprocessing! X_std = np.ones(X.shape[1], dtype = X.dtype) if self.fit_intercept: X_mean = np.average(X,axis = 0) y_mean = np.average(y,axis = 0) X -= X_mean y = y - y_mean else: X_mean = np.zeros(X.shape[1],dtype = X.dtype) y_mean = 0. if y.ndim == 1 else np.zeros(y.shape[1], dtype=X.dtype) return X,y, X_mean, y_mean, X_std
def test_memmap(): # Confirm that input validation code doesn't copy memory mapped arrays asflt = lambda x: as_float_array(x, copy=False) with NamedTemporaryFile(prefix='sklearn-test') as tmp: M = np.memmap(tmp, shape=100, dtype=np.float32) M[:] = 0 for f in (check_array, np.asarray, asflt): X = f(M) X[:] = 1 assert_array_equal(X.ravel(), M) X[:] = 0
def _fit(self, X): """Fit the model to the data X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. Returns ------- X : ndarray, shape (n_samples, n_features) The input data, copied, centered and whitened when requested. """ random_state = check_random_state(self.random_state) if hasattr(X, 'todense'): warnings.warn( "Sparse matrix support is deprecated" " and will be dropped in 0.16." " Use TruncatedSVD instead.", DeprecationWarning) else: # not a sparse matrix, ensure this is a 2D array X = np.atleast_2d(as_float_array(X, copy=self.copy)) n_samples = X.shape[0] if not hasattr(X, 'todense'): # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if self.n_components is None: n_components = X.shape[1] else: n_components = self.n_components U, S, V = randomized_svd(X, n_components, n_iter=self.iterated_power, random_state=random_state) self.explained_variance_ = exp_var = (S**2) / n_samples self.explained_variance_ratio_ = exp_var / exp_var.sum() if self.whiten: self.components_ = V / S[:, np.newaxis] * sqrt(n_samples) else: self.components_ = V return X
def fit(self, X, y=None): print("Fitting ... ", end="") X = as_float_array(X, copy=self.copy) self.mean_ = cp.mean(X, axis=0) X = X - self.mean_ sigma = cp.dot(X.T, X) / (X.shape[0] - 1) U, S, V = linalg.svd(sigma) tmp = cp.dot(cp.array(U), cp.diag(1 / cp.sqrt(cp.array(S) + self.regularization))) self.components_ = cp.dot(tmp, cp.array(U).T) print("done") return self
def test_np_matrix(): """Confirm that input validation code does not return np.matrix""" X = np.arange(12).reshape(3, 4) assert_false(isinstance(as_float_array(X), np.matrix)) assert_false(isinstance(as_float_array(np.matrix(X)), np.matrix)) assert_false(isinstance(as_float_array(sp.csc_matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csr(X), np.matrix)) assert_false(isinstance(atleast2d_or_csr(np.matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csr(sp.csc_matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csc(X), np.matrix)) assert_false(isinstance(atleast2d_or_csc(np.matrix(X)), np.matrix)) assert_false(isinstance(atleast2d_or_csc(sp.csr_matrix(X)), np.matrix)) assert_false(isinstance(safe_asarray(X), np.matrix)) assert_false(isinstance(safe_asarray(np.matrix(X)), np.matrix)) assert_false(isinstance(safe_asarray(sp.lil_matrix(X)), np.matrix)) assert_true(atleast2d_or_csr(X, copy=False) is X) assert_false(atleast2d_or_csr(X, copy=True) is X) assert_true(atleast2d_or_csc(X, copy=False) is X) assert_false(atleast2d_or_csc(X, copy=True) is X)
def calculate_purity(labels_true, labels_pred): labels_true = np.array(labels_true) labels_true = labels_true.reshape(labels_true.size) labels_pred = np.array(labels_pred) labels_pred = labels_pred.reshape(labels_pred.size) k = np.size(np.unique(labels_pred)) purityVector = np.zeros(k) purity = 0 matrix = as_float_array(contingency_matrix(labels_true, labels_pred)) for i in xrange(k): moda = np.float(np.max(matrix[:, i])) purityVector[i] = moda / np.sum(matrix[:, i]) purity += purityVector[i] * np.sum(matrix[:, i]) / np.size(labels_pred) return purity, purityVector
def fit(self, X, y=None): X = array2d(X) X = as_float_array(X, copy=self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ X = X.T examples = np.shape(X)[1] sigma = np.dot(X, X.T) / (examples - 1) U, S, V = linalg.svd(sigma) d = np.sqrt(1 / S[0:100]) dd = np.append(d, np.zeros((np.shape(X)[0] - 100))) #tmp = np.dot(U, np.diag(1/np.sqrt(S +self.regularization))) tmp = np.dot(U, np.diag(dd)) self.components_ = np.dot(tmp, U.T) return self
def fit(self, X, y=None): X = array2d(X) X = as_float_array(X, copy = self.copy) self.mean_ = np.mean(X, axis=0) X -= self.mean_ X = X.T examples = np.shape(X)[1] sigma = np.dot(X,X.T) / (examples - 1) U, S, V = linalg.svd(sigma) d = np.sqrt(1/S[0:100]) dd = np.append(d, np.zeros((np.shape(X)[0] - 100))) #tmp = np.dot(U, np.diag(1/np.sqrt(S +self.regularization))) tmp = np.dot(U, np.diag(dd)) self.components_ = np.dot(tmp, U.T) return self
def _fit(self, X): """Fit the model to the data X. Parameters ---------- X: array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. Returns ------- X : ndarray, shape (n_samples, n_features) The input data, copied, centered and whitened when requested. """ random_state = check_random_state(self.random_state) if hasattr(X, 'todense'): warnings.warn("Sparse matrix support is deprecated" " and will be dropped in 0.16." " Use TruncatedSVD instead.", DeprecationWarning) else: # not a sparse matrix, ensure this is a 2D array X = np.atleast_2d(as_float_array(X, copy=self.copy)) n_samples = X.shape[0] if not hasattr(X, 'todense'): # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if self.n_components is None: n_components = X.shape[1] else: n_components = self.n_components U, S, V = randomized_svd(X, n_components, n_iter=self.iterated_power, random_state=random_state) self.explained_variance_ = exp_var = (S ** 2) / n_samples self.explained_variance_ratio_ = exp_var / exp_var.sum() if self.whiten: self.components_ = V / S[:, np.newaxis] * sqrt(n_samples) else: self.components_ = V return X
def fit(self, features_train): X = array2d(features_train) n_samples, n_features = X.shape print 'given train features dimensions before PCA : ', features_train.shape X = as_float_array(X) # Data preprocessing by Mean Normalization # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ # Compute covariance matrix cov_matrix = np.dot(np.transpose(X), X) / n_samples print 'cov_matrix dimensions : ', cov_matrix.shape # Compute SVD U, S, V = linalg.svd(cov_matrix, full_matrices=1, compute_uv=1) print 'x dimensions : ', X.shape print 'U dimensions : ', U.shape print 'S dimensions : ', S.shape # Calculate optimal k - min number of principal components to maintain 99% of variance variance_retained = np.sum(S[:self.k_components]) / np.sum(S) while variance_retained < self.variance_percent_retained: self.k_components += 1 variance_retained = np.sum(S[:self.k_components]) / np.sum(S) #print 'k_components : ', self.k_components, ' variance : ', variance_retained if self.k_components is None: self.k_components = n_features elif not 0 <= self.k_components <= n_features: raise ValueError("n_components=%r invalid for n_features=%d" % (self.k_components, n_features)) self.components = U self.U_reduce = U[:, :self.k_components] print 'number of principal components : ', self.k_components self.U = U self.S = S self.V = V return (U, S, V)
def get_kMeans(fileData,normalized_axis=1,norm = 'l1'): data = np.load(fileData) #print data features = data['data'] n_clusters = np.size(np.unique(data['labels'])) features = as_float_array(features) #print features if normalized_axis != None: features = normalize(features,norm=norm,axis=normalized_axis) print features model = KMeans(n_clusters = n_clusters,tol=1e-2) print model model.fit(features) print model.labels_ labels_pred = model.labels_ labels_true = data['labels'] return labels_true, labels_pred,features
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) X2 = as_float_array(X, copy=False) assert X2.dtype == np.float32 # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert as_float_array(X, False) is not X assert X2.dtype == np.float64 # Test int dtypes <= 32bit tested_dtypes = [ np.bool, np.int8, np.int16, np.int32, np.uint8, np.uint16, np.uint32 ] for dtype in tested_dtypes: X = X.astype(dtype) X2 = as_float_array(X) assert X2.dtype == np.float32 # Test object dtype X = X.astype(object) X2 = as_float_array(X, copy=True) assert X2.dtype == np.float64 # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert as_float_array(X, copy=False) is X # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert np.isfortran(as_float_array(X, copy=True)) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), _sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert not np.isnan(M).any()
def test_as_float_array(): # Test function for as_float_array X = np.ones((3, 10), dtype=np.int32) X = X + np.arange(10, dtype=np.int32) X2 = as_float_array(X, copy=False) assert_equal(X2.dtype, np.float32) # Another test X = X.astype(np.int64) X2 = as_float_array(X, copy=True) # Checking that the array wasn't overwritten assert as_float_array(X, False) is not X assert_equal(X2.dtype, np.float64) # Test int dtypes <= 32bit tested_dtypes = [np.bool, np.int8, np.int16, np.int32, np.uint8, np.uint16, np.uint32] for dtype in tested_dtypes: X = X.astype(dtype) X2 = as_float_array(X) assert_equal(X2.dtype, np.float32) # Test object dtype X = X.astype(object) X2 = as_float_array(X, copy=True) assert_equal(X2.dtype, np.float64) # Here, X is of the right type, it shouldn't be modified X = np.ones((3, 2), dtype=np.float32) assert as_float_array(X, copy=False) is X # Test that if X is fortran ordered it stays X = np.asfortranarray(X) assert np.isfortran(as_float_array(X, copy=True)) # Test the copy parameter with some matrices matrices = [ np.matrix(np.arange(5)), sp.csc_matrix(np.arange(5)).toarray(), sparse_random_matrix(10, 10, density=0.10).toarray() ] for M in matrices: N = as_float_array(M, copy=True) N[0, 0] = np.nan assert not np.isnan(M).any()
def fit(self, X, y=None): # X = array2d(X) n_samples, n_features = X.shape X = as_float_array(X, copy=self.copy) #np.require(X, dtype=np.float32) # self.mean_ = np.mean(X, axis=0) self.std_ = np.std(X, axis=0) X -= self.mean_ X /= self.std_ sigma = np.dot(X.T, X)/n_samples d, V = np.linalg.eigh(sigma) # u,s,v = np.linalg.svd(sigma) #eigs, eigv = eigh(np.dot(X.T, X) / n_samples + \ # self.bias * np.identity(n_features)) D = np.diag(1./np.sqrt(d + self.epsilon)) components = np.dot(np.dot(V, D), V.T) self.components_ = components return self
def fit(self, X, y): """Fit the model using X, y as training data. Parameters ---------- X : array-like, shape = [n_samples, n_features] Training data. y : array-like, shape = [n_samples] Target values. Returns ------- self : object Returns an instance of self. """ X, y = check_X_y(X, y, ['csr', 'csc'], y_numeric=True, ensure_min_samples=2, estimator=self) X = as_float_array(X, copy=False) n_samples, n_features = X.shape X, y, X_offset, y_offset, X_scale = \ self._preprocess_data(X, y, self.fit_intercept, self.normalize) estimator_func, params = self._make_estimator_and_params(X, y) memory = self.memory if isinstance(memory, six.string_types): memory = Memory(cachedir=memory) scores_ = memory.cache( _resample_model, ignore=['verbose', 'n_jobs', 'pre_dispatch'] )( estimator_func, X, y, scaling=self.scaling, n_resampling=self.n_resampling, n_jobs=self.n_jobs, verbose=self.verbose, pre_dispatch=self.pre_dispatch, random_state=self.random_state, sample_fraction=self.sample_fraction, **params) if scores_.ndim == 1: scores_ = scores_[:, np.newaxis] self.all_scores_ = scores_ self.scores_ = np.max(self.all_scores_, axis=1) return self
def fit2(self, X, y): """ Fit the model using X, y as training data. Using Woodbury formula Parameters ---------- X : {array-like, sparse matrix} of shape [n_samples, n_features] Training vectors, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape [n_samples, n_outputs] Target values (class labels in classification, real numbers in regression) Returns ------- self : object Returns an instance of self. """ # fit random hidden layer and compute the hidden layer activations #self.H = self.hidden_layer.fit_transform(X) H = self._create_random_layer().fit_transform(X) y = as_float_array(y, copy=True) if self.beta is None: # Then, this is the first time the model is fitted assert len(X) >= self.n_hidden, ValueError( "The first time the model is fitted, X must have " "at least equal number of samples than n_hidden value!") # TODO: handle cases of singular matrices (maybe with a try clause) self.P = pinv2(safe_sparse_dot(H.T, H)) self.beta = multiple_safe_sparse_dot(self.P, H.T, y) else: M = np.eye(len(H)) + multiple_safe_sparse_dot(H, self.P, H.T) self.P -= multiple_safe_sparse_dot(self.P, H.T, pinv2(M), H, self.P) e = y - safe_sparse_dot(H, self.beta) self.beta += multiple_safe_sparse_dot(self.P, H.T, e) return self
def _fit(self, X): """Fit the model to the data X. Parameters ---------- X : array-like, shape (n_samples, n_features) Training vector, where n_samples in the number of samples and n_features is the number of features. Returns ------- X : ndarray, shape (n_samples, n_features) The input data, copied, centered and whitened when requested. """ random_state = check_random_state(self.random_state) X = np.atleast_2d(as_float_array(X, copy=self.copy)) n_samples = X.shape[0] # Center data self.mean_ = np.mean(X, axis=0) X -= self.mean_ if self.n_components is None: n_components = X.shape[1] else: n_components = self.n_components U, S, V = randomized_svd(X, n_components, n_iter=self.iterated_power, random_state=random_state) self.explained_variance_ = exp_var = (S**2) / (n_samples - 1) full_var = np.var(X, ddof=1, axis=0).sum() self.explained_variance_ratio_ = exp_var / full_var self.singular_values_ = S # Store the singular values. if self.whiten: self.components_ = V / S[:, np.newaxis] * math.sqrt(n_samples) else: self.components_ = V return X
def on_next(obj): nonlocal self X = obj[["p_log", "q_log"]] check_is_fitted(self, ["is_fitted"]) utils.assert_all_finite(X) X = utils.as_float_array(X) self._update_clustering(X) obj_2 = { "i_min": np.min(obj[["i"]]), "i_max": np.max(obj[["i"]]), "cluster": self.clustering, "X": X } if "start_time" in obj.keys(): obj_2["start_time"] = obj.iloc[-1]["start_time"] observer.on_next(obj_2)