def main(): k_range = [8, 16] C_range = [[0.001, 5], [0.005, 10]] #pca = KernelPCA(n_components=50, kernel='linear') pca = IncrementalPCA(n_components=50, batch_size=1000) #lda = LinearDiscriminantAnalysis(n_components=40) for k in k_range: print("VLAD, k:%d" % (k)) X = VLAD(k) print(X.shape) X = StandardScaler().fit_transform(X) col_name = ['feature' + str(i) for i in range(X.shape[1])] X = pd.DataFrame(data=X, columns=col_name) y = pd.read_csv(y_file_name, names=['label']) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4) print("PCA") pca.fit(X_train) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) for C in C_range: linear_score = SVMmodel.runSVM(X_train_pca, X_test_pca, y_train, y_test, C[0], 'linear') rbf_score = SVMmodel.runSVM(X_train_pca, X_test_pca, y_train, y_test, C[1], 'rbf') with open('res_VLAD_PCA.txt', "a") as f: f.write( "VLAD with k=%d, Z-score, SVM with %s kernel, C=%f, score=%f\n" % (k, 'linear', C[0], linear_score)) f.write( "VLAD with k=%d, Z-score, SVM with %s kernel, C=%f, score=%f\n" % (k, 'rbf', C[1], rbf_score))
def get_pca(training_data): # Get the principal components print("Applying PCA!!") ipca = IncrementalPCA() ipca.fit(training_data) print("\t\tDone.") return ipca
def mask_encoding(masks, n_components=60, class_agnostic=True, whiten=True, sigmoid=True, batch_size=1024): components_c = [] mean_c = [] ratio_c = [] explained_variance_c = [] if class_agnostic: if sigmoid: value_random = VALUE_MAX * np.random.rand(masks.shape[0], masks.shape[1]) value_random = np.maximum(value_random, VALUE_MIN) masks = np.where(masks > value_random, 1 - value_random, value_random) masks = inverse_sigmoid(masks) pca = IncrementalPCA(n_components=n_components, copy=False, whiten=whiten, batch_size=batch_size) pca.fit(masks) components_c.append(pca.components_[np.newaxis, :, :]) mean_c.append(pca.mean_[np.newaxis, :]) ratio_c.append(pca.explained_variance_ratio_[np.newaxis, :]) explained_variance_c.append(pca.explained_variance_[np.newaxis, :]) ratio = pca.explained_variance_ratio_.sum() else: # TODO: We have not achieve the function in class-specific. raise NotImplemented return components_c, mean_c, ratio_c, explained_variance_c, ratio
def hierarchical(chapter_list): 'Run hierarchical algorithm on given chapters' weight = tf_idf(chapter_list) result = AgglomerativeClustering(n_clusters=2).fit(weight) num_chapters = len(chapter_list) # Print cluster lable of each chapter for chapter_index in range(num_chapters): print('Chapter', chapter_index + 1, ':', result.labels_[chapter_index]) # Cumulate percentages of different clusters in first half and second half first_half = Counter(result.labels_[:tools.FIRST_HALF]) second_half = Counter(result.labels_[tools.FIRST_HALF:]) # Print result print('Chapter 1-80:') print('\tClass 0:', first_half[0], '/ 80', '=', first_half[0] / 80) print('\tClass 1:', first_half[1], '/ 80', '=', first_half[1] / 80) print('Chapter 81-120:') print('\tClass 0:', second_half[0], '/ 40', '=', second_half[0] / 40) print('\tClass 1:', second_half[1], '/ 40', '=', second_half[1] / 40) print('Plotting clusters in 2D graph:') ipca = IncrementalPCA(n_components=2) ipca.fit(weight) reduction = ipca.transform(weight) colors = ['c', 'orangered'] for chapter_index in range(num_chapters): plt.scatter(reduction[chapter_index, 0], reduction[chapter_index, 1], c=colors[int(result.labels_[chapter_index])], marker='x') plt.show()
def learn_PCA_matrix_for_resnetvecs_with_sklearn(resnetvecs, desired_dimension): print('resnetvecs in learn PCA ', resnetvecs.shape) pca = IncrementalPCA(n_components=desired_dimension, copy=False) pca.fit(resnetvecs) joblib.dump(pca, ('pca-%d.pkl' % desired_dimension)) return pca
def ipca(mov, components=50, batch=1000): # vectorize the images num_frames, h, w = np.shape(mov) frame_size = h * w frame_samples = np.reshape(mov, (num_frames, frame_size)).T # run IPCA to approxiate the SVD ipca_f = IncrementalPCA(n_components=components, batch_size=batch) ipca_f.fit(frame_samples) # construct the reduced version of the movie vectors using only the # principal component projection proj_frame_vectors = ipca_f.inverse_transform(ipca_f.transform(frame_samples)) # get the temporal principal components (pixel time series) and # associated singular values eigenseries = ipca_f.components_.T # the rows of eigenseries are approximately orthogonal # so we can approximately obtain eigenframes by multiplying the # projected frame matrix by this transpose on the right eigenframes = np.dot(proj_frame_vectors, eigenseries) return eigenseries, eigenframes, proj_frame_vectors
def GS_IncrementalPCA(X): ''' 搜索最优PCA降维参数 X:数据样本 ''' num1 = 0.99 num2 = 0.98 num3 = 0.97 num4 = 0.95 sum_t = 0 count = 0 ret = {} pca = IncrementalPCA(n_components=None) pca.fit(X) ratios = pca.explained_variance_ratio_ for ratio in ratios: sum_t = sum_t + ratio count = count + 1 if sum_t <= num4: ret['95%'] = count if sum_t <= num3: ret['97%'] = count if sum_t <= num2: ret['98%'] = count if sum_t <= num1: ret['99%'] = count return pca.n_components_, ret
def main(): m = word2vec.Word2Vec.load(sys.argv[1]) X = [] words = [] colors = [] with open(sys.argv[2], 'r') as f: for line in f: t = line.strip('\n').split(',') w = t[0] c = t[1] try: X.append(m[w]) words.append(w) colors.append(c) except: continue samples = np.array(X) ipca = IncrementalPCA(n_components=3) ipca.fit(samples) data = ipca.transform(X) xs = [i[0] for i in data] ys = [i[1] for i in data] zs = [i[2] for i in data] fig = plt.figure() ax = fig.add_subplot(111, projection='3d') ax.scatter(xs, ys, zs) chf = font_manager.FontProperties(fname='msjh.ttf', size='10') for i, txt in enumerate(words): ax.text(xs[i], ys[i], zs[i], txt, color=colors[i], fontproperties=chf) plt.show()
def batch_fit_pca(data_loader,n_components): ''' Inputs: data_loader - pytorch data loader giving data in batches n_components - int - number of principal components to use Output: pca - sklearn.decomposition.PCA - pca function fitted to data from data_loader ''' batch_size=data_loader.batch_size n_data_points=data_loader.dataset._len #Batches more than 300 are usually not possible to have on memory. if batch_size<n_data_points or batch_size>300: print("Apply incremental PCA on data.") pca=IncrementalPCA(n_components=n_components,batch_size=min(batch_size,300)) for batch_idx, (X,Y) in enumerate(data_loader): X=X.reshape(batch_size,-1) pca.partial_fit(X) else: print("Apply PCA on full data.") pca=PCA(n_components=n_components) X,Y=next(iter(data_loader)) X=X.reshape(batch_size,-1) pca.fit(X) return(pca)
def performPCA(data, n_components): ipca = IncrementalPCA(n_components=n_components, batch_size=n_components) ipca.fit(data) data = ipca.fit_transform(data) return data
class IPCAEstimator(): def __init__(self, n_components): self.n_components = n_components self.whiten = False self.transformer = IncrementalPCA(n_components, whiten=self.whiten, batch_size=max( 100, 2 * n_components)) self.batch_support = True def get_param_str(self): return "ipca_c{}{}".format(self.n_components, '_w' if self.whiten else '') def fit(self, X): self.transformer.fit(X) def fit_partial(self, X): try: self.transformer.partial_fit(X) self.transformer.n_samples_seen_ = \ self.transformer.n_samples_seen_.astype(np.int64) # avoid overflow return True except ValueError as e: print(f'\nIPCA error:', e) return False def get_components(self): stdev = np.sqrt(self.transformer.explained_variance_) # already sorted var_ratio = self.transformer.explained_variance_ratio_ return self.transformer.components_, stdev, var_ratio # PCA outputs are normalized
def calc_PCA(): with open('./config/config_origin.json', 'r') as f: CONFIG = json.load(f) ROOT_PATH = CONFIG["ROOT_PATH"] MODEL_TYPE = 'ResNet%dv%d' % (56, 2) FEATURE_DIR = os.path.join(ROOT_PATH, "features") FEATURE_DIR = os.path.join(FEATURE_DIR, "models-%s/" % MODEL_TYPE) features_train_bad = np.load( os.path.join(FEATURE_DIR, "features_train_bad.npy")) features_train_good = np.load( os.path.join(FEATURE_DIR, "features_train_good.npy")) features_train = np.concatenate((features_train_bad, features_train_good)) ipca = IncrementalPCA(n_components=2, batch_size=1000) ipca.fit(features_train) # fit with ALL data # components_train = ipca.transform(features_train) components_train_bad = ipca.transform(features_train_bad) components_train_good = ipca.transform(features_train_good) # print(components_train.shape) # (30000, 2) np.save(os.path.join(FEATURE_DIR, "components_train_bad.npy"), components_train_bad) np.save(os.path.join(FEATURE_DIR, "components_train_good.npy"), components_train_good) import matplotlib.pyplot as plt plt.scatter(components_train_bad[:, 0], components_train_bad[:, 1], color="r") plt.scatter(components_train_good[:, 0], components_train_good[:, 1], color="g") plt.show()
def compute_pca_model(crystal_samples, batch_size=20): transformer = IncrementalPCA(batch_size=batch_size) transformer.fit(crystal_samples) W = transformer.components_ w0 = transformer.mean_ z = np.matmul((crystal_samples - w0), W.T) return W, w0, z
def ipca(mov, components = 50, batch =1000): # vectorize the images num_frames, h, w = mov.shape frame_size = h * w frame_samples = np.reshape(mov, (num_frames, frame_size)).T # run IPCA to approxiate the SVD ipca_f = IncrementalPCA(n_components=components, batch_size=batch) ipca_f.fit(frame_samples) # construct the reduced version of the movie vectors using only the # principal component projection proj_frame_vectors = ipca_f.inverse_transform(ipca_f.transform(frame_samples)) # get the temporal principal components (pixel time series) and # associated singular values eigenseries = ipca_f.components_.T # the rows of eigenseries are approximately orthogonal # so we can approximately obtain eigenframes by multiplying the # projected frame matrix by this transpose on the right eigenframes = np.dot(proj_frame_vectors, eigenseries) return eigenseries, eigenframes, proj_frame_vectors
def preprocess_features(features, n_components, iterative=True): """ Applys PCA on the features matrix and selects the top n_components. :param features: (nd-array) Features matrix [n_samples, n_features] :param n_components: (int) Number of components to retain after PCA :param iterative: (bool) If True, will perform iterative PCA :return: (nd-array) transformed features matrix """ n_components = min(n_components, features.shape[1]) batch_size = n_components if iterative: pca = IncrementalPCA(n_components=n_components, whiten=False, batch_size=batch_size) else: pca = PCA(n_components=n_components, whiten=False) output = np.zeros( (features.shape[0], min(n_components, features.shape[1])), dtype=np.float) features = scale(features) pca.fit(features) for c in range(0, features.shape[0], batch_size): output[c:c + batch_size] = pca.transform(features[c:c + batch_size]) return output
def viz_cluster(self, X, title=''): """ Визуализация кластеров X: pandas.DataFrame/numpy.ndarray/scipy.sparse.csr.csr_matrix Датасет title: str, default='' Заголовок графика распределения """ dist = 1 - pairwise.cosine_similarity(X) icpa = IncrementalPCA(n_components=2, batch_size=16) icpa.fit(dist) demo = icpa.transform(dist) xs, ys = demo[:, 0], demo[:, 1] labels = self.cluster_model.labels_ labels_unique = list(set(labels)) traces = [] for label in labels_unique: indexes = np.where(labels == label)[0] trace = {'type': 'scatter', 'x': xs[indexes], 'y': ys[indexes], 'name': int(label), 'mode': 'markers', 'marker': {'size': 7}, 'text': X.toarray()[indexes] } traces.append(trace) layout = go.Layout(title=title, showlegend=True) data = go.Data(traces) fig = go.Figure(data=data, layout=layout) fig.show()
def test_incremental_pca(): # Incremental PCA on dense arrays. X = iris.data batch_size = X.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) pca = PCA(n_components=2) pca.fit_transform(X) X_transformed = ipca.fit_transform(X) assert X_transformed.shape == (X.shape[0], 2) np.testing.assert_allclose( ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), rtol=1e-3, ) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X) cov = ipca.get_covariance() precision = ipca.get_precision() np.testing.assert_allclose( np.dot(cov, precision), np.eye(X.shape[1]), atol=1e-13 )
def test_incremental_pca_sparse(matrix_class): # Incremental PCA on sparse arrays. X = iris.data pca = PCA(n_components=2) pca.fit_transform(X) X_sparse = matrix_class(X) batch_size = X_sparse.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) X_transformed = ipca.fit_transform(X_sparse) assert X_transformed.shape == (X_sparse.shape[0], 2) np.testing.assert_allclose(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), rtol=1e-3) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X_sparse) cov = ipca.get_covariance() precision = ipca.get_precision() np.testing.assert_allclose(np.dot(cov, precision), np.eye(X_sparse.shape[1]), atol=1e-13) with pytest.raises(TypeError, match="IncrementalPCA.partial_fit does not support " "sparse input. Either convert data to dense " "or use IncrementalPCA.fit to do so in batches."): ipca.partial_fit(X_sparse)
def low_mem_pca(data): """ Run Singular Value Decomposition (SVD) on input data. Parameters ---------- data : (S [*E] x T) array_like Optimally combined (S x T) or full multi-echo (S*E x T) data. Returns ------- u : (S [*E] x C) array_like Component weight map for each component. s : (C,) array_like Variance explained for each component. v : (C x T) array_like Component timeseries. """ from sklearn.decomposition import IncrementalPCA ppca = IncrementalPCA(n_components=(data.shape[-1] - 1)) ppca.fit(data) v = ppca.components_.T s = ppca.explained_variance_ u = np.dot(np.dot(data, v), np.diag(1. / s)) return u, s, v
def main(args): print('===> args:\n', args) image_list = args.image_list feature_dir = args.feature_dir save_type = args.save_format feature_len = args.feature_dims i = 0 with open(image_list, 'r') as f: lines = f.readlines() print('###### read features nums: %d ######' % (len(lines))) X = np.zeros(shape=(len(lines), feature_len)) for line in lines: feature_name = line.strip() + save_type feature_path = os.path.join(feature_dir, feature_name) x_vec = np.ravel(matio.load_mat(feature_path)) X[i] = x_vec[:feature_len] i = i + 1 print('###### success load feature nums: %d ######' % i) print(X.shape) #ipca ipca = IncrementalPCA(n_components=args.n_components) ipca.fit(X) print('###### PCA Done! ######') joblib.dump(ipca, args.ipca_save_path) print('components num: %d' % ipca.n_components) sum_variance_ratio = 0 for i in range(ipca.n_components): sum_variance_ratio += ipca.explained_variance_ratio_[i] print('sum_variance_ratio: %f' % sum_variance_ratio)
def visualize_data(self): ipca = IncrementalPCA(n_components=2, batch_size=3) ipca.fit(self.trainingData) self.fig = plt.figure() # self.ax = self.fig.add_subplot(111, projection='3d') self.ax = self.fig.add_subplot(111) projData = ipca.transform(self.trainingData) print(np.shape(projData)) X1 = [] X2 = [] Y1 = [] Y2 = [] for idx in range(len(projData)): if self.trainingLabels[idx]: X1.append(projData[idx][0]) Y1.append(projData[idx][1]) else: X2.append(projData[idx][0]) Y2.append(projData[idx][1]) # X = np.array([ data[0] for data in projData]) # Y = np.array([ data[1] for data in projData]) X1 = np.array(X1) X2 = np.array(X2) Y1 = np.array(Y1) Y2 = np.array(Y2) # rospy.loginfo(np.shape(X1)) # rospy.loginfo(np.shape(Y1)) # rospy.loginfo(np.shape(X2)) # rospy.loginfo(np.shape(Y2)) rospy.loginfo("PLOTTING GRAPH") self.ax.plot(X1, Y1, 'r.', X2, Y2, 'g.') plt.show()
class PCA(Model): """Given a set of input vectors, find their principle components""" def __init__(self, fn=None, n_comp=None, batch_size=None): self.model = IncrementalPCA() self.fn = fn self.params = {"n_components": n_comp, "batch_size": batch_size} self.set_params() def load(self, fn): """Set parameters after loading from filename""" super().load(fn) self.params = self.model.get_params() return def fit(self, reps): """Fit a list of representations""" X = [r.to_vector() for r in reps] self.model.fit(X) def err(self, to_transform, to_check_against): """Mesh error between reconstructed to_transform representation and mesh conversion of to_check_against """ vec = to_transform.to_vector() vec_trans = self.model.transform(vec) vec_recon = self.model.inverse_transform(vec_trans) transformed = to_transform.from_vector(vec_recon) mesh1 = transformed.mesh() mesh2 = to_check_against.mesh() error = representation.mesh_error(mesh1, mesh2) return error
class IncrementalPCA_Prim(primitive): def __init__(self, random_state=0): super(IncrementalPCA_Prim, self).__init__(name='IncrementalPCA') self.id = 50 self.PCA_LAPACK_Prim = [] self.type = 'feature engineering' self.description = "Incremental principal components analysis (IPCA). Linear dimensionality reduction using Singular Value Decomposition of centered data, keeping only the most significant singular vectors to project the data to a lower dimensional space. Depending on the size of the input data, this algorithm can be much more memory efficient than a PCA. This algorithm has constant memory complexity." self.hyperparams_run = {'default': True} self.pca = IncrementalPCA() self.accept_type = 'c_t' def can_accept(self, data): return self.can_accept_c(data) def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.pca.fit(data['X']) def produce(self, data): output = handle_data(data) cols = list(output['X'].columns) cols = ["{}_pcaincrmnt".format(x) for x in cols] result = self.pca.transform(output['X']) output['X'] = pd.DataFrame(result, columns=cols[:result.shape[1]]) final_output = {0: output} return final_output
class IPCA(object): def __init__(self, n_components=None, whiten=False, copy=True, batch_size=None): """ :param n_components: default为None ,int 或None, 想要保留的分量数,None 时, min(n_samples, n_features) :param whiten: bool型,可选项, 默认为False, 当true(默认情况下为false)时,components_ 向量除以 n_samples*components_以确保具有单位组件级方差的不相关输出。 :param copy: 默认为True, False时,x 将被覆盖,将节约能存,但存在不安全 :param batch_size: default None, 批量样本数, 只在fit 中使用,设为None,系统自动设成5*n_features, 以保持经度与内存开销的平衡 """ self.model = IncrementalPCA(n_components=n_components, whiten=whiten, copy=copy, batch_size=batch_size) def fit(self, x, y=None): self.model.fit(X=x, y=y) def transform(self, x): return self.model.transform(X=x) def fit_transform(self, x, y=None): return self.model.fit_transform(X=x, y=y) def get_params(self, deep=True): # 获取评估器的参数 return self.model.get_params(deep=deep) def set_params(self, **params): # 设置评估器的参数 self.model.set_params(**params) def inverse_transform(self, x): # 与 fit_tansform 刚好相反的两个操作 return self.model.inverse_transform(X=x) def get_precision(self): # 根据生成模型计算精度矩阵 return self.model.get_precision() def get_covariance(self): # 根据生成模型获取协方差 return self.model.get_covariance() def partial_fit(self, x, y=None, check_input=True): # 增量训练 self.model.partial_fit(X=x, y=y, check_input=check_input) def get_attributes(self): component = self.model.components_ explained_variance = self.model.explained_variance_ explained_variance_ratio = self.model.explained_variance_ratio_ singular_values = self.model.singular_values_ means = self.model.mean_ # 每个特征的均值 var = self.model.var_ # 每个特征的方差 noise_variance = self.model.noise_variance_ # 评估的噪声协方差 n_component = self.model.n_components_ n_samples_seen = self.model.n_samples_seen_ return component, explained_variance, explained_variance_ratio, singular_values, means, var, noise_variance, \ n_component, n_samples_seen
def test_incremental_pca_num_features_change(): """Test that changing n_components will raise an error.""" rng = np.random.RandomState(1999) n_samples = 100 X = rng.randn(n_samples, 20) X2 = rng.randn(n_samples, 50) ipca = IncrementalPCA(n_components=None) ipca.fit(X) assert_raises(ValueError, ipca.partial_fit, X2)
def get_encoder(metas, train_data, target_output_dim): tmpdir = metas['workspace'] model_path = os.path.join(tmpdir, 'incremental_pca.model') model = IncrementalPCA(n_components=target_output_dim, whiten=True) model.fit(train_data) pickle.dump(model, open(model_path, 'wb')) return IncrementalPCAEncoder(model_path=model_path)
def test_incremental_pca_num_features_change(): # Test that changing n_components will raise an error. rng = np.random.RandomState(1999) n_samples = 100 X = rng.randn(n_samples, 20) X2 = rng.randn(n_samples, 50) ipca = IncrementalPCA(n_components=None) ipca.fit(X) assert_raises(ValueError, ipca.partial_fit, X2)
def test_n_samples_equal_n_components(): # Ensures no warning is raised when n_samples==n_components # Non-regression test for gh-19050 ipca = IncrementalPCA(n_components=5) with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) ipca.partial_fit(np.random.randn(5, 7)) with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) ipca.fit(np.random.randn(5, 7))
def test_singular_values(): # Check that the IncrementalPCA output has the correct singular values rng = np.random.RandomState(0) n_samples = 1000 n_features = 100 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=10, random_state=rng) pca = PCA(n_components=10, svd_solver='full', random_state=rng).fit(X) ipca = IncrementalPCA(n_components=10, batch_size=100).fit(X) assert_array_almost_equal(pca.singular_values_, ipca.singular_values_, 2) # Compare to the Frobenius norm X_pca = pca.transform(X) X_ipca = ipca.transform(X) assert_array_almost_equal(np.sum(pca.singular_values_**2.0), np.linalg.norm(X_pca, "fro")**2.0, 12) assert_array_almost_equal(np.sum(ipca.singular_values_**2.0), np.linalg.norm(X_ipca, "fro")**2.0, 2) # Compare to the 2-norms of the score vectors assert_array_almost_equal(pca.singular_values_, np.sqrt(np.sum(X_pca**2.0, axis=0)), 12) assert_array_almost_equal(ipca.singular_values_, np.sqrt(np.sum(X_ipca**2.0, axis=0)), 2) # Set the singular values and see what we get back rng = np.random.RandomState(0) n_samples = 100 n_features = 110 X = datasets.make_low_rank_matrix(n_samples, n_features, tail_strength=0.0, effective_rank=3, random_state=rng) pca = PCA(n_components=3, svd_solver='full', random_state=rng) ipca = IncrementalPCA(n_components=3, batch_size=100) X_pca = pca.fit_transform(X) X_pca /= np.sqrt(np.sum(X_pca**2.0, axis=0)) X_pca[:, 0] *= 3.142 X_pca[:, 1] *= 2.718 X_hat = np.dot(X_pca, pca.components_) pca.fit(X_hat) ipca.fit(X_hat) assert_array_almost_equal(pca.singular_values_, [3.142, 2.718, 1.0], 14) assert_array_almost_equal(ipca.singular_values_, [3.142, 2.718, 1.0], 14)
def PCA_Train(data, result_fold, n_components=128): print_info("PCA training (n_components=%d)..." % n_components) pca = IncrementalPCA(n_components=n_components) pca.fit(data) joblib.dump(pca, result_fold + "pca_model.m") print_info("PCA done.") return pca
def incrementalpca_filtered_model(model, X_train, n_components=None, incrementalpca=None): element_shape = X_train.shape[1:] pxs_per_element = np.prod(element_shape) if incrementalpca is None: incrementalpca = IncrementalPCA(n_components=n_components) flatX_train = X_train.reshape(-1, pxs_per_element) incrementalpca.fit(flatX_train) return filtered_model(model, X_train, sklearn_transformer=incrementalpca)
def test_incremental_pca_fit_overflow_error(): # Test for overflow error on Windows OS # (non-regression test for issue #17693) rng = np.random.RandomState(0) A = rng.rand(500000, 2) ipca = IncrementalPCA(n_components=2, batch_size=10000) ipca.fit(A) pca = PCA(n_components=2) pca.fit(A) np.testing.assert_allclose(ipca.singular_values_, pca.singular_values_)
def generate_pca_compression(X, n_components=16, batch_size=100): """ Compresses the data using sklearn PCA implementation. :param X: Data (n_samples, n_features) :param n_components: Number of dimensions for PCA to keep :param batch_size: Batch size for incrimental PCA :return: X_prime (the compressed representation), pca """ pca = IncrementalPCA(n_components=n_components, batch_size=batch_size) pca.fit(X) return pca.transform(X), pca
def test_incremental_pca_set_params(): """Test that components_ sign is stable over batch sizes.""" rng = np.random.RandomState(1999) n_samples = 100 n_features = 20 X = rng.randn(n_samples, n_features) X2 = rng.randn(n_samples, n_features) X3 = rng.randn(n_samples, n_features) ipca = IncrementalPCA(n_components=20) ipca.fit(X) # Decreasing number of components ipca.set_params(n_components=10) assert_raises(ValueError, ipca.partial_fit, X2) # Increasing number of components ipca.set_params(n_components=15) assert_raises(ValueError, ipca.partial_fit, X3) # Returning to original setting ipca.set_params(n_components=20) ipca.partial_fit(X)
def test_incremental_pca(): """Incremental PCA on dense arrays.""" X = iris.data batch_size = X.shape[0] // 3 ipca = IncrementalPCA(n_components=2, batch_size=batch_size) pca = PCA(n_components=2) pca.fit_transform(X) X_transformed = ipca.fit_transform(X) np.testing.assert_equal(X_transformed.shape, (X.shape[0], 2)) assert_almost_equal(ipca.explained_variance_ratio_.sum(), pca.explained_variance_ratio_.sum(), 1) for n_components in [1, 2, X.shape[1]]: ipca = IncrementalPCA(n_components, batch_size=batch_size) ipca.fit(X) cov = ipca.get_covariance() precision = ipca.get_precision() assert_array_almost_equal(np.dot(cov, precision), np.eye(X.shape[1]))
def IPCA(self, components = 50, batch =1000): ''' Iterative Principal Component analysis, see sklearn.decomposition.incremental_pca Parameters: ------------ components (default 50) = number of independent components to return batch (default 1000) = number of pixels to load into memory simultaneously in IPCA. More requires more memory but leads to better fit Returns ------- eigenseries: principal components (pixel time series) and associated singular values eigenframes: eigenframes are obtained by multiplying the projected frame matrix by the projected movie (whitened frames?) proj_frame_vectors:the reduced version of the movie vectors using only the principal component projection ''' # vectorize the images num_frames, h, w = np.shape(self); frame_size = h * w; frame_samples = np.reshape(self, (num_frames, frame_size)).T # run IPCA to approxiate the SVD ipca_f = IncrementalPCA(n_components=components, batch_size=batch) ipca_f.fit(frame_samples) # construct the reduced version of the movie vectors using only the # principal component projection proj_frame_vectors = ipca_f.inverse_transform(ipca_f.transform(frame_samples)) # get the temporal principal components (pixel time series) and # associated singular values eigenseries = ipca_f.components_.T # the rows of eigenseries are approximately orthogonal # so we can approximately obtain eigenframes by multiplying the # projected frame matrix by this transpose on the right eigenframes = np.dot(proj_frame_vectors, eigenseries) return eigenseries, eigenframes, proj_frame_vectors
class PCALDA(AbstractFeature): def __init__(self,options): for key in options: setattr(self,key,options[key]) def compute(self,X,y): if X.ndim == 3: X = X.reshape((X.shape[0],X.shape[1]*X.shape[2])) if not hasattr(self,"pca_dim"): self.pca_dim = len(X)-len(np.unique(y)) # PCA self.ipca = IncrementalPCA(n_components=self.pca_dim, batch_size=None) self.ipca.fit(X) X_pca = self.ipca.transform(X) print("PCA train shape") print(X_pca.shape) # LDA self.lda = sklearn.lda.LDA() self.lda.fit(X_pca,y) X_lda = self.lda.transform(X_pca) return X_lda def extract(self,x): X = np.array([x]) if X.ndim == 3: X = X.reshape((X.shape[0],X.shape[1]*X.shape[2])) X_pca = self.ipca.transform(X) X_lda = self.lda.transform(X_pca) return list(X_lda[0]) def __repr__(self): return "PCALDA"
def calc_ipca(r, key, xyz, N, title=None): n_dim = np.prod(xyz.shape[1:]) ipca = IncrementalPCA() ipca.fit(xyz.reshape(len(xyz), n_dim)) return ipca
Test_matrix = np.array(rowstst) print('\nTest data loaded!\n') print('#================================================================#') print('#================================================================#') print('\nshape of Training Matrix = ', Train_matrix.shape) print('shape of Test Matrix = ', Test_matrix.shape,'\n') print('#================================================================#') #========================= Principal Component Analysis ==========================# print ('\nRunning Incrmental PCA with 200 Componenets and 5000 batch size') pca = IncrementalPCA(n_components=200, batch_size = 5000) pca.fit(Train_matrix) Train_matrix = pca.transform(Train_matrix) Test_matrix = pca.transform(Test_matrix) parameters = pca.get_params() variance = pca.explained_variance_ratio_ cumvariance = pca.explained_variance_ratio_.cumsum() #np.savetxt("pca_result_variance_200.csv", variance, delimiter=",") #np.savetxt("pca_result_cum_variance_200.csv", variance, delimiter=",") print ('\nPCA complete!\n') print ('#================================================================#') print('\nWriting transformed Train and Test matrices to CSV\n') print('#================================================================#') with open(csv_pca_train_out_path, 'w', newline='') as csvtrainoutfile: