def pca_visualize(self, save_dir): assert(save_dir.endswith('/')) util.make_sure_path_exists(self.DATA_VIZ_DIR + save_dir) save_name = self.name.replace(' ', '_').lower() errors = {'train':[], 'val':[], 'test':[]} xdata = [self.get_train_x(), self.get_val_x(), self.get_test_x()] cum_variances = [] indiv_variances = [] x = range(1, min(self.get_num_features(), 101)) # only do first 100 feats for i in x: model = pca.PCA(n_components=i) # get all components model.fit(self.get_train_x()) for key, xset in zip(errors.keys(), xdata): recon = model.inverse_transform(model.transform(xset)) errors[key].append(util.mse(xset, recon)) indiv_variances.append(model.explained_variance_ratio_[-1]) cum_variances.append(np.sum(model.explained_variance_ratio_)) f, axarr = plt.subplots(2, sharex=True, figsize=(16, 9.6)) axarr[0].plot(x, errors['train'], 'b:', label=u'Train Reconstruction MSE') axarr[0].plot(x, errors['val'], 'g:', label=u'Val Reconstruction MSE') axarr[0].plot(x, errors['test'], 'r:', label=u'Test Reconstruction MSE') axarr[1].plot(x, indiv_variances, 'b--', label=u'Variance Explained by Each Component') axarr[1].plot(x, 1 - np.array(cum_variances), 'b:', label=u'1 Minus Cumulative Explained Variance') axarr[1].set_xlabel(u'Dimensionality of PCA Representation') axarr[0].set_title(u'Analysis of PCA over Error and Variance') axarr[0].set_ylabel(u'MSE') axarr[1].set_ylabel(u'Proportion of Explained Variance') axarr[0].legend() axarr[1].legend() f.savefig(self.DATA_VIZ_DIR + save_dir + save_name + '_pca.png', bbox_inches='tight')
def plot_mixture_pca(self, matrix_dir: str): """PCA of mixture samples and the tissues used to generate them""" os.listdir(matrix_dir) f, ax = plt.subplots(5, 2, figsize=(8, 4 * 5)) ax = ax.flatten() for i, matrix in enumerate(os.listdir(matrix_dir)): t1, t2 = os.path.splitext(matrix)[0].split("-") mix_df = pd.read_hdf(os.path.join(matrix_dir, matrix)) mix_df["tissue"] = "Mixture" sub = self.df[self.df.tissue.isin([t1, t2])] pca_df = pd.concat([sub, mix_df]).dropna(axis=1) embedding = pca.PCA(n_components=2).fit_transform( pca_df[self.genes]) embedding = pd.DataFrame(embedding) embedding.columns = ["PCA1", "PCA2"] embedding["tissue"] = list(pca_df["tissue"]) sns.scatterplot( data=embedding, x="PCA1", y="PCA2", hue="tissue", style="tissue", ax=ax[i], ) ax[i].set_title(f"{t1}-{t2}") plt.tight_layout() return ax
def plot_pca(dat, colour_subgroups, p=None, components=(0, 1), marker_subgroups=None, ax=None, colour_map=None, marker_map=None, **kwargs): if p is None: p = pca.PCA() pca_data = p.fit_transform(dat.transpose()) else: pca_data = p.transform(dat.transpose()) variance_explained = p.explained_variance_ratio_ * 100. ax = scatter.scatter_with_colour_and_markers( pca_data[:, components], colour_subgroups=colour_subgroups, colour_map=colour_map, marker_subgroups=marker_subgroups, marker_map=marker_map, ax=ax, **kwargs) ax.set_xlabel("PCA component %s (%.1f%%)" % (components[0] + 1, variance_explained[components[0]])) ax.set_ylabel("PCA component %s (%.1f%%)" % (components[1] + 1, variance_explained[components[1]])) return p, ax
def PCA_2d_example(): '''加载数据并作图''' data = spio.loadmat('data/data.mat') X = data['X'] plt = plot_data_2d(X,'bo') plt.axis('square') plt.title('original data') plt.show() '''归一化数据并作图''' scaler = StandardScaler() scaler.fit(X) x_train = scaler.transform(X) plot_data_2d(x_train, 'bo') plt.axis('square') plt.title('scaler data') plt.show() '''拟合数据''' K=1 # 要降的维度 model = pca.PCA(n_components=K).fit(x_train) # 拟合数据,n_components定义要降的维度 Z = model.transform(x_train) # transform就会执行降维操作 '''数据恢复并作图''' Ureduce = model.components_ # 得到降维用的Ureduce x_rec = np.dot(Z,Ureduce) # 数据恢复 plot_data_2d(x_rec,'bo') plt.plot() plt.axis('square') plt.title('recover data') plt.show()
def main(): iris = load_iris() data = scale(iris.data) n_samples, n_features = data.shape reduced_data = pca.PCA(n_components=2).fit_transform(data) c, centroids = mykmeans(reduced_data, 3, 50) plotResult(c, centroids, reduced_data, nNumCluster=3)
def svm_proba(train_xx6, train_y, test_xx6): selects = [] for j, y in enumerate(train_y): if y == 1: [selects.append(j - e) for e in [-2, -1, 0, 1, 2]] train_xx6 = train_xx6[selects] train_xx6 = train_xx6[:, :, 40:80] test_xx6 = test_xx6[:, :, 40:80] train_y = train_y[selects] train_y[train_y != 1] = 0 # s = train_xx6.shape # train_xx6 = train_xx6.reshape((s[0], s[1] * s[2])) # s = test_xx6.shape # test_xx6 = test_xx6.reshape((s[0], s[1] * s[2])) # return test_xx6 clf = make_pipeline( Vectorizer(), StandardScaler(), pca.PCA(n_components=.95), svm.SVC(gamma='scale', kernel='rbf', class_weight={ 0: 1, 1: 2 }, probability=True)) clf.fit(train_xx6, train_y) return clf.predict_proba(test_xx6)
def PCA_Data(): train_data = pd.read_csv('data/zhengqi_train.txt', '\t') train_data = np.array(train_data) value = train_data[:, 0:-1] pcas = pca.PCA(n_components=0.95) # 0.95 pcas.fit(value) pca_value = pcas.transform(value) return pcas, len(pca_value[0])
def do_pca(k=20): vec = [] for word in words.index: vec.append(find_word_vec(word)) pca_model = pca.PCA(n_components=k).fit(vec) Z = pca_model.transform(vec) Z = pd.DataFrame(Z, index=words.index) return Z
def loadDataSet(): '''loading data......''' data = dataSet.load_iris() dataset = data.data target = data.target PCA = pca.PCA(n_components=2) dataset = PCA.fit_transform(dataset) return np.mat(dataset), np.mat(target) pass
def read_data(): train_data = pd.read_csv('data/zhengqi_train.txt', '\t') train_data = np.array(train_data) value = train_data[:, 0:-1] target = train_data[:, -1:] pcas = pca.PCA(n_components=0.95) # 0.95 pcas.fit(value) X_pca = pcas.transform(value) return X_pca, target
def visulizeCentroid(centroid): PCA = pca.PCA(n_components=2).fit(centroid) embedding = PCA.transform(centroid) for cls in range(num_classes): plt.scatter(embedding[cls][0], embedding[cls][1], marker='x', color=colors[cls], s=12) plt.savefig('icnn_centroid.png')
def decom_pca(data,dim): # scaler = StandardScaler() # scaler.fit(data) # data = scaler.transform(data) model = pca.PCA(n_components=dim).fit(data) data_trans = model.transform(data) print(data_trans.shape) return data_trans
def __init__(self, provider, num_principle_components): """ This class wraps an data provider and performs dimensionality reduction using PCA """ self._provider = provider self._pca_transform = pca.PCA(n_components=num_principle_components, svd_solver="full") self._pca_transform.fit(provider.get_training_data()) self._X = self._pca_transform.transform(provider.get_training_data()) return
def fit_PCA(C, n_components=5, **kwargs): mdl = skpca.PCA(n_components=n_components, **kwargs) M = mdl.fit_transform(C) resDict = { 'model': mdl, 'train_data': C, 'trans_data': M, } return pyutil.util_obj(**resDict)
def feature_extraction(self): mat = list() for t in self.articles: mat.append(t.tfidf) # Feature extraction using PCA if self.PCA_feature_extraction: pca1 = pca.PCA(n_components=len(mat)) pca1.fit(mat) mat = pca1.components_ mat = [x[0:10] for x in mat] return mat
def nReadAll(): from sklearn.decomposition import pca train_df = pd.read_csv('../Data/zhengqi_train.txt', sep='\t') train_x, train_y = train_df.iloc[:, 0:38], train_df.iloc[:, [38]] test_x = pd.read_csv('../Data/zhengqi_test.txt', sep='\t') x = train_x.append(test_x) # PCA数据处理 pca = pca.PCA(n_components=0.95) # 0.95 pca.fit(x) x_pca = pca.transform(x) ntrain_x, ntest_x, ntrain_y = x_pca[:2888, :], x_pca[2888:, :], train_y return ntrain_x, ntest_x, ntrain_y
def get_hog_features(): data = sio.loadmat( '/home/hardik/Desktop/MTech_Project/Data/HOG_Feature_Data/natural_movies_hog_features.mat' ) features = data['hog_features'] labels = data['hog_labels'].flatten() labels = sess.run(tf.one_hot(labels, depth=4)) print(labels.shape) # Normalizing The Data from sklearn.preprocessing import Normalizer normalized_features = Normalizer().fit(features).transform(features) # Dimension Reduction pcaModel = pca.PCA(n_components=50) pca_features = pcaModel.fit_transform(normalized_features) return {'features': pca_features, 'labels': labels}
def pre_xdata(xdata, k=1): scaler = StandardScaler() print('xdata', xdata.shape) xdata = np.squeeze(xdata) print('xdata', xdata.shape) scaler.fit(xdata) x_train = scaler.transform(xdata) model = pca.PCA(n_components=k).fit(x_train) # 拟合数据,n_components定义要降的维度 # print('model', model) Z = model.transform(x_train) # transform就会执行降维操作 print('Z', Z.shape) # Ureduce = model.components_ # 得到降维用的Ureduce # print('Ureduce', Ureduce.shape) # x_rec = np.dot(Z,Ureduce) # 数据恢复 # print('x_rec', x_rec.shape) Z = Z[:, np.newaxis,:] print('Z', Z.shape) return Z
def PCA_face_example(self): '''加载数据并显示''' image_data = spio.loadmat('data_faces.mat') X = image_data['X'] self.display_imageData(X[0:100, :]) # 显示100个最初图像 '''归一化数据''' scaler = StandardScaler() scaler.fit(X) x_train = scaler.transform(X) '''拟合模型''' K = 100 model = pca.PCA(n_components=K).fit(x_train) Z = model.transform(x_train) Ureduce = model.components_ self.display_imageData(Ureduce[0:36, :]) # 可视化部分U数据 x_rec = np.dot(Z, Ureduce) self.display_imageData(x_rec[0:100, :]) # 显示恢复的数据
def plot_pca_nearby_tissues(self, matrix_path: str, tissues): st_df = pd.read_hdf(matrix_path) label = os.path.splitext(os.path.basename(matrix_path))[0] st_df["tissue"] = "Mixture" sub = self.df[self.df.tissue.isin(tissues)] pca_df = pd.concat([st_df, sub]).dropna(axis=1) embedding = pca.PCA(n_components=2).fit_transform(pca_df[self.genes]) embedding = pd.DataFrame(embedding) embedding.columns = ["PCA1", "PCA2"] embedding["tissue"] = list(pca_df["tissue"]) f, ax = plt.subplots(figsize=(8, 8)) sns.scatterplot(data=embedding, x="PCA1", y="PCA2", hue="tissue", style="tissue") plt.title(f"PCA of {label} Mixtures and Nearby Tissues") return ax
def visualize_data(self, save_dir): assert(save_dir.endswith('/')) util.make_sure_path_exists(self.DATA_VIZ_DIR + save_dir) compressor = pca.PCA(n_components=2) compressor.fit(self.get_train_x()) xtr = compressor.transform(self.get_train_x()) minx1 = min(xtr[:,0]) maxx1 = max(xtr[:,0]) minx2 = min(xtr[:,1]) maxx2 = max(xtr[:,1]) # give 15% leeway for display prop = 0.10 axes = [minx1 - (prop * abs(minx1)), maxx1 + (prop * maxx1), minx2 - (prop * abs(minx2)), maxx2 + (prop * maxx2)] for x, y, set_name in self.iter_all_data(): self._visualize(compressor.transform(x), y, set_name, axes, save_dir)
def whitting(self, X, model=0): """ almost same as it in org code use PCA an whitting use sklearn PCA method to reduce the dim which bigger than input channel_num :param X: the recv mat of audio shape = [f, t, n_channels] :param model: the option for pca or turely whitting :return: The num of channels in the josn data which shape = [f, t, num_source] """ if not model: local_pca = pca.PCA(n_components=self.n_source, svd_solver="full", whiten=True) res = [] angle = np.angle(X) X = np.abs(X) for i in range(X.shape[1]): res.append(local_pca.fit_transform(X[:, i, :])) res = np.asarray(res).astype(np.complex) res = np.transpose(res, [1, 0, 2]) res *= np.exp(1j * angle) return res elif model == 1: dnum = self.n_source [I, J, M] = X.shape Y = np.zeros([I, J, self.n_source], dtype=np.complex) for i in range(I): Xi = np.squeeze(X[i, :, :]).T V = np.matmul(Xi, Xi.T.conj()) / J [D, P] = np.linalg.eig(V) idx = np.argsort(D) D = D[idx] P = P[:, idx] D = np.diag(D) D2 = D[M - dnum:M, M - dnum:M] P2 = P[:, M - dnum:M] D2_ = np.linalg.pinv(np.sqrt(D2)) Y[i, :, :] = np.matmul(np.matmul(D2_, P2.conj().T), Xi).T return Y else: raise ValueError("model should in 0 1")
def train_and_save_svm_model(self, paths, _labels, model_filename): svm_clf = SVC(C=1e3) vectors = [] labels = [] for i, (path, label) in enumerate(zip(paths, _labels)): if i % 10 == 1: print('SVM Train Parsing:', i) _, faces = self.read_image(path) if len(faces) == 0: print('Face Align Failed:', path) continue _, landmarks = faces[0] vectors.append(self.get_distance_vector(landmarks)) labels.append(label) pca_model = None if USE_PCA: pca_model = pca.PCA(n_components=PCA_DIMENSIONS) pca_model.fit(vectors) vectors = pca_model.transform(vectors) svm_clf.fit(vectors, labels) with open(model_filename, 'wb') as model: pickle.dump({"svm_clf": svm_clf, "pca_model": pca_model}, model)
def plot_pca_nearby_tissues(self, background_path: str, tissues, tumor_tissue): df = pd.read_hdf(background_path) tumor = self.tumor tumor = tumor[tumor.tissue == tumor_tissue] tumor["tissue"] = f"{tumor_tissue}-Tumor" sub = df[df.tissue.isin(tissues)] pca_df = pd.concat([tumor, sub]).dropna(axis=1) embedding = pca.PCA(n_components=2).fit_transform(pca_df[self.genes]) embedding = pd.DataFrame(embedding) embedding.columns = ["PCA1", "PCA2"] embedding["tissue"] = list(pca_df["tissue"]) f, ax = plt.subplots(figsize=(8, 8)) sns.scatterplot(data=embedding, x="PCA1", y="PCA2", hue="tissue", style="tissue") plt.title(f"PCA of {tumor_tissue} and Nearby GTEx Tissues") return ax
# cell_line[cell_line.index.str.contains('p62')] = 'ICb1299' # meta.insert(0, 'cell_line', cell_line) anno = loader.load_illumina_methylationepic_annotation() me_data = obj.data.dropna() me_data = process.m_from_beta(me_data) # reduce anno and data down to common probes common_probes = anno.index.intersection(me_data.index) anno = anno.loc[common_probes] # plot PCA p = pca.PCA() pca_dat = p.fit_transform(me_data.transpose()) fig = plt.figure() ax = fig.add_subplot(111) marker_groups = meta.cell_line marker_dict = dict([ ('3021', 'o'), ('ICb1299', 's'), ]) colour_groups = meta.batch.copy() colour_dict = dict([ ('2017-09-19', '#7fc97f'), ('2018-01-12', '#beaed4'),
if __name__ == '__main__': def generate_data(num_samples=100): # The desired mean values of the sample. mu = np.array([0.0, 0.0, 0.0]) # The desired covariance matrix. r = np.array([[1.50, 1.25, 1.25], [1.25, 1.50, 1.25], [1.25, 1.25, 1.50]]) # Generate the random samples. y = np.random.multivariate_normal(mu, r, size=num_samples) x = y[:, 0:2].T y = y[:, 2].T return x, y x, y = generate_data() pca = PrincipalComponentAnalysis(2) print(x.shape) z = pca.fit_transform(x) pca.plot_2d(x) plt.scatter(z[0], z[1]) from sklearn.decomposition import pca z = pca.PCA(2).fit_transform(x.T) plt.scatter(z[:, 0], z[:, 1]) plt.show()
train_data_missing = (df.isnull().sum() / len(df)) * 100 print(train_data_missing) # 分析特性与目标值的相关性 热力图 corrmat = df.corr() f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corrmat, vmax=.8, annot=True) plt.show() # 对训练数据处理,分离出特征和标签 X = df.values[:, 0:-1] Y = df.values[:, -1] X1_test = df_test.values # PCA数据处理 pca = pca.PCA(n_components=0.95) pca.fit(X) X_pca = pca.transform(X) X1_pca = pca.transform(X1_test) '''模型训练''' # 分离出训练集和测试集,并用梯度提升回归训练 X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=40) myGBR = GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None, learning_rate=0.03, loss='huber',
g_ang) path = path1 if not os.path.exists(path): continue if g_ang == '090': img = cv2.imread(path, 0) else: img = cv2.imread(path1, 0) img = cv2.resize(img, (64, 64)) img = img.flatten().astype(np.float32) X.append(img) y.append(p - 63) # y.append(p) X = np.asarray(X) y = np.asarray(y).astype(np.int32) pca_model = pca.PCA(n_components=int(min(X.shape) * 0.2), whiten=False) # print(int(min(X.shape)*0.20)) pca_model.fit(X) X = pca_model.transform(X) lda_model = LinearDiscriminantAnalysis(n_components=45) lda_model.fit(X, y) X = lda_model.transform(X) nbrs = KNeighborsClassifier(n_neighbors=1, p=2, weights='distance', metric='euclidean') nbrs.fit(X, y) testX = [] testy = []
w1 = 1 / (c1 / (c0 + c1 + c2)) w2 = 1 / (c2 / (c0 + c1 + c2)) clf = RandomForestClassifier( n_estimators=1000, min_samples_leaf=10, n_jobs=-1, class_weight={ 0: w0, 1: w1, 2: w2 }, #oob_score=True, ) reducedFeatureVector = pca.PCA(n_components=64) reducedFeatureVector.fit(X.values[learningStartIdx:learningEndIdx]) clf = clf.fit( reducedFeatureVector.transform( X.values[learningStartIdx:learningEndIdx]), Y.values[learningStartIdx:learningEndIdx]) Y_Pred = clf.predict( reducedFeatureVector.transform( X.values[learningStartIdx:learningEndIdx])) print("Training Performance") print( classification_report(Y.values[learningStartIdx:learningEndIdx], Y_Pred, labels=[0, 1, 2], target_names=['down', 'flat', 'up']))
X.append(X1_Data_Norm[i, j, :]) Y.append(Y1[[i], [j]]) X = np.array(X) Y = np.array(Y) X = np.nan_to_num(X) #np.save("new_result2/Gabor_norm.npy",X) print('PCA') scaler = StandardScaler() scaler.fit(X) print(np.shape(X)) x_train = scaler.transform(X) K = 250 model = pca.PCA(n_components=K).fit(x_train) Z = model.transform(x_train) print(np.shape(Z)) np.save('new_result2/Gabor_norm_pca.npy', Z) print(np.sum(Y)) kf = KFold(n_splits=10) li = [] for i in range(8700): li.append(i) random.shuffle(li) final_result = [] for train_n, test_n in kf.split(Z): X_train1 = [] X_test1 = []