def plotWords(): # get model, we use w2v only w2v, d2v = gensim.models.Doc2Vec.load_word2vec_format( "C:/Users/ghfiy/PycharmProjects/TwitterProcess/trained.word2vec") words_np = [] # a list of labels (words) words_label = [] for word in w2v.vocab.keys(): words_np.append(w2v[word]) words_label.append(word) print('Added %s words. Shape %s' % (len(words_np), np.shape(words_np))) pca = PCA(n_components=2) pca.fit(words_np) reduced = pca.transform(words_np) # plt.plot(pca.explained_variance_ratio_) for index, vec in enumerate(reduced): # print ('%s %s'%(words_label[index],vec)) if index < 100: x, y = vec[0], vec[1] plt.scatter(x, y) plt.annotate(words_label[index], xy=(x, y)) plt.show() plt.plot()
def main(): print("add dataset into numpy array") train_dataset = append_feature(TRAIN_PATH) print("train set created successfully") test_dataset = append_feature(TEST_PATH) print("train set created successfully") n_samples, h, w = train_dataset.images.shape # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) # X_train, X_test, y_train, y_test = train_test_split(image_dataset.data, image_dataset.target, test_size=0.1) X_train = train_dataset.data y_train = train_dataset.target X_test = test_dataset.data y_test = test_dataset.target # print(y_train) # print(y_test) n_components = 70 pca = PCA(n_components=n_components).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) print("Projecting the input data on the eigenfaces orthonormal basis") X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) eigenface_titles = [ "eigenface %d" % i for i in range(eigenfaces.shape[0]) ] # print(eigenfaces.shape[0]) plot_gallery(eigenfaces, eigenface_titles, h, w) # plt.imshow(eigenfaces.shape[0]) plt.show() k = 2 knn_model = KNeighborsClassifier(n_neighbors=k) model_save = knn_model.fit(X_train_pca, y_train) saved_model = pickle.dumps(model_save) knn_from_pickle = pickle.loads(saved_model) # print(model_save) y_predict = knn_from_pickle.predict(X_test_pca) print(classification_report(y_test, y_predict))
def pca(self): if (self.inputDataUji.toPlainText() != ''): print("add dataset into numpy array") train_dataset = append_feature(TRAIN_PATH) print("train set created successfully") test_dataset = append_feature(TEST_PATH) print("train set created successfully") n_samples, h, w = train_dataset.images.shape X_train = train_dataset.data y_train = train_dataset.target X_test = test_dataset.data y_test = test_dataset.target n_components = 70 pca = PCA(n_components=n_components).fit(X_train) eigenfaces = pca.components_.reshape((n_components, h, w)) print( "Projecting the input data on the eigenfaces orthonormal basis" ) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) eigenface_titles = [ "eigenface %d" % i for i in range(eigenfaces.shape[0]) ] plot_gallery(eigenfaces, eigenface_titles, h, w) plt.show() k = 2 knn_model = KNeighborsClassifier(n_neighbors=k) model_save = knn_model.fit(X_train_pca, y_train) saved_model = pickle.dumps(model_save) knn_from_pickle = pickle.loads(saved_model) # print(model_save) y_predict = knn_from_pickle.predict(X_test_pca) self.RESULT_CLASSIFICATION = classification_report( y_test, y_predict)
def draw(self): embeddings = self.embedding reversed_dictionary = self.doc_mapper.reversed_dictionary words_np = [] words_label = [] for i in range(0, len(embeddings)): words_np.append(embeddings[i]) words_label.append(reversed_dictionary[i][0]) pca = PCA(n_components=2) pca.fit(words_np) reduced = pca.transform(words_np) plt.rcParams["figure.figsize"] = (20, 20) for index, vec in enumerate(reduced): if index < 1000: x, y = vec[0], vec[1] plt.scatter(x, y) plt.annotate(words_label[index], xy=(x, y)) plt.show()
'total_size', 'coupon' ] data = monthly_cb_value[factor_list] data = data.fillna(0) ''' PCA 方法1: 直接用sklearn的包 优势:用SVD降维,更加标准 劣势:由于不知道实际的correlation matrix,不知道该怎么选择component,以及每个component对应的projection ''' from sklearn.decomposition import PCA X = np.array([[-1, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) pca = PCA(n_components=5) pca.fit(data) pca.transform(data) ''' 得到的结果:只有第一个component有意义 pca.explained_variance_ratio_ Out[21]: array([0.94569423, 0.04154984, 0.00570173, 0.00359057, 0.00259915]) ''' ''' PCA 方法2:根据定义自己来进行的PCA 优势:每一步都清楚是怎么做的 劣势:没有采用SVD,只是用correlation matrix来计算 ''' #scale the data(standardize from 0-1) from sklearn.preprocessing import StandardScaler factor_std = StandardScaler().fit_transform(data) #caculating the covariance matrix
redMatrix = numpy.matrix(totalred) grnMatrix = numpy.matrix(totalgrn) # In[ ]: matrix = numpy.hstack((nirMatrix, redMatrix, grnMatrix)) # In[ ]: matrix # In[ ]: pca = PCA() pca.fit(matrix) transform = pca.transform(matrix) # In[ ]: transform # In[ ]: #Nir pca1 = transform[:, 0] zeroNir = pca1 < -.14 oneNir = pca1 > -.13 pca1[zeroNir] = -2000 pca1[oneNir] = 0 #Red (Green Parks)
t = sum(i) / float(len(i)) avg_1.append(t) dif = 0.0 for i in range(len(data_1[0])): dif += (avg_1[i] - avg_2[i])**2 dif = dif**0.5 print("Difference : ", dif) #print(len(data_1[0])==len(data_2[0])) # print(len(data_2[0])) raw_input("Press enter to generate graph") ipca = PCA(n_components=3) ipca.fit(data_1) x_1 = ipca.transform(data_1) ipca.fit(data_2) x_2 = ipca.transform(data_2) Xs = [] Ys = [] Zs = [] for i in x_1[0:50]: Xs.append(i[0]) Ys.append(i[1]) Zs.append(i[2]) ax.scatter(Xs, Ys, Zs, c='r', marker='o') Xs = []