def plotWords(): # get model, we use w2v only w2v, d2v = gensim.models.Doc2Vec.load_word2vec_format( "C:/Users/ghfiy/PycharmProjects/TwitterProcess/trained.word2vec") words_np = [] # a list of labels (words) words_label = [] for word in w2v.vocab.keys(): words_np.append(w2v[word]) words_label.append(word) print('Added %s words. Shape %s' % (len(words_np), np.shape(words_np))) pca = PCA(n_components=2) pca.fit(words_np) reduced = pca.transform(words_np) # plt.plot(pca.explained_variance_ratio_) for index, vec in enumerate(reduced): # print ('%s %s'%(words_label[index],vec)) if index < 100: x, y = vec[0], vec[1] plt.scatter(x, y) plt.annotate(words_label[index], xy=(x, y)) plt.show() plt.plot()
def draw(self): embeddings = self.embedding reversed_dictionary = self.doc_mapper.reversed_dictionary words_np = [] words_label = [] for i in range(0, len(embeddings)): words_np.append(embeddings[i]) words_label.append(reversed_dictionary[i][0]) pca = PCA(n_components=2) pca.fit(words_np) reduced = pca.transform(words_np) plt.rcParams["figure.figsize"] = (20, 20) for index, vec in enumerate(reduced): if index < 1000: x, y = vec[0], vec[1] plt.scatter(x, y) plt.annotate(words_label[index], xy=(x, y)) plt.show()
from sklearn.decomposition import PCA from sklearn import decomposition from sklearn import datasets from sklearn.decomposition import PCA from sklearn.feature_extraction import DictVectorizer from sklearn.preprocessing import Imputer features = np.loadtxt("features.dat", unpack=True) response = np.loadtxt("response.dat", unpack=True) X = np.array(features) Y = np.array(response) # print banned_data pca = PCA(n_components=2) Y_r = pca.fit(response).transform(response) X_r = pca.fit(features).transform(features) print Y_r plt.figure() plt.scatter(X_r[:, 0], X_r[:, 1]) plt.title('PCA of dataset') plt.show() # # np.random.seed(5) # # centers = [[1, 1], [-1, -1], [1, -1]] # features = datasets.x() # X = features.data # y = features.target #
'default_spread', 'log_trade_value', 'res_day', 'vol', 'extreme_2', 'skew', 'total_size', 'coupon' ] data = monthly_cb_value[factor_list] data = data.fillna(0) ''' PCA 方法1: 直接用sklearn的包 优势:用SVD降维,更加标准 劣势:由于不知道实际的correlation matrix,不知道该怎么选择component,以及每个component对应的projection ''' from sklearn.decomposition import PCA X = np.array([[-1, 1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) pca = PCA(n_components=5) pca.fit(data) pca.transform(data) ''' 得到的结果:只有第一个component有意义 pca.explained_variance_ratio_ Out[21]: array([0.94569423, 0.04154984, 0.00570173, 0.00359057, 0.00259915]) ''' ''' PCA 方法2:根据定义自己来进行的PCA 优势:每一步都清楚是怎么做的 劣势:没有采用SVD,只是用correlation matrix来计算 ''' #scale the data(standardize from 0-1) from sklearn.preprocessing import StandardScaler factor_std = StandardScaler().fit_transform(data)
nirMatrix = numpy.matrix(totalnir) redMatrix = numpy.matrix(totalred) grnMatrix = numpy.matrix(totalgrn) # In[ ]: matrix = numpy.hstack((nirMatrix, redMatrix, grnMatrix)) # In[ ]: matrix # In[ ]: pca = PCA() pca.fit(matrix) transform = pca.transform(matrix) # In[ ]: transform # In[ ]: #Nir pca1 = transform[:, 0] zeroNir = pca1 < -.14 oneNir = pca1 > -.13 pca1[zeroNir] = -2000 pca1[oneNir] = 0
for i in trans_1: t = sum(i) / float(len(i)) avg_1.append(t) dif = 0.0 for i in range(len(data_1[0])): dif += (avg_1[i] - avg_2[i])**2 dif = dif**0.5 print("Difference : ", dif) #print(len(data_1[0])==len(data_2[0])) # print(len(data_2[0])) raw_input("Press enter to generate graph") ipca = PCA(n_components=3) ipca.fit(data_1) x_1 = ipca.transform(data_1) ipca.fit(data_2) x_2 = ipca.transform(data_2) Xs = [] Ys = [] Zs = [] for i in x_1[0:50]: Xs.append(i[0]) Ys.append(i[1]) Zs.append(i[2]) ax.scatter(Xs, Ys, Zs, c='r', marker='o')
# q[j] = 0 # label[i] = q # print(q.shape) # if np.isnan(0) == False: # print(100) print(ddd.shape) ddd_scaled = scale(ddd) print(ddd_scaled.shape) # a= np.array(data[]) pca = PCA(n_components=300) results = pca.fit(ddd_scaled) print(pca.explained_variance_ratio_) a = pca.fit_transform(ddd_scaled) aa = pd.Series(a[:, 0].tolist()) data["PCA1"] = aa.values aa = pd.Series(a[:, 1].tolist()) data["PCA2"] = aa.values aa = pd.Series(a[:, 2].tolist()) data["PCA3"] = aa.values aa = pd.Series(a[:, 3].tolist()) data["PCA4"] = aa.values aa = pd.Series(a[:, 4].tolist()) data["PCA5"] = aa.values