def get_data(pca_ON=False, print_shapes=False): data = pd.read_csv('mnist_train.csv').as_matrix() Xtrain = data[:-10000, 1:] Ytrain = data[:-10000, 0] Xtest = data[-10000:, 1:] Ytest = data[-10000:, 0] dataset = {} if pca_ON: pca = PCA(n_components=30) pca.fit(Xtrain) if print_shapes: print('\nEigenvectors size:', pca.evecs.shape) Xtrain = pca.transform(Xtrain) Xtest = pca.transform(Xtest) if print_shapes: print('\nXtrain: {}, Ytrain: {}'.format(Xtrain.shape, Ytrain.shape)) print('Xtest: {}, Ytest: {}'.format(Xtest.shape, Ytest.shape)) dataset['train'] = (Xtrain, Ytrain) dataset['test'] = (Xtest, Ytest) return dataset
def main(args): # Read data file into numpy matrices with gzip.open(args.mnist_train_data, 'rb') as in_gzip: magic, num, rows, columns = struct.unpack('>IIII', in_gzip.read(16)) all_data = np.array([np.array(struct.unpack('>{}B'.format(rows * columns), in_gzip.read(rows * columns))) for _ in range(16000)]) with gzip.open(args.mnist_train_labels, 'rb') as in_gzip: magic, num = struct.unpack('>II', in_gzip.read(8)) all_labels = struct.unpack('>16000B', in_gzip.read(16000)) each_label = np.empty(10, dtype = object) for i in range(10): each_label[i] = all_data[np.array(all_labels) == i] pca = PCA(15) pca.fit(all_data) all_data_transform = pca.transform(all_data) kmeans_labels = KMeans(n_clusters=10, random_state=0).fit_predict(all_data_transform) each_cluster = np.empty(10, dtype = object) for i in range(10): each_cluster[i] = all_data_transform[:,:2][np.array(kmeans_labels) == i] f, axarr = plt.subplots(2, 10, figsize=(18, 4), sharey=True) for i in range(10): a = pca.transform(each_label[i]) axarr[0][i].scatter(a.T[0], a.T[1], s = 1) for i in range(10): axarr[1][i].scatter(each_cluster[i].T[0], each_cluster[i].T[1], s = 1) #plt.show() coincidence_matrix = np.zeros((10,10)).astype(int) for i in range(16000): coincidence_matrix[all_labels[i], kmeans_labels[i]]+=1 print(coincidence_matrix) plt.savefig("labels_vs_kmeans_clusters.jpg")
def main(args): # Read data file into numpy matrices with gzip.open(args.mnist_train_data, 'rb') as in_gzip: magic, num, rows, columns = struct.unpack('>IIII', in_gzip.read(16)) all_data = [np.array(struct.unpack('>{}B'.format(rows * columns), in_gzip.read(rows * columns))) for _ in range(60000)] # Read labels file into labels with gzip.open(args.mnist_train_labels, 'rb') as in_gzip: magic, num = struct.unpack('>II', in_gzip.read(8)) all_labels = struct.unpack('>60000B', in_gzip.read(60000)) pca = PCA(5) pca.fit(all_data) components = pca.return_components() components = np.reshape(components, (5, 28, 28)) one = PCA(5) one.fit() one_comp = pca.return_components() f, axarr = plt.subplots(1, 5, figsize=(18, 4), sharey=True) for i in range(5): axarr[i].imshow(components[i]) axarr[i].set_aspect('equal') axarr[i].set_title('Component {}'.format(i + 1)) plt.tight_layout() name = 'Hrach' plt.savefig('comps-{}.png'.format(name), dpi=320)
def main(args): # Read data file into numpy matrices with gzip.open(args.mnist_train_data, 'rb') as in_gzip: magic, num, rows, columns = struct.unpack('>IIII', in_gzip.read(16)) all_data = np.array([ np.array( struct.unpack('>{}B'.format(rows * columns), in_gzip.read(rows * columns))) for _ in range(16000) ]) with gzip.open(args.mnist_train_labels, 'rb') as in_gzip: magic, num = struct.unpack('>II', in_gzip.read(8)) all_labels = struct.unpack('>16000B', in_gzip.read(16000)) zeros = all_data[np.array(all_labels) < 0.5] #plt.imshow(all_data[0].reshape(28,28)) #plt.show() pca = PCA(5) pca.fit(all_data) print(pca.return_components().shape) components = pca.return_components().reshape(5, 28, 28) f, axarr = plt.subplots(1, 5, figsize=(18, 4), sharey=True) for i in range(5): axarr[i].imshow(components[i]) print(i, components[i].shape) axarr[i].set_aspect('equal') axarr[i].set_title('Component {}'.format(i + 1)) plt.tight_layout() name = 'TODO' # TODO: Remplace name with your name plt.savefig('comps-{}.png'.format(name), dpi=320)
class MSNM: # M是输入变量的维度 def __init__(self, M): self.pca = PCA() # 传入pca的缩放参数 self.pca_w = np.ones(M) self.M = M # 训练, 并且使用R2R优化, n_component是主成分分析降维的个数, K 是 R2R的计算次数, rc是设置的w的偏移率 def train(self, label_train, train_labels, unlabel_train, n_component=50, K=10, rc=0.1, epoch=20): pls = PLS() S = np.empty((K, self.M)) Y = np.empty(K) for e in range(epoch): for i in range(K): w = self.pca_w + rc * np.random.normal(0, 1, self.M) self.pca.fit(unlabel_train, n_component, w) Dst, Qst = self.pca.Dst_Qst(label_train, w) score = self.pca.anomaly_score(Dst, Qst) auc = roc_auc_score(train_labels, score) S[i, :] = w Y[i] = auc pls.fit(S, Y, 10) self.pca_w = self.pca_w + (3 * pls.B).reshape(self.M) print("epoch ", e + 1, "/", epoch) # 返回异常分数 def test(self, X): # 计算异常分数 Dst, Qst = self.pca.Dst_Qst(X, self.pca_w) scores = self.pca.anomaly_score(Dst, Qst) return scores
def main(): dim = 2 num_class = 3 dataset_dir = '../input/wine.csv' train_x, train_y, raw_data = data_loader(dataset_dir) pca = PCA(first_k=dim, use_threshold=False, threshold=0.5) proj = pca.fit(train_x) kmeans = KMeans(K=num_class) center, predict_y = kmeans.fit(proj) result = evaluate(proj, train_y, predict_y, k=num_class) visualization(center, proj, predict_y, dim) save_to_csv(raw_data, predict_y) print(result)
from sklearn.neighbors import KNeighborsClassifier SHAPE = (46, 56) M = 121 standard = False data = split_data() X_train, y_train = data['train'] D, N = X_train.shape pca = PCA(n_comps=M, standard=standard) W_train = pca.fit(X_train) X_test, y_test = data['test'] I, K = X_test.shape W_test = pca.transform(X_test) nn = KNeighborsClassifier(n_neighbors=1) nn.fit(W_train.T, y_train.T.ravel()) y_hat = nn.predict(W_test.T) done = {'success': False, 'failure': False} fig, axes = plt.subplots(nrows=2)
acertoPCAS = [[], [], [], []] extraido = list(range(1, 19)) for j in extraido: erros = [0] * 4 errosScore = [0] * 4 for i in range(hold): pcaR = PCAR() pcaRS = PCARS() train_atr, test_atr, train_classes, test_classes = train_test_split( baseClimate.atributos, baseClimate.classes, test_size=0.5, random_state=i) b = Base(train_classes, train_atr) #cria a base de treino pcaR.fit(b) #prepara o PCA pcaRS.fit(b) #prepara o PCA com score baseTreino = pcaR.run(Base(train_classes, train_atr), j) #Cria base de treino projetada pelo PCA baseTeste = pcaR.run(Base(test_classes, test_atr), j) #Cria base de teste projetada pelo PCA baseTreinoS = pcaRS.run( Base(train_classes, train_atr), j) #Cria base de treino projetada pelo PCA com score baseTesteS = pcaRS.run( Base(test_classes, test_atr), j) #Cria base de teste projetada pelo PCA com score qt1 = len(test_classes) #Erros dos classificados - PCA
def tsne(data, label): from sklearn.manifold import TSNE tsne = TSNE(n_components=2, init='pca') result = tsne.fit_transform(data) visualize(result, label) if __name__ == "__main__": winefile = "../input/wine.csv" print("---------Preprocessing------------") dataset = DataSet() dataset.preprocessing(winefile) print("---------PCA------------") pca = PCA() dimmedset = pca.fit(dataset.dataSet) # print(dimmedset) print("---------K-means------------") kmeans = Kmeans() print("---------不采用降维数据------------") predictlabel = kmeans.cluster(dataset.dataSet) # print(predictlabel) evaluate(dataset.label, predictlabel, dataset.dataSet) print("使用TSNE对不降维的聚类结果可视化:") tsne(dataset.dataSet, predictlabel) # print("---------Output------------") # filename = "../output/undimmedCluster.csv" # with open(filename, 'w', newline='') as outfile: # outlist = (np.array(predictlabel)[:, np.newaxis]).tolist()
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from PCA import PCA pre_data = datasets.load_iris() X = pre_data.data Y = pre_data.target pca = PCA(2) pca.fit(X) X_transformed = pca.transform(X) print("Dimensions of X is " + str(X.shape)) print("Dimensions of X_transformed is" + str(X_transformed.shape)) print(Y) x1 = X_transformed[:, 0] x2 = X_transformed[:, 1] plt.xlabel("Component 1") plt.ylabel("Component 2") plt.scatter(x1, x2, alpha=1, cmap=plt.cm.get_cmap('Dark2', 3), c=Y, edgecolor="red") plt.colorbar()
""" import numpy as np from PCA import PCA import matplotlib.pyplot as plt import json import os if __name__ == "__main__": images = np.load('./data/sampled_image.npy') labels = np.load('./data/sampled_label.npy') print(images.shape, labels.shape) images_reshape = images.reshape(images.shape[0], images.shape[1]*images.shape[2]) print(images_reshape.shape) pca = PCA() pca.fit(images_reshape) data = pca.transform(2) dirs = './pic/' if not os.path.exists(dirs): os.makedirs(dirs) data_set = [] data_x = [] data_y = [] for i in range(data.shape[0]): path = "./pic/{}.jpg".format(i) sample = [data[i][0].real, data[i][1].real, labels[i], path] data_set.append(sample) data_x.append(data[i][0].real) data_y.append(data[i][1].real)
plt.subplot(np.sqrt(pictN),np.sqrt(pictN),i+1) data=dataFace[i,:].reshape((32,32)).T # img = scipy.misc.toimage(data) plt.imshow(data,cmap='gray') plt.show() imageShow(dataFace) # myP=PCA(x,number=1) # myP.train() # Z=myP.fit() # Xnex=myP.reconstruct(Z) # plt.plot(Xnex[:,0],Xnex[:,1],'ro') # plt.show() for i in [5,10,50,100,500]: myP=PCA(dataFace,number=i) myP.train() Z=myP.fit() Xnex=myP.reconstruct(Z) imageShow(Xnex)
c += 1 else: d += 1 r = (a + d) / (a + b + c + d) return r def visualize(pca_x, cluster): import pandas as pd y = np.array(cluster).reshape(-1, 1) x = pd.DataFrame(np.concatenate((pca_x, y), axis=-1)) data = pd.DataFrame(x, index=x.index) d1 = data[x[2] == 0] d2 = data[x[2] == 1] d3 = data[x[2] == 2] plt.plot(d1[0], d1[1], 'r.', d2[0], d2[1], 'gx', d3[0], d3[1], 'b*') plt.show() if __name__ == '__main__': dataset_dir = '../input/wine.data' x, y = load_data(dataset_dir) model = PCA(threshold=0.5) pca = model.fit(x) print('dim: ', pca.shape[1]) kmodel = KMeans() cluster, s = kmodel.fit(pca) print('r: ', evaluate(cluster, y)) visualize(pca, cluster) np.savetxt('../output/output.csv', cluster, delimiter=',', fmt='%d')