def IrisMatchingBootstrap(train_features, train_classes, test_features, test_classes, times, thresholds): total_fmrs = [] total_fnmrs = [] total_crr = np.zeros(times) lle = LocallyLinearEmbedding(n_neighbors=201, n_components=200) lle.fit(train_features) train_redfeatures = lle.transform(train_features) test_redfeatures = lle.transform(test_features) for t in range(times): tests_features, tests_classes = selectTestSample( test_redfeatures, test_classes) crr, distm, distn = IrisMatching(train_redfeatures, train_classes, tests_features, tests_classes, 3) fmrs, fnmrs = calcROC(distm, distn, thresholds) total_fmrs.append(fmrs) total_fnmrs.append(fnmrs) total_crr[t] = crr total_fmrs = np.array(total_fmrs) total_fnmrs = np.array(total_fnmrs) crr_mean = np.mean(total_crr) crr_std = np.std(total_crr) crr_u = min(crr_mean + crr_std * 1.96, 1) crr_l = crr_mean - crr_std * 1.96 return total_fmrs, total_fnmrs, crr_mean, crr_u, crr_l
def IrisMatchingRed(train_features, train_classes, test_features, test_classes, n): train_redfeatures = train_features.copy() test_redfeatures = test_features.copy() total = float(len(test_classes)) if n < 108: lda = LinearDiscriminantAnalysis(n_components=n) lda.fit(train_features, train_classes) train_redfeatures = lda.transform(train_features) test_redfeatures = lda.transform(test_features) if n >= 108 and n < 323: lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n) lle.fit(train_features) train_redfeatures = lle.transform(train_features) test_redfeatures = lle.transform(test_features) l1knn = KNeighborsClassifier(n_neighbors=1, metric='l1') l1knn.fit(train_redfeatures, train_classes) l1classes = l1knn.predict(test_redfeatures) l1crr = float(np.sum(l1classes == test_classes)) / total l2knn = KNeighborsClassifier(n_neighbors=1, metric='l2') l2knn.fit(train_redfeatures, train_classes) l2classes = l2knn.predict(test_redfeatures) l2crr = float(np.sum(l2classes == test_classes)) / total cosknn = KNeighborsClassifier(n_neighbors=1, metric='cosine') cosknn.fit(train_redfeatures, train_classes) cosclasses = cosknn.predict(test_redfeatures) coscrr = float(np.sum(cosclasses == test_classes)) / total # table_CRR() return l1crr, l2crr, coscrr
def wrap_lle(x, required_d, neighbors): # 对输入x,用LLE方法降维到required_d维,并将降维后的数据保存为np文件,方便下次调用 lle = LocallyLinearEmbedding(n_components=required_d, n_neighbors=neighbors) lle.fit(x) x_lle = lle.embedding_ np.save('LLE/np_x_LLE_' + str(required_d) + str(neighbors), x_lle) return x_lle
def embed_lle(train, test, nn=10, method='standard'): traintest = np.concatenate((train, test)) from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=nn, n_components=2, method=method) lle.fit(traintest) X2d = lle.transform(traintest) X2d = MinMaxScaler().fit_transform(X2d) return X2d[:train.shape[0]], X2d[train.shape[0]:]
class _LocallyLinearEmbeddingImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def transform(self, X): return self._wrapped_model.transform(X)
def main(args=None): phase = "LLE" random.seed(SEED) np.random.seed(SEED) x, y = load_data(DATAPATH) y = np.asarray([ord(l) - 65 for l in y]) # train data will be used for fitting x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5, random_state=SEED) # MODELPATH = "./model/pca_" + str(K) + "D.pt" PLOTPATH = "./plot/lle_" + str(K) + "D.png" lle = LocallyLinearEmbedding(n_components=K) lle.fit(x) # <- train data is used for fitting x_transformed = lle.transform(x) c = np.asarray(COLORS)[y] # <- define corresponding colors s = np.asarray([2 for _ in range(N_SAMPLE)]) # <- define corresponding data point sizes if K == 2: # number of components = 2 (plot 2D) for i in range(N_CLASS): indices = np.asarray([idx for idx, y_ in enumerate(y) if y_==i]) plt.scatter(x_transformed[indices, 0], x_transformed[indices, 1], label= (chr(i + 65)), s=s[indices], c=c[i]) elif K == 3: # number of components = 3 (plot 3D) fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for i in range(N_CLASS): indices = np.asarray([idx for idx, y_ in enumerate(y) if y_ == i]) ax.scatter(x_transformed[indices, 0], x_transformed[indices, 1], x_transformed[indices, 2], label= (chr(i + 65)), s=s[indices], c=c[i], marker='.') else: raise NotImplementedError plt.legend(title="Classes", scatterpoints=1, loc='best',ncol=4, fontsize=8, markerscale=3) plt.title(phase) plt.savefig(PLOTPATH) plt.show()
def classify_concat_lle_data(self, vis_data, sem_data, labels): fold = 0 accuracies = [] lle = LocallyLinearEmbedding(n_components=sem_data.shape[1], n_neighbors=20) skf = StratifiedKFold(n_splits=self.n_folds, random_state=None, shuffle=True) for train_index, test_index in skf.split(vis_data, labels): logging.info('Running LLE classification for fold %d' % fold) tr_vis = normalize(vis_data[train_index], norm='l2', axis=1, copy=True) te_vis = normalize(vis_data[test_index], norm='l2', axis=1, copy=True) tr_sem = normalize(sem_data[train_index], norm='l2', axis=1, copy=True) te_sem = normalize(sem_data[test_index], norm='l2', axis=1, copy=True) te_sem = SemanticDegradation.kill_semantic_attributes( te_sem, self.degradation_rate) te_sem = normalize(te_sem, norm='l2', axis=1, copy=True) tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack( (te_vis, te_sem)) tr_labels, te_labels = labels[train_index][:, 0], labels[ test_index][:, 0] clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=1.0, kernel='linear')) lle.fit(tr_data) clf.fit(lle.transform(tr_data), tr_labels) prediction = clf.predict(lle.transform(te_data)) fold += 1 accuracies.append(balanced_accuracy_score(te_labels, prediction)) return accuracies
def LLE(train_img, train_label, img, n_components): """ It transforms the feature vector to one in a low-dimensional feature space. :param train_img: feature vector of training images :param train_label: labels of training images :param img: feature vector of images to be transformed :param n_components: dimension of the new transformed feature vector :return: transformed feature vecter """ embedding = LocallyLinearEmbedding(n_neighbors=201, n_components=n_components) embedding.fit(train_img, train_label) img_t = embedding.transform(img) return img_t
def IrisMatchingRed1(train_features, train_classes, test_features, test_classes, n): train_redfeatures = train_features.copy() test_redfeatures = test_features.copy() total = float(len(test_classes)) if n < 108: lda = LinearDiscriminantAnalysis(n_components=n) lda.fit(train_features, train_classes) train_redfeatures = lda.transform(train_features) test_redfeatures = lda.transform(test_features) if n >= 108 and n < 323: lle = LocallyLinearEmbedding(n_neighbors=n + 1, n_components=n) lle.fit(train_features) train_redfeatures = lle.transform(train_features) test_redfeatures = lle.transform(test_features) model = SVC(kernel='rbf') model.fit(train_redfeatures, train_classes) modelclasses = model.predict(test_redfeatures) modelcrr = float(np.sum(modelclasses == test_classes)) / total return modelcrr
def LLE10FoldClf(X, y, nclf): acc = [] kf = KFold(X.shape[0], n_folds=10, shuffle=True) i = 0 for train_index, test_index in kf: yTest = y[test_index] yTrain = y[train_index] n_neighbors = 30 clf = LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard') clf.fit(X[train_index]) newRepTrain = clf.transform(X[train_index]) newRepTest = clf.transform(X[test_index]) # NN = neighbors.KNeighborsClassifier(n_neighbors=2) nclf.fit(newRepTrain, yTrain) XPred = nclf.predict(newRepTest) acc.append(np.sum(XPred == yTest) * 1.0 / yTest.shape[0]) # print i,":",acc[i] i += 1 return np.mean(acc), np.std(acc)
def runLLE(X_train, X_test, y_train, y_test, comp_range, n_neigh): rbf_scores = [] linear_scores = [] for n_comp in comp_range: print("\nn_comp=%d\n" % (n_comp)) # transformer = LocallyLinearEmbedding(n_neighbors=n_neigh, n_components=n_comp, eigen_solver='dense', n_jobs=8) transformer = LocallyLinearEmbedding(n_neighbors=n_neigh, n_components=n_comp, n_jobs=8) transformer.fit(X_train) X_train_proj = transformer.transform(X_train) X_test_proj = transformer.transform(X_test) if n_comp == 2: np.save('X_train_proj_2d_LLE_' + str(n_neigh), X_train_proj) np.save('X_test_proj_2d_LLE_' + str(n_neigh), X_test_proj) score_rbf = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test, SVMmodel.getBestParam('rbf'), 'rbf') rbf_scores.append(score_rbf.mean()) score_linear = SVMmodel.runSVM(X_train_proj, X_test_proj, y_train, y_test, SVMmodel.getBestParam('linear'), 'linear') linear_scores.append(score_linear.mean()) for i, scores in enumerate([rbf_scores, linear_scores]): if i == 0: kernel = 'rbf' elif i == 1: kernel = 'linear' else: kernel = '' bestIdx = np.argmax(scores) bestNComp = comp_range[bestIdx] bestAcc = scores[bestIdx] with open('res_LLE_' + kernel + '_' + str(n_neigh) + '.txt', 'w') as f: for j in range(len(comp_range)): f.write(kernel + ": n_comp = %f, acc = %f\n" % (comp_range[j], scores[j])) f.write(kernel + ": Best n_comp = %f\n" % (bestNComp)) f.write(kernel + ": acc = %f\n" % (bestAcc)) return rbf_scores, linear_scores
def ul_LLE(X, y, random_seed, filename, verbose=False): n_cols = len(X.columns) re_list = [] for i in range(n_cols): lle = LocallyLinearEmbedding(n_neighbors=10, n_components=i, random_state=random_seed, n_jobs=-1) lle.fit(X, y) re_list.append(lle.reconstruction_error_) if verbose: print(lle.reconstruction_error_) fig, ax1 = plt.subplots() ax1.plot(range(1, n_cols + 1), re_list, 'b-') ax1.set_xlabel('# of Components', fontsize=16) # Make the y-axis label, ticks and tick labels match the line color. ax1.set_ylabel('Mean Reconstruction Error', color='b', fontsize=16) ax1.tick_params('y', colors='b', labelsize=16) ax1.tick_params('x', labelsize=16) plt.grid(False) plt.title(filename + " LLE Mean Reconstruction Error", fontsize=16) fig.tight_layout() plt.show()
def main(): # load ORL or load Yale xTrain_, yTrain, xTest_, yTest = loadORLImages(u'./att_faces', 5) # xTrain_, yTrain, xTest_, yTest = loadYaleImages() # WT+PCA+SVM # WT xTrain = np.array(wavelet_transform(xTrain_)) xTest = np.array(wavelet_transform(xTest_)) #Yale dataset wavelet # xTrain = np.array(wavelet_transform(xTrain_,100,100)) # xTest = np.array(wavelet_transform(xTest_,100,100)) # PCA data = np.float32(np.mat(xTrain)) pca = PCA(n_components=50) pca.fit(data) xTrain = pca.transform(data) print('PCA解释率%s' % sum(pca.explained_variance_ratio_)) xTest = pca.transform(np.float32(np.mat(xTest))) # SVM score = SVM_GridSearch(xTrain, yTrain, xTest, yTest) print('WT+PCA+SVM精度为%s' % score) # PCA+SVM # PCA data = np.float32(np.mat(xTrain_)) pca = PCA(n_components=50) pca.fit(data) xTrain = pca.transform(data) print('PCA解释率%s' % sum(pca.explained_variance_ratio_)) xTest = pca.transform(np.float32(np.mat(xTest_))) # SVM score = SVM_GridSearch(xTrain, yTrain, xTest, yTest) print('PCA+SVM精度为%s' % score) # LDA+SVM # #%% LDA directly # clf = LDA() # clf.fit(xTrain_, yTrain) # yPredict = clf.predict(xTest_) # print(np.where(yPredict != np.array(yTest))) # print(u'LDA识别率: %.2f%%' % ((yPredict == np.array(yTest)).mean()*100)) #use for feature extration clf = LDA(n_components=50) clf.fit(xTrain_, yTrain) xTrain = clf.transform(xTrain_) #xTrain为降维后的数据 xTest = clf.transform(xTest_) #print ('LDA的数据中心点:',clf.means_) #中心点 print('LDA做分类时的正确率:', clf.score(xTest_, yTest)) #score是指分类的正确率 # SVM score = SVM_GridSearch(xTrain, yTrain, xTest, yTest) print('LDA+SVM精度为%s' % score) # LLE+SVM from sklearn.manifold import LocallyLinearEmbedding as LLE lle = LLE(n_neighbors=30, n_components=50, method='standard') lle.fit(xTrain_) xTrain = lle.transform(xTrain_) xTest = lle.transform(xTest_) # trans_data,err = lle.fit_transform(xTrain_) # print("LLE Done. Reconstruction error: %g" % err) # SVM score = SVM_GridSearch(xTrain, yTrain, xTest, yTest) print('LLE+SVM精度为%s' % score)
class Cluster: """ Constructor Initializes the class variables necessary for preprocessing the data """ def __init__(self): self.lle = None self.n_clusters = None self.size = None self.iterations = None self.results = None self.n_vectors = 5 self.affinities = ['rbf', 'nearest_neighbors'] self.laplacians = ['custom', 'csgraph'] self.eigvectors = [5, 15] self.clusters = [3, 5, 7, 8] #self.eigvectors = [5, 10, 15, 20] """ Run Locally Linear Embedding and Spectral Clustering on the provided data LLE reduces the data to 2D """ def train(self, x_train, y_train, multiple=False, binary=False): # Set number of clusters self.n_clusters = 2 # Set the size to the training set size self.size = len(x_train) # Create list with numbers from 1 to number of training items self.iterations = np.zeros(self.size) for i in range(0, self.size): self.iterations[i] = i+1 # Apply Locally Linear Embedding on training and testing data x_train = self.LLE(x_train) # Plot training data self.filenale_ = 'multiclass' if binary is True: self.filenale_ = 'binary' self.visualize2D(x_train[:, 0], x_train[:, 1], c=y_train, title='Training data ' + self.filenale_, filename='logs/plots/training_data_' + self.filenale_) # Change y_train labels for binary for i in range(0, len(y_train)): if y_train[i] == -1: y_train[i] = 0 # Run SpectralClustering if multiple is True: for affinity in self.affinities: for laplacian in self.laplacians: for vector in self.eigvectors: self.n_vectors = vector if binary is True: self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian) else: for n in self.clusters: self.n_clusters = n self.SpectralClustering(x_train, y_train, affinity=affinity, laplacian=laplacian) else: if binary is not True: self.n_clusters = 8 self.n_vectors = 8 self.SpectralClustering(x_train, y_train) if multiple is True: for affinity in self.affinities: # Run with sklearns Spectral Clustering sklearn_predicted = self.SklearnSP(x_train, affinity=affinity) title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=' + affinity filename='logs/plots/' + affinity + '_sklearn_' + self.filenale_ self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename) else: # Run with sklearns Spectral Clustering sklearn_predicted = self.SklearnSP(x_train) self.logResults(y_train, sklearn_predicted, sklearn=True, affinity=affinity, laplacian=laplacian) title = 'SKLearn SpectralClustering Results for ' + self.filenale_ + ", " + 'affinity=rbf' filename='logs/plots/rbf_sklearn_' + self.filenale_ self.visualize2D(x_train[:, 0], x_train[:, 1], c=sklearn_predicted, title=title, filename=filename) """ Run Spectral Clustering for these data with these parameters affinity=['rbf', 'nearest_neighbors'], laplacian=['custom', 'csgraph'] Default is nearest_neighbors kernel for similarity matrix, custom for laplacian matrix """ def SpectralClustering(self, x_train, y_train, affinity='nearest_neighbors', laplacian='custom'): # Get similarity matrix for train data if affinity == 'nearest_neighbors': similarity_matrix = self.NNGraph(x_train) else: similarity_matrix = self.SimilarityMatrix(x_train) # Get laplacian matrix from similarity matrix if laplacian == 'csgraph': laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=False) else: laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix) # Transform data using the laplacian matrix transormed_data = self.transformDataToLaplacian(laplacian_matrix) # Cluster transormed data with kmeans model = cluster.KMeans(n_clusters=self.n_clusters, precompute_distances='auto', random_state=0) predicted = model.fit(transormed_data).labels_ self.logResults(y_train, predicted, affinity=affinity, laplacian=laplacian) title = 'Custom SpectralClustering Results ' + self.filenale_ + ", " + 'affinity=' + affinity + ", laplacian=" + laplacian + ", vectors=" + str(self.n_vectors) filename='logs/plots/' + affinity + '_' + laplacian + "_" + str(self.n_vectors) + "_" + str(self.n_clusters) + '_custom_' + self.filenale_ self.visualize2D(x_train[:, 0], x_train[:, 1], c=predicted, title=title, filename=filename) """ Create the new data using the laplacian matrix and its eigenvalues and eigenvectors """ def transformDataToLaplacian(self, laplacian_matrix): # Get eigenvalues and eigenvectors from the laplacian matrix eigval, eigvec = np.linalg.eig(laplacian_matrix) # Keep the n_clusters smaller eigenvalues sort_ind = np.argsort(eigval)[: self.n_vectors] # Sort and plot eigenvalues #eigval = np.sort(eigval) # Initialize new array for the transormed data transormed_data = np.zeros((len(laplacian_matrix), self.n_vectors-1), dtype=np.float64) # Create transformed data for i in range(0, len(laplacian_matrix)): # Ignore first eigenvalue as it is close or equal to 0 for j in range(1, self.n_vectors): transormed_data[i][j-1] = eigvec[i, np.asscalar(sort_ind[j])] return transormed_data """ Transform and return data to 2D using LocallyLinearEmbedding """ def LLE(self, data): if self.lle is None: self.lle = LocallyLinearEmbedding(n_components=2) self.lle.fit(data) return self.lle.transform(data) """ Calculate and return the nearest neighbors graph which depicts the distances between each point to another The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix Default limit is 0.4 """ def NNGraph(self, data, limit=0.4): # Create the nearest neighbors graph graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False) graph = graph.toarray() return graph """ Calculate and return the similarity matrix using the rbf kernel """ def SimilarityMatrix(self, data, limit=0.4): size = len(data) # Initialize array of size x size with zeros similarity_matrix = np.zeros((size, size), dtype=np.float64) for i in range(0, size): for j in range(0, size): if i != j: value = self.rbf(data[i], data[j], 0.5) #if value <= limit: #similarity_matrix[i][j] = value similarity_matrix[i][j] = value return similarity_matrix """ Calculate and return the Laplacian matrix """ def LaplacianMatrix(self, similarity_matrix): D = np.zeros(similarity_matrix.shape) w = np.sum(similarity_matrix, axis=0) D.flat[::len(w) + 1] = w ** (-0.5) # set the diag of D to w return D.dot(similarity_matrix).dot(D) """ Run sklearn's Spectral Cluster method for comparison """ def SklearnSP(self, x_train, affinity='rbf'): model = cluster.SpectralClustering(n_clusters=self.n_clusters, affinity=affinity) model.fit(x_train) y_predict = model.fit_predict(x_train) return y_predict """ Return exp(−||a − b||^2/s^2) where s = sigma """ def rbf(self, a, b, sigma): result = math.exp( -math.pow( self.VectorLength( self.VectorSub(a, b) ) , 2) / math.pow(sigma, 2) ) return result """ Return the legth of vector v """ def VectorLength(self, v): sum = 0 for item in v: sum += item * item return math.sqrt(sum) """ Return the result of the subtraction a - b where a and b are vectors of the same length """ def VectorSub(self, a, b): if (len(a) != len(b)): return None v = np.zeros(len(a), dtype=np.float64) for i in range(0, len(a)): v[i] = a[i] - b[i] return v """ Visualize 2D data """ def visualize2D(self, x, y, c=None, title='', filename=None): fig, ax = plt.subplots(figsize=(13, 6)) ax.set_title(title, fontsize=16) cmap = 'viridis' dot_size=50 # Check if there are different colored items in the plot if c is not None: for i in range(0, self.n_clusters-1) : temp_c = c[ (i*self.size) : (i+1) * self.size] ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap) else: ax.scatter(x, y, s=dot_size) # Save to file or display plot if filename is not None: plt.savefig(filename + '.png') plt.clf() plt.close() else: plt.show() """ Log results """ def logResults(self, y_test, prediction, sklearn=False, affinity='rbf', laplacian='custom'): if sklearn is True: algorithm = 'SKLearn Spectral Clustering' else: algorithm = 'Custom Spectral Clustering' # Calculate precision, recall, f1 result = metrics.precision_recall_fscore_support(y_test, prediction, average='macro') self.results = self.results.append({ 'Algorithm': algorithm, 'Affinity': affinity, 'N_Vectors': str(self.n_vectors), 'Laplacian': laplacian, 'Precision': float("%0.3f"%result[0]), 'Recall': float("%0.3f"%result[1]), 'F1': float("%0.3f"%result[2])}, ignore_index=True) """ Setup results dataframe object """ def setupResults(self): self.results = pd.DataFrame(columns=['Algorithm', 'Affinity', 'Laplacian', 'N_Vectors', 'Precision', 'Recall', 'F1'])
fig = plt.figure(figsize=(6, 4)) axes3D = Axes3D(fig) axes3D.scatter3D(gm_X[:, 0], gm_X[:, 1], gm_X[:, 2], marker='o', c=gm_colors[gm_y]) plt.scatter(gm_centers[:, 0], gm_centers[:, 1], gm_centers[:, 2], marker='x', c='r') plt.title("Orignal Axis Dist with Class Label.(First 3 dims)") plt.show() ############# perform algrithom ############# gm_lle = LocallyLinearEmbedding(n_neighbors=30, n_components=2, method='standard', n_jobs=2, random_state=9) gm_lle.fit(gm_X) gm_S = gm_lle.transform(gm_X) gm_Scenters = gm_lle.transform(gm_centers) plt.scatter(gm_S[:, 0], gm_S[:, 1], marker='o', c=gm_colors[gm_y]) plt.scatter(gm_Scenters[:, 0], gm_Scenters[:, 1], marker='x', c='r') plt.title("LDA Axis Dist.( 2 dims)") plt.show()
#数据准备 xs = np.linspace(0, 10, 1000) zs = np.sin(xs) ys = np.random.random(1000) ax = plt.axes(projection='3d') plt.figure(figsize=(20, 10)) ax.scatter(xs=xs[:300], ys=ys[:300], zs=zs[:300]) ax.scatter(xs=xs[300:600], ys=ys[300:600], zs=zs[300:600]) ax.scatter(xs=xs[600:], ys=ys[600:], zs=zs[600:]) plt.show() x = np.vstack((xs, ys, zs)).T #sklearn用法 n = 50 #近邻数量 lle = LocallyLinearEmbedding(n_neighbors=n, n_components=2, method='standard') lle.fit(x) tranx = lle.transform(x) #画图 print(n) plt.scatter(tranx[:300, 0], tranx[:300, 1]) plt.scatter(tranx[300:600, 0], tranx[300:600, 1]) plt.scatter(tranx[600:, 0], tranx[600:, 1]) plt.show() #自编用法 m, n = np.shape(x) #1、计算W k = 50 #近邻数量 W = np.zeros((m, m)) for i in range(m): n_distance = np.zeros((m))
# Locally Linear Embedding (LLE) from sklearn.manifold import LocallyLinearEmbedding n_neighbors = 10 n_components = 2 method = 'modified' n_jobs = 4 random_state = 2018 lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components, method=method, random_state=random_state, n_jobs=n_jobs) lle.fit(X_train.loc[0:5000, :]) X_train_lle = lle.transform(X_train) X_train_lle = pd.DataFrame(data=X_train_lle, index=train_index) X_validation_lle = lle.transform(X_validation) X_validation_lle = pd.DataFrame(data=X_validation_lle, index=validation_index) scatterPlot(X_train_lle, y_train, "Locally Linear Embedding") # In[ ]: # t-SNE from sklearn.manifold import TSNE n_components = 2 learning_rate = 300
def main(): # ----- settings: dataset = 'MNIST' # --> 'Facial' or 'MNIST' or 'Breast_cancer' embedding_method = 'Isomap' n_components = 5 split_in_cross_validation_again = False load_dataset_again = False subset_of_MNIST = True pick_subset_of_MNIST_again = False MNIST_subset_cardinality_training = 10000 # picking from first samples of 60,000 samples MNIST_subset_cardinality_testing = 5000 # picking from first samples of 10,000 samples # ----- paths: if dataset == 'Facial': path_dataset = './input/att_database/' path_dataset_save = './input/pickle_dataset/Facial/' elif dataset == 'MNIST': path_dataset = './input/mnist/' path_dataset_save = './input/pickle_dataset/MNIST/' elif dataset == 'Breast_cancer': path_dataset = './input/Breast_cancer_dataset/wdbc_data.txt' path_dataset_save = './input/pickle_dataset/MNIST/' # ----- Loading dataset: print('Reading dataset...') if dataset == 'MNIST': if load_dataset_again: training_data = list( read_MNIST_dataset(dataset="training", path=path_dataset)) testing_data = list( read_MNIST_dataset(dataset="testing", path=path_dataset)) number_of_training_samples = len(training_data) dimension_of_data = 28 * 28 X_train = np.empty((0, dimension_of_data)) y_train = np.empty((0, 1)) for sample_index in range(number_of_training_samples): if np.mod(sample_index, 1) == 0: print('sample ' + str(sample_index) + ' from ' + str(number_of_training_samples) + ' samples...') label, pixels = training_data[sample_index] pixels_reshaped = np.reshape(pixels, (1, 28 * 28)) X_train = np.vstack([X_train, pixels_reshaped]) y_train = np.vstack([y_train, label]) y_train = y_train.ravel() number_of_testing_samples = len(testing_data) dimension_of_data = 28 * 28 X_test = np.empty((0, dimension_of_data)) y_test = np.empty((0, 1)) for sample_index in range(number_of_testing_samples): if np.mod(sample_index, 1) == 0: print('sample ' + str(sample_index) + ' from ' + str(number_of_testing_samples) + ' samples...') label, pixels = testing_data[sample_index] pixels_reshaped = np.reshape(pixels, (1, 28 * 28)) X_test = np.vstack([X_test, pixels_reshaped]) y_test = np.vstack([y_test, label]) y_test = y_test.ravel() save_variable(X_train, 'X_train', path_to_save=path_dataset_save) save_variable(y_train, 'y_train', path_to_save=path_dataset_save) save_variable(X_test, 'X_test', path_to_save=path_dataset_save) save_variable(y_test, 'y_test', path_to_save=path_dataset_save) else: file = open(path_dataset_save + 'X_train.pckl', 'rb') X_train = pickle.load(file) file.close() file = open(path_dataset_save + 'y_train.pckl', 'rb') y_train = pickle.load(file) file.close() file = open(path_dataset_save + 'X_test.pckl', 'rb') X_test = pickle.load(file) file.close() file = open(path_dataset_save + 'y_test.pckl', 'rb') y_test = pickle.load(file) file.close() if subset_of_MNIST: if pick_subset_of_MNIST_again: X_train_picked = X_train[ 0:MNIST_subset_cardinality_training, :] X_test_picked = X_test[0:MNIST_subset_cardinality_testing, :] y_train_picked = y_train[0:MNIST_subset_cardinality_training] y_test_picked = y_test[0:MNIST_subset_cardinality_testing] save_variable(X_train_picked, 'X_train_picked', path_to_save=path_dataset_save) save_variable(X_test_picked, 'X_test_picked', path_to_save=path_dataset_save) save_variable(y_train_picked, 'y_train_picked', path_to_save=path_dataset_save) save_variable(y_test_picked, 'y_test_picked', path_to_save=path_dataset_save) else: file = open(path_dataset_save + 'X_train_picked.pckl', 'rb') X_train_picked = pickle.load(file) file.close() file = open(path_dataset_save + 'X_test_picked.pckl', 'rb') X_test_picked = pickle.load(file) file.close() file = open(path_dataset_save + 'y_train_picked.pckl', 'rb') y_train_picked = pickle.load(file) file.close() file = open(path_dataset_save + 'y_test_picked.pckl', 'rb') y_test_picked = pickle.load(file) file.close() X_train = X_train_picked X_test = X_test_picked y_train = y_train_picked y_test = y_test_picked image_shape = (28, 28) elif dataset == 'Facial': if load_dataset_again: X, y, image_shape = read_image_dataset(dataset_path=path_dataset, imagesType='.jpg') save_variable(variable=X, name_of_variable='X', path_to_save=path_dataset_save) save_variable(variable=y, name_of_variable='y', path_to_save=path_dataset_save) save_variable(variable=image_shape, name_of_variable='image_shape', path_to_save=path_dataset_save) else: file = open(path_dataset_save + 'X.pckl', 'rb') X = pickle.load(file) file.close() file = open(path_dataset_save + 'y.pckl', 'rb') y = pickle.load(file) file.close() file = open(path_dataset_save + 'image_shape.pckl', 'rb') image_shape = pickle.load(file) file.close() elif dataset == 'Breast_cancer': data = pd.read_csv( path_dataset, sep=",", header=None ) # read text file using pandas dataFrame: https://stackoverflow.com/questions/21546739/load-data-from-txt-with-pandas labels_of_classes = ['M', 'B'] X, y = read_BreastCancer_dataset(data=data, labels_of_classes=labels_of_classes) X = X.astype( np.float64 ) #---> otherwise MDS has error --> https://stackoverflow.com/questions/16990996/multidimensional-scaling-fitting-in-numpy-pandas-and-sklearn-valueerror # --- cross validation: path_to_save = './input/split_data/' portion_of_test_in_dataset = 0.3 number_of_folds = 10 if split_in_cross_validation_again: train_indices_in_folds, test_indices_in_folds, \ X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds = \ cross_validation(X=X, y=y, n_splits=number_of_folds, test_size=portion_of_test_in_dataset) save_variable(train_indices_in_folds, 'train_indices_in_folds', path_to_save=path_to_save) save_variable(test_indices_in_folds, 'test_indices_in_folds', path_to_save=path_to_save) save_variable(X_train_in_folds, 'X_train_in_folds', path_to_save=path_to_save) save_variable(X_test_in_folds, 'X_test_in_folds', path_to_save=path_to_save) save_variable(y_train_in_folds, 'y_train_in_folds', path_to_save=path_to_save) save_variable(y_test_in_folds, 'y_test_in_folds', path_to_save=path_to_save) for fold_index in range(number_of_folds): save_np_array_to_txt(np.asarray( train_indices_in_folds[fold_index]), 'train_indices_in_fold' + str(fold_index), path_to_save=path_to_save) save_np_array_to_txt(np.asarray( test_indices_in_folds[fold_index]), 'test_indices_in_folds' + str(fold_index), path_to_save=path_to_save) else: file = open(path_to_save + 'train_indices_in_folds.pckl', 'rb') train_indices_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'test_indices_in_folds.pckl', 'rb') test_indices_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'X_train_in_folds.pckl', 'rb') X_train_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'X_test_in_folds.pckl', 'rb') X_test_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'y_train_in_folds.pckl', 'rb') y_train_in_folds = pickle.load(file) file.close() file = open(path_to_save + 'y_test_in_folds.pckl', 'rb') y_test_in_folds = pickle.load(file) file.close() print(X_train.shape) print(X_test.shape) # ----- embedding: print('Embedding...') if dataset == 'MNIST': # plot_components(X_projected=X_projected, images=X.reshape((-1, image_shape[0], image_shape[1])), ax=ax, image_scale=0.6, markersize=10, thumb_frac=0.05, cmap='gray_r') # ----- embedding: if embedding_method == 'LLE': clf = LLE(n_neighbors=5, n_components=n_components, method='standard') clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'Isomap': clf = Isomap(n_neighbors=5, n_components=n_components) clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'MDS': clf = MDS(n_components=n_components) X_projected = clf.fit_transform(X=np.vstack([X_train, X_test])) X_train_projected = X_projected[:X_train.shape[0], :] X_test_projected = X_projected[X_train.shape[0]:, :] elif embedding_method == 'PCA': clf = PCA(n_components=n_components) clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'KernelPCA': clf = KernelPCA(n_components=n_components, kernel='rbf') clf.fit(X=X_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'LaplacianEigenmap': clf = LaplacianEigenmap(n_neighbors=5, n_components=n_components) X_projected = clf.fit_transform(X=np.vstack([X_train, X_test])) X_train_projected = X_projected[:X_train.shape[0], :] X_test_projected = X_projected[X_train.shape[0]:, :] elif embedding_method == 'LDA': clf = LDA(n_components=n_components) clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'SPCA': clf = SPCA(n_components=n_components) clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'TSNE': clf = TSNE(n_components=min(3, n_components)) # print(type(list(y_train))) X_projected = clf.fit_transform( X=np.vstack([X_train, X_test]), y=np.asarray(list(y_train) + list(y_test))) X_train_projected = X_projected[:X_train.shape[0], :] X_test_projected = X_projected[X_train.shape[0]:, :] elif embedding_method == 'ML': clf = ML(n_components=n_components) clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'Kernel_FLDA': clf = Kernel_FLDA(n_components=n_components, kernel='linear') clf.fit(X=X_train, y=y_train) X_train_projected = clf.transform(X=X_train) X_test_projected = clf.transform(X=X_test) elif embedding_method == 'No_embedding': X_train_projected = X_train X_test_projected = X_test # --- classification: print('Classification...') # clf = KNN(n_neighbors=1) clf = NB() clf.fit(X=X_train_projected, y=y_train) y_pred = clf.predict(X=X_test_projected) accuracy = accuracy_score(y_true=y_test, y_pred=y_pred) error = 1 - accuracy_score(y_true=y_test, y_pred=y_pred) # --- saving results: save_variable(accuracy, 'accuracy', path_to_save='./output/MNIST/') save_np_array_to_txt(np.asarray(accuracy), 'accuracy', path_to_save='./output/MNIST/') save_variable(error, 'error', path_to_save='./output/MNIST/') save_np_array_to_txt(np.asarray(error), 'error', path_to_save='./output/MNIST/') # --- report results: print(' ') print('Accuracy: ', accuracy * 100) print(' ') print('Error: ', error * 100)
te_vis = normalize(vis_data[test_index], norm='l2', axis=1, copy=True) tr_sem = normalize(sem_data[train_index], norm='l2', axis=1, copy=True) te_sem = normalize(sem_data[test_index], norm='l2', axis=1, copy=True) tr_data, te_data = np.hstack((tr_vis, tr_sem)), np.hstack((te_vis, te_sem)) tr_labels, te_labels = labels[train_index][:, 0], labels[test_index][:, 0] clf = make_pipeline(StandardScaler(), SVC(gamma='auto', C=1.0, kernel='linear')) pca.fit(tr_data) clf.fit(pca.transform(tr_data), tr_labels) prediction = clf.predict(pca.transform(te_data)) print('PCA: %f' % balanced_accuracy_score(te_labels, prediction)) lle.fit(tr_data) clf.fit(lle.transform(tr_data), tr_labels) prediction = clf.predict(lle.transform(te_data)) print('LLE: %f' % balanced_accuracy_score(te_labels, prediction)) iso.fit(tr_data) clf.fit(iso.transform(tr_data), tr_labels) prediction = clf.predict(iso.transform(te_data)) print('ISO: %f' % balanced_accuracy_score(te_labels, prediction)) break elapsed = time.time() - init_time hours, rem = divmod(elapsed, 3600) minutes, seconds = divmod(rem, 60) time_elapsed = '{:0>2}:{:0>2}:{:05.2f}'.format(int(hours), int(minutes), seconds)
def main(): parser = argparse.ArgumentParser(description= 'Perform Dimensionality Reduction') parser.add_argument('--alg', type=str, default='MLLE', help='Algorithm to reduce dimensionality.') parser.add_argument('catalog', type=str, help='Specify the catalog on which to perform DimReduce.') args = parser.parse_args() #dat = Table.read('catalogs/ZEST_catalog_colors.fits') #training_sample = dat[0:10000] #testing_sample = dat[10001:20000] #zkeys = ['cc', 'aa', 'm20', 'gg'] base = os.path.basename(args.catalog) filename = os.path.splitext(base)[0] dat = Table.read(args.catalog) mkeys = ['elipt', 'C', 'A_1a', 'G', 'M20']# #dat.remove_column('color') if 'color' not in dat.colnames: if 'kaggle' in sample: dat = prep_catalog.color_data2(dat, 'gz2class') if 'direct' in sample: dat = prep_catalog.color_data(dat, 'zclass') dat.write(args.catalog, overwrite=True) #dat = prep_catalog.adjust_asym(dat, mkeys[2]) #train, traincols, targets = prep_catalog.whiten_data(dat, mkeys) n_neighbors = [10,12,15,20] #n_neighbors = [7] n_components = 3 for i, n_neigh in enumerate(n_neighbors): if args.alg in ['MLLE', 'LLE', 'LTSA', 'HLLE']: if args.alg == 'MLLE': method = 'modified' elif args.alg == 'LLE': method = 'standard' elif args.alg == 'LTSA': method = 'ltsa' elif args.alg == 'HLLE': method = 'hessian' #replace_panoptes(dat) #pdb.set_trace() #sample = 'directbig_panoptes' X, y = prep_catalog.whiten_data(dat, mkeys) (dat1, dat2),(thing1,thing2) = split_samples(dat, dat,[0.75, 0.35], random_state=0) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.35], random_state=0) y_train = simplify_classlabels(y_train) y_test = simplify_classlabels(y_test) #filename = 'modified_7_directbig_new' X_train = X y_train = simplify_classlabels(y) #''' #sample ='direct_zcut' #Y_train, Y_test = open_previous_LLE(filename) #cut = np.where(X1['REDSHIFT'] <= 0.05) #X1_cut = X1[cut] #QC_plots(X1_cut) #Y_train = np.array(Y_train)[cut] #col_train = np.array(col_train)[cut] #X = Table(X) #cut_out_mixedup_region(X, np.array(Y_train)) #''' print "performing "+method+" LLE with",n_neigh,\ "nearest neighbors" print "on training sample of",len(X_train),"objects" t0 = time() A = LLE(n_neigh, n_components, eigen_solver='auto', method=method) error = A.fit(X_train).reconstruction_error_ Y_train = A.fit_transform(X_train) Y_test = A.transform(X_train) t1 = time() #''' metadata = {'method':method, 'N':n_neigh, 'd':n_components, 'error':error, 'time':t1-t0, 'sample':filename+'_total'} save_dimreduce(dat, Y_train, y_train, metadata, filename+'_total') #metadata = {'method':method, 'N':n_neigh, 'd':n_components, # 'error':error, 'time':t1-t0, 'sample':filename+'_test'} #save_dimreduce(X2, Y_test, y_test, metadata, filename+'_test') # plot in 3D plot_dimreduce_3D(Y_train, y_train[:,1], Y_test, y_test[:,1], method, n_neigh, error, t1-t0, filename, two=False) #====================================================================# elif args.alg == 'ISO': method='IsoMap' print "performing IsoMap with",n_neigh,"nearest neighbors" print "on training sample of",len(dat),"objects" t0 = time() A = Isomap(n_neigh, n_components, eigen_solver='dense') error = A.fit(train).reconstruction_error() Y = A.fit_transform(train) #Y2 = A.transform(test) t1 = time() print "%s: %.2g sec" %(args.alg, t1-t0) print "reconstruction error: ", error print "begin plotting" plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=0) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=1) plot_dimreduce(Y, traincols, method, n_neigh, sample, axis=2) plot_dimreduce_3D(Y, traincols, Y, traincols, method, n_neigh, (t1-t0), error, sample) elif args.alg == 'LDA': print "performing LDA" X, Xc, y = prep_catalog.whiten_data(dat, mkeys) (X_train, X_test), (y_train, y_test) = split_samples(X, y, [0.75, 0.25], random_state=0) DRclf = LDA(3, priors=None) #DRclf.fit(X_train, y_train) DRtrain = DRclf.fit(X_train, y_train).transform(X_train) DRtest = DRclf.fit(X_train, y_train).transform(X_test) classes = np.unique(y_train) colors = np.array(['darkred', 'red', 'lightsalmon', 'darkgreen', 'lightgreen', 'lightseagreen', 'indigo', 'darkviolet', 'plum']) plot_LDA_3D(DRtrain, y_train, classes, colors, sample) pdb.set_trace() #classifiers = [] #predictions = [] #Nparams = np.arange(1, X.shape[1]+1) #for nc in Nparams: clf = LDA() clf.fit(DRtrain, y_train) y_pred = clf.predict(DRtest) matchesLDA = (y_pred == y_test) print np.sum(matchesLDA) pdb.set_trace() #------------------------------------------ from sklearn.neighbors import KNeighborsClassifier knc = KNeighborsClassifier(5) knc.fit(DRtrain, y_train) y_pred = knc.predict(DRtest) matchesKNN = (y_pred == y_test) print np.sum(matchesKNN) pdb.set_trace() #------------------------------------------ from astroML.classification import GMMBayes gmmb = GMMBayes(9) gmmb.fit(DRtrain, y_train) y_pred = gmmb.predict(DRtest) matchesGMMB = (y_pred == y_test) print np.sum(matchesGMMB) pdb.set_trace() #------------------------------------------ # plot the results fig = plt.figure(figsize=(5, 2.5)) fig.subplots_adjust(bottom=0.15, top=0.95, hspace=0.0, left=0.1, right=0.95, wspace=0.2) # left plot: data and decision boundary ax = fig.add_subplot(121) pdb.set_trace() im = ax.scatter(X[:, 3], X[:, 4], color=Xc, cmap=plt.cm.Spectral, s=4, lw=0) #cmap=plt.cm.binary,, zorder=2 im.set_clim(-0.5, 1) #im = ax.imshow(Z, origin='lower', aspect='auto', # cmap=plt.cm.binary, zorder=1, # extent=xlim + ylim) #im.set_clim(0, 1.5) #ax.contour(xx, yy, Z, [0.5], colors='k') #ax.set_xlim(xlim) #ax.set_ylim(ylim) ax.set_xlabel('$G$') ax.set_ylabel('$M20$') #pred, true = classification_loss(predictions, y_test) #completeness, contamination = completeness_contamination(pred, true) pdb.set_trace() #''' #t0 = time() #A = LDA(n_components, priors=None) #Y = A.fit_transform(train, targets) #Y2 = A.fit(train, targets).transform(train) #t1 = time() #print "%s: %.2g sec" %(args.alg, t1-t0) predict = A.predict(train) #print "Predicted classes:", predict #pdb.set_trace() #pdb.set_trace() #''' plot_LDA_3D(Y2, targets, classes, colors, sample) plot_LDA(Y2, targets, classes, colors, sample, axis=0) plot_LDA(Y2, targets, classes, colors, sample, axis=1) plot_LDA(Y2, targets, classes, colors, sample, axis=2) pdb.set_trace()
return features_train_transformed, lables, vectorizer, selector, le, features # nFeatures = np.arange(50, 1000, 50) nLocally_Linear = np.arange(20, 200, 20) data = {} for k in nLocally_Linear: features, labels, vectorizer, selector, le, features_data = preprocess("pkl/article_2_people.pkl", "pkl/lable_2_people.pkl") features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=42) t0 = time() ll = LocallyLinearEmbedding(n_neighbors=15, n_components=k, eigen_solver='auto') ll.fit(features_train) print ("Dimension Reduction time:", round(time()-t0, 3), "s") features_train = ll.transform(features_train) features_test = ll.transform(features_test) for name, clf in [ ('AdaBoostClassifier', AdaBoostClassifier(algorithm='SAMME.R')), ('BernoulliNB', BernoulliNB(alpha=1)), ('GaussianNB', GaussianNB()), ('DecisionTreeClassifier', DecisionTreeClassifier(min_samples_split=100)), ('KNeighborsClassifier', KNeighborsClassifier(n_neighbors=50, algorithm='ball_tree')), ('RandomForestClassifier', RandomForestClassifier(min_samples_split=100)), ('SVC', SVC(kernel='linear', C=1)) ]:
# # Isomap # isomap = Isomap(n_neighbors=4, n_components=2) # isomap.fit(one_hot_data) # isomap_trans = isomap.transform(one_hot_data) # # # 可視化 # fig = plt.figure(figsize=(8,6)) # plt.scatter(isomap_trans[:, 0], isomap_trans[:, 1]) # plt.savefig("img/Isomap_Image/isomap_trans_" + str(data_num) + ".png") # # plt.show() # LocallyLinearEmbedding locally_linear_embedding = LocallyLinearEmbedding(n_neighbors=5, n_components=2) locally_linear_embedding.fit(one_hot_data) locally_linear_embedding_trans = locally_linear_embedding.transform( one_hot_data) # 可視化 fig = plt.figure(figsize=(8, 6)) plt.scatter(locally_linear_embedding_trans[:, 0], locally_linear_embedding_trans[:, 1]) plt.savefig( "img/LocallyLinearEmbedding_Image/locally_linear_embedding_trans_" + str(data_num) + ".png") # plt.show() # tSNE tSNE = TSNE(n_components=2, perplexity=30.0) tSNE_trans = tSNE.fit_transform(one_hot_data)
def localLinearEmbedding(X, y): lle = LocallyLinearEmbedding(n_components = 1, eigen_solver = "dense") lle.fit(X) transformX = lle.transform(X) return transformX
def func_lle(): print('\nDIMENSIONALITY REDUCTION: LLE\n') k = 50 #Number of neighbours used to perform LLE, chosen empirically # Creating the model using the photometric data print('Fitting the model...') embedding = LocallyLinearEmbedding(n_components=6, n_neighbors=k, eigen_solver='arpack') embedding.fit(dataset) print('LLE model created successfully') # Adjusting the data to the model print('Adjusting the data to the model created...') proj0 = embedding.transform(dataset) proj1 = embedding.transform(labeleddataset) # Full data plot fig = plt.figure(figsize=(8, 8)) labels = ['LL1', 'LL2', 'LL3', 'LL4', 'LL5', 'LL6'] ax = MultiAxes(6, fig=fig, hspace=0, wspace=0) ax.scatter(proj0, s=1, color=[0.75, 0, 0], marker='o', alpha=0.05) ax.set_labels(labels) plt.title('LLE\nFull data', fontsize=10) plotfile = root + '/LLE/' + root_file + '_LLE' fig.savefig(plotfile + '.png') fig.savefig(plotfile + '.eps') plt.close(fig) print('Triangular representation finished, check your LLE folder') # Saving data in ASCII format print('Saving obtained data from LLE in ASCII format...') dataheading = 'id_2MASS\tid_AllWISE\tLL1\tLL2\tLL3\tLL4\tLL5\tLL6' np.savetxt(plotfile + '.txt', np.c_[data['id_2MASS'], data['id_AllWISE'], proj0], header=dataheading, delimiter='\t', fmt='%s') np.savetxt(plotfile + '_labeled.txt', np.c_[proj1, labeleddata['z'], labeleddata['class'], subclass], header=dataheading[20:] + '\tz\tclass\tsubClass', delimiter='\t', fmt='%s') print('Data file saved successfully, check your LLE folder') # Saving data in FITS format print('Saving obtained data from LLE in FITS format...') bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile + '.txt ' 'out=' + plotfile + '.fits') subprocess.run(bashorder, shell=True) bashorder = ('sh stilts tcopy ifmt=ascii ofmt=fits in=' + plotfile + '_labeled.txt out=' + plotfile + '_labeled.fits') subprocess.run(bashorder, shell=True) print('Data file saved successfully, check your LLE folder') # Individual plots print('If you want to make a close-up plot, write 0. If this is not the ' 'case, write anything else') ind = input() while ind == '0': print('Write the components you want to plot') mag1 = input() mag2 = input() # These plots will be done using STILTS bashorder = ('sh stilts plot2plane xpix=600 ypix=450 xlabel=' + mag1 + ' ylabel=' + mag2 + ' texttype=latex fontsize=32 legend=' 'false layer=mark in=' + plotfile + '.fits x=' + mag1 + ' y=' + mag2 + ' shading=auto size=0 omode=out minor=false' ' out=' + plotfile + '_' + mag1 + mag2) subprocess.run(bashorder + '.png ofmt=png', shell=True) subprocess.run(bashorder + '.eps ofmt=eps', shell=True) print('Close-up finished, check your LLE folder') print('If you want to make another close-up plot, write 0. If this is ' 'not the case, write anything else') ind = input() print('\nLLE TECHNIQUE APPLIED\n')
doc_train, doc_test, = utils.document_test_train_split(documents, 0.4) print("Doc train: ", len(doc_train)) print("Doc test: ", len(doc_test)) X_train, y_train = utils.convert_docs_to_lines(doc_train) X_test, y_test = utils.convert_docs_to_lines(doc_test) order = np.arange(len(X_train)) np.random.shuffle(order) n = 10000 X_train, y_train = (X_train[order][:n], y_train[order][:n]) ''' vect = CountVectorizer() X_train_count = vect.fit_transform(X_train) tfidf = TfidfTransformer() X_train_tfidf = tfidf.fit_transform(X_train_count) pca = TruncatedSVD(n_components=20) X_train_pca = pca.fit_transform(X_train_tfidf) isomap = LocallyLinearEmbedding(n_neighbors=5, n_components=2) LocallyLinearEmbedding.fit(X_train_pca) X_train_isomap = LocallyLinearEmbedding.transform(X_train_pca) X_test_count = vect.transform(X_test) X_test_tfidf = tfidf.transform(X_test_count) X_test_pca = pca.fit(X_test_tfidf)
class Cluster: """ Constructor Initializes the class variables necessary for preprocessing the data """ def __init__(self): self.lle = None self.n_clusters = None self.size = None self.iterations = None self.affinity = ['rbf', 'nearest_neighbors'] """ Run Locally Linear Embedding and Spectral Clustering on the provided data LLE reduces the data to 2D Spectral Clustering runs for n_clusters, default is 2 """ def train(self, x_train, y_train, x_test, y_test, n_clusters=2): # Set number of clusters self.n_clusters = n_clusters # Set the size to the training set size self.size = len(x_train) # Create list with numbers from 1 to number of training items self.iterations = np.zeros(self.size) for i in range(0, self.size): self.iterations[i] = i + 1 # Apply Locally Linear Embedding on training and testing data x_train = self.LLE(x_train) x_test = self.LLE(x_test) # Plot training data self.visualize2D(x_train[:, 0], x_train[:, 1], c=y_train, title='Training data') self.SpectralClustering(x_train, y_train) """ Run Spectral Clustering for these data with these parameters affinity=['rbf', 'nearest_neighbors'], Default is rbf kernel for similarity matrix, """ def SpectralClustering(self, x_train, y_train, affinity='nearest_neighbors'): # Get similarity matrix for train data if affinity == 'nearest_neighbors': similarity_matrix = self.NNGraph(x_train) else: similarity_matrix = self.SimilarityMatrix(x_train) # Get degree matrix from similarity matrix degree_matrix = self.DegreeMatrix(similarity_matrix) # Get laplacian matrix from similarity matrix and degree matrix #laplacian_matrix = self.LaplacianMatrix(similarity_matrix=similarity_matrix, degree_matrix=degree_matrix) laplacian_matrix = csgraph.laplacian(similarity_matrix, normed=True) y_spec = self.transformDataToLaplacian(laplacian_matrix) model = cluster.KMeans(n_clusters=self.n_clusters, precompute_distances='auto', random_state=0) predicted = model.fit(y_spec).labels_ print(predicted) self.visualize2D(x_train[:, 0], x_train[:, 1], c=predicted, title='Custom SpectralClustering') for i in range(0, len(y_train)): if y_train[i] == -1: y_train[i] = 0 print( metrics.precision_recall_fscore_support(y_train, predicted, average='macro')) # Run with sklearns Spectral Clustering #self.SklearnSP(x_train) """ Create the new data using the laplacian matrix and its eigenvalues and eigenvectors """ def transformDataToLaplacian(self, laplacian_matrix): # Get eigenvalues and eigenvectors from the laplacian matrix eigval, eigvec = np.linalg.eig(laplacian_matrix) n_clusters = 5 # Keep the n_clusters smaller eigenvalues sort_ind = np.argsort(eigval)[:n_clusters] # Sort and plot eigenvalues eigval = np.sort(eigval) self.visualize2D(self.iterations, eigval) # Initialize new array for the transormed data transormed_data = np.zeros((len(laplacian_matrix), n_clusters - 1), dtype=np.float64) # Create transformed data for i in range(0, len(laplacian_matrix)): # Ignore first eigenvalue as it is close or equal to 0 for j in range(1, n_clusters): transormed_data[i][j - 1] = eigvec[i, np.asscalar(sort_ind[j])] return transormed_data """ Transform and return data to 2D using LocallyLinearEmbedding """ def LLE(self, data): if self.lle is None: self.lle = LocallyLinearEmbedding(n_components=2) self.lle.fit(data) return self.lle.transform(data) """ Calculate and return the nearest neighbors graph which depicts the distances between each point to another The graph connects only the items with at most limit distance between them and everything else is zero resulting in a sparse matrix Default limit is 0.4 """ def NNGraph(self, data, limit=0.4): # Create the nearest neighbors graph graph = radius_neighbors_graph(data, limit, mode='distance', metric='minkowski', p=2, metric_params=None, include_self=False) # A = kneighbors_graph(X_mn, 2, mode='connectivity', metric='minkowski', p=2, metric_params=None, include_self=False) graph = graph.toarray() return graph """ Calculate and return the similarity matrix using the rbf kernel """ def SimilarityMatrix(self, data, limit=0.4): size = len(data) # Initialize array of size x size with zeros similarity_matrix = np.zeros((size, size), dtype=np.float64) for i in range(0, size): for j in range(0, size): if i != j: value = self.rbf(data[i], data[j], 0.5) #if value <= limit: #similarity_matrix[i][j] = value similarity_matrix[i][j] = value return similarity_matrix """ Calculate and return the Degree matrix """ def DegreeMatrix(self, similarity_matrix): size = len(similarity_matrix) # Initialize array of size x size with zeros degree_matrix = np.zeros((size, size), dtype=np.float64) # Calculate sum of every row and set it in the diagonal index = 0 for row in similarity_matrix: sum = 0 for item in row: sum += item degree_matrix[index][index] = sum index += 1 return degree_matrix """ Calculate and return the Laplacian matrix """ def LaplacianMatrix(self, similarity_matrix, degree_matrix): #return degree_matrix - similarity_matrix D = np.zeros(similarity_matrix.shape) w = np.sum(similarity_matrix, axis=0) D.flat[::len(w) + 1] = w**(-0.5) # set the diag of D to w return D.dot(similarity_matrix).dot(D) """ Run sklearn's Spectral Cluster method for comparison """ def SklearnSP(self, x_train): model = cluster.SpectralClustering(n_clusters=self.n_clusters, affinity='rbf') model.fit(x_train) y_predict = model.fit_predict(x_train) self.visualize(x_train, y_predict, title='SKLearn SpectralClustering') """ Return exp(−||a − b||^2/s^2) where s = sigma """ def rbf(self, a, b, sigma): #delta = np.array(abs(np.subtract(a, b))) #distance = (np.square(delta).sum()) #c = np.exp(-(distance**2)/(sigma**2)) result = math.exp( -math.pow(self.VectorLength(self.VectorSub(a, b)), 2) / math.pow(sigma, 2)) return result """ Return the legth of vector v """ def VectorLength(self, v): sum = 0 for item in v: sum += item * item return math.sqrt(sum) """ Return the result of the subtraction a - b where a and b are vectors of the same length """ def VectorSub(self, a, b): if (len(a) != len(b)): return None v = np.zeros(len(a), dtype=np.float64) for i in range(0, len(a)): v[i] = a[i] - b[i] return v """ Visualize 2D data """ def visualize2D(self, x, y, c=None, title='', filename=None): fig, ax = plt.subplots(figsize=(13, 6)) ax.set_title(title, fontsize=18) cmap = 'viridis' dot_size = 50 # Check if there are different colored items in the plot if c is not None: for i in range(0, self.n_clusters - 1): temp_c = c[(i * self.size):(i + 1) * self.size] ax.scatter(x, y, c=temp_c, s=dot_size, cmap=cmap) else: ax.scatter(x, y, s=dot_size) # Save to file or display plot if filename is not None: pyplot.savefig(filename + '.png') pyplot.clf() else: plt.show()
from sklearn.manifold import LocallyLinearEmbedding from astroML.datasets import fetch_sdss_specgals from astroML.datasets import fetch_sdss_spectrum data = fetch_sdss_specgals() print data.dtype.names ngals = 326 nwavel = 3855 plates = data['plate'][:ngals] mjds = data['mjd'][:ngals] fiberIDs = data['fiberID'][:ngals] h_alpha = data['h_alpha_flux'][:ngals] bptclass = data['bptclass'][:ngals] specdata = np.zeros((ngals, nwavel)) i = 0 for plate, mjd, fiberID in zip(plates, mjds, fiberIDs): tempdata = fetch_sdss_spectrum(plate, mjd, fiberID) specdata[i, :] = tempdata.spectrum/tempdata.spectrum.mean() i += 1 # Apply LLE k = 7 for fignum, n in enumerate([2, 3]): lle = LocallyLinearEmbedding(k, n) lle.fit(specdata) proj = lle.transform(specdata) pl.subplot(2, 1, fignum+1) pl.scatter(proj[:,0], proj[:,1], c=bptclass, s=50) pl.colorbar() pl.show()
def eval_dimension_reduction_method(method, n_components, data, label, params, kfold=0): import time from sklearn.model_selection import StratifiedKFold from sklearn.decomposition import PCA from sklearn.manifold import Isomap, LocallyLinearEmbedding, MDS, SpectralEmbedding, TSNE from sklearn.svm import SVC from sklearn.preprocessing import StandardScaler, MinMaxScaler import numpy as np # 对数据进行归一化 normalizer = MinMaxScaler() data = normalizer.fit_transform(data) if kfold != 0: kf = StratifiedKFold(n_splits=kfold, random_state=0) final_score = [] final_time = [] for train_index, test_index in kf.split(data, label): train_data, test_data = data[train_index], data[test_index] train_label, test_label = label[train_index], label[test_index] start = time.time() if method == 'pca': pca = PCA(n_components=n_components, whiten=False, svd_solver='auto', random_state=0) reduced_train_data = pca.fit_transform(train_data) if method == 'iso': iso = Isomap(n_neighbors=params.n_neighbors, n_components=n_components, n_jobs=-1) reduced_train_data = iso.fit_transform(train_data) if method == 'lle': lle = LocallyLinearEmbedding(n_neighbors=params['n_neighbors'], n_components=n_components, method=params['method'], n_jobs=-1, random_state=0) reduced_train_data = lle.fit_transform(train_data) if method == 'mds': mds = MDS(n_components=n_components, n_init=1, random_state=0) reduced_train_data = mds.fit_transform(train_data) if method == 'le': le = SpectralEmbedding(n_components=n_components, random_state=0, n_jobs=-1) reduced_data = le.fit_transform(train_data) if method == 'tsne': tsne = TSNE(n_components=n_components, random_state=0) reduced_data = tsne.fit_transform(train_data) end = time.time() # 对降维数据进行标准化 scaler = StandardScaler() reduced_train_data = scaler.fit_transform(reduced_train_data) svc = SVC(kernel='rbf', gamma='scale', random_state=0, decision_function_shape='ovo') svc.fit(reduced_train_data, train_label) score = svc.score(reduced_train_data, train_label) final_score.append(score) final_time.append(end - start) print('-', end='') final_score = np.mean(final_score) final_time = np.mean(final_time) print('{}+svm cost {:.3f} s score {}'.format(method, final_time, final_score)) else: if method == 'pca': pca = PCA(n_components=n_components, whiten=False, svd_solver='auto', random_state=0) learn_start = time.time() pca.fit(data) learn_end = time.time() inference_start = time.time() reduced_data = pca.transform(data) inference_end = time.time() if method == 'iso': iso = Isomap(n_neighbors=params['n_neighbors'], n_components=n_components, n_jobs=-1) learn_start = time.time() iso.fit(data) learn_end = time.time() inference_start = time.time() reduced_data = iso.transform(data) inference_end = time.time() if method == 'lle': lle = LocallyLinearEmbedding(n_neighbors=params['n_neighbors'], n_components=n_components, method=params['method'], n_jobs=-1, random_state=0) learn_start = time.time() lle.fit(data) learn_end = time.time() inference_start = time.time() reduced_data = lle.transform(data) inference_end = time.time() if method == 'mds': mds = MDS(n_components=n_components, n_init=1, random_state=0) inference_start = time.time() reduced_data = mds.fit_transform(data) inference_end = time.time() if method == 'le': le = SpectralEmbedding(n_components=n_components, random_state=0, n_jobs=-1) inference_start = time.time() reduced_data = le.fit_transform(data) inference_end = time.time() if method == 'tsne': tsne = TSNE(n_components=n_components, random_state=0) inference_start = time.time() reduced_data = tsne.fit_transform(data) inference_end = time.time() scaler = StandardScaler() reduced_data = scaler.fit_transform(reduced_data) svc = SVC(kernel='rbf', gamma='scale', random_state=0, decision_function_shape='ovo') svc.fit(reduced_data, label) score = svc.score(reduced_data, label) if method == 'pca': print('learn time:{:.3f} inference time:{:.3f} score:{}'.format( (learn_end - learn_start), (inference_end - inference_start), score)) return normalizer, pca, scaler, svc, reduced_data, label if method == 'iso': print('learn time:{:.3f} inference time:{:.3f} score:{}'.format( (learn_end - learn_start), (inference_end - inference_start), score)) return normalizer, iso, scaler, svc, reduced_data, label if method == 'lle': print('learn time:{:.3f} inference time:{:.3f} score:{}'.format( (learn_end - learn_start), (inference_end - inference_start), score)) return normalizer, lle, scaler, svc, reduced_data, label if method == 'mds': print('inference time:{:.3f} score:{}'.format( (inference_end - inference_start), score)) return normalizer, mds, scaler, svc, reduced_data, label if method == 'le': print('inference time:{:.3f} score:{}'.format( (inference_end - inference_start), score)) return normalizer, le, scaler, svc, reduced_data, label if method == 'tsne': print('inference time:{:.3f} score:{}'.format( (inference_end - inference_start), score)) return normalizer, tsne, scaler, svc, reduced_data, label
clf.fit(X_train, Y_train) prediction = clf.predict(X_test) origin_time_end = time.time() acc_origin_space = metrics.accuracy_score(Y_test, prediction) time_elapse = (origin_time_end - origin_time_start) * 1000 print('原始空间的准确率:%.4f, 原始空间数据维度:%d, 耗时:%d ms。' % (acc_origin_space, n_features, time_elapse)) # TODO: 使用lda对数据进行降维 subspace_dim = 56 lle_model = LocallyLinearEmbedding(n_components=subspace_dim, n_neighbors=5, random_state=4399) lle_model.fit(X_train) X_train_new = lle_model.transform(X_train) X_test_new = lle_model.transform(X_test) # TODO: 在子空间上的分类效果 subspace_time_start = time.time() clf_new = KNeighborsClassifier(n_neighbors=5, weights='distance') clf_new.fit(X_train_new, Y_train) prediction_subspace = clf_new.predict(X_test_new) subspace_time_end = time.time() acc_subspace_score = metrics.accuracy_score(Y_test, prediction_subspace) time_elapse = (subspace_time_end - subspace_time_start) * 1000 print('子空间的准确率:%.4f, 子空间数据维度:%d, 耗时:%d ms。' % (acc_subspace_score, subspace_dim, time_elapse))
# standardize the inputs to take on values between 0 and 1 x_columns = X.columns scaler = MinMaxScaler() X = scaler.fit_transform(X) X = pd.DataFrame(X, columns=x_columns) # separate the data into training and testing np.random.seed(1) test_idx = np.random.choice(a=X.index.values, size=int(X.shape[0] / 5), replace=False) train_idx = np.array(list(set(X.index.values) - set(test_idx))) # train a LocallyLinearEmbedding model n_comp = 1 # number of components component = LocallyLinearEmbedding(n_components=n_comp, n_neighbors=5, n_jobs=1, random_state=42) component.fit(X.iloc[train_idx, :]) # compute components for all the data, add cluster labels and train/test labels components = pd.DataFrame(component.transform(X), columns=["LC" + str(i + 1) for i in range(n_comp)]) components["Data"] = "Train" for j in test_idx: components.loc[j, "Data"] = "Test" # components.to_csv("lle.csv", index=False) # combine the data and components data = pd.concat([X, components], axis=1) # plot correlations corr_plot(data.drop(columns="Data"))
from sklearn import svm clf = svm.SVC() clf.fit(Xtrain, Ytrain.ravel()) pre = clf.predict(X) elif sys.argv[3] == 'ranfor': from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=50, random_state=0) clf.fit(Xtrain, Ytrain.ravel()) pre = clf.predict(X) elif sys.argv[3] == 'lle': from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=int(round(TRAINING_SAMPLE / 5)), n_components=50) lle.fit(Xtrain, Ytrain) Xtrain = lle.transform(Xtrain) X = lle.transform(X) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(max_depth=50, random_state=0) clf.fit(Xtrain, Ytrain.ravel()) pre = clf.predict(X) correct = 0 wrong = 0 for x in range(len(pre)): if pre[x] == Y[x]: correct = correct + 1 else: wrong = wrong + 1
def train_NN_LLE(filename, X_train, X_test, y_train, y_test, debug=False, numFolds=10, njobs=-1, scalar=1, make_graphs=False, pNN={}, nolegend=False, random_seed=1, num_dim=4): np.random.seed(random_seed) algo = 'LLE' + str(num_dim) start = time.time() lle = LocallyLinearEmbedding(n_neighbors=10, n_components=num_dim, random_state=random_seed, n_jobs=-1) lle.fit(X_train) X_train = lle.transform(X_train) X_test = lle.transform(X_test) param_grid = [{ 'hidden_layer_sizes': [(512, 512, 512, 512)], 'activation': ['relu'], # 'identity', 'solver': ['adam'], 'alpha': [0.0001, 0.001, 0.01, 0.1], 'batch_size': ['auto'], 'learning_rate_init': [0.001, 0.01], 'max_iter': [10000], 'warm_start': [True], 'early_stopping': [True], 'random_state': [1] }] nn_classifier = MLPClassifier() grid_search = GridSearchCV(nn_classifier, param_grid, cv=numFolds, scoring='roc_auc_ovr_weighted', return_train_score=True, n_jobs=njobs, verbose=debug) grid_search.fit(X_train, y_train) cvres = grid_search.cv_results_ util.save_gridsearch_to_csv(cvres, algo, filename[:-4] + '-' + str(num_dim), scalar, '') start = time.time() nn_classifier.fit(X_train, y_train) print('NN Fit Time: ', time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_train) train_score = roc_auc_score(y_train, y_prob, multi_class="ovr", average="weighted") print('NN Train Score Time: ', train_score, time.time() - start) start = time.time() y_prob = nn_classifier.predict_proba(X_test) test_score = roc_auc_score(y_test, y_prob, multi_class="ovr", average="weighted") print('NN Test Score Time: ', test_score, time.time() - start) test_class = MLPClassifier() test_class.set_params(**pNN) if make_graphs: # computer Model Complexity/Validation curves util.plot_learning_curve(nn_classifier, algo, filename[:-4], X_train, y_train, ylim=(0.0, 1.05), cv=10, n_jobs=njobs, debug=debug) return time.time() - start, round(train_score, 4), round(test_score, 4)