def plotTrajectory(dfile): fin = open(dfile) Vsteps = [] Vtarget = fin.readline().strip().split() Vtarget = map(float,Vtarget) Vsteps.append(Vtarget) for l in fin: l = l.strip().split() if len(l) != 26: continue l = map(float,l) Vsteps.append(l) distances = [euclidean(a,Vsteps[0]) for a in Vsteps[1:]] print len(distances) _map = plt.get_cmap("winter") distcolors = _map(distances) dimred = Isomap(n_components=2) Vsteps = dimred.fit_transform(Vsteps) #objective vector plt.scatter(Vsteps[0,0],Vsteps[0,1],color='red',s=30,marker=(5,1)) #Optimization steps plt.scatter(Vsteps[1:,0],Vsteps[1:,1],color=distcolors,alpha=0.5) plt.show()
def plot_3d(dataset): """TODO: Docstring for plot_3d. :returns: TODO """ from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = fig.add_subplot(111, projection='3d') iso = Isomap(n_components=3) projected = iso.fit_transform(dataset.data.toarray()) print 'projected: sample: %s, feature: %s'\ % (projected.shape[0], projected.shape[1]) all_scatter = [] colors = cm.rainbow(np.linspace(0, 1, len(dataset.target_names)), alpha=0.5) for i in range(len(dataset.target_names)): points = projected[dataset.target==i,:] cur = ax.scatter(points[:,0], points[:,1], points[:,2], color=colors[i], edgecolor='k', lw=0.1, vmin=0, vmax=len(dataset.target_names)) all_scatter.append(cur) ax.legend(all_scatter, dataset.target_names, loc='lower left', scatterpoints=1) plt.savefig('isomap3d', dpi=500) plt.show() return True
def isomap(similarity, euclid=False): if not euclid: print('podvod') model = Isomap(n_neighbors=15) result = model.fit_transform(similarity) return result.T
def iso_map(data, target, target_names): iso = Isomap(n_components=2) data_projected = iso.fit_transform(data) formatter = plt.FuncFormatter(lambda i, *args:target_names[int(i)]) plt.figure(figsize=(8, 8)) plt.scatter(data_projected[:, 0], data_projected[:, 1], c=target,edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow', len(target_names))); plt.colorbar(ticks=sorted(list(set(target))), format=formatter) #plt.clim(-200, 0) return iso, data_projected
def embedDistanceMatrix(dmatDf, method='kpca', n_components=2, **kwargs): """Two-dimensional embedding of sequence distances in dmatDf, returning Nx2 x,y-coords: tsne, isomap, pca, mds, kpca, sklearn-tsne""" if isinstance(dmatDf, pd.DataFrame): dmat = dmatDf.values else: dmat = dmatDf if method == 'tsne': xy = tsne.run_tsne(dmat, no_dims=n_components, perplexity=kwargs['perplexity']) elif method == 'isomap': isoObj = Isomap(n_neighbors=10, n_components=n_components) xy = isoObj.fit_transform(dmat) elif method == 'mds': mds = MDS(n_components=n_components, max_iter=3000, eps=1e-9, random_state=15, dissimilarity="precomputed", n_jobs=1) xy = mds.fit(dmat).embedding_ rot = PCA(n_components=n_components) xy = rot.fit_transform(xy) elif method == 'pca': pcaObj = PCA(n_components=None) xy = pcaObj.fit_transform(dmat)[:, :n_components] elif method == 'kpca': pcaObj = KernelPCA(n_components=dmat.shape[0], kernel='precomputed', eigen_solver='dense') try: gram = dist2kernel(dmat) except: print('Could not convert dmat to kernel for KernelPCA; using 1 - dmat/dmat.max() instead') gram = 1 - dmat / dmat.max() xy = pcaObj.fit_transform(gram)[:, :n_components] elif method == 'lle': lle = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=n_components, method='standard') xy = lle.fit_transform(dist) elif method == 'sklearn-tsne': tsneObj = TSNE(n_components=n_components, metric='precomputed', random_state=0, perplexity=kwargs['perplexity']) xy = tsneObj.fit_transform(dmat) elif method == 'umap': umapObj = umap.UMAP(n_components=n_components, metric='precomputed', **kwargs) xy = umapObj.fit_transform(dmat) else: print('Method unknown: %s' % method) return assert xy.shape[0] == dmatDf.shape[0] xyDf = pd.DataFrame(xy[:, :n_components], index=dmatDf.index, columns=np.arange(n_components)) if method == 'kpca': """Not sure how negative eigenvalues should be handled here, but they are usually small so it shouldn't make a big difference""" setattr(xyDf, 'explained_variance_', pcaObj.lambdas_[:n_components]/pcaObj.lambdas_[pcaObj.lambdas_>0].sum()) return xyDf
def isomap(file_name, dimension, num_neighbors, label): balls = np.loadtxt(file_name) matrix = balls[:, 0:dimension] new_matrix = convert_angles_to_cos_sin(matrix) imap = Isomap(n_neighbors=num_neighbors, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto') transformed_matrix = imap.fit_transform(new_matrix) ball_coords = np.zeros((balls.shape[0], dimension+3)) for i in xrange(balls.shape[0]): ball_coords[i, 0:dimension] = balls[i, 0:dimension].tolist() ball_coords[i, dimension:dimension+2] = transformed_matrix[i] if label == 'cluster': ball_coords[i, dimension+2] = balls[i, dimension].tolist() elif label == 'eq': ball_coords[i, dimension+2] = (-0.0019872041*300*np.log(abs(balls[i, dimension+1]))).tolist() elif label == 'committor': ball_coords[i, dimension+2] = (balls[i, dimension+2]/abs(balls[i, dimension+1])).tolist() print ' '.join([str(x) for x in ball_coords[i, :]])
def isomap(self, data): print 'Isomap neighbours :', self.parameters["n_neighbors"] print 'Isomap components, ie final number of coordinates :', self.k k_means_n_clusters=self.parameters['k_means_n_clusters'] isomap_params = dict(self.parameters) del isomap_params["k_means_n_clusters"] m = Isomap(neighbors_algorithm = 'kd_tree',**isomap_params)#eigen_solver='auto', tol=0, path_method='auto', neighbors_algorithm='kd_tree') x = m.fit_transform(data) error=m.reconstruction_error() geod_d = m.dist_matrix_.flatten() new_euclid_d = cdist(x, x, metric='euclidean').flatten() corr=1- pearsonr(geod_d, new_euclid_d)[0]**2 new_data = x print self.parameters return self.batch_kmeans(new_data, parameters = dict(zip(params["mini-batchk-means"], [k_means_n_clusters, 1000, 500, 1000, 'k-means++', 5])))
def outputBin(data, ctrlSize,nbPheno, lPheno, binSize, sigma, nbDim=2, nbNeighbours=20): m = Isomap(n_neighbors=nbNeighbours, n_components=nbDim, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='kd_tree') D = m.fit_transform(data) ctrl = D[:ctrlSize] ctrlTree = KDTree(ctrl, leafsize=10) length=ctrlSize mini = np.amin(D, 0); maxi=np.amax(D, 0); nbPointsX = int((maxi[0]-mini[0])/float(binSize))+1 nbPointsY = int((maxi[1]-mini[1])/float(binSize))+1 result = np.zeros(shape=(nbPheno, nbPointsX, nbPointsY)) denomCtrl = np.zeros(shape=(nbPointsX, nbPointsY)) for pointX, pointY in product(range(nbPointsX), range(nbPointsY)): x=mini[0]+(pointX+0.5)*binSize; y=mini[1]+(pointY+0.5)*binSize ctrldou, ctrli = ctrlTree.query((x, y), ctrlSize, distance_upper_bound=binSize/sqrt(2)) if min(ctrldou)<100: ctrlPoint = filter(lambda t: t[1]<ctrl.shape[0] and np.all(np.abs(ctrl[t[1]]-(x, y))<(binSize/2.0, binSize/2.0)), zip(ctrldou, ctrli)) for distance, cPoint in ctrlPoint: denomCtrl[pointX, pointY]+=dist((x,y), ctrl[cPoint], sigma) for ifilm in range(nbPheno): print 'film ', ifilm pheno = D[length:length+lPheno[ifilm]] phenoTree = KDTree(pheno, leafsize=10) for pointX, pointY in product(range(nbPointsX), range(nbPointsY)): x=mini[0]+(pointX+0.5)*binSize; y=mini[1]+(pointY+0.5)*binSize denom=denomCtrl[pointX, pointY] phenodou, phenoi=phenoTree.query((x, y), data.shape[0]-ctrlSize, distance_upper_bound=binSize/sqrt(2)) if min(phenodou)<100: phenoPoint =filter(lambda t: t[1]<pheno.shape[0] and np.all(np.abs(pheno[t[1]]-(x, y))<(binSize/2.0, binSize/2.0)), zip(phenodou, phenoi)) for distance, pPoint in phenoPoint: local = dist((x,y), pheno[pPoint], sigma) result[ifilm, pointX, pointY]+=local; denom+=local length+=lPheno[ifilm] if denom>0:result[ifilm, pointX, pointY]/=denom plotMovies('/media/lalil0u/New/workspace2/Tracking/images', result, 'pattern_b{}_s{}'.format(binSize, sigma)) return result
def plot_2d(dataset): """TODO: Docstring for plot_2d. :returns: TODO """ iso = Isomap(n_components=2) projected = iso.fit_transform(dataset.data.toarray()) print 'projected: sample: %s, feature: %s'\ % (projected.shape[0], projected.shape[1]) all_scatter = [] colors = cm.rainbow(np.linspace(0, 1, len(dataset.target_names)), alpha=0.5) for i in range(len(dataset.target_names)): points = projected[dataset.target==i,:] cur = plt.scatter(points[:,0], points[:,1], color=colors[i], edgecolor='k', lw=0.6, vmin=0, vmax=len(dataset.target_names)) all_scatter.append(cur) plt.legend(all_scatter, dataset.target_names, loc='lower left', scatterpoints=1) plt.clim(-0.5, 9.5) plt.savefig('isomap2d', dpi=500)
def embedDistanceMatrix(dist,method='tsne'): """MDS embedding of sequence distances in dist, returning Nx2 x,y-coords: tsne, isomap, pca, mds, kpca""" if method == 'tsne': xy = tsne.run_tsne(dist, no_dims=2) #xy=pytsne.run_tsne(adist,no_dims=2) elif method == 'isomap': isoObj = Isomap(n_neighbors=10, n_components=2) xy = isoObj.fit_transform(dist) elif method == 'mds': mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=15, dissimilarity="precomputed", n_jobs=1) xy = mds.fit(dist).embedding_ rot = PCA(n_components=2) xy = rot.fit_transform(xy) elif method == 'pca': pcaObj = PCA(n_components=2) xy = pcaObj.fit_transform(1-dist) elif method == 'kpca': pcaObj = KernelPCA(n_components=2, kernel='precomputed') xy = pcaObj.fit_transform(1-dist) elif method == 'lle': lle = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=2, method='standard') xy = lle.fit_transform(dist) return xy
num_samples_to_plot = 5000 X_train, y_train = shuffle(X_train, y_train) X_train, y_train = X_train[: num_samples_to_plot], y_train[: num_samples_to_plot] # lets subsample a bit for a first impression for digit in mytargets: instances = [i for i in y_train if i == digit] print "Digit", digit, "appears ", len(instances), "times" transformer = Isomap(n_neighbors=10, n_components=2) fig, plot = plt.subplots() fig.set_size_inches(50, 50) plt.prism() X_transformed = transformer.fit_transform(X_train) plot.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y_train) plot.set_xticks(()) plot.set_yticks(()) count = 0 plt.tight_layout() plt.suptitle("Isomap for MNIST digits ") for label, x, y in zip(y_train, X_transformed[:, 0], X_transformed[:, 1]): #Lets annotate every 1 out of 200 samples, otherwise graph will be cluttered with anotations if count % 200 == 0: plt.annotate(str(int(label)), xy=(x, y), color='black', weight='normal', size=10,
for i,s in enumerate(spectra): spec= s.spectrum wavsi = s.wavelength() intpol = spi.interp1d(wavsi,spec,bounds_error=False,fill_value = 0.) spec = intpol(wavs) spec/=spec.max() data[i]= spec #print data iso = Isomap(k,n) #iso.fit(data) print "projecting and fitting: " proj = iso.fit_transform(data) print "proj.shape" print proj.shape fig,axes = plt.subplots(2,3) print proj[:,0] print proj[:,1] print proj for prop,nprop,ax in zip(properties,nproperites,axes.flatten()): ax.set_title(nprop) ax.scatter(proj[:,0],proj[:,1],c=prop)
def main(session_key, config_file, segment_size, step_size): # Get audiofilename audio_dir = "static/uploads/" + session_key + "/" for file_name in os.listdir(audio_dir): if file_name[0] != ".": audio_name = file_name break # Get full path audio_path = audio_dir + file_name # If mp3, convert to wav if audio_path[-3:] == "mp3": wav_audio = AudioSegment.from_mp3(audio_path) audio_path = audio_path[:-3:] + "wav" # set new audio_path wav_audio.export(audio_path, format="wav") # Get metadata loaded_sound = AudioSegment.from_wav(audio_path) audio_duration = len(loaded_sound) frame_rate = loaded_sound.frame_rate # If duration is longer than 1 hour, segment into chunks if audio_duration > 3600000: chunks = [] chunk_start_time = 0 while chunk_start_time * 1000 < audio_duration: subprocess.call(["sox", audio_path, audio_dir + str(int((chunk_start_time / 3600)+1)) + ".wav", "trim", str(chunk_start_time), "3600"]) chunks.append(audio_dir + str(int((chunk_start_time / 3600)+1)) + ".wav") chunk_start_time += 3600 else: chunks = [audio_path] # Create dir for ouput and set filenames output_dir = "static/data/" + session_key + "/" subprocess.call(["mkdir", output_dir]) output_path = output_dir + audio_name.split(".")[0] + ".mfcc.htk" if config_file == "spectrogram": waveform = wavfile.read(audio_path)[1] print(frame_rate) print(segment_size) print(int(frame_rate*segment_size)) f, t, Sxx = signal.spectrogram(waveform, fs=frame_rate, nperseg=int(frame_rate*(segment_size/10000)), noverlap=0) Sxx_transpose = Sxx.transpose() print("scipy shape: ", Sxx_transpose.shape) # Reduce dimensionality to 39 with svd svd = TruncatedSVD(n_components=39) result = svd.fit_transform(Sxx_transpose) print("scipy shape2: ", result.shape) else: # Prepend path to config file config_file = config_dir + config_file # Update config file with segment- and steplength, divided by 1000 to get second-format update_config(config_file, str(segment_size/10000), str(step_size/10000)) # Run opensmile to output features in output dir subprocess.call([smilextract, "-C", config_file, "-I", audio_path, "-O", output_path]) # Read file, and return formatted data htk_reader = HTKFile() htk_reader.load(output_path) result = np.array(htk_reader.data) # Flatten concatenate ten vectors at a time, resulting in 39*10 dimensionality per snippet new_result = [] temp_list = [] for vec in result: temp_list.append(vec) if len(temp_list) == 10: new_result.append(np.concatenate(tuple(temp_list), axis=0)) temp_list = [] result = np.array(new_result) # Run data through t-SNE tsne = TSNE(n_components=2, perplexity=25)#, random_state=None) Y1 = convert_range(tsne.fit_transform(result)) print("t-SNE done") # Run data through PCA pca = PCA(n_components=2) Y2 = convert_range(pca.fit_transform(result)) print("PCA done") # Run data through SOM som = True if som: print("SOM-grid-size: ", int(len(result)**0.5)) mapsize = [int(len(result)**0.5), int(len(result)**0.5)] if mapsize[0] > 100: mapsize = [100, 100] som = sompy.SOMFactory.build(result, mapsize, mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', neighborhood='gaussian', training='batch', name='sompy') # this will use the default parameters, but i can change the initialization and neighborhood methods som.train(n_job=1, verbose='info') # verbose='debug' will print more, and verbose=None wont print anything som_output = np.array(np.array([np.array(np.unravel_index(int(bmu), (mapsize[0],mapsize[0]))) for bmu in som._bmu[0]])) Y3 = convert_range(som_output) print("SOM done") else: Y3 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # Run data through UMAP run_umap = True if run_umap: Y4 = convert_range(umap.UMAP().fit_transform(result)) print("UMAP done") else: Y4 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # Run data through isomap IM = Isomap(n_components=2) Y5 = convert_range(IM.fit_transform(result)) print("Isomap done") # Experiment with autoencoder, bad results so commented for now # Run data through autoencoder # ae = False # if ae: # Y5 = convert_range(AE(result)) # else: # Y5 = convert_range(np.array([np.array([random.randint(-50, 50), random.randint(-50, 50)]) for i in range(len(Y2))])) # print("Autoencoder done") # K-means on raw features kmeans2 = KMeans(n_clusters=2, random_state=0).fit(result) print("kmeans2 done") kmeans3 = KMeans(n_clusters=3, random_state=0).fit(result) print("kmeans3 done") kmeans4 = KMeans(n_clusters=4, random_state=0).fit(result) print("kmeans4 done") kmeans5 = KMeans(n_clusters=5, random_state=0).fit(result) print("kmeans5 done") kmeans6 = KMeans(n_clusters=6, random_state=0).fit(result) print("kmeans6 done") kmeans7 = KMeans(n_clusters=7, random_state=0).fit(result) print("kmeans7 done") kmeans8 = KMeans(n_clusters=8, random_state=0).fit(result) print("kmeans8 done") kmeans20 = KMeans(n_clusters=20, random_state=0).fit(result) print("kmeans20 done") # Format t-SNE output to correct dictionary format data = [] i = 0 for coord1, coord2, coord3, coord4, coord5, cluster_index2, cluster_index3, cluster_index4, cluster_index5, cluster_index6, cluster_index7, cluster_index8, cluster_index20 in zip(Y1, Y2, Y3, Y4, Y5, kmeans2.labels_, kmeans3.labels_, kmeans4.labels_, kmeans5.labels_, kmeans6.labels_, kmeans7.labels_, kmeans8.labels_, kmeans20.labels_): data.append({ "id":i, "tsneX":float(coord1[0]), "tsneY":float(coord1[1]), "pcaX":float(coord2[0]), "pcaY":float(coord2[1]), "somX":float(coord3[0]), "somY":float(coord3[1]), "umapX":float(coord4[0]), "umapY":float(coord4[1]), "aeX":float(coord5[0]), "aeY":float(coord5[1]), "start":int(i*step_size), "active":1, "color":"black", "kcolor2":color_dict[str(cluster_index2)], "kcolor3":color_dict[str(cluster_index3)], "kcolor4":color_dict[str(cluster_index4)], "kcolor5":color_dict[str(cluster_index5)], "kcolor6":color_dict[str(cluster_index6)], "kcolor7":color_dict[str(cluster_index7)], "kcolor8":color_dict[str(cluster_index8)], "kcolor20":color_dict[str(cluster_index20)]}) #data.append({"id":i, "tsneX":random.randint(1,99), "tsneY":random.randint(1,99), "pcaX":random.randint(1,99), "pcaY":random.randint(1,99), "start":int(i*step_size), "active":1, "color":"black"}) i+=1 # Save data as csv to be able to load later keys = data[0].keys() with open(output_dir + "data.csv", 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(data) # Save metadata as csv to be able to load later metadata = [{"audio_duration":audio_duration, "audio_path":audio_path, "segment_size":segment_size, "step_size":step_size, "chunks":",".join(chunks)}] keys = metadata[0].keys() with open(output_dir + "metadata.csv", 'w') as output_file: dict_writer = csv.DictWriter(output_file, keys) dict_writer.writeheader() dict_writer.writerows(metadata)
# copy from https://blog.csdn.net/qq_42797457/article/details/100675654 import matplotlib.pyplot as plt from sklearn import datasets from sklearn.manifold import Isomap iris = datasets.load_iris() X = iris.data y = iris.target fig, ax = plt.subplots(1, 3, figsize=(15, 5)) for idx, neighbor in enumerate([2, 20, 100]): isomap = Isomap(n_components=2, n_neighbors=neighbor) new_X_isomap = isomap.fit_transform(X) ax[idx].scatter(new_X_isomap[:, 0], new_X_isomap[:, 1], c=y) ax[idx].set_title("Isomap (n_neighbors=%d)" % neighbor) plt.show() # use `isomap.transform(X)` to calc new samples
class CardiotocographyMainFrame(Tk.Frame): def __init__(self, master, x_train, y_train, x_test, y_test, evaluator, console): Tk.Frame.__init__(self, master) self.evaluator = evaluator self.x_train = x_train self.y_train = y_train self.x_test = x_test self.y_test = y_test self.new_estimator = None self.console = console self.evaluator.load_data(x_train, y_train, x_test, y_test) self.evaluator.train() self.x_train_r = self.evaluator.reduce(x_train) # 特征降维 # 0. 优化按钮 self.button_opt = Tk.Button(self, text="优化", command=self.optimize_parameter) self.button_opt.pack(side=Tk.TOP, anchor=Tk.E) self.label_tips = Tk.Label(self) self.label_tips.pack(side=Tk.TOP, anchor=Tk.E) # 1. 散点图 frame_train = Tk.Frame(self) frame_train.pack(fill=Tk.BOTH, expand=1, padx=15, pady=15) self.figure_train = Figure(figsize=(5, 4), dpi=100) self.subplot_train = self.figure_train.add_subplot(111) self.subplot_train.set_title('Cardiotocography High-Dimension Data Visualization (21-dim)') self.figure_train.tight_layout() # 一定要放在add_subplot函数之后,否则崩溃 self.last_line = None self.tsne = Isomap(n_components=2, n_neighbors=10) np.set_printoptions(suppress=True) x_train_r = self.tsne.fit_transform(x_train) self.subplot_train.scatter(x_train_r[:, 0], x_train_r[:, 1], c=y_train, cmap=plt.cm.get_cmap("Paired")) self.attach_figure(self.figure_train, frame_train) y_pred = self.evaluator.pipeline.predict(x_train) accuracy = accuracy_score(y_true=y_train, y_pred=y_pred) self.console.output("[CTG] INIT MODEL: ", str(self.evaluator.pipeline.named_steps['clf']) + "\n") self.console.output("[CTG] INIT ACCURACY: ", str(accuracy) + "\n") # 2. 概率输出框 frame_prob = Tk.Frame(self) frame_prob.pack(fill=Tk.BOTH, expand=1, padx=5, pady=5) Tk.Label(frame_prob, text="prob").pack(side=Tk.LEFT) self.strvar_prob1 = Tk.StringVar() Tk.Label(frame_prob, text="1.").pack(side=Tk.LEFT) Tk.Entry(frame_prob, textvariable=self.strvar_prob1, bd=5).pack(side=Tk.LEFT, padx=5, pady=5) self.strvar_prob2 = Tk.StringVar() Tk.Label(frame_prob, text="2.").pack(side=Tk.LEFT) Tk.Entry(frame_prob, textvariable=self.strvar_prob2, bd=5).pack(side=Tk.LEFT, padx=5, pady=5) self.strvar_prob3 = Tk.StringVar() Tk.Label(frame_prob, text="3.").pack(side=Tk.LEFT) Tk.Entry(frame_prob, textvariable=self.strvar_prob3, bd=5).pack(side=Tk.LEFT, padx=5, pady=5) # 3. 滑动条 frame_slides = Tk.Frame(self) frame_slides.pack(fill=Tk.BOTH, expand=1, padx=5, pady=5) canv = Tk.Canvas(frame_slides, relief=Tk.SUNKEN) vbar = Tk.Scrollbar(frame_slides, command=canv.yview) canv.config(scrollregion=(0, 0, 300, 1500)) canv.config(yscrollcommand=vbar.set) vbar.pack(side=Tk.RIGHT, fill=Tk.Y) canv.pack(side=Tk.LEFT, expand=Tk.YES, fill=Tk.BOTH) feature_num = x_train.shape[1] self.slides = [None] * feature_num # 滑动条个数为特征个数 for i in range(feature_num): canv.create_window(60, (i + 1) * 40, window=Tk.Label(canv, text=str(i + 1) + ". ")) min_x = np.min(x_train[:, i]) max_x = np.max(x_train[:, i]) self.slides[i] = Tk.Scale(canv, from_=min_x, to=max_x, resolution=(max_x - min_x) / 100.0, orient=Tk.HORIZONTAL, command=self.predict) canv.create_window(200, (i + 1) * 40, window=self.slides[i]) # 根据即特征值,计算归属类别的概率 def predict(self, trivial): feature_num = self.x_train.shape[1] x = np.arange(feature_num, dtype='f').reshape((1, feature_num)) for i in range(feature_num): x[0, i] = float(self.slides[i].get()) result = self.evaluator.predict(x) self.strvar_prob1.set("%.2f%%" % (result[0, 0] * 100)) # 无病的概率 self.strvar_prob2.set("%.2f%%" % (result[0, 1] * 100)) # 存疑的概率 self.strvar_prob3.set("%.2f%%" % (result[0, 2] * 100)) # 确诊的概率 self.plot_point(self.subplot_train, self.tsne.transform(x)) self.figure_train.canvas.draw() # 重绘点 def plot_point(self, subplot, x): if self.last_line is not None: self.last_line.remove() del self.last_line lines = subplot.plot(x[:, 0], x[:, 1], "ro", label="case") self.last_line = lines.pop(0) subplot.legend(loc='lower right') # 将figure放到frame上 @staticmethod def attach_figure(figure, frame): canvas = FigureCanvasTkAgg(figure, master=frame) # 内嵌散点图到UI canvas.show() canvas.get_tk_widget().pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) toolbar = NavigationToolbar2TkAgg(canvas, frame) # 内嵌散点图工具栏到UI toolbar.update() canvas.tkcanvas.pack(side=Tk.TOP, fill=Tk.BOTH, expand=1) # 搜索最优参数 def optimize_parameter(self): self.console.output("[CTG] OPTIMIZATION START...", "\n") # 计算旧模型(即初始模型)的交叉验证精度 old_scores = cross_validation.cross_val_score(estimator=self.evaluator.pipeline, X=self.x_train, y=self.y_train, scoring='accuracy', cv=10, n_jobs=-1) old_score = np.mean(old_scores) # 计算新模型们中最好的交叉验证精度 new_score = -1.0 self.new_estimator = None for clf, param_grid in RandomParameterSettings.possible_models: self.console.output("[CTG] SEARCH MODEL:", str(clf) + "\n") estimator = Pipeline([('scl', StandardScaler()), ('pca', PCA()), ('clf', clf)]) gs = RandomizedSearchCV(estimator=estimator, param_distributions=param_grid, scoring='accuracy', cv=10, n_jobs=-1) gs = gs.fit(self.x_train, self.y_train) if new_score < gs.best_score_: new_score = gs.best_score_ self.new_estimator = gs.best_estimator_ if new_score > old_score: self.label_tips.config( text='Found a new model with improvement: %.2f%%' % (100.0 * (new_score - old_score) / old_score)) self.button_opt.config(text='应用', command=self.apply_new_estimator) else: self.label_tips.config(text="No better model founded.") self.console.output("[CTG] OPTIMIZATION COMPLETE !", "\n") self.console.output("[CTG] RESULT: ", "old_model_accuracy=%f, new_model_accuracy=%f, improvement=%.2f%%\n" % ( old_score, new_score, (100.0 * (new_score - old_score) / old_score)) + "\n") def apply_new_estimator(self): self.console.output("[CTG] APPLY NEW MODEL:", "old_model=%s \n new_model=%s\n" % (self.evaluator.pipeline, self.new_estimator)) self.evaluator.pipeline = self.new_estimator self.label_tips.config(text="New model has been applied.")
cells = opts.high / opts.step isomap_gmm_results = np.zeros((cells,opts.iters)) D = scale(X) n_samples, n_features = D.shape # chosen by hyperparam search in a separate test. n_neighbors = 10 # For the specified number of principal components, do the clustering dimension_list = range(opts.low, opts.high + 1, opts.step) data_files = [] for i in dimension_list: index = (i / opts.step) - 1 isomap = Isomap(n_neighbors, n_components=i) X_iso = isomap.fit_transform(D) for j in range(0,opts.iters,1): gaussmix = GMM(n_components=true_k, covariance_type='tied', n_init=10, n_iter=1000) gaussmix.fit(X_iso) gaussmix_labels = gaussmix.predict(X_iso) homog = metrics.homogeneity_score(labels[:,0], gaussmix_labels) print "Homogeneity: %0.3f" % homog test_result = {"Model": 'Isomap', "Dimension": i, "Homogeneity": homog, "Trial": j} index = pd.Index([0], name='rows') data_files.append(pd.DataFrame(data=test_result,index=index)) print "...Done" print "...rbinding DataFrames" master_df = data_files[0] for i in xrange(1,len(data_files)):
plt.show() fig = plt.figure() ax = fig.add_subplot(111, projection="3d") plt.scatter(fdata[:, 0], fdata[:, 1], zs=fdata[:, 2], c=digits["target"], s=100) plt.show() # ISOMAP from sklearn.manifold import Isomap iso = Isomap(n_components=3, n_neighbors=15) fdata = iso.fit_transform(digits["data"]) fig = plt.figure() ax = fig.add_subplot(111, projection="3d") plt.scatter(fdata[:, 0], fdata[:, 1], zs=fdata[:, 2], c=digits["target"], s=100) plt.show() # LLE from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=15, n_components=3, method="modified") fig = plt.figure() fdata = lle.fit_transform(digits["data"])
from __future__ import division import sys from sklearn.manifold import Isomap from sklearn.decomposition import PCA from sklearn import preprocessing import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm from mpl_toolkits.mplot3d import Axes3D import random from colorsys import hsv_to_rgb data = np.genfromtxt('data012.txt', delimiter=',') isomap = Isomap() data_xformed = isomap.fit_transform(data) # pca = PCA(n_components=2) # data_xformed = pca.fit_transform(data) print data.shape print data_xformed.shape c = [(1,0,0)]*1000+[(0,1,0)]*1000+[(1,1,0)]*1000 plt.figure() plt.scatter(data_xformed[:,0], data_xformed[:,1], c=c) plt.show() quit() train_data = np.genfromtxt('training.txt', delimiter=',') isomap = Isomap(n_components=4) train_xformed = isomap.fit_transform(train_data) test_data = np.genfromtxt('testing.txt', delimiter=',') test_xformed = isomap.transform(test_data)
#03-03.py X, y = preprocess(data, shuffle=False, n_samples=1000, normalization=None) from sklearn.manifold import Isomap iso = Isomap(n_neighbors=15, n_components=3) X_proj = iso.fit_transform(X) three_component_plot(X_proj[:, 0], X_proj[:, 1], X_proj[:, 2], y, labels, trim_outliers=True)
X_pca = pca.fit_transform(T) ''' # No, the accuracy levels off at the same value as before from 7 components onwards. # If you are not, then forget about PCA entirely, unless you want to visualize your data. However if you are able to get a higher score, # then be *sure* keep that figure in mind, and comment out all the PCA code. # In the same spot, run Isomap on the data, before sending it to the train / test split. Manually experiment with every inclusive # combination of n_neighbors between 2 and 5, and n_components between 4 and 6. Are you able to get a better accuracy? from sklearn.manifold import Isomap # You're going to have to write nested for loops that wrap around everything from here on down! best_score = 0 for k in range(2, 6): for l in range(4, 7): iso = Isomap(n_neighbors = k, n_components = l) X_iso = iso.fit_transform(T) # Perform a train/test split. 30% test group size, with a random_state equal to 7. from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_iso, y, test_size = 0.3, random_state = 7) # Create a SVC classifier. Don't specify any parameters, just leave everything as default. # Fit it against your training data and then score your testing data. from sklearn.svm import SVC # Lines below are for the first lab question: ''' model = SVC() model.fit(X_train, y_train) score = model.score(X_test, y_test) print score '''
from sklearn.manifold import Isomap from matplotlib import pyplot as pl X=np.load("NormalizedHRVdata.npy") print X k=3 print "-"*10,"k=%d"%(k),"-"*10 km =KMeans(k) km.fit(X) print "Labels:" reduced_data = PCA(n_components=2).fit_transform(X) kmRed = KMeans(k) kmRed.fit(reduced_data) imap=Isomap() isomap_data=imap.fit_transform(X) kmIso = KMeans(k) kmIso.fit(isomap_data) print km.labels_ print kmRed.labels_ print kmIso.labels_ print "Silhouette Score" print metrics.silhouette_score(X,km.labels_,metric="euclidean") print metrics.silhouette_score(X,kmRed.labels_,metric="euclidean") print metrics.silhouette_score(X,kmIso.labels_,metric="euclidean") pl.subplot(1,2,1) # Step size of the mesh. Decrease to increase the quality of the VQ. h = .02 # point in the mesh [x_min, m_max]x[y_min, y_max].
import pylab import numpy # [0: 'CLASSICAL', 1: 'METAL', 2: 'HIPHOP', 3: 'DANCE', 4: 'JAZZ'] # [5:'FOLK', 6: 'SOUL', 7: 'ROCK', 8: 'POP', 9: 'BLUES'] col_input=['genre', 'year', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7', 'col8', 'col9', 'col10', 'col11', 'col12', 'col13', 'col14', 'col15', 'col16', 'col17', 'col18', 'col19', 'col20', 'col21', 'col22', 'col23', 'col24', 'col25', 'col26', 'col27', 'col28', 'col29', 'col30', 'col31', 'col32', 'col33', 'col34', 'col35', 'col36', 'col37', 'col38', 'col39', 'col40', 'col41', 'col42', 'col43', 'col44', 'col45', 'col46', 'col47', 'col48', 'col49', 'col50', 'col51', 'col52', 'col53', 'col54', 'col55', 'col56', 'col57', 'col58', 'col59', 'col60', 'col61', 'col62', 'col63', 'col64', 'col65', 'col66', 'col67', 'col68', 'col69', 'col70', 'col71', 'col72'] df_input = pandas.read_csv('pandas_output_missing_data_fixed.csv', header=None, delimiter = ",", names=col_input) # range(2,74) means its goes from col 2 to col 73 df_input_data = df_input[list(range(2, 74))] df_input_target = df_input[list(range(0, 1))] colors = numpy.random.rand(len(df_input_target)) # Manifold PCA from sklearn.manifold import Isomap iso = Isomap(n_neighbors=10, n_components=2) # n_neighbors = num of classes = 10 genres , components optimal n=6 proj1 = iso.fit_transform(df_input_data) # Relative weights on features print iso.dist_matrix_ print iso.kernel_pca_ # Plotting mpyplot.figure(1) p1 = mpyplot.scatter(proj1[:, 0], proj1[:, 1], c=colors) mpyplot.colorbar(p1) mpyplot.show(p1)
def apply_ISOMap(proj_data, proj_weights=None): model = Isomap(n_neighbors=4, n_components=2); norm_data = normalize_columns(proj_data); result = model.fit_transform(norm_data.T); return result;
from sklearn.datasets import make_s_curve X,y=make_s_curve(n_samples=1000) from mpl_toolkits.mplot3d import Axes3D ax=plt.axes(projection='3d') ax.scatter3D(X[:,0],X[:,1],X[:,2],c=y) ax.view_init(10,-60) # this is a 2D dataset embedded in 3D, but it is embedded in such a way that #PCA can't discover the underlying data orientation. from sklearn import decomposition X_pca=decomposition.PCA(n_components=2).fit_transform(X) plt.scatter(X_pca[:,0],X_pca[:,1],c=y) #Manifold learning algorithms, however, available in the sklearn.manifold #submodule, are able to recover the underlying 2-dimensional manifold: from sklearn.manifold import Isomap iso = Isomap(n_neighbors=15, n_components=2) X_iso = iso.fit_transform(X) plt.scatter(X_iso[:, 0], X_iso[:, 1], c=y) """ Exercise: Compare the results of Isomap and PCA on a 5-class subset of the digits dataset (load_digits(5)) Bonus: Also compare to TSNE, another popular manifold learning technique. """ from sklearn.datasets import load_digits digits=load_digits(5) X=digits.data isomap=Isomap(n_neighbors=15,n_components=2) X_trans=isomap.fit_transform(X) print(X_trans.shape) plt.scatter(X_trans[:,0],X_trans[:,1],c=digits.target) # Another method from sklearn.manifold import TSNE
#Traintestsplit--------------------------- from sklearn.cross_validation import train_test_split from sklearn.metrics import confusion_matrix X_train, X_test, Y_train, Y_test = train_test_split(X, y) clf.fit(X_train, Y_train) Y_pred = clf.predict(X_test) #print(confusion_matrix(Y_test,Y_pred)) #Training on the digits------------------------------------------------- from sklearn.manifold import Isomap from sklearn.datasets import load_digits digits = load_digits() iso = Isomap(n_components=2) data_projected = iso.fit_transform(digits.data) #print(data_projected.shape) #plot the data transformed from 64 dim to 2 dim. '''plt.scatter(data_projected[:,0],data_projected[:,1],c=digits.target, edgecolors="k", lw=.1, alpha=.5,s=10, cmap=plt.cm.get_cmap("nipy_spectral",10)) plt.colorbar(label="digit label", ticks=range(10)) plt.clim(-.5,9.5)''' #plt.show() #Classification of the digits------------------------------ Xtrain, Xtest, Ytrain, Ytest = train_test_split(digits.data, digits.target, random_state=2) #print(Xtrain.shape, Xtest.shape)
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.manifold import Isomap resourceFolder = '../res/' dataframe = pd.read_csv(resourceFolder + 'EnergyMix.csv') df = dataframe.loc[:, ['Oil', 'Gas', 'Coal', 'Nuclear', 'Hydro']] print df imap = Isomap() df_reduced = imap.fit_transform(df) print df_reduced plt.plot(df_reduced[:, 0], df_reduced[:, 1], '.') for index, country in enumerate(dataframe["Country"]): plt.text(df_reduced[index, 0], df_reduced[index, 1], country) plt.savefig('../doc/EnergyMix_Reduced.png') plt.show()
if __name__ == '__main__': pth = './data.txt' data, label = load_data(pth) # print(data,label) # PCA pca = PCA(n_components=2) pca_ = pca.fit_transform(data) visual(pca_, label, "PCA") # LDA lda = LinearDiscriminantAnalysis() lda_ = lda.fit_transform(data, label) visual(lda_, label, "LDA") # KPCA kpca = KernelPCA(n_components=2, kernel='rbf') kpca_ = kpca.fit_transform(data) visual(kpca_, label, "KPCA") # Isomap iso = Isomap(n_components=2) iso_ = iso.fit_transform(data) visual(iso_, label, "Isomap") # LLE lle = LocallyLinearEmbedding(n_components=2) lle_ = lle.fit_transform(data) visual(lle_, label, "LLE") # Laplacian Eigenmaps le = SpectralEmbedding(n_components=2) le_ = le.fit_transform(data) visual(le_, label, "Laplacian Eigenmaps")
transformed_pca = pd.DataFrame(transformed_pca) transformed_pca.columns = ('PCA 1', 'PCA 2', 'PCA 3', 'PCA 4', 'PCA 5', 'PCA 6') ## TSNE feature extraction TSNE = TSNE( method='exact', n_components = 6) transformed_TSNE = TSNE.fit_transform(data) transformed_TSNE = pd.DataFrame(transformed_TSNE) transformed_TSNE.columns = ('TSNE 1', 'TSNE 2', 'TSNE 3', 'TSNE 4', 'TSNE 5', 'TSNE 6') ## ISOMAP feature extraction isomap = Isomap(n_neighbors = 6, n_components= 6) transformed_isomap = isomap.fit_transform(data) transformed_isomap = pd.DataFrame(transformed_isomap) transformed_isomap.columns = ('ISOMAP 1', 'ISOMAP 2', 'ISOMAP 3', 'ISOMAP 4', 'ISOMAP 5', 'ISOMAP 6') ## Combining all features into one data frame features = pd.concat([ pd.DataFrame(transformed_pca),pd.DataFrame(transformed_TSNE), pd.DataFrame(transformed_isomap)], axis=1) ### STANDARDIZING THE FEATURES st = StandardScaler() st.fit(features) features = st.transform(features)
colors.append("r") # # TODO: Convert the list to a dataframe # # .. your code here .. df = pd.DataFrame(samples) # # TODO: Implement Isomap here. Reduce the dataframe df down # to three components, using K=6 for your neighborhood size # # .. your code here .. transformedIsomap = Isomap(n_neighbors=6, n_components=3) transformedIsomap = transformedIsomap.fit_transform(df) # # TODO: Create a 2D Scatter plot to graph your manifold. You # can use either 'o' or '.' as your marker. Graph the first two # isomap components # # .. your code here .. figure = plt.figure() figure.add_subplot("211").scatter(transformedIsomap[:, 0], transformedIsomap[:, 1], c=colors) # # TODO: Create a 3D Scatter plot to graph your manifold. You
def get_iso_net(net, neighbours, comps): embedding = Isomap(n_neighbors=neighbours, n_components=comps) net_transformed = embedding.fit_transform(net) return net_transformed
def plot2d(X, y, scale=True, normalize=False, embedding='pca', title=''): """ Plot data transformed into two dimensions by PCA. PCA transforms into a new embedding dimension such that the first dimension contains the maximal variance and following dimensions maximal remaining variance. This shoudl spread the observed n-dimensional data maximal. This is unsupervised and will not consider target values. """ if (scale): scaler = StandardScaler() X = scaler.fit_transform(X) if (normalize): normalizer = Normalizer(norm='l2') X = normalizer.fit_transform(X) if (embedding is 'pca'): pca = PCA(n_components=2) X_transformed = pca.fit_transform(X) elif (embedding is 'isomap'): isomap = Isomap(n_components=2, n_neighbors=20) X_transformed = isomap.fit_transform(X) elif (embedding is 'lle' ): lle = LocallyLinearEmbedding(n_components=2, n_neighbors=5) X_transformed = lle.fit_transform(X) elif (embedding is 'tsne'): t_sne = TSNE(n_components=2) X_transformed = t_sne.fit_transform(X) elif (embedding is 'spectral'): se = SpectralEmbedding(n_components=2) X_transformed = se.fit_transform(X) elif (embedding is 'mds'): mds = MDS(n_components=2) X_transformed = mds.fit_transform(X) elif (embedding is 'gallery'): plt.figure(1) plt.subplot(231) plt.title('pca') X_t = PCA(n_components=2).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(232) plt.title('isomap') X_t = Isomap(n_neighbors=20).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(233) plt.title('lle') X_t = LocallyLinearEmbedding(n_neighbors=20).fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(234) plt.title('tsne') X_t = TSNE().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(235) plt.title('spectral') X_t = SpectralEmbedding().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.subplot(236) plt.title('mds') X_t = MDS().fit_transform(X) plt.scatter(X_t[:,0 ], X_t[:, 1], c=y) plt.suptitle('Gallery transforms ' + title) return plt else: raise ValueError("Choose between pca, isomap and tsne") plt.title(title + ' ' + embedding + ' plot') sc = plt.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y) plt.colorbar(sc) return plt
def plot2D_classification(self, query=None, colors=None, markers=['*', 'v', 'o', '+', '-', '.', ',']): X, y = self.__check_data_available() n_row, n_col = X.shape import matplotlib.pyplot as plt import matplotlib as mpl c_map = plt.cm.get_cmap("hsv", self._nb_clazz + 1) colors = dict((self._clazz[idx], c_map(idx)) for idx in range(0, self._nb_clazz)) \ if colors is None else colors markers = dict((self._clazz[idx], markers[idx]) for idx in range(0, self._nb_clazz)) def plot_constraints(lower, upper, _linestyle="solid"): plt.plot([lower[0], lower[0], upper[0], upper[0], lower[0]], [lower[1], upper[1], upper[1], lower[1], lower[1]], linestyle=_linestyle) plt.grid() def plot2D_scatter(X, y): for row in range(0, len(y)): plt.scatter(X[row, 0], X[row, 1], marker=markers[y[row]], c=colors[y[row]]) def plot_ellipse(splot, mean, cov, color): from scipy import linalg v, w = linalg.eigh(cov) u = w[0] / linalg.norm(w[0]) angle = np.arctan(u[1] / u[0]) angle = 180 * angle / np.pi ell = mpl.patches.Ellipse(mean, 2 * v[0] ** 0.5, 2 * v[1] ** 0.5, 180 + angle, facecolor="none", edgecolor=color, linewidth=2, zorder=2) ell.set_clip_box(splot.bbox) ell.set_alpha(0.9) splot.add_artist(ell) if n_col == 2: for clazz in self._clazz: post_mean_lower = self._mean_lower[clazz] post_mean_upper = self._mean_upper[clazz] plot_constraints(post_mean_lower, post_mean_upper) mean = self.get_mean_by_clazz(clazz) prior_mean_lower = mean - self.ell prior_mean_upper = mean + self.ell plot_constraints(prior_mean_lower, prior_mean_upper, _linestyle="dashed") if query is not None: ml_mean, ml_cov, ml_prob = self.fit_max_likelihood(query) plt.plot([query[0]], [query[1]], marker='h', markersize=5, color="black") _, _bounds = self.evaluate(query) for clazz in self._clazz: plt.plot([ml_mean[clazz][0]], [ml_mean[clazz][1]], marker='o', markersize=5, color=colors[clazz]) _, est_mean_lower = _bounds[clazz]['inf'] _, est_mean_upper = _bounds[clazz]['sup'] plt.plot([est_mean_lower[0]], [est_mean_lower[1]], marker='x', markersize=4, color="black") plt.plot([est_mean_upper[0]], [est_mean_upper[1]], marker='x', markersize=4, color="black") cov, inv, det = self.__cov_group_sample() s_plot = plt.subplot() for clazz in self._clazz: mean = self.get_mean_by_clazz(clazz) plot_ellipse(s_plot, mean, cov, colors[clazz]) elif n_col > 2: if query is not None: inference, _ = self.evaluate(query) X = np.vstack([X, query]) y = np.append(y, inference[0]) from sklearn.manifold import Isomap iso = Isomap(n_components=2) projection = iso.fit_transform(X) X = np.c_[projection[:, 0], projection[:, 1]] if query is not None: color_instance = colors[inference[0]] if len(inference) == 1 else 'black' plt.plot([X[n_row, 0]], [X[n_row, 1]], color='red', marker='o', mfc=color_instance) else: raise Exception("Not implemented for one feature yet.") plot2D_scatter(X, y) plt.show()
labels.remove('y') X_raw = df[list(labels)] X_train, _, _ = one_hot_dataframe(X_raw, [ 'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome' ], replace=True) y_train = [1 if i == 'yes' else 0 for i in df.y] reductions = [] pca = PCA(n_components=2) reductions.append(pca.fit_transform(X_train, y_train)) lda = LDA(n_components=2) reductions.append(lda.fit_transform(X_train, y_train)) isomap = Isomap(n_components=2) reductions.append(isomap.fit_transform(X_train, y_train)) lle = LocallyLinearEmbedding(n_components=2, method='standard') reductions.append(lle.fit_transform(X_train, y_train)) for reduced_X in reductions: plt.figure() red_x = [] red_y = [] blue_x = [] blue_y = [] green_x = [] green_y = [] for i in range(len(reduced_X)): if y_train[i] == 0: red_x.append(reduced_X[i][0])
from sklearn.manifold import Isomap iso = Isomap(n_components=2) digits_isomap = iso.fit_transform(digits.data) plt.figure(figsize=(10, 10)) plt.xlim(digits_isomap[:, 0].min(), digits_isomap[:, 0].max() + 1) plt.ylim(digits_isomap[:, 1].min(), digits_isomap[:, 1].max() + 1) for i in range(len(digits.data)): # actually plot the digits as text instead of using scatter plt.text(digits_isomap[i, 0], digits_isomap[i, 1], str(digits.target[i]), color = colors[digits.target[i]], fontdict={'weight': 'bold', 'size': 9})
def fit_transform(self, X): """ 计算降维结果 :param X: 高维数据矩阵,每一行是一个高维数据点 :return: """ (n, m) = X.shape print(self.parameters) # 用经典的降维方法 if self.affinity == 'PCA': # 直接返回 PCA 的降维结果 print('Classical method: PCA...') pca = PCA(n_components=self.n_components) return pca.fit_transform(X) elif self.affinity == 'MDS': # 直接返回 MDS 的降维结果 print('Classical method: MDS...') mds = MDS(n_components=self.n_components) return mds.fit_transform(X) elif self.affinity == 'Isomap': # 直接返回 Isomap 的降维结果 print('Classical method: Isomap...') iso = Isomap(n_components=self.n_components, n_neighbors=self.parameters['n_neighbors']) return iso.fit_transform(X) elif self.affinity == 't-SNE': # 直接返回 t-SNE 的降维结果 print('Classical method: t-SNE...') tsne = TSNE(n_components=self.n_components, perplexity=self.parameters['perplexity']) return tsne.fit_transform(X) elif self.affinity == 'cTSNE': # 用不加速版本的t-SNE降维 print('Classical method: classical t-SNE...') from ArtDR import tsne return tsne.tsne(X, perplexity=self.parameters['perplexity'], path=self.path, config_str='t-SNE ') elif self.affinity == 'LLE': # 直接返回 LLE 的降维结果 print('Classical method: LLE...') lle = LocallyLinearEmbedding( n_components=self.n_components, n_neighbors=self.parameters['n_neighbors']) return lle.fit_transform(X) elif self.affinity == 'geo-t-SNE': # 用基于测地线距离的 t-SNE 方法 print('Geodesic t-SNE...') gtsne = geoTsne(n_neighbors=self.parameters['n_neighbors'], perplexity=self.parameters['perplexity']) return gtsne.fit_transform(X, n_components=self.n_components) if self.parameters['use_skeleton']: # 用骨架点的方法 return self.skeleton_fit_transform(X) # 用我们自己设计的降维方法 if self.parameters['neighborhood_type'] == 'iter': # 用迭代的方式 W = self.iter_affinity_matrix(X) else: W = self.affinity_matrix(X) # 用我们的普通方法 if self.frame == 'MDS': print('Using MDS frame...') mds = MDS(n_components=self.n_components, dissimilarity='precomputed') Y = mds.fit_transform(W) return Y elif self.frame == 't-SNE': print('Using t-SNE frame...') Y = tsneFrame.tsne_plus(W, self.parameters['perplexity'], path=self.path, config_str=self.config_str) return Y elif self.frame == 't-SNE+': print('Using t-SNE framework in sklearn...') tsne = tsneFramePlus.tsnePlus( n_components=self.n_components, perplexity=self.parameters['perplexity']) Y = tsne.fit_transform(W) return Y else: print("Wrong frame name!") return
import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.manifold import Isomap resourceFolder = '../res/' dataframe = pd.read_csv(resourceFolder + 'EnergyMix.csv') df = dataframe.loc[:,['Oil','Gas','Coal','Nuclear','Hydro']] print df imap = Isomap() df_reduced = imap.fit_transform(df) print df_reduced plt.plot(df_reduced[:,0],df_reduced[:,1],'.') for index, country in enumerate(dataframe["Country"]): plt.text(df_reduced[index,0], df_reduced[index,1], country) plt.savefig('../doc/EnergyMix_Reduced.png') plt.show()
threes_data = (threes_data - threes_data.min()) / \ (threes_data.max() - threes_data.min()) n_neighbors = 5 n_components = 4 # 1. Apply LLE lle = LocallyLinearEmbedding(n_neighbors=n_neighbors, n_components=n_components) lle_data = lle.fit_transform(threes_data) lle_df = pd.DataFrame(lle_data) plot_three("LLE", lle_df, 0, 1, threes_df, 0.45) # 2. Apply ISOMAP iso = Isomap(n_neighbors=n_neighbors, n_components=n_components) iso_data = iso.fit_transform(threes_data) iso_df = pd.DataFrame(iso_data) plot_three("Isomap", iso_df, 0, 1, threes_df, 0.45) # 3. Use the Naive Bayes classier to classify the dataset based on the projected 4-dimension representations of the LLE and ISOMAP. df_data = df.values[:, 1: len(df.columns) - 1] test_size = 0.3 def calc_mean_accuracy(data, threshold=0.00015, miniter=500): print("Diff threshold {}".format(thresh)) i = 0 scores = [] mean_accuracy = 0 gnb = GaussianNB()
df = pd.DataFrame(X,columns=feat_cols) df['y'] = y df['label'] = df['y'].apply(lambda i: str(i)) X, y = None, None df_subset = df X = df_subset[feat_cols].values y = df_subset['y'] #n_components = 50 from sklearn.manifold import Isomap pca_50 = Isomap(n_neighbors=5,n_components=15) #pca_50 = PCA(n_components=n_components) pca_result_50 = pca_50.fit_transform(X) #print('Cumulative explained variation for %d principal components: %3.f' \ # % (n_components, np.sum(pca_50.explained_variance_ratio_))) X = pca_result_50 y = df['y'] def run_silhouhette_analysis(): range_n_clusters = [2, 3, 4] for n_clusters in range_n_clusters: if PLT_ALL: fig, (ax1, ax2) = plt.subplots(1, 2) fig.set_size_inches(18, 7) ax1.set_xlim([-0.1, 1])
#------------------------------------------- # RUN CLASSIFIER WITH ISOMAP IMPLEMENTATION #------------------------------------------- '''ISOMAP is so slow that the value of n_components is manually adjusted; the process in fact did not successfully run on the full dataset and various subsets of data were created to generate results demonstrating that ISOMAP is disadvanatageous''' classifier_condition = "Random Forest, ISOMAP" x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, test_size = 0.20, random_state=5) start = time.time() embedding = Isomap(n_components=6) x_train = embedding.fit_transform(x_train) x_test = embedding.fit_transform(x_test) rfclassifier = RandomForestClassifier(n_estimators=500, random_state=5, criterion = 'gini') classifier = OneVsRestClassifier(rfclassifier, n_jobs=-1) classifier.fit(x_train, y_train) prediction = classifier.predict(x_test) end = time.time() save_data[f"{classifier_condition}_n = 6"] = (model_evaluation("RF", "6", x_test, y_test, prediction, classifier, end-start, n_classes)) save_data.to_csv("Random_Forest_ISO_6.csv")
def testMahalanobisMushroom(): np.random.seed(0) N = 5000 X = np.random.rand(N, 2) Y = getMushroom(X) fn_ellipsoid = lambda idx, delta, n_points: getMushroomEllipsoid(X, idx, delta, n_points) res = getMahalanobisDists(Y, fn_ellipsoid, 0.001, 400, 2) gamma = res["gamma"] dMaxSqrCoeff=0.5 """ Using https://github.com/jmbr/diffusion-maps """ c = plt.get_cmap('magma_r') C1 = c(np.array(np.round(255.0*X[:, 1]/np.max(X[:, 1])), dtype=np.int32)) C1 = C1[:, 0:3] C2 = c(np.array(np.round(255.0*X[:, 0]/np.max(X[:, 1])), dtype=np.int32)) C2 = C2[:, 0:3] t = dMaxSqrCoeff*np.max(gamma)*0.001 tic = time.time() YM = getDiffusionMap(gamma, t, distance_matrix=True, neigs=6, thresh=1e-10) print("Elapsed Time: %.3g"%(time.time()-tic)) embedding = Isomap(n_components=2) YIso = embedding.fit_transform(Y) plt.figure(figsize=(16, 8)) plt.subplot(241) plt.scatter(X[:, 1], X[:, 0], c=C1) plt.axis('equal') plt.title("Domain, Colored by x") plt.subplot(245) plt.scatter(X[:, 1], X[:, 0], c=C2) plt.axis('equal') plt.title("Domain, Colored by y") plt.subplot(242) plt.scatter(Y[:, 0], Y[:, 1], c=C1) plt.axis('equal') plt.title("Mushroom, Colored by x") plt.subplot(246) plt.scatter(Y[:, 0], Y[:, 1], c=C2) plt.axis('equal') plt.title("Mushroom, Colored by y") plt.subplot(243) plt.scatter(YIso[:, 0], YIso[:, 1], c=C1) plt.axis('equal') plt.title("ISOMAP, Colored by x") plt.subplot(247) plt.scatter(YIso[:, 0], YIso[:, 1], c=C2) plt.axis('equal') plt.title("ISOMAP, Colored by y") plt.subplot(244) plt.scatter(YM[:, 0], YM[:, 1], c=C1) plt.axis('equal') plt.title("Mahalanobis, Colored by x") plt.subplot(248) plt.scatter(YM[:, 0], YM[:, 1], c=C2) plt.axis('equal') plt.title("Mahalanobis, Colored by y") plt.savefig("Mushroom.png", bbox_inches='tight')
try: data_res = np.load('feature_res.npz') pca_data = data_res['pca_data'] tsne_data = data_res['tsne_data'] iso_data = data_res['iso_data'] except IOError: data = images_as_matrix() pca = PCA(n_components=6) pca_data = pca.fit_transform(data) tsne = TSNE(n_components=6, method='exact') tsne_data = tsne.fit_transform(data) iso = Isomap(n_components=6) iso_data = iso.fit_transform(data) np.savez('feature_res.npz', pca_data=pca_data, tsne_data=tsne_data, iso_data=iso_data) data_labels = np.loadtxt('labels.txt', delimiter=',') stacked_features = np.concatenate((pca_data, tsne_data, iso_data), axis=1) stacked_f, stacked_prob = f_classif(stacked_features[data_labels[:, 1] > 0, :], data_labels[data_labels[:, 1] > 0, 1]) plt.figure() plt.bar(range(18), stacked_f, width=.2,
X_pca = pca.fit_transform(T) ''' # No, the accuracy levels off at the same value as before from 7 components onwards. # If you are not, then forget about PCA entirely, unless you want to visualize your data. However if you are able to get a higher score, # then be *sure* keep that figure in mind, and comment out all the PCA code. # In the same spot, run Isomap on the data, before sending it to the train / test split. Manually experiment with every inclusive # combination of n_neighbors between 2 and 5, and n_components between 4 and 6. Are you able to get a better accuracy? from sklearn.manifold import Isomap # You're going to have to write nested for loops that wrap around everything from here on down! best_score = 0 for k in range(2, 6): for l in range(4, 7): iso = Isomap(n_neighbors=k, n_components=l) X_iso = iso.fit_transform(T) # Perform a train/test split. 30% test group size, with a random_state equal to 7. from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X_iso, y, test_size=0.3, random_state=7) # Create a SVC classifier. Don't specify any parameters, just leave everything as default. # Fit it against your training data and then score your testing data. from sklearn.svm import SVC # Lines below are for the first lab question: ''' model = SVC() model.fit(X_train, y_train)
newlabels=np.array(newlabels) features=features.transpose(1,2,3,0) features=np.reshape(features,(len(features),4,32,64)) print(features.shape) ''' feats=fs.mutual_info_classif(features,newlabels,n_neighbors=5,random_state=0) max_indices=sorted(range(len(feats)), key=lambda i: feats[i])[-64:] #picking max 64 features print(len(max_indices)) features=np.reshape(features,(len(features),-1)) newfeatures=[] for f in features: newfeatures.append(f[max_indices]) features=np.array(newfeatures) ''' features=np.reshape(features,(len(features),-1)) print(features.shape) lle=Isomap(n_components=10,max_iter=60000,n_jobs=-1) X_embedded=lle.fit_transform(features) print(X_embedded.shape) with open('../Manifold_features/isomap', 'wb') as fp: pickle.dump(X_embedded, fp)
X_train.append(XX_train[i]) y_train.append(yy_train[i]) num_samples_to_plot = 5000 X_train, y_train = shuffle(X_train, y_train) X_train, y_train = X_train[:num_samples_to_plot], y_train[:num_samples_to_plot] # lets subsample a bit for a first impression for digit in mytargets: instances=[i for i in y_train if i==digit] print "Digit",digit,"appears ",len(instances), "times" transformer = Isomap(n_neighbors = 10, n_components = 2) fig, plot = plt.subplots() fig.set_size_inches(50, 50) plt.prism() X_transformed = transformer.fit_transform(X_train) plot.scatter(X_transformed[:, 0], X_transformed[:, 1], c=y_train) plot.set_xticks(()) plot.set_yticks(()) count=0; plt.tight_layout() plt.suptitle("Isomap for MNIST digits ") for label , x, y in zip(y_train, X_transformed[:, 0], X_transformed[:, 1]): #Lets annotate every 1 out of 200 samples, otherwise graph will be cluttered with anotations if count % 200 == 0: plt.annotate(str(int(label)),xy=(x,y), color='black', weight='normal',size=10,bbox=dict(boxstyle="round4,pad=.5", fc="0.8")) count = count + 1 #plt.savefig("mnist_pca.png") plt.show()
def testMahalanobisCircle(): dMaxSqrCoeff = 1.0 np.random.seed(0) N = 1000 t =np.linspace(0, 1, N+1)[0:N] t *= 2*np.pi Y = getPinchedCircleParam(t) fn_ellipsoid = lambda idx, delta, n_points: getPinchedCircleEllipsoid(t, idx, delta, n_points) np.random.seed(2) res = getMahalanobisDists(Y, fn_ellipsoid, delta=0.1, n_points=100, \ rank=1, maxeigs=2, jacfac=10) gamma = res["gamma"] mask = res["mask"] ## Step 1: Show the effect of the mask plt.figure(figsize=(8, 8)) plt.subplot(221) plt.imshow(res["DSqr"]) plt.title("Original") plt.subplot(222) plt.imshow(res["gamma"]) plt.title("Full Mahalanobis") plt.subplot(223) plt.imshow(mask) plt.title("Mask") plt.subplot(224) D = np.array(gamma) D[mask == 0] = np.inf plt.imshow(D) plt.title("Masked Mahalanobis") plt.savefig("PinchedCircle_Mask.png", bbox_inches='tight') c = plt.get_cmap('magma_r') C1 = c(np.array(np.round(255.0*t/np.max(t)), dtype=np.int32)) C1 = C1[:, 0:3] t = dMaxSqrCoeff*np.max(gamma)*0.001 tic = time.time() YMask = getDiffusionMap(gamma, t, mask=mask, distance_matrix=True, neigs=6, thresh=1e-10) YNoMask = getDiffusionMap(gamma, t, distance_matrix=True, neigs=6, thresh=1e-10) print("Elapsed Time Diffusion Maps: %.3g"%(time.time()-tic)) embedding = Isomap(n_components=2) YIso = embedding.fit_transform(Y) plt.figure(figsize=(8, 8)) plt.subplot(221) plt.scatter(Y[:, 0], Y[:, 1], c=C1) plt.axis('equal') plt.title("Warped, Colored by t") plt.subplot(222) plt.scatter(YIso[:, 0], YIso[:, 1], c=C1) plt.axis('equal') plt.title("ISOMAP, Colored by t") plt.subplot(223) plt.scatter(YNoMask[:, 0], YNoMask[:, 1], c=C1) plt.axis('equal') plt.title("Mahalanobis, Colored by t") plt.subplot(224) plt.scatter(YMask[:, 0], YMask[:, 1], c=C1) plt.axis('equal') plt.title("Masked Mahalanobis, Colored by t") plt.savefig("PinchedCircle.png", bbox_inches='tight')
from sklearn.manifold import Isomap from sklearn.manifold import LocallyLinearEmbedding from sklearn import preprocessing import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm from mpl_toolkits.mplot3d import Axes3D import random from colorsys import hsv_to_rgb pca = PCA(n_components=2) isomap = Isomap(n_components=2) lle = LocallyLinearEmbedding(n_components=2) data = np.genfromtxt('data01_small.txt', delimiter=',') pca_xform = pca.fit_transform(data) isomap_xform = isomap.fit_transform(data) lle_xform = lle.fit_transform(data) label = [0]*100+[1]*100 rgbs = [(0.5,0,0), (0,0.5,0)] plt.figure() xs = pca_xform[:,0] ys = pca_xform[:,1] ax = plt.subplot(111) for i in xrange(len(xs)): ax.text(xs[i], ys[i], str(label[i]), color=rgbs[label[i]], fontdict={'weight': 'bold', 'size': 9}) t = (max(xs)-min(xs))*0.1 ax.axis([min(xs)-t, max(xs)+t, min(ys)-t, max(ys)+t]) plt.xticks([]), plt.yticks([]) plt.title('PCA')
def dimensionality_reduction(df_sel_data, n_comp): print("[INFO] feature reduction of " + str(len(df_sel_data[0])) + " for " + str(n_comp)) iso_map = Isomap(n_components=n_comp) sel_data = iso_map.fit_transform(pd.DataFrame(df_sel_data)) return sel_data
class Project(Standardize.Standard): ''' The Project class will project data in to new feature space using linear and non-linear transformation methods. The currently supported projections are PCA and UMAP. ''' def __init__(self, standard_method=None): ''' The class take as input the valid standardization methods from the Standard class. Valid standardization methods include: - 'standardize' = Mean center the data and scale by the standard deviation - 'center' abs = Mean center the data - 'min_max' = Scale the data to between 0 and 1 ''' self.standard_method = standard_method self.pca_fitted = False self.umap_fitted = False self.standard_data = None self.standard_obj = None self.project_obj = None self.project_data = None super().__init__(standard_method) def reset_params(self): ''' Reset the standization and projection parameters to their default values ''' self.standard_fitted = False self.pca_fitted = False self.standard_data = None self.standard_obj = None self.project_obj = None self.project_data = None return self def fit_pca(self, x, n_comps=None): ''' Fit a pca object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the pca object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut None, n_comps <= min( n_features, n_observations ) Specifies the number of principal components to fit to the dataset ''' if n_comps == None: n_comps = min(x.shape) assert n_comps <= min( x.shape ), 'n_comps must be less than or equal to the minimum element of x.shape' if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = PCA(n_components=n_comps).fit(x) self.pca_fitted = True return self.project_obj def transform_pca(self, x=None, return_data=False): ''' Project a dataset using the pca object fitted in the fit_pca() method: - x: Array or dataframe, default = None, optional Contains the data to be transformed by the pca object. If x is None then the data used to fit the pca object in fit_pca() will be used in transfrom_pca. If x is not None the the fitted parameters of the pca object will be used to transform x. Furthermore, if a standardization method is specified and x is not None then the parameters of the standardization object fitted in fit_pca() will be used to standardize x before proejcting the data. - return_data: bool, default = False If True the function will return the projected dataset as a numpy array ''' assert self.pca_fitted == True if self.standard_method != None: if type(x) != type(None): x = self.transform_standard(x, return_data=True) else: x = self.standard_data self.project_data = self.project_obj.transform(x) if return_data == True: return self.project_data def fit_transform_pca(self, x, n_comps=None, return_data=False): ''' Fit a pca object and project the dataset using the fitted pca object with the inputs: - x: Array or dataframe Contains the data to be fitted by the pca object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut None, n_comps <= min( n_features, n_observations ) Specifies the number of principal components to fit to the dataset - return_data: bool, default = False If True the function will return the projected dataset as a numpy array ''' if n_comps == None: n_comps = min(x.shape) assert n_comps <= min( x.shape ), 'n_comps must be less than or equal to the minimum element of x.shape' if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = PCA(n_components=n_comps).fit(x) self.project_data = self.project_obj.transform(x) self.pca_fitted = True if return_data == True: return self.project_data def fit_umap(self, x, n_comps=None, n_neighbors=50): ''' Fit a UMAP object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the pca object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut = None, n_comps <= min( n_features, n_observations, n_neighbors ) Specifies the number of UMAP components to fit to the dataset - n_neighbors: int, default = 50 This parameter controls how UMAP balances local versus global structure in the data. It does this by constraining the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. Reference to the authors of UMAP: McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018 ''' if n_comps == None: n_comps = n_neighbors assert n_comps <= n_neighbors, 'n_comps must be less than or equal to n_neighbors' if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.5, n_components=n_comps).fit(x) self.umap_fitted = True return self.project_obj def transform_umap(self, x=None, return_data=False): ''' Project a dataset using the UMAP object fitted in the fit_umap() method: - x: Array or dataframe, default = None, optional Contains the data to be transformed by the fitted umap object. If x is None then the data used to fit the umap object in fit_umap() will be used in transfrom_pca. If x is not None the the fitted parameters of the umap object will be used to transform x. Furthermore, if a standardization method is specified and x is not None then the parameters of the standardization object fitted in fit_umap() will be used to standardize x before proejcting the data. - return_data: bool, default = False If True the function will return the projected dataset as a numpy array ''' assert self.umap_fitted == True, 'No UMAP object has been fitted' if self.standard_method != None: if type(x) != type(None): x = self.transform_standard(x, return_data=True) else: x = self.standard_data self.project_data = self.project_obj.transform(x) if return_data == True: return self.project_data def fit_transform_umap(self, x, n_comps=None, n_neighbors=50, return_data=False): ''' Fit a UMAP object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the pca object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut = None, n_comps <= min( n_features, n_observations, n_neighbors ) Specifies the number of UMAP components to fit to the dataset - n_neighbors: int, default = 50 This parameter controls how UMAP balances local versus global structure in the data. It does this by constraining the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. - return_data: bool, default = False If True the function will return the projected dataset as a numpy array Reference to the authors of UMAP: McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018 ''' if n_comps == None: n_comps = n_neighbors assert n_comps <= n_neighbors, 'n_comps must be less than or equal to n_neighbors' if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = umap.UMAP(n_neighbors=n_neighbors, min_dist=0.5, n_components=n_comps).fit(x) self.project_data = self.project_obj.transform(x) self.umap_fitted = True if return_data == True: return self.project_data def fit_transform_tsne(self, x, n_comps=None, return_data=False): ''' Fit a TSNE object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the pca object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut = None, n_comps <= min( n_features, n_observations, n_neighbors ) Specifies the number of UMAP components to fit to the dataset - n_neighbors: int, default = 50 This parameter controls how UMAP balances local versus global structure in the data. It does this by constraining the size of the local neighborhood UMAP will look at when attempting to learn the manifold structure of the data. - return_data: bool, default = False If True the function will return the projected dataset as a numpy array Reference to the authors of UMAP: McInnes, L, Healy, J, UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction, ArXiv e-prints 1802.03426, 2018 ''' if n_comps == None: n_comps = 3 assert n_comps <= 3, 'n_comps must be less than or equal to 3' if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = TSNE(n_components=n_comps, n_iter=1000) self.project_data = self.project_obj.fit_transform(x) self.tsne_fitted = True if return_data == True: return self.project_data def fit_phate(self, x, n_comps=None, knn=5, decay=40, n_landmark=2000, t='auto', gamma=1, n_pca=100, mds_solver='sgd', knn_dist='euclidean', mds_dist='euclidean', mds='metric', n_jobs=1, random_state=123, verbose=1): ''' Fit a PHATE object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the PHATE object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut = None, n_comps <= min( n_features, n_observations, n_neighbors ) Specifies the number of PHATE components to fit to the dataset Reference to the authors of PHATE: Moon KR, van Dijk D, Zheng W, et al. (2017), PHATE: A Dimensionality Reduction Method for Visualizing Trajectory Structures in High-Dimensional Biological Data, BioRxiv. Documetnation: https://phate.readthedocs.io/en/stable/api.html#id2 ''' if n_comps == None: n_comps = min(x.shape) n_comps = min((n_comps, n_landmark)) if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = phate.PHATE(n_components=n_comps, knn=knn, decay=decay, n_landmark=n_landmark, t=t, gamma=gamma, n_pca=n_pca, mds_solver=mds_solver, knn_dist=knn_dist, mds_dist=mds_dist, mds=mds, n_jobs=n_jobs, random_state=random_stat, verbose=verbose).fit(x) self.phate_fitted = True return self.project_obj def transform_phate(self, x=None, return_data=False): ''' Project a dataset using the UMAP object fitted in the fit_umap() method: - x: Array or dataframe, default = None, optional Contains the data to be transformed by the fitted umap object. If x is None then the data used to fit the umap object in fit_umap() will be used in transfrom_pca. If x is not None the the fitted parameters of the umap object will be used to transform x. Furthermore, if a standardization method is specified and x is not None then the parameters of the standardization object fitted in fit_umap() will be used to standardize x before proejcting the data. - return_data: bool, default = False If True the function will return the projected dataset as a numpy array ''' assert self.phate_fitted == True, 'No PHATE object has been fitted' if self.standard_method != None: if type(x) != type(None): x = self.transform_standard(x, return_data=True) else: x = self.standard_data self.project_data = self.project_obj.transform(x) if return_data == True: return self.project_data def fit_transform_phate(self, x, n_comps=None, knn=5, decay=40, n_landmark=2000, t='auto', gamma=1, n_pca=100, mds_solver='sgd', knn_dist='euclidean', mds_dist='euclidean', mds='metric', n_jobs=1, random_state=123, verbose=1, return_data=False): ''' Fit a PHATE object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the PHATE object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut = None, n_comps <= min( n_features, n_observations, n_neighbors ) Specifies the number of PHATE components to fit to the dataset - return_data: bool, default = False If True the function will return the projected dataset as a numpy array Reference to the authors of PHATE: Moon KR, van Dijk D, Zheng W, et al. (2017), PHATE: A Dimensionality Reduction Method for Visualizing Trajectory Structures in High-Dimensional Biological Data, BioRxiv. Documetnation: https://phate.readthedocs.io/en/stable/api.html#id2 ''' if n_comps == None: n_comps = min(x.shape) n_comps = min((n_comps, n_landmark)) if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = phate.PHATE(n_components=n_comps, knn=knn, decay=decay, n_landmark=n_landmark, t=t, gamma=gamma, n_pca=n_pca, mds_solver=mds_solver, knn_dist=knn_dist, mds_dist=mds_dist, mds=mds, n_jobs=n_jobs, random_state=random_state, verbose=verbose).fit(x) self.project_data = self.project_obj.transform(x) self.phate_fitted = True if return_data == True: return self.project_data def fit_isomap(self, x, n_comps=None, n_neighbors=5): ''' Fit a Isomap object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the Isomap object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut = None, n_comps <= min( n_features, n_observations, n_neighbors ) Specifies the number of Isomap components to fit to the dataset Reference to the authors of PHATE: R7f4d308f5054-1 Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric framework for nonlinear dimensionality reduction. Science 290 (5500) ''' if n_comps == None: n_comps = min(x.shape) if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = Isomap(n_components=n_comps, n_neighbors=n_neighbors).fit(x) self.isomap_fitted = True return self.project_obj def transform_isomap(self, x=None, return_data=False): ''' Project a dataset using the UMAP object fitted in the fit_umap() method: - x: Array or dataframe, default = None, optional Contains the data to be transformed by the fitted umap object. If x is None then the data used to fit the umap object in fit_umap() will be used in transfrom_pca. If x is not None the the fitted parameters of the umap object will be used to transform x. Furthermore, if a standardization method is specified and x is not None then the parameters of the standardization object fitted in fit_umap() will be used to standardize x before proejcting the data. - return_data: bool, default = False If True the function will return the projected dataset as a numpy array ''' assert self.isomap_fitted == True, 'No Isomap object has been fitted' if self.standard_method != None: if type(x) != type(None): x = self.transform_standard(x, return_data=True) else: x = self.standard_data self.project_data = self.project_obj.transform(x) if return_data == True: return self.project_data def fit_transform_isomap(self, x, n_comps=None, n_neighbors=5, return_data=False): ''' Fit a PHATE object to a dataset with the inputs: Fit a Isomap object to a dataset with the inputs: - x: Array or dataframe Contains the data to be fitted by the Isomap object. If a standardization method is specified then x is standardized prior to fitting the pca object. - n_comps: int, defalut = None, n_comps <= min( n_features, n_observations, n_neighbors ) Specifies the number of Isomap components to fit to the dataset - return_data: bool, default = False If True the function will return the projected dataset as a numpy array Reference to the authors of PHATE: Moon KR, van Dijk D, Zheng W, et al. (2017), PHATE: A Dimensionality Reduction Method for Visualizing Trajectory Structures in High-Dimensional Biological Data, BioRxiv. Documetnation: https://phate.readthedocs.io/en/stable/api.html#id2 ''' if n_comps == None: n_comps = min(x.shape) if self.standard_method != None: x = self.fit_transform_standard(x, return_data=True) self.project_obj = Isomap(n_components=n_comps, n_neighbors=n_neighbors).fit(x) self.project_data = self.project_obj.transform(x) self.isomap_fitted = True if return_data == True: return self.project_data
# # The format is: Plot2D(T, title, x, y, num_to_plot=40): # T is your transformed data, NDArray. # title is your chart title # x is the principal component you want displayed on the x-axis, Can be 0 or 1 # y is the principal component you want displayed on the y-axis, Can be 1 or 2 # pca = PCA(n_components=3) T = pca.fit_transform(df) Plot2D(T, 'PCA', 0, 1) Plot2D(T, 'PCA', 1, 2) #%% # TODO: Implement Isomap here. Reduce the dataframe df down # to THREE components. Once you've done that, call Plot2D using # the first two components. # iso = Isomap(n_components=3) T_iso = iso.fit_transform(df) Plot2D(T_iso, 'Isomap', 0, 1) Plot2D(T_iso, 'Isomap', 1, 2) #%% # TODO: If you're up for a challenge, draw your dataframes in 3D # Even if you're not, just do it anyway. # ax = plt.subplot(111, projection='3d') ax.scatter(T_iso[:, 0], T_iso[:, 1], T_iso[:, 2]) plt.show()
Method to clean the data :return: data and labels """ # load the data dataset = np.genfromtxt("wdbc.data", dtype=np.float, delimiter=',', usecols=(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31), encoding=None) labels = np.genfromtxt("wdbc.data", dtype=None, delimiter=',', usecols=(1), encoding=None) temp_labels = np.zeros(len(labels)) for i in range(len(labels)): if labels[i] == 'B': temp_labels[i] = 0 else: temp_labels[i] = 1 # normalize temp_data = normalize(dataset) return temp_data, temp_labels x, y = original_clean() model = Isomap(n_components=size, n_neighbors=30) out = model.fit_transform(x) out = out[:, 0:2] plt.scatter(out[:, 0], out[:, 1], c=y, marker='o') plt.show() model_2 = DBSCAN() predicted = model_2.fit_predict(out) score = v_measure_score(predicted, y) print(score)
def apply_ISOMap(proj_data, proj_weights=None): model = Isomap(n_neighbors=4, n_components=2) norm_data = normalize_columns(proj_data) result = model.fit_transform(norm_data.T) return result
raw_df.to_csv("./Data/raw_mat_vectors.csv", index=False) ############################################################################ ################################ ISOMAP SSMs ############################### ############################################################################ print("Generating ISOMAP Matrices") # initialize embedding iso = Isomap(n_neighbors=3, n_components=1) # generate SSMs for each gesture max_sz = 0 # track size to determine largest iso_ssm_lst = [np.zeros(shape=(a.shape[0], a.shape[0])) for a in arrays] for n, a in enumerate(arrays): embed = iso.fit_transform(a) for i in range(embed.size): for j in range(embed.size): iso_ssm_lst[n][i, j] = cumulated_ts_2(embed[i, :], embed[j, :]) if embed.shape[0] > max_sz: max_sz = embed.shape[0] # smooth SSM images for r, s in enumerate(iso_ssm_lst): iso_ssm_lst[r] = gaussian_filter(s, sigma=1) # zero pad images shape = (max_sz, max_sz) pad_img = [ np.pad(a, np.subtract(shape, a.shape), 'constant', constant_values=0) for a in iso_ssm_lst ]
# where each dimension represents the brightness of 1 pixel. # visualizing such relationships (given num dimensions) is hard # One approach is to use 'dimensionality reduction', such as manifold learning # dimensionality reduction is an example of unsupervised machine learning. neato! # there is a later chapter dedicated to machine learning, so without getting into the # discussion of how it works, here's a simple dimensionality reduction in action # to achieve our desired goal: # ---------------------------------------- # project the digits into 2 dimensions using IsoMap from sklearn.manifold import Isomap iso = Isomap(n_components=2) projection = iso.fit_transform(digits.data) # use discrete colormap to view results, set "ticks" / "clim" for aesthetics plt.scatter(projection[:, 0], projection[:, 1], lw=0.1, c=digits.target, cmap=plt.cm.get_cmap('cubehelix', 6)) plt.colorbar(ticket=range(6), label='digit value') plt.clim(-0.5, 5.5) # ---------------------------------------- # projection also gives insights on relationships within dataset # i.e. note that '5' and '3' clusters are very close together # while '0' and '1' clusters are extremely far apart
show_figure(fdata, labels, ulabs, 'PCA') # Sparse PCA print('Sparse PCA') from sklearn.decomposition import SparsePCA spca = SparsePCA(n_components=3) fdata = spca.fit_transform(authors) show_figure(fdata, labels, ulabs, 'Sparse PCA') # ISOMAP print('ISOMAP') from sklearn.manifold import Isomap iso = Isomap(n_components=3, n_neighbors=7) fdata = iso.fit_transform(authors) show_figure(fdata, labels, ulabs, 'ISOMAP') # LLE print('LLE') from sklearn.manifold import LocallyLinearEmbedding lle = LocallyLinearEmbedding(n_neighbors=7, n_components=3, method='standard') fdata = lle.fit_transform(authors) print(lle.reconstruction_error_) show_figure(fdata, labels, ulabs, 'LLE') # MDS
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Set1, edgecolor='k') plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.subplot(122) plt.scatter(X[:, 2], X[:, 3], c=y, cmap=plt.cm.Set1, edgecolor='k') plt.xlabel('Petal Length') plt.ylabel('Petal Width') plt.show() # + from sklearn.manifold import Isomap iso = Isomap(n_neighbors=5, n_components=2) proj = iso.fit_transform(X) plt.figure(figsize=(15, 9)) plt.scatter(proj[:, 0], proj[:, 1], c=y) plt.colorbar() plt.show() # - # ## blobs # # + import matplotlib.pyplot as plt from sklearn.datasets import make_blobsb # Generate 3 blobs with 2 classes where the second blob contains
y = df.iloc[:,-1:].as_matrix() #################################################################### # # # ###### randomized principal component analysis for dimensionality reduction of alt set ######## # # # The purpose is to find a way to effectively label our data, since labeling based solely on # # # perceptual criteria (meaning, just listening to the sounds and judging to which instrument they should # # # be assigned) does not work well enough. # # from sklearn.decomposition import RandomizedPCA as RandPCA # # pca = RandPCA(n_components = 30) # # X = pca.fit_transform(X) from sklearn.manifold import Isomap isomap = Isomap(n_components=30) X = isomap.fit_transform(X) #################################################################### ############ cluster the alternative set into 17 clusters, using KMeans ########## clstrer = KMeans(n_clusters = 17) clstr = clstrer.fit_predict(X) #################################################################### ########### names will be filled with the wav files' filenames ################ pardir = '../database/all_recorded_and_downloaded_alt_sounds_processed' names = np.array([]) folders = os.listdir(pardir)[1:]
import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import fetch_olivetti_faces from sklearn.manifold import Isomap # Set random seed for reproducibility np.random.seed(1000) if __name__ == '__main__': # Create the dataset faces = fetch_olivetti_faces() # Train Isomap isomap = Isomap(n_neighbors=5, n_components=2) X_isomap = isomap.fit_transform(faces['data']) # Plot the result fig, ax = plt.subplots(figsize=(18, 10)) for i in range(100): ax.scatter(X_isomap[i, 0], X_isomap[i, 1], marker='o', s=100) ax.annotate('%d' % faces['target'][i], xy=(X_isomap[i, 0] + 0.5, X_isomap[i, 1] + 0.5)) ax.set_xlabel(r'$x_0$') ax.set_ylabel(r'$x_1$') ax.grid() plt.show()
exit(0) min_max_scaler = MinMaxScaler() x_benign = min_max_scaler.fit_transform(np.load("./hidden_repre/ben_hid_emd_4_50_8_200_r0.npy")) x_vandal = min_max_scaler.fit_transform(np.load("./hidden_repre/val_hid_emd_4_50_8_200_r0.npy")) x_benign = sample_shuffle_uspv(x_benign) x_vandal = sample_shuffle_uspv(x_vandal) X = x_benign[0:3000].tolist() + x_vandal[0:3000].tolist() y = np.zeros(3000).tolist() + np.ones(3000).tolist() X, y = np.array(X), np.array(y) model_2D = Isomap(n_components=2) X_2D = model_2D.fit_transform(X) draw_2D(X_2D, y) exit(0)
data_pca = pca.fit_transform(data) plt.scatter(data_pca[:, 0], data_pca[:, 1], c=target, edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow', 2)) plt.colorbar(); ## PCA能量 sb.set() pca_ = PCA().fit(data) plt.plot(np.cumsum(pca_.explained_variance_ratio_)) plt.xlabel('number of components') plt.ylabel('cumulative explained variance'); plt.xlim(0,5) ## IsoMap降维 from sklearn.manifold import Isomap iso = Isomap(n_components=2) data_projected = iso.fit_transform(data) plt.scatter(data_projected[:, 0], data_projected[:, 1], c=target,edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('rainbow', 2)); plt.colorbar(label='Cancer', ticks=range(2)) plt.clim(-200, 0) ### KNN from sklearn.neighbors import KNeighborsClassifier from sklearn.grid_search import GridSearchCV clf = KNeighborsClassifier() n_neighbors = [1,2,3,5,8,10,15,20,25,30,35,40] weights = ['uniform','distance'] param_grid = [{'n_neighbors': n_neighbors, 'weights': weights}] grid_search = GridSearchCV(clf, param_grid=param_grid, cv=10) grid_search.fit(data, target) grid_search.grid_scores_
from sklearn.decomposition import PCA, KernelPCA from sklearn.manifold import Isomap from analogy import Analogy from vstore import VStore a = Analogy(VStore("vectors.lmdb", "big-glove")) buf = "" linebuf = raw_input("Please enter some words to plot, or empty for a canned list: ") while linebuf: buf += linebuf + " " linebuf = raw_input("... ") labels = buf.split() \ or "doctor nurse politician senator lawyer barrister defend accuse heal treat cure elect vote".split() vs = [a.w(x) for x in labels if a.w(x) is not None ] flatplot = Isomap(2) ps = flatplot.fit_transform(vs) plt.title("Reduced vector space model") plt.xlabel("First Principal Component") plt.ylabel("Second Principal Component") plt.scatter(ps[:, 0], ps[:, 1]) for (x, y), label in zip(ps, labels): print "plotting %f, %f, %s" %(x, y, label) plt.annotate(label, xy = (x, y), xytext = (0, 0), textcoords = 'offset points') plt.show()