def __init__(self, n_clusters=50, pca_n_components=20, kmpca_n_components=3, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 2), min_df=30, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Title_CounterX', 'Title_ClusterdX', 'Title_KmX', 'Title_PCAX', 'Title_PCAClusterdX', 'Title_RbfX', 'Title_TreeX' ] self.linear_feature_selector = None
def __init__(self, X_train, y_train): X_train, X_train_lr, y_train, y_train_lr = \ skms.train_test_split(X_train, y_train, test_size=0.5) rt = ske.RandomTreesEmbedding(n_estimators=50) lm = skline.LogisticRegression() self.model = skp.make_pipeline(rt, lm) self.model.fit(X_train, y_train)
def __init__(self, n_clusters=50, pca_n_components=30, kmpca_n_components=3, kernel_n_components=30): ## use (min_df = 30, max_df = 0.5) to generate a lot of features - more choic for feature selection ## use (min_df = 0.001, max_df = 0.05) to generate fewer features - better clustering self.counter = text.CountVectorizer(stop_words='english', charset='utf-8', charset_error='ignore', ngram_range=(1, 1), min_df=0.001, max_df=0.05, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Desc_CounterX', 'Desc_ClusterdX', 'Desc_KmX', 'Desc_PCAX', 'Desc_PCAClusterdX', 'Desc_RbfX', 'Desc_TreeX' ] self.linear_feature_selector = None
def classifier_choice(method='tsne', neighbors=30, dimensions=2): if method in "tsne": return TSNE(n_components=dimensions, perplexity=30, verbose=1) elif method in "pca": return decomposition.TruncatedSVD(n_components=dimensions) elif method in "isomap": return manifold.Isomap(n_neighbors=neighbors, n_components=dimensions) elif method in "lle": return manifold.LocallyLinearEmbedding(n_neighbors=neighbors, n_components=dimensions, method='standard') elif method in "mlle": return manifold.LocallyLinearEmbedding(n_neighbors=neighbors, n_components=dimensions, method='modified') elif method in "hlle": return manifold.LocallyLinearEmbedding(n_neighbors=neighbors, n_components=dimensions, method='hessian') elif method in "ltsa": return manifold.LocallyLinearEmbedding(n_neighbors=neighbors, n_components=dimensions, method='ltsa') elif method in "mds": return manifold.MDS(n_components=dimensions, n_init=1, max_iter=100) elif method in "trees": trees = ensemble.RandomTreesEmbedding(n_estimators=200, max_depth=5) pca = decomposition.TruncatedSVD(n_components=dimensions) return Pipeline([('Random Tree Embedder', trees), ('PCA', pca)]) elif method in "spectral": return manifold.SpectralEmbedding(n_components=dimensions, eigen_solver="arpack") else: print('Please use valid method')
def analyse_mode(properties, mode): mode = float(mode) if mode == 0: pca = PCA(n_components=2) pos = pca.fit_transform(properties) elif mode == 1: model = TSNE(n_components=2, random_state=0) np.set_printoptions(suppress=True) pos = model.fit_transform(properties) elif mode == 2: clf = manifold.Isomap(n_components=2) pos = clf.fit_transform(properties) elif mode == 3: hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) x_transformed = hasher.fit_transform(properties) pca = decomposition.TruncatedSVD(n_components=2) pos = pca.fit_transform(x_transformed) else: clf = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") pos = clf.fit_transform(properties) return pos
def __init__(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit(source) hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=None) data_transformed = hasher.fit_transform(data_source) rfe = decomposition.TruncatedSVD(n_components=2) self.return_data = rfe.fit_transform(data_transformed)
def random_tree(): print("Totally Random Trees embedding is selected") embedder = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) X_s = embedder.fit_transform(X_s) X_t = embedder.fit_transform(X_t) X_s_val = embedder.fit_transform(X_s_val) X_test = embedder.fit_transform(X_test) embedder = decomposition.TruncatedSVD(n_components=n_components) return embedder
def RandomForestEmbedding(self, source): min_max_scaler = preprocessing.MinMaxScaler() data_source = min_max_scaler.fit(source) hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=None) data_transformed = hasher.fit_transform(data_source) rfe = decomposition.TruncatedSVD(n_components=2) result = {} result['data'] = rfe.fit_transform(data_transformed) result['params'] = 0 return result
def random_trees_embedding(): print("Computing Totally Random Trees embedding") hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) t0 = time() X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=2) X_reduced = pca.fit_transform(X_transformed) plot_embedding(X_reduced, "Random forest embedding of the digits (time %.2fs)" % (time() - t0))
def rtree(X, dim=2, n_estimators=200, max_depth=5, **kargs): '''Random Trees embedding of the dataset''' print("Computing Totally Random Trees embedding") from sklearn.pipeline import Pipeline tr = Pipeline([('hasher', ensemble.RandomTreesEmbedding(n_estimators=n_estimators, random_state=0, max_depth=max_depth)), ('pca', decomposition.PCA(n_components=dim))]) try: X_reduced = tr.fit_transform(X) return tr, X_reduced, "Random forest embedding of the features" except Exception as e: traceback.print_exc()
def __init__(self, n_clusters=100, pca_n_components=10, kmpca_n_components=7, kernel_n_components=30): self.counter = text.CountVectorizer(stop_words='english', ngram_range=(1, 1), min_df=2, max_df=0.8, binary=True, lowercase=True) self.km = cluster.MiniBatchKMeans(n_clusters=n_clusters, n_init=10, batch_size=10000, verbose=1) self.pca = decomposition.RandomizedPCA(n_components=pca_n_components) self.kmpca = decomposition.RandomizedPCA( n_components=kmpca_n_components) self.rbf = kernel_approximation.RBFSampler( n_components=kernel_n_components) self.tree_hasher = ensemble.RandomTreesEmbedding(n_estimators=30, max_depth=5, n_jobs=4) self.X_names = [ 'Loc_CounterX', 'Loc_ClusterdX', 'Loc_KmX', 'Loc_PCAX', 'Loc_PCAClusterdX', 'Loc_RbfX', 'Loc_TreeX' ] self.linear_feature_selector = None ## BUILD dictionary based on location_tree - faster for search location_tree = [ row[0].lower().split('~')[::-1] for row in csv.reader(open(LOCATION_TREE_FILE)) ] self.location_dict = {} for locs in location_tree: for i in range(len(locs)): if locs[i] not in self.location_dict: self.location_dict[locs[i]] = locs[i:]
def plot_scatter_2d(ds_merged, method='mds', fig_number=1): from sklearn import decomposition, manifold, lda, ensemble """ methods: 'mds', 'pca', 'iso', 'forest', 'embedding' """ data = ds_merged.samples stringa = '' if method == 'pca': clf = decomposition.RandomizedPCA(n_components=2) stringa = 'Principal Component Analysis' ######## elif method == 'iso': clf = manifold.Isomap(30, n_components=2) stringa = 'Iso surfaces ' ######### elif method == 'forest': hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) data = hasher.fit_transform(data) clf = decomposition.RandomizedPCA(n_components=2) stringa = 'Random Forests' ######## elif method == 'embedding': clf = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") stringa = 'Spectral Embedding' ######### else: clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) stringa = 'Multidimensional scaling' ########################### #dist_matrix = squareform(pdist(data, 'euclidean')) print stringa + ' is performing...' pos = clf.fit_transform(data) colors = cycle('bgrymkybgrcmybgrcmybgrcmy') f = plt.figure() a = f.add_subplot(111) a.set_title(stringa) for label in np.unique(ds_merged.targets): m = ds_merged.targets == label data_m = pos[m] c = colors.next() a.scatter(data_m.T[0].mean(), data_m.T[1].mean(), label=label, color=c, s=120) a.scatter(data_m.T[0][::2], data_m.T[1][::2], color=c) ''' cov_ = np.cov(data_m.T) v, w = np.linalg.eigh(cov_) u = w[0] / np.linalg.norm(w[0]) angle = np.arctan2(u[1], u[0]) angle = 180 * angle / np.pi # convert to degrees v *= 0.5 ell = mpl.patches.Ellipse(np.mean(data_m, axis=0), v[0], v[1], 180 + angle, color=c) ell.set_clip_box(a.bbox) ell.set_alpha(0.2) a.add_artist(ell) ''' a.legend() return
if stru2vec[triple[1]] + cnn2vec[triple[1]] not in entity_vec: entity_vec.append(stru2vec[triple[1]]) entity_type.append(5) # x_train, y_train, x_test, y_test,x_test_triple = eTour_Experiment.orgnanizeDataFormat(dkrl_train,trainZ_N,dkrl_test,ClassiferTestTriples,cnn2vec,stru2vec,1) ###rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) ###x_projected = rp.fit_transform(x_train) ###x_projected = decomposition.TruncatedSVD(n_components=2).fit_transform(x_train) ###x_projected = discriminant_analysis.LinearDiscriminantAnalysis().fit_transform(x_train, y_train) ###x_projected = manifold.Isomap(n_neighbors=5, n_components=2).fit_transform(x_train) # x_projected = manifold.LocallyLinearEmbedding(n_neighbors=30, n_components=2, # method='hessian').fit_transform(x_train) ##x_projected = manifold.TSNE(n_components=2, init='pca', random_state=0).fit_transform(x_train) hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=12) t0 = time() X_transformed = hasher.fit_transform(entity_vec) pca = decomposition.TruncatedSVD(n_components=2) x_projected = pca.fit_transform(X_transformed) font = FontProperties(fname=r"C:\Windows\Fonts\simhei.ttf", size=14) print(x_projected) xValue_1 = [x[0] for x, y in zip(x_projected, entity_type) if y == 1] yValue_1 = [x[1] for x, y in zip(x_projected, entity_type) if y == 1] xValue_0 = [x[0] for x, y in zip(x_projected, entity_type) if y == 0] yValue_0 = [x[1] for x, y in zip(x_projected, entity_type) if y == 0] xValue_3 = [x[0] for x, y in zip(x_projected, entity_type) if y == 3]
n_components=level, method='modified') X_mlle = clf.fit_transform(X) plot_embedding(X_mlle, "LLE modifiée") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=level, method='hessian') X_hlle = clf.fit_transform(X) plot_embedding(X_hlle, "LLE Hessian") clf = manifold.MDS(n_components=level, n_init=20, max_iter=100) X_mds = clf.fit_transform(X) plot_embedding(X_mds, "MDS") hasher = ensemble.RandomTreesEmbedding(n_estimators=100) X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=level) X_reduced = pca.fit_transform(X_transformed) plot_embedding(X_reduced, "Random forest") embedder = manifold.SpectralEmbedding(n_components=level) X_se = embedder.fit_transform(X) plot_embedding(X_se, "Spectral embedding") plotly_embedding(X_se, "Spectral embedding") tsne = manifold.TSNE(n_components=level, init='pca', random_state=0) X_tsne = tsne.fit_transform(X) plot_embedding(X_tsne, "t-SNE") print("Réduction dimensionnelle et clustering :", time.clock() - time4)
def try_all_dim_reduction(X, y, y_label): n_neighbors = 30 #---------------------------------------------------------------------- # Random 2D projection using a random unitary matrix print("Computing random projection") rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X) plot_3D_labeled_datapoints(X_projected, y, y_label) #---------------------------------------------------------------------- # Projection on to the first 2 principal components print("Computing PCA projection") X_pca = decomposition.PCA(n_components=2).fit_transform(X) plot_3D_labeled_datapoints(X_pca, y, y_label) #---------------------------------------------------------------------- # Isomap projection of the digits dataset print("Computing Isomap embedding") X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X) plot_3D_labeled_datapoints(X_iso, y, y_label) #---------------------------------------------------------------------- # Locally linear embedding of the digits dataset print("Computing LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard') X_lle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_3D_labeled_datapoints(X_lle, y, y_label) #---------------------------------------------------------------------- # Modified Locally linear embedding of the digits dataset print("Computing modified LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') X_mlle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_3D_labeled_datapoints(X_mlle, y, y_label) #---------------------------------------------------------------------- # MDS embedding of the digits dataset print("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) X_mds = clf.fit_transform(X) print("Done. Stress: %f" % clf.stress_) plot_3D_labeled_datapoints(X_mds, y, y_label) #---------------------------------------------------------------------- # Random Trees embedding of the digits dataset print("Computing Totally Random Trees embedding") hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=2) X_reduced = pca.fit_transform(X_transformed) plot_3D_labeled_datapoints(X_reduced, y, y_label) #---------------------------------------------------------------------- # Spectral embedding of the digits dataset print("Computing Spectral embedding") embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") X_se = embedder.fit_transform(X) plot_3D_labeled_datapoints(X_se, y, y_label) #---------------------------------------------------------------------- # t-SNE embedding of the digits dataset print("Computing t-SNE embedding") tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) X_tsne = tsne.fit_transform(X) plot_3D_labeled_datapoints(X_tsne, y, y_label)
def plot_other_manifold(X, y, n_neighbors, n_estimators=00, max_depth=5, random_state=0): # ---------------------------------------------------------------------- # Modified Locally linear embedding of the digits dataset print("Computing modified LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') t0 = time() X_mlle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_mlle, y, "Modified Locally Linear Embedding of the digits (time %.2fs)" % (time() - t0)) # ------------------------------------------------------------- # HLLE embedding of the digits dataset print("Computing Hessian LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='hessian') t0 = time() X_hlle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_hlle, y, "Hessian Locally Linear Embedding of the digits (time %.2fs)" % (time() - t0)) # -------------------------------------------------------------------- # LTSA embedding of the digits dataset print("Computing LTSA embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') t0 = time() X_ltsa = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_ltsa, y, "Local Tangent Space Alignment of the digits (time %.2fs)" % (time() - t0)) # ---------------------------------------------------------------------- # Random Trees embedding of the digits dataset print("Computing Totally Random Trees embedding") hasher = ensemble.RandomTreesEmbedding(n_estimators=n_estimators, random_state=random_state, max_depth=max_depth) t0 = time() X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=2) X_reduced = pca.fit_transform(X_transformed) plot_embedding(X_reduced, y, "Random forest embedding of the digits (time %.2fs)" % (time() - t0)) # ---------------------------------------------------------------------- # Spectral embedding of the digits dataset print("Computing Spectral embedding") embedder = manifold.SpectralEmbedding(n_components=2, random_state=random_state, eigen_solver="arpack") t0 = time() X_se = embedder.fit_transform(X) plot_embedding(X_se, y, "Spectral embedding of the digits (time %.2fs)" % (time() - t0))
def get_embedding(X, y, type_embeding): n_neighbors = 30 X_projected = None if type_embeding == "Random": rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X) elif type_embeding == "PCA": X_projected = decomposition.TruncatedSVD(n_components=2).fit_transform( X) elif type_embeding == "LDA": X2 = X.copy() X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible X_projected = discriminant_analysis.LinearDiscriminantAnalysis( n_components=2).fit_transform(X2, y) elif type_embeding == "Isomap": X_projected = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X) elif type_embeding == "LLE": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard') X_projected = clf.fit_transform(X) elif type_embeding == "mLLE": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') X_projected = clf.fit_transform(X) elif type_embeding == "hLLE": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='hessian') X_projected = clf.fit_transform(X) elif type_embeding == "ltsa": clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') X_projected = clf.fit_transform(X) elif type_embeding == "MDS": clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) X_projected = clf.fit_transform(X) elif type_embeding == "RF": hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=2) X_projected = pca.fit_transform(X_transformed) elif type_embeding == "Spectral": embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") X_projected = embedder.fit_transform(X) elif type_embeding == "T-SNE": tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) X_projected = tsne.fit_transform(X) else: print("""Valid options are: Random => Random Projections PCA => Principal Component Analysis LDA => Linear Discriminant Analysis Isomap => Isomap LLE => Locally Linear Embedding mLLE => Modified Locally Linear Embedding hLLE => Hessian Locally Linear Embedding ltsa => Locally Linear Embedding (ltsa) MDS => Multidimensional Scaling RF => Random Forest Embeding Spectral => Spectral Embeding T-SNE => T-SNE """) return X_projected
# ---------------------------------------------------------------------- # MDS embedding of the digits dataset print("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) t0 = time() X_mds = clf.fit_transform(X) print("Done. Stress: %f" % clf.stress_) plot_embedding(X_mds, "MDS embedding of the digits (time %.2fs)" % (time() - t0)) # ---------------------------------------------------------------------- # Random Trees embedding of the digits dataset print("Computing Totally Random Trees embedding") RTE = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5).fit_transform(X) X_reduced = decomposition.TruncatedSVD(n_components=2).fit_transform(RTE) plot_embedding(X_reduced, "Random forest embedding of the digits") # ---------------------------------------------------------------------- # Spectral embedding of the digits dataset print("Computing Spectral embedding") embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") t0 = time() X_se = embedder.fit_transform(X) plot_embedding(X_se,
def dimension_reduce(X_train,y_train): ''' rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X_train) plot_embedding(X_projected, y_train, "Random Projection of the digits") # Projection on to the first 2 principal components print("Computing PCA projection") t0 = time() X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X_train) plot_embedding(X_pca, y_train "Principal Components projection of the digits (time %.2fs)" % (time() - t0)) print("Computing Linear Discriminant Analysis projection") X2 = X_train.copy() X2.flat[::X_train.shape[1] + 1] += 0.01 # Make X invertible t0 = time() X_lda = discriminant_analysis.LinearDiscriminantAnalysis(n_components=2).fit_transform(X2, y_train) plot_embedding(X_lda, y_train, "Linear Discriminant projection of the digits (time %.2fs)" % (time() - t0)) ''' n_neighbors = 30 # Isomap projection of the digits dataset print("Computing Isomap embedding") t0 = time() X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X_train) print("Done.") plot_embedding(X_iso,y_train, "Isomap projection of the digits (time %.2fs)" %(time() - t0)) # Locally linear embedding of the digits dataset print("Computing LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard') t0 = time() X_lle = clf.fit_transform(X_train) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_lle,y_train, "Locally Linear Embedding of the digits (time %.2fs)" % (time() - t0)) # Modified Locally linear embedding of the digits dataset print("Computing modified LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') t0 = time() X_mlle = clf.fit_transform(X_train) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_mlle,y_train, "Modified Locally Linear Embedding of the digits (time %.2fs)" % (time() - t0)) ''' # HLLE embedding of the digits dataset print("Computing Hessian LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='hessian') t0 = time() X_hlle = clf.fit_transform(X_train) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_hlle,y_train, "Hessian Locally Linear Embedding of the digits (time %.2fs)" % (time() - t0)) ''' # LTSA embedding of the digits dataset print("Computing LTSA embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') t0 = time() X_ltsa = clf.fit_transform(X_train) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding(X_ltsa,y_train, "Local Tangent Space Alignment of the digits (time %.2fs)" % (time() - t0)) # MDS embedding of the digits dataset print("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) t0 = time() X_mds = clf.fit_transform(X_train) print("Done. Stress: %f" % clf.stress_) plot_embedding(X_mds,y_train, "MDS embedding of the digits (time %.2fs)" % (time() - t0)) # Random Trees embedding of the digits dataset print("Computing Totally Random Trees embedding") hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) t0 = time() X_transformed = hasher.fit_transform(X_train) pca = decomposition.TruncatedSVD(n_components=2) X_reduced = pca.fit_transform(X_transformed) plot_embedding(X_reduced,y_train, "Random forest embedding of the digits (time %.2fs)" % (time() - t0)) # Spectral embedding of the digits dataset print("Computing Spectral embedding") embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") t0 = time() X_se = embedder.fit_transform(X_train) plot_embedding(X_se,y_train, "Spectral embedding of the digits (time %.2fs)" % (time() - t0)) # t-SNE embedding of the digits dataset print("Computing t-SNE embedding") tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) t0 = time() X_tsne = tsne.fit_transform(X_train) plot_embedding(X_tsne,y_train, "t-SNE embedding of the digits (time %.2fs)" % (time() - t0)) plt.show()
def analyze(X=None, y=None, plot_fun=scatter_plot, data_name="data"): if X is None: digits = datasets.load_digits(n_class=6) X = digits.data y = digits.target n_samples, n_features = X.shape n_neighbors = 30 def plot_embedding(X, title=None): x_min, x_max = np.min(X, 0), np.max(X, 0) X = (X - x_min) / (x_max - x_min) plot_fun(X, y) if title is not None: plt.title(title) # #---------------------------------------------------------------------- # # Scale and visualize the embedding vectors # def plot_embedding(X, title=None): # x_min, x_max = np.min(X, 0), np.max(X, 0) # X = (X - x_min) / (x_max - x_min) # # plt.figure() # ax = plt.subplot(111) # for i in range(X.shape[0]): # plt.text(X[i, 0], X[i, 1], str(digits.target[i]), # color=plt.cm.Set1(y[i] / 10.), # fontdict={'weight': 'bold', 'size': 9}) # # if hasattr(offsetbox, 'AnnotationBbox'): # # only print thumbnails with matplotlib > 1.0 # shown_images = np.array([[1., 1.]]) # just something big # for i in range(digits.data.shape[0]): # dist = np.sum((X[i] - shown_images) ** 2, 1) # if np.min(dist) < 4e-3: # # don't show points that are too close # continue # shown_images = np.r_[shown_images, [X[i]]] # imagebox = offsetbox.AnnotationBbox( # offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r), # X[i]) # ax.add_artist(imagebox) # plt.xticks([]), plt.yticks([]) # if title is not None: # plt.title(title) # # # #---------------------------------------------------------------------- # # Plot images of the digits # n_img_per_row = 20 # img = np.zeros((10 * n_img_per_row, 10 * n_img_per_row)) # for i in range(n_img_per_row): # ix = 10 * i + 1 # for j in range(n_img_per_row): # iy = 10 * j + 1 # img[ix:ix + 8, iy:iy + 8] = X[i * n_img_per_row + j].reshape((8, 8)) # plt.imshow(img, cmap=plt.cm.binary) # plt.xticks([]) # plt.yticks([]) # plt.title('A selection from the 64-dimensional digits dataset') #---------------------------------------------------------------------- # Random 2D projection using a random unitary matrix print("Computing random projection") rp = random_projection.SparseRandomProjection(n_components=2, random_state=42) X_projected = rp.fit_transform(X) plot_embedding(X_projected, "Random Projection of the {}".format(data_name)) #---------------------------------------------------------------------- # Projection on to the first 2 principal components print("Computing PCA projection") t0 = time() X_pca = decomposition.TruncatedSVD(n_components=2).fit_transform(X) plot_embedding( X_pca, "Principal Components projection of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # Projection on to the first 2 linear discriminant components print("Computing LDA projection") X2 = X.copy() X2.flat[::X.shape[1] + 1] += 0.01 # Make X invertible t0 = time() X_lda = lda.LDA(n_components=2).fit_transform(X2, y) plot_embedding( X_lda, "Linear Discriminant projection of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # Isomap projection of the dataset print("Computing Isomap embedding") t0 = time() X_iso = manifold.Isomap(n_neighbors, n_components=2).fit_transform(X) print("Done.") plot_embedding( X_iso, "Isomap projection of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # Locally linear embedding of the dataset print("Computing LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='standard') t0 = time() X_lle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding( X_lle, "Locally Linear Embedding of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # Modified Locally linear embedding of the dataset print("Computing modified LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='modified') t0 = time() X_mlle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding( X_mlle, "Modified Locally Linear Embedding of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # HLLE embedding of the dataset print("Computing Hessian LLE embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='hessian') t0 = time() X_hlle = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding( X_hlle, "Hessian Locally Linear Embedding of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # LTSA embedding of the dataset print("Computing LTSA embedding") clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=2, method='ltsa') t0 = time() X_ltsa = clf.fit_transform(X) print("Done. Reconstruction error: %g" % clf.reconstruction_error_) plot_embedding( X_ltsa, "Local Tangent Space Alignment of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # MDS embedding of the dataset print("Computing MDS embedding") clf = manifold.MDS(n_components=2, n_init=1, max_iter=100) t0 = time() X_mds = clf.fit_transform(X) print("Done. Stress: %f" % clf.stress_) plot_embedding( X_mds, "MDS embedding of the {} (time {:.2f})".format(data_name, time() - t0)) #---------------------------------------------------------------------- # Random Trees embedding of the dataset print("Computing Totally Random Trees embedding") hasher = ensemble.RandomTreesEmbedding(n_estimators=200, random_state=0, max_depth=5) t0 = time() X_transformed = hasher.fit_transform(X) pca = decomposition.TruncatedSVD(n_components=2) X_reduced = pca.fit_transform(X_transformed) plot_embedding( X_reduced, "Random forest embedding of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # Spectral embedding of the digits dataset print("Computing Spectral embedding") embedder = manifold.SpectralEmbedding(n_components=2, random_state=0, eigen_solver="arpack") t0 = time() X_se = embedder.fit_transform(X) plot_embedding( X_se, "Spectral embedding of the {} (time {:.2f})".format( data_name, time() - t0)) #---------------------------------------------------------------------- # t-SNE embedding of the digits dataset print("Computing t-SNE embedding") tsne = manifold.TSNE(n_components=2, init='pca', random_state=0) t0 = time() X_tsne = tsne.fit_transform(X) plot_embedding( X_tsne, "t-SNE embedding of the {} (time {:.2f})".format( data_name, time() - t0)) plt.show()