test_loader = torch.utils.data.DataLoader(Subset(test_set, test_limit), batch_size=args.batch_size, shuffle=False) # Main body model = DCN(args) rec_loss_list, nmi_list, ari_list = solver(args, model, train_loader, test_loader) # X_train = X_train.to(self.device) # print(y_train[0]) out = model.autoencoder(torch.FloatTensor(np.array(X_train)).to( args.device), latent=True) reducer = pacmap.PaCMAP() X2 = reducer.fit_transform(out.cpu().detach().numpy()) X4 = reducer.fit_transform(X_train) c_train = [color[int(y_train.iloc[i])] for i in range(len(y_train))] figure = plt.figure() fig, (ax1, ax2) = plt.subplots(1, 2) fig.suptitle('Normal vs CAC Embeddings') ax1.scatter(X4[:, 0], X4[:, 1], color=c_train) ax2.scatter(X2[:, 0], X2[:, 1], color=c_train) plt.savefig("normal_vs_cac.png", dpi=figure.dpi) # plt.show() # Testing out = model.autoencoder(torch.FloatTensor(np.array(X_test)).to( args.device),
mnist = np.load("../data/mnist_images.npy", allow_pickle=True) mnist = mnist.reshape(mnist.shape[0], -1) labels = np.load("../data/mnist_labels.npy", allow_pickle=True) print("Loading data") n_splits = [2, 5, 10] for n in n_splits: skf = StratifiedKFold(n_splits=n) for train_index, test_index in skf.split(mnist, labels): X_train, X_test = mnist[train_index], mnist[test_index] y_train, y_test = labels[train_index], labels[test_index] break # Initialize the instance reducer = pacmap.PaCMAP(n_components=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0, random_state=20, save_tree=False) # Fit the training set embedding = reducer.fit_transform(X_train) # Transform the test set into the same embedding space embedding_test = reducer.transform(X_test, basis=X_train) # Plot the results embedding_combined = np.concatenate((embedding, embedding_test)) y = np.concatenate((y_train, y_test)) embeddings = [embedding, embedding_test, embedding_combined] labelset = [y_train, y_test, y] titles = ['Training', 'Test', 'Combined']
import pacmap import numpy as np import matplotlib.pyplot as plt # loading preprocessed coil_20 dataset # you can change it with any dataset that is in the ndarray format, with the shape (N, D) # where N is the number of samples and D is the dimension of each sample X = np.load("../data/coil_20.npy", allow_pickle=True) X = X.reshape(X.shape[0], -1) y = np.load("./data/coil_20_labels.npy", allow_pickle=True) # Initialize the pacmap instance # Setting n_neighbors to "None" leads to an automatic parameter selection # choice shown in "parameter" section of the README file. # Notice that from v0.6.0 on, we rename the n_dims parameter to n_components. embedding = pacmap.PaCMAP(n_components=2, n_neighbors=None, MN_ratio=0.5, FP_ratio=2.0) # fit the data (The index of transformed data corresponds to the index of the original data) X_transformed = embedding.fit_transform(X, init="pca") # visualize the embedding fig, ax = plt.subplots(1, 1, figsize=(6, 6)) ax.scatter(X_transformed[:, 0], X_transformed[:, 1], cmap="Spectral", c=y, s=0.6)
def PaCMAP(data=None, init=None, n_dims=2, n_neighbors=10, MN_ratio=0.5, FP_ratio=2.0, pair_neighbors=None, pair_MN=None, pair_FP=None, distance="angular", lr=1.0, num_iters=450, verbose=False, intermediate=False): """ Dimensionality Reduction Using Pairwise-controlled Manifold Approximation and Projectio Inputs ------ data : np.array with the data to be reduced init : the initialization of the lower dimensional embedding. One of "pca" or "random", or a user-provided numpy ndarray with the shape (N, 2). Default to "random". n_dims : the number of dimension of the output. Default to 2. n_neighbors : the number of neighbors considered in the k-Nearest Neighbor graph. Default to 10 for dataset whose sample size is smaller than 10000. For large dataset whose sample size (n) is larger than 10000, the default value is: 10 + 15 * (log10(n) - 4). MN_ratio : the ratio of the number of mid-near pairs to the number of neighbors, n_MN = \lfloor n_neighbors * MN_ratio \rfloor . Default to 0.5. FP_ratio : the ratio of the number of further pairs to the number of neighbors, n_FP = \lfloor n_neighbors * FP_ratio \rfloor Default to 2. distance : Distance measure ('euclidean' (default), 'manhattan', 'angular', 'hamming') lr : Optimization method ('sd': steepest descent, 'momentum': GD with momentum, 'dbd': GD with momentum delta-bar-delta (default)) num_iters : number of iterations. Default to 450. 450 iterations is enough for most dataset to converge. pair_neighbors, pair_MN and pair_FP: pre-specified neighbor pairs, mid-near points, and further pairs. Allows user to use their own graphs. Default to None. verbose : controls verbosity (default False) intermediate : whether pacmap should also output the intermediate stages of the optimization process of the lower dimension embedding. If True, then the output will be a numpy array of the size (n, n_dims, 13), where each slice is a "screenshot" of the output embedding at a particular number of steps, from [0, 10, 30, 60, 100, 120, 140, 170, 200, 250, 300, 350, 450]. random_state : RandomState object (default None) """ try: import pacmap _have_pacmap = True except ImportError( 'TriMAP is needed for this embedding. Install it with `pip install trimap`' ): return print( 'TriMAP is needed for this embedding. Install it with `pip install trimap`' ) pacmap_emb = pacmap.PaCMAP(n_dims=n_dims, n_neighbors=n_neighbors, MN_ratio=MN_ratio, FP_ratio=FP_ratio, pair_neighbors=pair_neighbors, pair_MN=pair_MN, pair_FP=pair_FP, distance=distance, lr=lr, num_iters=num_iters, verbose=verbose, intermediate=intermediate).fit_transform( X=data, init=init) return pacmap_emb