def _fit_transform(self, X): X = check_array(X) self.nbrs_.fit(X) self.training_data_ = self.nbrs_._fit_X self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter) kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode='distance') self.dist_matrix_ = graph_shortest_path(kng, method=self.path_method, directed=False) G = self.dist_matrix_ ** 2 G *= -0.5 self.embedding_ = self.kernel_pca_.fit_transform(G)
def __init__(self, gamma_o=5, gamma_c=4, gamma_b=2, gamma_p=3, grid_o_dim=25, grid_c_dims=(5, 5, 5), grid_p_dims=(5, 5), epsilon_g=0.8, epsilon_s=0.2): print "basis for orientation" k_o = GaussianKernelForAngle(1 / numpy.sqrt(2 * gamma_o)) self.projector_o = FeatureVectorProjection(k_o) X = numpy.linspace(-numpy.pi, numpy.pi, grid_o_dim + 1)[:-1] X = X[:, numpy.newaxis] self.projector_o.fit(X) print "basis for color" k_c = GaussianKernel(1 / numpy.sqrt(2 * gamma_c)) self.projector_c = FeatureVectorProjection(k_c) r_step = 1.0 / (grid_c_dims[0] - 1) g_step = 1.0 / (grid_c_dims[1] - 1) b_step = 1.0 / (grid_c_dims[2] - 1) X = numpy.mgrid[0:1 + r_step:r_step, 0:1 + g_step:g_step, 0:1 + b_step:b_step].reshape(3, -1).T self.projector_c.fit(X) print "basis for binary patterns" k_b = GaussianKernel(1 / numpy.sqrt(2 * gamma_b)) self.projector_b = FeatureVectorProjection(k_b) X = numpy.mgrid[0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1].reshape(8, -1).T self.projector_b.fit(X) print "basis for positions" k_p = GaussianKernel(1 / numpy.sqrt(2 * gamma_p)) self.projector_p = FeatureVectorProjection(k_p) x_step = 1.0 / (grid_p_dims[0] - 1) y_step = 1.0 / (grid_p_dims[1] - 1) X = numpy.mgrid[0:1 + x_step:x_step, 0:1 + y_step:y_step].reshape(2, -1).T self.projector_p.fit(X) self.epsilon_g = epsilon_g self.epsilon_s = epsilon_s kpca_kernel = GaussianKernel(0.4) X_p = self.projector_p.predict(self.projector_p.basis) kdes_dim = self.projector_o.ndim * self.projector_p.ndim X_o = self.projector_o.predict(self.projector_o.basis) X_op = numpy.zeros((kdes_dim, kdes_dim)) for i, (x, y) in enumerate(zip(X_o, X_p)): X_op[i, :] = numpy.kron(x, y) self.kpca_op = KernelPCA(kpca_kernel) self.kpca_op.fit(X_op) kdes_dim = self.projector_c.ndim * self.projector_p.ndim X_c = self.projector_c.predict(self.projector_c.basis) X_cp = numpy.zeros((kdes_dim, kdes_dim)) pos = 0 for x in X_c: for y in X_p: X_cp[pos, :] = numpy.kron(x, y) pos += 1 self.kpca_cp = KernelPCA(kpca_kernel) self.kpca_cp.fit(X_cp)
class KernelDescriptorsExtractor: def __init__(self, gamma_o=5, gamma_c=4, gamma_b=2, gamma_p=3, grid_o_dim=25, grid_c_dims=(5, 5, 5), grid_p_dims=(5, 5), epsilon_g=0.8, epsilon_s=0.2): print "basis for orientation" k_o = GaussianKernelForAngle(1 / numpy.sqrt(2 * gamma_o)) self.projector_o = FeatureVectorProjection(k_o) X = numpy.linspace(-numpy.pi, numpy.pi, grid_o_dim + 1)[:-1] X = X[:, numpy.newaxis] self.projector_o.fit(X) print "basis for color" k_c = GaussianKernel(1 / numpy.sqrt(2 * gamma_c)) self.projector_c = FeatureVectorProjection(k_c) r_step = 1.0 / (grid_c_dims[0] - 1) g_step = 1.0 / (grid_c_dims[1] - 1) b_step = 1.0 / (grid_c_dims[2] - 1) X = numpy.mgrid[0:1 + r_step:r_step, 0:1 + g_step:g_step, 0:1 + b_step:b_step].reshape(3, -1).T self.projector_c.fit(X) print "basis for binary patterns" k_b = GaussianKernel(1 / numpy.sqrt(2 * gamma_b)) self.projector_b = FeatureVectorProjection(k_b) X = numpy.mgrid[0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1, 0:2:1].reshape(8, -1).T self.projector_b.fit(X) print "basis for positions" k_p = GaussianKernel(1 / numpy.sqrt(2 * gamma_p)) self.projector_p = FeatureVectorProjection(k_p) x_step = 1.0 / (grid_p_dims[0] - 1) y_step = 1.0 / (grid_p_dims[1] - 1) X = numpy.mgrid[0:1 + x_step:x_step, 0:1 + y_step:y_step].reshape(2, -1).T self.projector_p.fit(X) self.epsilon_g = epsilon_g self.epsilon_s = epsilon_s kpca_kernel = GaussianKernel(0.4) X_p = self.projector_p.predict(self.projector_p.basis) kdes_dim = self.projector_o.ndim * self.projector_p.ndim X_o = self.projector_o.predict(self.projector_o.basis) X_op = numpy.zeros((kdes_dim, kdes_dim)) for i, (x, y) in enumerate(zip(X_o, X_p)): X_op[i, :] = numpy.kron(x, y) self.kpca_op = KernelPCA(kpca_kernel) self.kpca_op.fit(X_op) kdes_dim = self.projector_c.ndim * self.projector_p.ndim X_c = self.projector_c.predict(self.projector_c.basis) X_cp = numpy.zeros((kdes_dim, kdes_dim)) pos = 0 for x in X_c: for y in X_p: X_cp[pos, :] = numpy.kron(x, y) pos += 1 self.kpca_cp = KernelPCA(kpca_kernel) self.kpca_cp.fit(X_cp) def _calc_gradient_match_kernel_for_image(self, I, patch_size, subsample): nX, nY, nchannels = I.shape # precalculate magnitude and angle of gradient in each pixel Ig_magnitude = numpy.zeros(I.shape[0:2]) Ig_angle = numpy.zeros(I.shape[0:2]) for i in range(nX): for j in range(nY): chosen_dx, chosen_dy, chosen_magnitude = 0, 0, 0 for c in range(nchannels): dx, dy = 0, 0 if i < nX - 1: dx += I[i + 1, j, c] if i > 0: dx -= I[i - 1, j, c] if j < nY - 1: dy += I[i, j + 1, c] if j > 0: dy -= I[i, j - 1, c] magnitude = dx**2 + dy**2 if magnitude > chosen_magnitude: chosen_magnitude = magnitude chosen_dx = dx chosen_dy = dy Ig_magnitude[i, j] = numpy.sqrt(magnitude) Ig_angle[i, j] = numpy.arctan2(dx, dy) x_step = 1.0 / (patch_size[0] - 1) y_step = 1.0 / (patch_size[1] - 1) X_p = numpy.mgrid[0:1 + x_step:x_step, 0:1 + y_step:y_step].reshape(2, -1).T X_p = self.projector_p.predict(X_p) patch_x = numpy.arange(patch_size[0]).repeat(patch_size[1]) patch_y = numpy.tile(numpy.arange(patch_size[1]), patch_size[0]) kdes_dims = self.projector_o.ndim * self.projector_p.ndim ret = numpy.zeros((9, kdes_dims)) pos = 0 for sx in range(0, nX - patch_size[0] + 1, subsample[0]): for sy in range(0, nY - patch_size[1] + 1, subsample[1]): norm = numpy.sum(Ig_magnitude[sx:sx + patch_size[0], sy:sy + patch_size[1]]**2) norm = numpy.sqrt(self.epsilon_g + norm) X_o = Ig_angle[sx:sx + patch_size[0], sy:sy + patch_size[1]].reshape(-1) X_o = X_o[:, numpy.newaxis] X_o = self.projector_o.predict(X_o) aux = numpy.zeros(kdes_dims) for x_o, x_p, x, y in zip(X_o, X_p, patch_x, patch_y): aux += Ig_magnitude[x, y] * numpy.kron(x_o, x_p) ret[pos, :] = aux / norm pos += 1 return self.kpca_op.predict(ret, components=200).flatten() def _calc_color_match_kernel_for_image(self, I, patch_size, subsample): nX, nY, nchannels = I.shape x_step = 1.0 / (patch_size[0] - 1) y_step = 1.0 / (patch_size[1] - 1) X_p = numpy.mgrid[0:1 + x_step:x_step, 0:1 + y_step:y_step].reshape(2, -1).T X_p = self.projector_p.predict(X_p) patch_x = numpy.arange(patch_size[0]).repeat(patch_size[1]) patch_y = numpy.tile(numpy.arange(patch_size[1]), patch_size[0]) X_c = numpy.zeros((patch_size[0] * patch_size[1], 3)) kdes_dims = self.projector_c.ndim * self.projector_p.ndim ret = numpy.zeros((9, kdes_dims)) pos = 0 for sx in range(0, nX - patch_size[0] + 1, subsample[0]): for sy in range(0, nY - patch_size[1] + 1, subsample[1]): for i, (x, y) in enumerate(zip(patch_x, patch_y)): X_c[i, :] = I[x, y, :] X_c_proj = self.projector_c.predict(X_c) aux = numpy.zeros(kdes_dims) for x_c, x_p in zip(X_c_proj, X_p): aux += numpy.kron(x_c, x_p) ret[pos, :] = aux pos += 1 return self.kpca_cp.predict(ret, components=200).flatten() def predict(self, X, patch_size=(16, 16), subsample=(8, 8), match_kernel='gradient'): assert X.ndim == 4 n = X.shape[0] print "Match kernel: %s" % match_kernel if match_kernel == 'gradient': X_grad = [] for i in tqdm(range(n)): X_grad.append( self._calc_gradient_match_kernel_for_image( X[i, :, :, :], patch_size, subsample)) X_grad = numpy.array(X_grad) return X_grad elif match_kernel == 'color': X_color = [] for i in tqdm(range(n)): X_color.append( self._calc_color_match_kernel_for_image( X[i, :, :, :], patch_size, subsample)) return X_color else: raise Exception("Unknown match kernel")
from kmeans import Kmeans cats = ['sci.med', 'misc.forsale', 'soc.religion.christian'] newsgroups_all = fetch_20newsgroups(subset='all', categories=cats) vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(newsgroups_all.data) X = vectors.toarray() y = newsgroups_all.target # only take 800 for training and 200 for testing X_train = X[0:800, :] X_test = X[800:1000, :] y_train = y[0:800] y_test = y[800:1000] kernel = GaussianKernel(sigma=1) # to change kpca = KernelPCA(kernel) kpca.fit(X_train) n_components = 2 # to change X_train_proj = kpca.predict(X_train, components=n_components) X_test_proj = kpca.predict(X_test, components=n_components) permuts = numpy.array([[0, 1, 2], [0, 2, 1], [1, 0, 2], [1, 2, 0], [2, 0, 1], [2, 1, 0]]) def find_permut_for_prediction(y_pred, y, permuts): y_pred_best = y_pred accuracy_best = 0. permut_best = permuts[0, :] for i in range(0, len(permuts)): y_pred_current = [permuts[i, e] for e in y_pred]
class Isomap(BaseEstimator, TransformerMixin): """Isomap Embedding Non-linear dimensionality reduction through Isometric Mapping Parameters ---------- n_neighbors : integer number of neighbors to consider for each point. n_components : integer number of coordinates for the manifold eigen_solver : ['auto'|'arpack'|'dense'] 'auto' : Attempt to choose the most efficient solver for the given problem. 'arpack' : Use Arnoldi decomposition to find the eigenvalues and eigenvectors. 'dense' : Use a direct solver (i.e. LAPACK) for the eigenvalue decomposition. tol : float Convergence tolerance passed to arpack or lobpcg. not used if eigen_solver == 'dense'. max_iter : integer Maximum number of iterations for the arpack solver. not used if eigen_solver == 'dense'. path_method : string ['auto'|'FW'|'D'] Method to use in finding shortest path. 'auto' : attempt to choose the best algorithm automatically. 'FW' : Floyd-Warshall algorithm. 'D' : Dijkstra's algorithm. neighbors_algorithm : string ['auto'|'brute'|'kd_tree'|'ball_tree'] Algorithm to use for nearest neighbors search, passed to neighbors.NearestNeighbors instance. Attributes ---------- embedding_ : array-like, shape (n_samples, n_components) Stores the embedding vectors. kernel_pca_ : object `KernelPCA` object used to implement the embedding. training_data_ : array-like, shape (n_samples, n_features) Stores the training data. nbrs_ : sklearn.neighbors.NearestNeighbors instance Stores nearest neighbors instance, including BallTree or KDtree if applicable. dist_matrix_ : array-like, shape (n_samples, n_samples) Stores the geodesic distance matrix of training data. References ---------- .. [1] Tenenbaum, J.B.; De Silva, V.; & Langford, J.C. A global geometric framework for nonlinear dimensionality reduction. Science 290 (5500) """ def __init__(self, n_neighbors=5, n_components=2, eigen_solver='auto', tol=0, max_iter=None, path_method='auto', neighbors_algorithm='auto'): self.n_neighbors = n_neighbors self.n_components = n_components self.eigen_solver = eigen_solver self.tol = tol self.max_iter = max_iter self.path_method = path_method self.neighbors_algorithm = neighbors_algorithm self.nbrs_ = NearestNeighbors(n_neighbors=n_neighbors, algorithm=neighbors_algorithm) def _fit_transform(self, X): X = check_array(X) self.nbrs_.fit(X) self.training_data_ = self.nbrs_._fit_X self.kernel_pca_ = KernelPCA(n_components=self.n_components, kernel="precomputed", eigen_solver=self.eigen_solver, tol=self.tol, max_iter=self.max_iter) kng = kneighbors_graph(self.nbrs_, self.n_neighbors, mode='distance') self.dist_matrix_ = graph_shortest_path(kng, method=self.path_method, directed=False) G = self.dist_matrix_ ** 2 G *= -0.5 self.embedding_ = self.kernel_pca_.fit_transform(G) def reconstruction_error(self): """Compute the reconstruction error for the embedding. Returns ------- reconstruction_error : float Notes ------- The cost function of an isomap embedding is ``E = frobenius_norm[K(D) - K(D_fit)] / n_samples`` Where D is the matrix of distances for the input data X, D_fit is the matrix of distances for the output embedding X_fit, and K is the isomap kernel: ``K(D) = -0.5 * (I - 1/n_samples) * D^2 * (I - 1/n_samples)`` """ G = -0.5 * self.dist_matrix_ ** 2 G_center = KernelCenterer().fit_transform(G) evals = self.kernel_pca_.lambdas_ return np.sqrt(np.sum(G_center ** 2) - np.sum(evals ** 2)) / G.shape[0] def fit(self, X, y=None): """Compute the embedding vectors for data X Parameters ---------- X : {array-like, sparse matrix, BallTree, KDTree, NearestNeighbors} Sample data, shape = (n_samples, n_features), in the form of a numpy array, precomputed tree, or NearestNeighbors object. Returns ------- self : returns an instance of self. """ self._fit_transform(X) return self def fit_transform(self, X, y=None): """Fit the model from data in X and transform X. Parameters ---------- X: {array-like, sparse matrix, BallTree, KDTree} Training vector, where n_samples in the number of samples and n_features is the number of features. Returns ------- X_new: array-like, shape (n_samples, n_components) """ self._fit_transform(X) return self.embedding_ def transform(self, X): """Transform X. This is implemented by linking the points X into the graph of geodesic distances of the training data. First the `n_neighbors` nearest neighbors of X are found in the training data, and from these the shortest geodesic distances from each point in X to each point in the training data are computed in order to construct the kernel. The embedding of X is the projection of this kernel onto the embedding vectors of the training set. Parameters ---------- X: array-like, shape (n_samples, n_features) Returns ------- X_new: array-like, shape (n_samples, n_components) """ X = check_array(X) distances, indices = self.nbrs_.kneighbors(X, return_distance=True) #Create the graph of shortest distances from X to self.training_data_ # via the nearest neighbors of X. #This can be done as a single array operation, but it potentially # takes a lot of memory. To avoid that, use a loop: G_X = np.zeros((X.shape[0], self.training_data_.shape[0])) for i in range(X.shape[0]): G_X[i] = np.min((self.dist_matrix_[indices[i]] + distances[i][:, None]), 0) G_X **= 2 G_X *= -0.5 return self.kernel_pca_.transform(G_X)
def load_features(feature_extractor_name, overwrite_features=True, overwrite_kpca=True, do_kpca=False, kpca_kernel=None, cut_percentage=90, folder_name='data/'): Xtrain, Ytrain, Xtest = load_data(folder_name) if not overwrite_features and not overwrite_kpca and do_kpca: assert kpca_kernel is not None kernel_name = kpca_kernel.name file_suffix = '_' + feature_extractor_name + '_' + kernel_name + '.npy' if os.path.isfile(folder_name + 'Xtrain' + file_suffix) \ and os.path.isfile(folder_name + 'Xtest' + file_suffix): Xtrain = numpy.load(folder_name + 'Xtrain' + file_suffix) Xtest = numpy.load(folder_name + 'Xtest' + file_suffix) return Xtrain, Ytrain, Xtest feature_extractor = get_feature_extractor(feature_extractor_name) if feature_extractor_name == 'hog_fisher' or feature_extractor_name == 'sift_fisher': if not overwrite_features and os.path.isfile(folder_name + 'Xtrain_' + feature_extractor_name + '.npy') \ and os.path.isfile(folder_name + 'Xtest_' + feature_extractor_name + '.npy'): Xtrain = numpy.load(folder_name + 'Xtrain_' + feature_extractor_name + '.npy') Xtest = numpy.load(folder_name + 'Xtest_' + feature_extractor_name + '.npy') else: Xtrain, V_truncate, gmm = feature_extractor.train(Xtrain) Xtest = feature_extractor.predict(Xtest, V_truncate, gmm) numpy.save(folder_name + 'Xtrain_' + feature_extractor_name, Xtrain) numpy.save(folder_name + 'Xtest_' + feature_extractor_name, Xtest) elif feature_extractor_name == 'bag_of_words_hog': if not overwrite_features and os.path.isfile(folder_name + 'Xtrain_' + feature_extractor_name + '.npy') \ and os.path.isfile(folder_name + 'Xtest_' + feature_extractor_name + '.npy'): Xtrain = numpy.load(folder_name + 'Xtrain_' + feature_extractor_name + '.npy') Xtest = numpy.load(folder_name + 'Xtest_' + feature_extractor_name + '.npy') else: Xtrain = feature_extractor.extract(Xtrain) Xtest = feature_extractor.extract(Xtest) feature_extractor.fit(Xtrain) Xtrain = feature_extractor.predict(Xtrain) Xtest = feature_extractor.predict(Xtest) numpy.save(folder_name + 'Xtrain_' + feature_extractor_name, Xtrain) numpy.save(folder_name + 'Xtest_' + feature_extractor_name, Xtest) elif feature_extractor is not None: if not overwrite_features and os.path.isfile(folder_name + 'Xtrain_' + feature_extractor_name + '.npy'): Xtrain = numpy.load(folder_name + 'Xtrain_' + feature_extractor_name + '.npy') else: Xtrain = feature_extractor.predict(Xtrain) numpy.save(folder_name + 'Xtrain_' + feature_extractor_name, Xtrain) if not overwrite_features and os.path.isfile(folder_name + 'Xtest_' + feature_extractor_name + '.npy'): Xtest = numpy.load(folder_name + 'Xtest_' + feature_extractor_name + '.npy') else: Xtest = feature_extractor.predict(Xtest) numpy.save(folder_name + 'Xtest_' + feature_extractor_name, Xtest) if do_kpca: kpca = KernelPCA(kpca_kernel) kpca.fit(Xtrain, cut_percentage=cut_percentage) Xtrain = kpca.predict(Xtrain) Xtest = kpca.predict(Xtest) kernel_name = kpca_kernel.name file_suffix = '_' + feature_extractor_name + '_' + kernel_name + '.npy' numpy.save(folder_name + 'Xtrain' + file_suffix, Xtrain) numpy.save(folder_name + 'Xtest' + file_suffix, Xtest) return Xtrain, Ytrain, Xtest