def __init__(self, x, y, z, num_bootstrap=99, kernel_type='gauss', bandwidth=None, index=1, seed=1, numba=True): if kernel_type in {'gauss', 'rectangle'}: self.kernel_type = kernel_type else: self.kernel_type = 'gauss' kde_z = KernelDensityEstimation(utils.as_matrix(z), bandwidth) self.kernel = kde_z.compute_kernel_density_estimate() self.dist_x = utils.compute_distance_matrix(utils.as_matrix(x), index) self.dist_y = utils.compute_distance_matrix(utils.as_matrix(y), index) assert self.dist_x.shape == self.dist_y.shape == self.kernel.shape self.stats = CDCStats(self.dist_x, self.dist_y, self.kernel, numba=numba) self.cdcov_stats = 0. self.B = num_bootstrap self.permuted_cdcov_stats = np.zeros(self.B) self.seed = seed self.p_value = 0.
def cross_validation_with_and_without_manifold(X, y, n_neighbors, n_components, k): # Split indexes according to Kfold with k = 10 kf = KFold(n_splits=k) # initialize scores lists scores = [] scores2 = [] for train_index, test_index in kf.split(X): kernel = GraphKernel(kernel={"name": "shortest_path", "with_labels": False}, normalize=True) # split train and test of K-fold X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Calculate the kernel matrix. K_train = kernel.fit_transform(X_train) K_test = kernel.transform(X_test) # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=4) clf.fit(K_train, y_train) # Predict and test. y_pred = clf.predict(K_test) # Calculate accuracy of classification. acc = accuracy_score(y_test, y_pred) scores.append(acc) # Compute distance matrix D_train = compute_distance_matrix(K_train) D_test = compute_distance_matrix(K_test) # Initialize Isomap embedding object, embed train and test data embedding = manifold.Isomap(n_neighbors, n_components, metric="precomputed") E_train = embedding.fit_transform(D_train) E_test = embedding.transform(D_test) # initialize second svm (not necessary? search documentation) clf2 = svm.SVC(kernel='linear', C=4) clf2.fit(E_train, y_train) # Predict and test. y_pred = clf2.predict(E_test) # Calculate accuracy of classification. acc = accuracy_score(y_test, y_pred) scores2.append(acc) for i, _ in enumerate(scores): scores[i] = scores[i] * 100 for i, _ in enumerate(scores2): scores2[i] = scores2[i] * 100 return scores, scores2
M = arguments.N_KEEP burnin = arguments.burnin output_path = arguments.output_path sigma_chain = np.cov(chain[burnin:].T) mean_chain = np.mean(chain[burnin:], axis=0) indexes = np.linspace(burnin, len(chain) - 1, M, dtype=int) star_discrepency = utils.discrepency(chain[indexes], chain[burnin:], np.linalg.cholesky(sigma_chain), mean_chain) vfk0 = make_imq(chain, gradient, pre='med') print("Star discrepency done") thin_mat = kmat(chain[indexes], gradient[indexes], vfk0) KSD_thin = np.sqrt(np.mean(thin_mat)) print("KSD done") ED_thin = 2*np.mean(utils.compute_distance_matrix(chain[indexes], chain[burnin:])) \ - np.mean(utils.compute_distance_matrix(chain[indexes], chain[indexes])) print("ED done") d = { "thinning": indexes, "ED": ED_thin, "KSD": KSD_thin, "star_discrepency": star_discrepency, "burnin": burnin } np.save(output_path, d, allow_pickle=True)
##OLD CODE shortestPathKernel = GraphKernel(kernel={ "name": "shortest_path", "with_labels": False }, normalize=True) # Calculate the kernel matrix. K = shortestPathKernel.fit_transform(X) nan_elements = np.any(np.isnan(K)) # Compute the distance matrix D D = compute_distance_matrix(K) embedding = manifold.Isomap(n_neighbors=5, n_components=10, metric="precomputed") X_transformed = embedding.fit_transform(D) # xs = feature_vectors[:, 0] # ys = feature_vectors[:, 1] # # plt.scatter(xs, ys, c=y) # plt.show() # print(np.all(np.isfinite(K_train))) X_train, X_test, y_train, y_test = train_test_split(X_transformed,
def spk_isomap(X,y, k, KNNstart, KNNend, Dstart, Dend, svmC): filename = "accuracy.txt" myfile = open(filename, 'a') # Add info to file myfile.write('SP Isomap accuracy: K = %d-%d, D = %d-%d, C = %d, K-fold = %d\n' % (KNNstart, KNNend, Dstart, Dend, svmC, k)) KNN = [] KNNrange = KNNend - KNNstart+1 D = [] Drange = Dend - Dstart+1 for knn in range(KNNrange): KNN.append( knn + KNNstart) for d in range(Drange): D.append(d + Dstart) kf = KFold(n_splits=k) scores = [] Z = np.ndarray(shape=( len(D) , len(KNN) )) for knn in range(len(KNN)): for d in range(len(D)): for train_index, test_index in kf.split(X): kernel = GraphKernel(kernel={"name": "shortest_path", "with_labels": False}, normalize=True) # split train and test of K-fold X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Calculate the kernel matrix. K_train = kernel.fit_transform(X_train) K_test = kernel.transform(X_test) # Compute distance matrix D_train = compute_distance_matrix(K_train) D_test = compute_distance_matrix(K_test) # Initialize Isomap embedding object, embed train and test data embedding = manifold.Isomap(n_neighbors=KNN[knn], n_components=D[d], metric="precomputed") E_train = embedding.fit_transform(D_train) E_test = embedding.transform(D_test) # initialize second svm (not necessary? search documentation) clf2 = svm.SVC(kernel='linear', C=svmC) clf2.fit(E_train, y_train) # Predict and test. y_pred = clf2.predict(E_test) # Append accuracy of classification. scores.append(accuracy_score(y_test, y_pred)) val = np.mean(scores) Z[d][knn] = val myfile.write("%f " % (val)) print("knn = ", KNN[knn], "d = ", D[d], " accuracy = ", Z[d][knn]) print("{0:.2%} done".format((Drange*knn+d+1.0)/(Drange*KNNrange))) # print("{0:.2%} done".format((D*k+d + 1.0)/(D*KNN) )) myfile.write("\n") # Close the file myfile.close() return Z
#!/usr/bin/env python import utils import bmu import pickle # set the paths to the mapping and biom files biom_fp = "../data/study_550_closed_reference_otu_table.biom" map_fp = "../data/study_550_mapping_file.txt" data, sample_ids, otus_names = bmu.load_biom(biom_fp) dist_mat = utils.compute_distance_matrix(data) pickle.dump(dist_mat, open("../data/distances_study_500.pkl", "wb"))