def main(): # Load ENZYMES data set # graph_db, classes = dp.read_txt("ENZYMES") obj = joblib.load("../sample.graph.jbl") graph_db = obj["graph"] classes = obj["label"] print(graph_db[0]) print(classes) print(len(classes)) print("*****1") # Parameters used: # Compute gram matrix: False, # Normalize gram matrix: False # Use discrete labels: False kernel_parameters_sp = [False, False, 1] # Parameters used: # Compute gram matrix: False, # Normalize gram matrix: False # Use discrete labels: False # Number of iterations for WL: 3 kernel_parameters_wl = [3, False, False, 1] # Use discrete labels, too # kernel_parameters_sp = [False, False, 1] # kernel_parameters_wl = [3, False, False, 1] # Compute gram matrix for HGK-WL # 20 is the number of iterations gram_matrix = rbk.hash_graph_kernel(graph_db, sp_exp.shortest_path_kernel, kernel_parameters_sp, 20, scale_attributes=True, lsh_bin_width=1.0, sigma=1.0) # Normalize gram matrix gram_matrix = aux.normalize_gram_matrix(gram_matrix) print("****1") # Compute gram matrix for HGK-SP # 20 is the number of iterations gram_matrix = rbk.hash_graph_kernel(graph_db, wl.weisfeiler_lehman_subtree_kernel, kernel_parameters_wl, 20, scale_attributes=True, lsh_bin_width=1.0, sigma=1.0) # Normalize gram matrix gram_matrix = aux.normalize_gram_matrix(gram_matrix) # Write out LIBSVM matrix dp.write_lib_svm(gram_matrix, classes, "gram_matrix")
def eval_kernel(kernel, classes, mode, n_reps=10, all_std=True): """Evaluates a specific kernel that will be normalized before evaluation. Args: kernel ([list]): kernel classes (list): dataset classes mode (string): either LINEAR or KERNEL n_reps (int, optional): Number of repetitions. Defaults to 10. all_std (bool, optional): Standard deviation?. Defaults to True. Returns: tuple: evaluation results """ normalized = [] print(f'Starting normalization of {len(kernel)} elements...') for array in kernel: if mode == 'LINEAR': normalized.append(aux.normalize_feature_vector(array)) else: normalized.append(aux.normalize_gram_matrix(array)) print(f'Normalization finished, starting {mode} SVM...') if mode == 'LINEAR': return ke.linear_svm_evaluation(normalized, classes, num_repetitions=n_reps, all_std=all_std) return ke.kernel_svm_evaluation(normalized, classes, num_repetitions=n_reps, all_std=all_std)
def plot_kpca_nmi_and_clustering(classes): ds_name = 'ENZYMES' base_path = os.path.join("kernels","node_labels") fig, axs = plt.subplots(3,3, figsize=(15,15)) representations = ["wl3", "graphlet", "shortestpath"] for (i, representation) in enumerate(representations): gram = load_csv(os.path.join(base_path,f"{ds_name}_gram_matrix_{representation}.csv")) gram = aux.normalize_gram_matrix(gram) kpca = KernelPCA(n_components=100, kernel="precomputed") reduced_kpca = kpca.fit_transform(gram) # fig, ax = plt.subplots(figsize=(5,5)) axs[0][i].scatter(reduced_kpca[:,0], reduced_kpca[:,1], c=classes, s=1) axs[0][i].set_title(f'{representation} KPCA ground truth') kmeans = KMeans(n_clusters=6).fit(reduced_kpca) axs[1][i].scatter(reduced_kpca[:,0], reduced_kpca[:,1], c=kmeans.labels_, s=1) axs[1][i].set_title(f'{representation} KPCA KMeans') print(f"NMI KMeans {representation}: {nmi(classes, kmeans.labels_)}") db = DBSCAN().fit(reduced_kpca) axs[2][i].scatter(reduced_kpca[:,0], reduced_kpca[:,1], c=db.labels_, s=1) axs[2][i].set_title(f'{representation} DBSCAN KMeans') print(f"NMI DBSCAN {representation}: {nmi(classes, db.labels_)}\n") plt.show()
def hash_graph_kernel(graph_db, base_kernel, kernel_parameters, iterations=20, lsh_bin_width=1.0, sigma=1.0, normalize_gram_matrix=True, use_gram_matrices=False, scale_attributes=True): num_vertices = 0 for g in graph_db: num_vertices += g.num_vertices() n = len(graph_db) g = graph_db[0] v = list(graph_db[0].vertices())[0] dim_attributes = len(g.vp.na[v]) colors_0 = np.zeros([num_vertices, dim_attributes]) offset = 0 gram_matrix = np.zeros([n, n]) # Get attributes from all graph instances graph_indices = [] for g in graph_db: for i, v in enumerate(g.vertices()): colors_0[i + offset] = g.vp.na[v] graph_indices.append((offset, offset + g.num_vertices() - 1)) offset += g.num_vertices() # Normalize attributes: center to the mean and component wise scale to unit variance if scale_attributes: colors_0 = pre.scale(colors_0, axis=0) for it in xrange(0, iterations): colors_hashed = aux.locally_sensitive_hashing(colors_0, dim_attributes, lsh_bin_width, sigma=sigma) tmp = base_kernel(graph_db, colors_hashed, *kernel_parameters) if it == 0 and not use_gram_matrices: feature_vectors = tmp else: if use_gram_matrices: feature_vectors = tmp feature_vectors = feature_vectors.tocsr() feature_vectors = m.sqrt(1.0 / iterations) * (feature_vectors) gram_matrix += feature_vectors.dot(feature_vectors.T).toarray() else: feature_vectors = sparse.hstack((feature_vectors, tmp)) feature_vectors = feature_vectors.tocsr() if not use_gram_matrices: # Normalize feature vectors feature_vectors = m.sqrt(1.0 / iterations) * (feature_vectors) # Compute Gram matrix gram_matrix = feature_vectors.dot(feature_vectors.T) gram_matrix = gram_matrix.toarray() if normalize_gram_matrix: gram_matrix = aux.normalize_gram_matrix(gram_matrix) return gram_matrix
def eval_wl(data, classes): """Evaluates the gram matrices of WL kernels. Args: data (list): data classes ([list]): classes """ for array in data["gram_matrix"]["wl"]: normalized = [aux.normalize_gram_matrix(array)] print( ke.kernel_svm_evaluation(normalized, classes, num_repetitions=10, all_std=True))
def weisfeiler_lehman_subtree_kernel(graph_db, hashed_attributes, *kwargs): iterations = kwargs[0] compute_gram_matrix = kwargs[1] normalize_gram_matrix = kwargs[2] use_labels = kwargs[3] # Create one empty feature vector for each graph feature_vectors = [] for _ in graph_db: feature_vectors.append(np.zeros(0, dtype=np.float64)) # Construct block diagonal matrix of all adjacency matrices adjacency_matrices = [] for g in graph_db: adjacency_matrices.append(gt.adjacency(g)) M = sp.sparse.block_diag(tuple(adjacency_matrices), dtype=np.float64, format="csr") num_vertices = M.shape[0] # Load list of precalculated logarithms of prime numbers log_primes = log_pl.log_primes[0:num_vertices] # Color vector representing labels colors_0 = np.zeros(num_vertices, dtype=np.float64) # Color vector representing hashed attributes colors_1 = hashed_attributes # Get labels (colors) from all graph instances offset = 0 graph_indices = [] for g in graph_db: if use_labels == 1: for i, v in enumerate(g.vertices()): colors_0[i + offset] = g.vp.nl[v] if use_labels == 2: for i, v in enumerate(g.vertices()): colors_0[i + offset] = v.out_degree() graph_indices.append((offset, offset + g.num_vertices() - 1)) offset += g.num_vertices() # Map labels to [0, number_of_colors) if use_labels: _, colors_0 = np.unique(colors_0, return_inverse=True) for it in range(0, iterations + 1): if use_labels: # Map colors into a single color vector if len(colors_1) > 0: colors_all = np.array([colors_0, colors_1]) else: colors_all = np.array([colors_0]) colors_all = [hash(tuple(row)) for row in colors_all.T] _, colors_all = np.unique(colors_all, return_inverse=True) max_all = int(np.amax(colors_all) + 1) # max_all = int(np.amax(colors_0) + 1) feature_vectors = [ np.concatenate((feature_vectors[i], np.bincount(colors_all[index[0]:index[1] + 1], minlength=max_all))) for i, index in enumerate(graph_indices) ] # Avoid coloring computation in last iteration if it < iterations: colors_0 = compute_coloring(M, colors_0, log_primes[0:len(colors_0)]) if len(colors_1) > 0: colors_1 = compute_coloring(M, colors_1, log_primes[0:len(colors_1)]) else: max_1 = int(np.amax(colors_1) + 1) feature_vectors = [ np.concatenate((feature_vectors[i], np.bincount(colors_1[index[0]:index[1] + 1], minlength=max_1))) for i, index in enumerate(graph_indices) ] # Avoid coloring computation in last iteration if it < iterations: colors_1 = compute_coloring(M, colors_1, log_primes[0:len(colors_1)]) if not compute_gram_matrix: return feature_vectors #return lil.lil_matrix(feature_vectors, dtype=np.float64) else: # Make feature vectors sparse gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64) # Compute gram matrix gram_matrix = gram_matrix.dot(gram_matrix.T) gram_matrix = gram_matrix.toarray() if normalize_gram_matrix: return aux.normalize_gram_matrix(gram_matrix) else: return gram_matrix
def shortest_path_kernel(graph_db, hashed_attributes, *kwargs): compute_gram_matrix = kwargs[0] normalize_gram_matrix = kwargs[1] use_labels = kwargs[2] num_vertices = 0 for g in graph_db: num_vertices += g.num_vertices() offset = 0 graph_indices = [] colors_0 = np.zeros(num_vertices, dtype=np.int64) # Get labels (colors) from all graph instances offset = 0 for g in graph_db: graph_indices.append((offset, offset + g.num_vertices() - 1)) if use_labels == 1: for i, v in enumerate(g.vertices()): colors_0[i + offset] = g.vp.nl[v] if use_labels == 2: for i, v in enumerate(g.vertices()): colors_0[i + offset] = v.out_degree() offset += g.num_vertices() _, colors_0 = np.unique(colors_0, return_inverse=True) colors_1 = hashed_attributes triple_indices = [] triple_offset = 0 triples = [] # Solve APSP problem for every graphs in graph data base for i, g in enumerate(graph_db): a = gt.adjacency(g) M = csg.shortest_path(a, method='J', directed=False, unweighted=True) index = graph_indices[i] if use_labels: l = colors_0[index[0]:index[1] + 1] h = colors_1[index[0]:index[1] + 1] else: h = colors_1[index[0]:index[1] + 1] d = M.shape[0] # For each pair of vertices collect labels, hashed attributes, and shortest-path distance pairs = list(it.product(range(d), repeat=2)) if use_labels: t = [ hash((l[k], h[k], l[j], h[j], M[k][j])) for (k, j) in pairs if (k != j or ~np.isinf(M[k][j])) ] else: t = [ hash((h[k], h[j], M[k][j])) for (k, j) in pairs if (k != j or ~np.isinf(M[k][j])) ] triples.extend(t) triple_indices.append((triple_offset, triple_offset + len(t) - 1)) triple_offset += len(t) _, colors = np.unique(triples, return_inverse=True) m = np.amax(colors) + 1 # Compute feature vectors feature_vectors = [] for i, index in enumerate(triple_indices): feature_vectors.append( np.bincount(colors[index[0]:index[1] + 1], minlength=m)) if not compute_gram_matrix: return lil.lil_matrix(feature_vectors, dtype=np.float64) else: # Make feature vectors sparse gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64) # Compute gram matrix gram_matrix = gram_matrix.dot(gram_matrix.T) gram_matrix = gram_matrix.toarray() if normalize_gram_matrix: return aux.normalize_gram_matrix(gram_matrix) else: return gram_matrix
def main(): path = "./GM/EXP/" dataset = [["ENZYMES", True], ["IMDB-BINARY", False], ["IMDB-MULTI", False], ["PROTEINS", True], ["PTC_FM", True], ["NCI1", True]] algorithms = ["LWLC2"] for a in algorithms: for d, use_labels in dataset: gram_matrices = [] for i in range(0, 10): if not pth.exists(path + d + "__" + a + "_" + str(i) + ".gram"): continue else: gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" + str(i) + ".gram") gram_matrix = normalize_gram_matrix(gram_matrix) classes = read_classes(d) gram_matrices.append(gram_matrix) if gram_matrices != []: acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices, classes, num_repetitions=10) print(a, d, acc, acc_train, s_1) exit() path = "./GM/EXP/" dataset = [["ENZYMES", True], ["IMDB-BINARY", False], ["IMDB-MULTI", False], ["NCI1", True], ["NCI109", True], ["PROTEINS", True], ["PTC_FM", True], ["REDDIT-BINARY", False]] algorithms = [ "WL1", "GR", "SP", "WLOA", "LWL2", "LWLP2", "WL2", "DWL2", "LWL3", "LWLP3", "WL3", "DWL3" ] for a in algorithms: for d, use_labels in dataset: gram_matrices = [] for i in range(0, 10): if not pth.exists(path + d + "__" + a + "_" + str(i) + ".gram"): continue else: gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" + str(i) + ".gram") gram_matrix = normalize_gram_matrix(gram_matrix) classes = read_classes(d) gram_matrices.append(gram_matrix) if gram_matrices != []: acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices, classes, num_repetitions=10) print(a, d, acc, acc_train, s_1) path = "./GM/EXPSPARSE/" for name in [ "Yeast", "YeastH", "UACC257", "UACC257H", "OVCAR-8", "OVCAR-8H" ]: for algorithm in ["LWL2", "LWLP2", "WL"]: # Collect feature matrices over all iterations all_feature_matrices = [] classes = read_classes(name) for i in range(2, 3): # Load feature matrices. feature_vector = pd.read_csv(path + name + "__" + algorithm + "_" + str(i), header=1, delimiter=" ").to_numpy() feature_vector = feature_vector.astype(int) feature_vector[:, 0] = feature_vector[:, 0] - 1 feature_vector[:, 1] = feature_vector[:, 1] - 1 feature_vector[:, 2] = feature_vector[:, 2] + 1 xmax = int(feature_vector[:, 0].max()) ymax = int(feature_vector[:, 1].max()) feature_vector = sp.coo_matrix( (feature_vector[:, 2], (feature_vector[:, 0], feature_vector[:, 1])), shape=(xmax + 1, ymax + 1)) feature_vector = feature_vector.tocsr() all_feature_matrices.append(feature_vector) acc, s_1 = linear_svm_evaluation(all_feature_matrices, classes, num_repetitions=3, all_std=False) print(name, algorithm, acc, s_1)
import auxiliarymethods.auxiliary_methods as aux import auxiliarymethods.datasets as dp import kernel_baselines as kb from auxiliarymethods.kernel_evaluation import kernel_svm_evaluation # Download dataset. classes = dp.get_dataset("ENZYMES") use_labels, use_edge_labels = True, False all_matrices = [] # Compute 1-WL kernel for 1 to 5 iterations. for i in range(1, 6): # Use node labels and no edge labels. gm = kb.compute_wl_1_dense("ENZYMES", i, use_labels, use_edge_labels) # Apply cosine normalization. gm = aux.normalize_gram_matrix(gm) all_matrices.append(gm) # Perform 10 repetitions of 10-CV using LIBSVM. print(kernel_svm_evaluation(all_matrices, classes, num_repetitions=10, all_std=True))
def main(): ### Smaller datasets using LIBSVM. dataset = [["ENZYMES", True], ["IMDB-BINARY", False], ["IMDB-MULTI", False], ["NCI1", True], ["PROTEINS", True], ["REDDIT-BINARY", False]] # Number of repetitions of 10-CV. num_reps = 10 results = [] for dataset, use_labels in dataset: classes = dp.get_dataset(dataset) # 1-WL kernel, number of iterations in [1:6]. all_matrices = [] for i in range(1, 6): gm = kb.compute_wl_1_dense(dataset, i, use_labels, False) gm_n = aux.normalize_gram_matrix(gm) all_matrices.append(gm_n) acc, s_1, s_2 = kernel_svm_evaluation(all_matrices, classes, num_repetitions=num_reps, all_std=True) print(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) + " " + str(s_2)) results.append(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) + " " + str(s_2)) # WLOA kernel, number of iterations in [1:6]. all_matrices = [] for i in range(1, 6): gm = kb.compute_wloa_dense(dataset, i, use_labels, False) gm_n = aux.normalize_gram_matrix(gm) all_matrices.append(gm_n) acc, s_1, s_2 = kernel_svm_evaluation(all_matrices, classes, num_repetitions=num_reps, all_std=True) print(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) + " " + str(s_2)) results.append(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) + " " + str(s_2)) # Graphlet kernel. all_matrices = [] gm = kb.compute_graphlet_dense(dataset, use_labels, False) gm_n = aux.normalize_gram_matrix(gm) all_matrices.append(gm_n) acc, s_1, s_2 = kernel_svm_evaluation(all_matrices, classes, num_repetitions=num_reps, all_std=True) print(dataset + " " + "GR " + str(acc) + " " + str(s_1) + " " + str(s_2)) results.append(dataset + " " + "GR " + str(acc) + " " + str(s_1) + " " + str(s_2)) # Shortest-path kernel. all_matrices = [] gm = kb.compute_shortestpath_dense(dataset, use_labels) gm_n = aux.normalize_gram_matrix(gm) all_matrices.append(gm_n) acc, s_1, s_2 = kernel_svm_evaluation(all_matrices, classes, num_repetitions=num_reps, all_std=True) print(dataset + " " + "SP " + str(acc) + " " + str(s_1) + " " + str(s_2)) results.append(dataset + " " + "SP " + str(acc) + " " + str(s_1) + " " + str(s_2)) # Number of repetitions of 10-CV. num_reps = 3 ### Larger datasets using LIBLINEAR with edge labels. dataset = [["MOLT-4", True, True], ["Yeast", True, True], ["MCF-7", True, True], ["github_stargazers", False, False], ["reddit_threads", False, False]] for d, use_labels, use_edge_labels in dataset: dataset = d classes = dp.get_dataset(dataset) # 1-WL kernel, number of iterations in [1:6]. all_matrices = [] for i in range(1, 6): gm = kb.compute_wl_1_sparse(dataset, i, use_labels, use_edge_labels) gm_n = aux.normalize_feature_vector(gm) all_matrices.append(gm_n) acc, s_1, s_2 = linear_svm_evaluation(all_matrices, classes, num_repetitions=num_reps, all_std=True) print(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " + str(s_2)) results.append(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " + str(s_2)) # Graphlet kernel, number of iterations in [1:6]. all_matrices = [] gm = kb.compute_graphlet_sparse(dataset, use_labels, use_edge_labels) gm_n = aux.normalize_feature_vector(gm) all_matrices.append(gm_n) acc, s_1, s_2 = linear_svm_evaluation(all_matrices, classes, num_repetitions=num_reps, all_std=True) print(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " + str(s_2)) results.append(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " + str(s_2)) # Shortest-path kernel. all_matrices = [] gm = kb.compute_shortestpath_sparse(dataset, use_labels) gm_n = aux.normalize_feature_vector(gm) all_matrices.append(gm_n) acc, s_1, s_2 = linear_svm_evaluation(all_matrices, classes, num_repetitions=num_reps, all_std=True) print(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " + str(s_2)) results.append(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " + str(s_2)) for r in results: print(r)