def test_multiscale_laplacian_pd(): """Random input test for the Multiscale Laplacian kernel [n_jobs=-1/generic-wrapper].""" # Intialise kernel train, test = generate_dataset(n_graphs=30, r_vertices=(5, 10), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=10, random_state=rs, features=('na', 5)) gk = GraphKernel(kernel="ML", verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception
def test_neighborhood_subgraph_pairwise_distance(): """Random input test for the Neighborhood Subgraph Pairwise Distance kernel [+ generic-wrapper].""" train, test = generate_dataset(n_graphs=100, r_vertices=(5, 10), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=40, random_state=rs, features=('nl', 5, 'el', 4)) nspd_kernel = NeighborhoodSubgraphPairwiseDistance(verbose=verbose, normalize=normalize) gk = GraphKernel(kernel="NSPD", verbose=verbose, normalize=normalize) try: nspd_kernel.fit_transform(train) nspd_kernel.transform(test) gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception
adjacency_matrices = np.array([[cell['am']] for cell in dataset["G"][0]]) labels = np.array([label[0] for label in dataset["labels"]]) X = adjacency_matrices y = labels X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, shuffle=True, random_state=42) randomWalkKernel = GraphKernel(kernel={ "name": "random_walk", "with_labels": False }, normalize=True) graphletKernel = GraphKernel(kernel={"name": "graphlet_sampling"}, normalize=True) shortestPathKernel = GraphKernel(kernel={"name": "shortest_path"}, normalize=True) # Calculate the kernel matrix for random Walk Kernel. K_train = randomWalkKernel.fit_transform(X_train) K_test = randomWalkKernel.transform(X_test) '''nanel = 0 print (K_train[0][79-5]) print(len(K_train)) print(len(K_train[0]))
def main(): # Training settings parser = argparse.ArgumentParser(description='WL subtree kernel') parser.add_argument('--dataset', type=str, default="MUTAG", help='name of dataset (default: MUTAG)') parser.add_argument( '--seed', type=int, default=0, help='random seed for splitting the dataset into 10 (default: 0)') parser.add_argument( '--fold_idx', type=int, default=0, help='the index of fold in 10-fold validation. Should be less then 10.' ) parser.add_argument('--iter', type=int, default=5, help='Number of iteration for the WL') parser.add_argument('--normalize', action="store_true", help='normalize the feature or not') parser.add_argument('--filename', type=str, default="", help='output file') args = parser.parse_args() np.random.seed(0) graphs, num_classes = load_data(args.dataset, False) ##10-fold cross validation, consider the particular fold. train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx) #SVM hyper-parameter to tune C_list = [0.01, 0.1, 1, 10, 100] X_train, y_train = convert(train_graphs) X_test, y_test = convert(test_graphs) wl_kernel = GraphKernel(kernel=[{ "name": "weisfeiler_lehman", "niter": args.iter }, { "name": "subtree_wl" }], normalize=args.normalize) K_train = wl_kernel.fit_transform(X_train) K_test = wl_kernel.transform(X_test) train_acc = [] test_acc = [] for C in C_list: clf = SVC(kernel='precomputed', C=C) clf.fit(K_train, y_train) y_pred_test = clf.predict(K_test) y_pred_train = clf.predict(K_train) train_acc.append(accuracy_score(y_train, y_pred_train) * 100) test_acc.append(accuracy_score(y_test, y_pred_test) * 100) print(train_acc) print(test_acc) if not args.filename == "": np.savetxt(args.filename, np.array([train_acc, test_acc]).transpose())
import numpy as np from time import time from sklearn import svm from sklearn.model_selection import GridSearchCV from sklearn.model_selection import cross_val_predict from sklearn.pipeline import make_pipeline from sklearn.metrics import accuracy_score from grakel import datasets from grakel import GraphKernel # Loads the Mutag dataset from: # https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets # the biggest collection of benchmark datasets for graph_kernels. mutag = datasets.fetch_dataset("MUTAG", verbose=False) G, y = mutag.data, mutag.target C_grid = (10.**np.arange(1, 10, 1) / len(G)).tolist() n_folds = 10 estimator = make_pipeline( GraphKernel(kernel=dict(name="shortest_path"), normalize=True), GridSearchCV(svm.SVC(kernel='precomputed'), dict(C=C_grid), scoring='accuracy')) acc = accuracy_score(y, cross_val_predict(estimator, G, y, cv=n_folds)) print("Accuracy:", str(round(acc * 100, 2)) + "%")
def evaluate_kernel(graphs, graph_labels, kernel_def, label_requests, n_folds=10, seed=None): """ """ # Print progress print("Kernel: {}".format(kernel_def)) # Just a few sanity checks assert (len(graphs) == len(graph_labels)) # Initialize graph kernel gk = GraphKernel(kernel=kernel_def, normalize=True) # Train kernel on each set of relabeled graphs results = [] for lr in label_requests: # Print progress print("Vertex/Edge Labeling: {}".format(lr)) # Convert the base graphs into GraKeL-compatible representations with # the requested vertex and edge labels, if any. relabeled_graphs = convert_graphs(graphs, lr) print("# relabeled graphs: {}".format(len(relabeled_graphs))) # Define lists to track non-determinism fraction prediction results # over multiple folds true_nd_vals = [] pred_nd_vals = [] mse_vals = [] # Define training and testing sets graph_indices = list(range(len(graph_labels))) kf = KFold(n_splits=n_folds, random_state=seed, shuffle=True) for split_idx, (train_indices, test_indices) in enumerate(kf.split(graph_indices)): # Print progress print("Running split {}/{}".format(split_idx, n_folds)) # Get training and testing graphs g_train = [relabeled_graphs[i] for i in train_indices] g_test = [relabeled_graphs[i] for i in test_indices] print("# training graphs: {}".format(len(g_train))) print("# test graphs: {}".format(len(g_test))) # Get the non-determinism fraction values for the training and # testing graphs y_train = [graph_labels[i] for i in train_indices] y_test = [graph_labels[i] for i in test_indices] # Compute the graph kernel matrix k_train, k_test = compute_kernel_matrix(g_train, g_test, gk) print("K-train shape: {}".format(k_train.shape)) print("K-test shape: {}".format(k_test.shape)) # Train SVM regressor using precomputed kernel matrix y_pred = train_model(k_train, k_test, y_train) # Print progress print("Done with split {}/{}".format(split_idx, n_folds)) print() # Aggregate results for this fold true_nd_vals += list(y_test) pred_nd_vals += list(y_pred) # Aggregate results for this labeling results.append({"true": true_nd_vals, "pred": pred_nd_vals}) return results
def untangle(graph, k_hop, with_data: bool = True, with_call: bool = True, with_name: bool = True): seeds, list_of_graphs = deltaPDG_to_list_of_Graphs(graph, khop_k=k_hop) wl_subtree = GraphKernel(kernel=[{ "name": "weisfeiler_lehman", "n_iter": 10 }, { "name": "subtree_wl" }], normalize=True) if len(list_of_graphs) > 0: similarities = defaultdict(lambda: (0, 0.0)) for g1, g2 in itertools.combinations(list_of_graphs, 2): # The graph has to be converted to {Graph, Node_Labels, Edge_Labels} wl_subtree.fit( [graph_to_grakel(g1, with_data, with_call, with_name)]) similarity = wl_subtree.transform( [graph_to_grakel(g2, with_data, with_call, with_name)])[0][0] similarities[(list_of_graphs.index(g1), list_of_graphs.index(g2))] = similarity n = len(list_of_graphs) affinity = np.zeros(shape=(scipy.special.comb(n, 2, exact=True), )) args = list(enumerate(itertools.combinations(range(n), 2))) with ThreadPool(processes=min(os.cpu_count() - 1, 1)) as wp: for k, value in wp.imap_unordered( lambda i: (i[0], similarities[(i[-1][0], i[-1][1])]), args): affinity[k] += (1 - value ) # affinity is distance! so (1 - sim) cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='precomputed', linkage='complete') if len(affinity) < 2: if len(affinity) == 1: labels = np.asarray( [0, 0]) if affinity[0] <= 0.5 else np.asarray([0, 1]) else: labels = np.asarray([0]) else: labels = cluster.fit_predict( scipy.spatial.distance.squareform(affinity)) else: labels = None label = list() for node, data in graph.nodes(data=True): if 'color' in data.keys(): i = seeds.index(node) if node in seeds else -1 if labels is not None and i != -1: data['label'] = '%d: ' % labels[i] + data['label'] label.append(labels[i]) graph.add_node(node, **data) else: data['label'] = '-1: ' + data['label'] label.append(-1) graph.add_node(node, **data) return graph
import networkx as nx from grakel import GraphKernel from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import LabelEncoder from rpcc.create_features import TextFeaturesExtractor from rpcc.load_data import DataLoader sp_kernel = GraphKernel(kernel={ "name": "shortest_path", 'with_labels': False }, normalize=True) dl_obj = DataLoader() dl_obj.run_data_preparation() # creating a label binarizer instance in order to convert the classes to one hot vectors lb = LabelEncoder() # extracting the train targets y_train = dl_obj.y_train # converting the train targets to one hot y_train_one_hot = lb.fit_transform(y_train) # extracting the train targets y_val = dl_obj.y_val # converting the train targets to one hot y_val_one_hot = lb.transform(y_val)
bone_atoms_list={ 'Ph': ['1a'], 'C': ['2a'], 'O': ['3a'] }, side_atoms_list={'H': ['2a']}, additional_or_special_bonds_list=[['2a', '3a', 'double']]) y20 = 98 #%% from kernelSVR import kernelSVR ks = kernelSVR() gk = GraphKernel(kernel={ "name": "multiscale_laplacian", "which": "fast", "L": 1, "P": 10, "N": 10 }) #ks.add_kernel(gk) ignoreH = False expandPh = True mx_use = toGraKelList(mx_train, ignoreH, expandPh) #mx_use = toGraKelList(mx_full, ignoreH, expandPh) ks.fit_kernel(mx_use) #, my_train) ks.fit_SVRs(my_train) mx_use_test = toGraKelList(mx_test, ignoreH, expandPh) #mx_use_test = mx_use ks_pred_all = ks.predict(mx_use_test, 'all') ks_pred = ks.predict(mx_use_test)
from datasets_utils import load_shock_dataset, load_ppi_dataset from utils import compute_distance_matrix X, y = load_shock_dataset() # X, y = load_ppi_dataset() ## KFOLD # Shuffle data idx = np.random.permutation(len(X)) X, y = X[idx], y[idx] # Initialize chosen Kernel spk = GraphKernel(kernel={ "name": "shortest_path", "with_labels": False }, normalize=True) # Split indexes according to Kfold with k = 10 k = 10 kf = KFold(n_splits=k) # initialize scores lists scores1 = [] scores2 = [] for train_index, test_index in kf.split(X): # split train and test of K-fold X_train, X_test = X[train_index], X[test_index]
kf = StratifiedKFold(n_splits=10, shuffle=True) accs = [] for train_index, test_index in kf.split(G, y): start = time() G_train = [G[idx] for idx in train_index] y_train = [y[idx] for idx in train_index] G_test = [G[idx] for idx in test_index] y_test = [y[idx] for idx in test_index] # Initialise a weifeiler kernel, with a dirac base_kernel. gk = GraphKernel(kernel=[{ "name": "weisfeiler_lehman", "niter": niter }, { "name": "subtree_wl" }], normalize=True) # Calculate the kernel matrix. K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=1) params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]} clf = GridSearchCV(svm.SVC(kernel='precomputed'), params, cv=10, scoring='accuracy',
# Loads the Mutag dataset from: # https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets # the biggest collection of benchmark datasets for graph_kernels. mutag = datasets.fetch_dataset("MUTAG", verbose=False) G, y = mutag.data, mutag.target # Train-test split of graph data G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1, random_state=42) start = time() # Initialise a weifeiler kernel, with a dirac base_kernel. gk = GraphKernel(kernel=[{"name": "WL", "n_iter": 5}, "ST-WL"], normalize=True) # Calculate the kernel matrix. K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) end = time() # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=1) clf.fit(K_train, y_train) # Predict and test. y_pred = clf.predict(K_test) # Calculate accuracy of classification. acc = accuracy_score(y_test, y_pred)
def compute_kernel_matrix_grakel(event_graphs, kernel_params): kernel = GraphKernel(kernel_params) kernel_mat = kernel.fit_transform(event_graphs) return kernel_mat
G_train, G_test = list(), list() y_train, y_test = list(), list() for (i, (g, t)) in enumerate(zip(G, y)): if len(tri) and i == tri[0]: G_train.append(g) y_train.append(t) tri.pop(0) elif len(tei) and i == tei[0]: G_test.append(g) y_test.append(t) tei.pop(0) start = time() gk = GraphKernel(kernel={"name": "multiscale_laplacian", "which": "fast", "L": 1, "P": 10, "n_samples": 10}) # Calculate the kernel matrix. K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) end = time() # Cross validation on C, variable acc = 0 for c in C_grid: # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=c) # Fit on the train Kernel
if __name__ == '__main__': H2O = Graph([[0, 1, 1], [1, 0, 0], [1, 0, 0]], {0: 'O', 1: 'H', 2: 'H'}) H3O = Graph([[0, 1, 1, 1], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], { 0: 'O', 1: 'H', 2: 'H', 3: 'H' }) H2Od = dict() H2Od[0] = Graph({'a': {'b': 1., 'c': 1.}, 'b': {'a': 1}, 'c': {'a': 1}}) H2Od[1] = Graph({ ('a', 'b'): 1., ('a', 'c'): 1., ('c', 'a'): 1., ('b', 'a'): 1. }) H2Ot = array([[0, 1, 1], [1, 0, 0], [1, 0, 0]]) H2O_labels = {0: 'O', 1: 'H', 2: 'H'} H2O_edge_labels = { (0, 1): 'pcb', (1, 0): 'pcb', (0, 2): 'pcb', (2, 0): 'pcb' } adj_graph = Graph(H2Ot, H2O_labels, H2O_edge_labels, "all") #============================================================================== sp_kernal = GraphKernel(kernel={"name": "shortest_path"}, normalize=True) kernal_m = sp_kernal.fit_transform([adj_graph]) Sim = sp_kernal.transform([H3O]) print("the kernal_m is :{m}\n the sim is :{s}".format(m=kernal_m, s=Sim))
G, y = mutag.data, mutag.target C_grid = (10. ** np.arange(4,10,1) / len(G)).tolist() niter = 10 kernel_names = ["lovasz_theta", "svm_theta"] stats = {k: {"acc": list(), "time": list()} for k in kernel_names} for i in range(niter): # Train-test split of graph data G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1) for kernel_name in kernel_names: start = time() # Initialise a weifeiler kernel, with a dirac base_kernel. gk = GraphKernel(kernel={"name": kernel_name}, normalize=True) # Calculate the kernel matrix. K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) end = time() # Cross validation on C, variable acc = 0 for c in C_grid: # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=c) # Fit on the train Kernel clf.fit(K_train, y_train)
def test_subgraph_matching_pd(): """Random input test for the Subgraph Matching kernel [n_jobs=-1/generic-wrapper].""" # node-label/edge-label train, test = generate_dataset(n_graphs=100, r_vertices=(10, 20), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=40, random_state=rs, features=('nl', 3, 'el', 4)) gk = GraphKernel(kernel={"name": "SM"}, verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception # node-label/edge-attribute train, test = generate_dataset(n_graphs=50, r_vertices=(5, 10), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=20, random_state=rs, features=('nl', 3, 'ea', 5)) gk = GraphKernel(kernel={ "name": "SM", "ke": np.dot }, verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception # node-attribute/edge-label train, test = generate_dataset(n_graphs=50, r_vertices=(5, 10), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=20, random_state=rs, features=('na', 4, 'el', 3)) gk = GraphKernel(kernel={ "name": "SM", "kv": np.dot }, verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception # node-attribute/edge-attribute train, test = generate_dataset(n_graphs=50, r_vertices=(5, 10), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=20, random_state=rs, features=('na', 4, 'ea', 6)) gk = GraphKernel(kernel={ "name": "SM", "kv": np.dot, "ke": np.dot }, verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception
from Utils import * from numpy import array from grakel import graph_from_networkx if __name__ == '__main__': low_version = "F:\GraphSim\jsondata\V1.0" high_version = "F:\GraphSim\jsondata\V1.1" base_file_list = [] target_file_list = [] pairfileList = [] getfilePath(low_version, base_file_list) getfilePath(high_version, target_file_list) pairfileList = getpairFile(base_file_list, target_file_list) for pair in pairfileList: basefile = pair[0] targetfile = pair[1] g1 = ParseFile(basefile) g2 = ParseFile(targetfile) #basefileGraph、targetfileGraph分别为待比较结点得图 _basefileGraph = g1.connectFile() _targetfileGraph = g2.connectFile() adj1, node_label1, edge_label1 = getadjlist(_basefileGraph) adj2, node_label2, edge_label2 = getadjlist(_targetfileGraph) sp_kernal = GraphKernel(kernel={"name": "shortest_path"}, normalize=True) g1 = Graph(adj1, node_label1, edge_label1) g2 = Graph(adj2, node_label2, edge_label2) tp = sp_kernal.fit_transform([g1]) sim = sp_kernal.transform([g2]) print("kernal_Done!")
def test_propagation_pd(): """Random input test for the Propagation kernel [n_jobs=-1/generic-wrapper].""" train, test = generate_dataset(n_graphs=100, r_vertices=(10, 20), r_connectivity=(0.4, 0.8), r_weight_edges=(float("1e-5"), 10), n_graphs_test=40, random_state=rs, features=('nl', 4)) gk = GraphKernel(kernel="PR", verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception train, test = generate_dataset(n_graphs=100, r_vertices=(10, 20), r_connectivity=(0.4, 0.8), r_weight_edges=(float("1e-5"), 10), n_graphs_test=40, random_state=rs, features=('na', 5)) gk = GraphKernel(kernel={ "name": "PR", "with_attributes": True }, verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception
def test_shortest_path_pd(): """Random input test for the Shortest Path kernel [n_jobs=-1 (for attributed)/decorator].""" train, test = generate_dataset(n_graphs=100, r_vertices=(10, 20), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=40, random_state=rs, features=('nl', 3)) gk = GraphKernel(kernel="SP", verbose=verbose, normalize=normalize) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception train, test = generate_dataset(n_graphs=50, r_vertices=(5, 10), r_connectivity=(0.4, 0.8), r_weight_edges=(1, 1), n_graphs_test=20, random_state=rs, features=('na', 5)) gk = GraphKernel(kernel={ "name": "SP", "as_attributes": True }, verbose=verbose, normalize=normalize, n_jobs=-1) try: gk.fit_transform(train) gk.transform(test) assert True except Exception as exception: assert False, exception
def worker(work): for graph_location in tqdm(work, leave=False): chain = os.path.basename( os.path.dirname(os.path.dirname(graph_location))) q = int(os.path.basename(os.path.dirname(graph_location))) graph = obj_dict_to_networkx(read_graph_from_dot(graph_location)) graph = remove_all_except(graph, edges_kept) if len(graph.nodes) == 0: continue t0 = time.perf_counter() for i in range(times): seeds, list_of_graphs = deltaPDG_to_list_of_Graphs( graph, khop_k=k_hop) wl_subtree = GraphKernel(kernel=[{ "name": "weisfeiler_lehman", "n_iter": 10 }, { "name": "subtree_wl" }], normalize=True) if len(list_of_graphs) > 0: similarities = defaultdict(lambda: (0, 0.0)) for g1, g2 in itertools.combinations(list_of_graphs, 2): # The graph has to be converted to {Graph, Node_Labels, Edge_Labels} wl_subtree.fit([ graph_to_grakel(g1, with_data, with_call, with_name) ]) similarity = wl_subtree.transform([ graph_to_grakel(g2, with_data, with_call, with_name) ])[0][0] similarities[(list_of_graphs.index(g1), list_of_graphs.index(g2))] = similarity n = len(list_of_graphs) affinity = np.zeros( shape=(scipy.special.comb(n, 2, exact=True), )) args = list(enumerate(itertools.combinations(range(n), 2))) with ThreadPool(processes=min(os.cpu_count() - 1, 1)) as wp: for k, value in wp.imap_unordered( lambda i: (i[0], similarities[ (i[-1][0], i[-1][1])]), args): affinity[k] += ( 1 - value ) # affinity is distance! so (1 - sim) cluster = AgglomerativeClustering(n_clusters=None, distance_threshold=0.5, affinity='precomputed', linkage='complete') if len(affinity) < 2: if len(affinity) == 1: labels = np.asarray([ 0, 0 ]) if affinity[0] <= 0.5 else np.asarray([0, 1]) else: labels = np.asarray([0]) else: labels = cluster.fit_predict( scipy.spatial.distance.squareform(affinity)) else: labels = None t1 = time.perf_counter() time_ = (t1 - t0) / times truth = list() label = list() for node, data in graph.nodes(data=True): if 'color' in data.keys(): if 'community' in data.keys(): truth.append(int(data['community'])) i = seeds.index(node) if node in seeds else -1 if labels is not None and i != -1: data['label'] = '%d: ' % labels[i] + data['label'] label.append(labels[i]) graph.add_node(node, **data) else: data['label'] = '-1: ' + data['label'] label.append(-1) graph.add_node(node, **data) nx.drawing.nx_pydot.write_dot( graph, graph_location[:-4] + '_output_wl_%d.dot' % k_hop) truth = np.asarray(truth) label = np.asarray(label) acc, overlap = evaluate(truth[label > -1], label[label > -1], q=1 if len(label) == 0 else np.max(label) + 1) with open( './out/%s/wl_%s_%d_results_%s.csv' % (repository_name, edges_kept, k_hop, suffix), 'a') as f: f.write(chain + ',' + str(q) + ',' + str(acc) + ',' + str(overlap) + ',' + str(time_) + '\n')
Graph Kernel有很多种。常见的分为三类: 基于树的, 基于路径的, 基于子图的 WL核可以基于树构建,也可以基于路径构建,还可以基于子图构建。 Informally, a kernel is a function of two objects that quantifies their similarity. Mathematically, it corresponds to an inner product in a reproducing kernel Hilbert space. Graph Kernel 都来基于 R-convolution kernel 理论: Convolution Kernels on Discrete Structures, David Haussler, 1999 ''' from grakel import GraphKernel, datasets wl_kernel = GraphKernel(kernel=[{ "name": "weisfeiler_lehman" }, { "name": "subtree_wl" }]) H2O = [[[[0, 1, 1], [1, 0, 0], [1, 0, 0]], {0: 'O', 1: 'H', 2: 'H'}]] H3O = [[[[0, 1, 1, 1], [1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0]], { 0: 'O', 1: 'H', 2: 'H', 3: 'H' }]] two = [H2O[0], H3O[0]] # k1 = wl_kernel.fit_transform(H2O) # print(k1) # k2 = wl_kernel.transform(H3O) # print(k2) # k3 = wl_kernel.fit_transform(two)
def spk_isomap(X,y, k, KNNstart, KNNend, Dstart, Dend, svmC): filename = "accuracy.txt" myfile = open(filename, 'a') # Add info to file myfile.write('SP Isomap accuracy: K = %d-%d, D = %d-%d, C = %d, K-fold = %d\n' % (KNNstart, KNNend, Dstart, Dend, svmC, k)) KNN = [] KNNrange = KNNend - KNNstart+1 D = [] Drange = Dend - Dstart+1 for knn in range(KNNrange): KNN.append( knn + KNNstart) for d in range(Drange): D.append(d + Dstart) kf = KFold(n_splits=k) scores = [] Z = np.ndarray(shape=( len(D) , len(KNN) )) for knn in range(len(KNN)): for d in range(len(D)): for train_index, test_index in kf.split(X): kernel = GraphKernel(kernel={"name": "shortest_path", "with_labels": False}, normalize=True) # split train and test of K-fold X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Calculate the kernel matrix. K_train = kernel.fit_transform(X_train) K_test = kernel.transform(X_test) # Compute distance matrix D_train = compute_distance_matrix(K_train) D_test = compute_distance_matrix(K_test) # Initialize Isomap embedding object, embed train and test data embedding = manifold.Isomap(n_neighbors=KNN[knn], n_components=D[d], metric="precomputed") E_train = embedding.fit_transform(D_train) E_test = embedding.transform(D_test) # initialize second svm (not necessary? search documentation) clf2 = svm.SVC(kernel='linear', C=svmC) clf2.fit(E_train, y_train) # Predict and test. y_pred = clf2.predict(E_test) # Append accuracy of classification. scores.append(accuracy_score(y_test, y_pred)) val = np.mean(scores) Z[d][knn] = val myfile.write("%f " % (val)) print("knn = ", KNN[knn], "d = ", D[d], " accuracy = ", Z[d][knn]) print("{0:.2%} done".format((Drange*knn+d+1.0)/(Drange*KNNrange))) # print("{0:.2%} done".format((D*k+d + 1.0)/(D*KNN) )) myfile.write("\n") # Close the file myfile.close() return Z
G_rw, G_sm, y = read_data(3) N = len(G_rw[0]) labels = {'1': 'NC', '2': 'MCI', '3': 'AD'} rw_ac = [] sm_ac = [] for iter in range(3): print("Iter: ", iter) # Train-test split of graph data G_train_rw, G_test_rw, y_train_rw, y_test_rw = prepare_data(G_rw, y, random_state=iter) G_train_sm, G_test_sm, y_train_sm, y_test_sm = prepare_data(G_sm, y, random_state=iter) print("Data Set prepared") for (i, k) in enumerate(rows): print(k, end=" ") gk = GraphKernel(kernel=kernels[k], normalize=True) print("", end=".") # Calculate the kernel matrix for raw data start = time.time() K_train_rw = gk.fit_transform(G_train_rw) K_test_rw = gk.transform(G_test_rw) end = time.time() print("", end=".") # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed') clf.fit(K_train_rw, y_train_rw) print("", end=". ") # Predict and test.
# the biggest collection of benchmark datasets for graph_kernels. mutag = datasets.fetch_dataset("MUTAG", verbose=False) G, y = mutag.data, mutag.target # Train-test split of graph data G_train, G_test, y_train, y_test = train_test_split(G, y, test_size=0.1, random_state=42) start = time() # Initialise a weifeiler kernel, with a dirac base_kernel. gk = GraphKernel(kernel=[{ "name": "weisfeiler_lehman", "niter": 5 }, { "name": "subtree_wl" }], normalize=True) # Calculate the kernel matrix. K_train = gk.fit_transform(G_train) K_test = gk.transform(G_test) end = time() # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=1) clf.fit(K_train, y_train) # Predict and test. y_pred = clf.predict(K_test)
dataset_d = datasets.fetch_dataset(d, verbose=False, data_home="../dataset", produce_labels_nodes=True) G, y = np.asarray(dataset_d.data), np.asarray(dataset_d.target) stats = {m: {"acc": list(), "time": list()} for m in Methods} kfold = KFold(n_splits=10, random_state=50, shuffle=True) for train_idx, test_idx in kfold.split(G, y): train_g, train_y = G[train_idx], y[train_idx] test_g, test_y = G[test_idx], y[test_idx] for i, k in enumerate(Methods): gk = GraphKernel(kernel=kernels[k], normalize=True) start = time.time() k_train = gk.fit_transform(train_g) k_test = gk.transform(test_g) end = time.time() clf = svm.SVC(kernel='precomputed') clf.fit(k_train, train_y) pred_y = clf.predict(k_test) stats[k]["acc"].append(accuracy_score(test_y, pred_y)) stats[k]["time"].append(end - start) for m in Methods:
def cross_validation_with_and_without_manifold(X, y, n_neighbors, n_components, k, C): # Split indexes according to Kfold with k = 10 kf = KFold(n_splits=k) # initialize scores lists scores = [] scores2 = [] for train_index, test_index in kf.split(X): kernel = GraphKernel(kernel={ "name": "shortest_path", "with_labels": False }, normalize=True) # split train and test of K-fold X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] # Calculate the kernel matrix. K_train = kernel.fit_transform(X_train) K_test = kernel.transform(X_test) # Initialise an SVM and fit. clf = svm.SVC(kernel='precomputed', C=C) clf.fit(K_train, y_train) # Predict and test. y_pred = clf.predict(K_test) # Calculate accuracy of classification. acc = accuracy_score(y_test, y_pred) scores.append(acc) # Compute distance matrix D_train = compute_distance_matrix(K_train) D_test = compute_distance_matrix(K_test) # Initialize Isomap embedding object, embed train and test data embedding = manifold.Isomap(n_neighbors, n_components, metric="precomputed") E_train = embedding.fit_transform(D_train) E_test = embedding.transform(D_test) # initialize second svm (not necessary? search documentation) clf2 = svm.SVC(kernel='linear', C=C) clf2.fit(E_train, y_train) # Predict and test. y_pred = clf2.predict(E_test) # Calculate accuracy of classification. acc = accuracy_score(y_test, y_pred) scores2.append(acc) for i, _ in enumerate(scores): scores[i] = scores[i] * 100 for i, _ in enumerate(scores2): scores2[i] = scores2[i] * 100 return scores, scores2