def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs, verbose): """Compute kernel matrix using the base kernel. """ if parallel == 'imap_unordered': # compute kernels. def init_worker(alllabels_toshare): global G_alllabels G_alllabels = alllabels_toshare do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_num_of_each_label, ), n_jobs=n_jobs, verbose=verbose) elif parallel == None: for i in range(len(Kmatrix)): for j in range(i, len(Kmatrix)): Kmatrix[i][j] = compute_subtree_kernel( all_num_of_each_label[i], all_num_of_each_label[j], Kmatrix[i][j]) Kmatrix[j][i] = Kmatrix[i][j]
def commonwalkkernel(*args, node_label='atom', edge_label='bond_type', n=None, weight=1, compute_method=None, n_jobs=None, verbose=True): """Calculate common walk graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. n : integer Longest length of walks. Only useful when applying the 'brute' method. weight: integer Weight coefficient of different lengths of walks, which represents beta in 'exp' method and gamma in 'geo'. compute_method : string Method used to compute walk kernel. The Following choices are available: 'exp' : exponential serial method applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. 'geo' : geometric serial method applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. 'brute' : brute force, simply search for all walks and compare them. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is a common walk kernel between 2 graphs. """ compute_method = compute_method.lower() # arrange all graphs in a list Gn = args[0] if len(args) == 1 else [args[0], args[1]] # remove graphs with only 1 node, as they do not have adjacency matrices len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they have only 1 node.\n' % (len_gn - len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') if not ds_attrs['is_directed']: # convert Gn = [G.to_directed() for G in Gn] start_time = time.time() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare # direct product graph method - exponential if compute_method == 'exp': do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # direct product graph method - geometric elif compute_method == 'geo': do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # pool = Pool(n_jobs) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # # # direct product graph method - exponential # if compute_method == 'exp': # do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # # direct product graph method - geometric # elif compute_method == 'geo': # do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) # # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, chunksize), # desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- direct running, normally use single CPU core. ---- # # direct product graph method - exponential # itr = combinations_with_replacement(range(0, len(Gn)), 2) # if compute_method == 'exp': # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # # direct product graph method - geometric # elif compute_method == 'geo': # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # search all paths use brute force. # elif compute_method == 'brute': # n = int(n) # # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset. # all_walks = [ # find_all_walks_until_length(Gn[i], n, node_label, edge_label) # for i in range(0, len(Gn)) # ] # # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _commonwalkkernel_brute( # all_walks[i], # all_walks[j], # node_label=node_label, # edge_label=edge_label) # Kmatrix[j][i] = Kmatrix[i][j] run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def marginalizedkernel(*args, node_label='atom', edge_label='bond_type', p_quit=0.5, n_iteration=20, remove_totters=False, n_jobs=None, verbose=True): """Calculate marginalized graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_label : string edge attribute used as label. The default edge label is bond_type. p_quit : integer the termination probability in the random walks generating step n_iteration : integer time of iterations to calculate R_inf remove_totters : boolean whether to remove totters. The default value is True. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the marginalized kernel between 2 praphs. """ # pre-process n_iteration = int(n_iteration) Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled'] or node_label == None: node_label = 'atom' for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled'] or edge_label == None: edge_label = 'bond_type' for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() if remove_totters: # ---- use pool.imap_unordered to parallel and track progress. ---- pool = Pool(n_jobs) untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 for i, g in tqdm( pool.imap_unordered( untotter_partial, range(0, len(Gn)), chunksize), desc='removing tottering', file=sys.stdout): Gn[i] = g pool.close() pool.join() # # ---- direct running, normally use single CPU core. ---- # Gn = [ # untotterTransformation(G, node_label, edge_label) # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) # ] Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit, n_iteration) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn,), n_jobs=n_jobs, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- ## pbar = tqdm( ## total=(1 + len(Gn)) * len(Gn) / 2, ## desc='calculating kernels', ## file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): ## print(i, j) # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, # edge_label, p_quit, n_iteration) # Kmatrix[j][i] = Kmatrix[i][j] ## pbar.update(1) run_time = time.time() - start_time if verbose: print("\n --- marginalized kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 unlabeled graphs using spectral decomposition method. Labels will be ignored. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) if q == None: # precompute the spectral decomposition of each graph. P_list = [] D_list = [] for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout) if verbose else Gn): # don't normalize adjacency matrices if q is a uniform vector. Note # A accually is the transpose of the adjacency matrix. A = nx.adjacency_matrix(G, eweight).todense().transpose() ew, ev = np.linalg.eig(A) D_list.append(ew) P_list.append(ev) # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs? if p == None: # p is uniform distribution as default. q_T_list = [ np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn ] # q_T_list = [q.T for q in q_list] def init_worker(q_T_toshare, P_toshare, D_toshare): global G_q_T, G_P, G_D G_q_T = q_T_toshare G_P = P_toshare G_D = D_toshare do_partial = partial(wrapper_sd_do, weight, sub_kernel) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs, verbose=verbose) # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j], # D_list[i], D_list[j], weight, sub_kernel) # Kmatrix[i][j] = result # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) return Kmatrix
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, n_jobs=None, verbose=True): """Calculate shortest-path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. node_kernels : dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the sp kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: # # use default chunksize as pool.map when iterable is less than 100 # chunksize, extra = divmod(len(Gn), n_jobs * 4) # if extra: # chunksize += 1 chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, g in iterator: Gn[i] = g pool.close() pool.join() # # ---- direct running, normally use single CPU core. ---- # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): # i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) # # ---- use pool.map to parallel ---- # result_sp = pool.map(getsp_partial, range(0, len(Gn))) # for i in result_sp: # Gn[i[0]] = i[1] # or # getsp_partial = partial(wrap_getSPGraph, Gn, weight) # for i, g in tqdm( # pool.map(getsp_partial, range(0, len(Gn))), # desc='getting sp graphs', # file=sys.stdout): # Gn[i] = g # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # # ---- use pool.map to parallel. ---- # # result_perf = pool.map(do_partial, itr) # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use joblib.Parallel to parallel and track progress. ---- # result_perf = Parallel( # n_jobs=n_jobs, verbose=10)( # delayed(do_partial)(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) # result_perf = [ # do_partial(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2) # ] # for i in result_perf: # Kmatrix[i[0]][i[1]] = i[2] # Kmatrix[i[1]][i[0]] = i[2] # # ---- direct running, normally use single CPU core. ---- # from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time if verbose: print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 graphs using conjugate method. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1: # # this is faster from unlabeled graphs. @todo: why? # if q == None: # # don't normalize adjacency matrices if q is a uniform vector. Note # # A_wave_list accually contains the transposes of the adjacency matrices. # A_wave_list = [ # nx.adjacency_matrix(G, eweight).todense().transpose() for G in # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) # ] # if p == None: # p is uniform distribution as default. # def init_worker(Awl_toshare): # global G_Awl # G_Awl = Awl_toshare # do_partial = partial(wrapper_cg_unlabled_do, lmda) # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, # glbv=(A_wave_list,), n_jobs=n_jobs) # else: # reindex nodes using consecutive integers for convenience of kernel calculation. Gn = [ nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn) ] if p == None and q == None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels, # node_label, edge_kernels, edge_label, lmda) # Kmatrix[i][j] = result # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) return Kmatrix
def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \ # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1: # # this is faster from unlabeled graphs. @todo: why? # if q == None: # # don't normalize adjacency matrices if q is a uniform vector. Note # # A_wave_list accually contains the transposes of the adjacency matrices. # A_wave_list = [ # nx.adjacency_matrix(G, eweight).todense().transpose() for G in # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout) # ] # if p == None: # p is uniform distribution as default. # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # # use uniform distribution if there is no prior knowledge. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) # p_times_uni = 1 / nb_pd # w_times = kron(A_wave_list[i], A_wave_list[j]).todense() # p_times = np.full((nb_pd, 1), p_times_uni) # x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times)) # # use uniform distribution if there is no prior knowledge. # q_times = np.full((1, nb_pd), p_times_uni) # Kmatrix[i][j] = np.dot(q_times, x) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) # else: # reindex nodes using consecutive integers for convenience of kernel calculation. Gn = [ nx.convert_node_labels_to_integers(g, first_label=0, label_attribute='label_orignal') for g in (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn) ] if p == None and q == None: # p and q are uniform distributions as default. def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels, node_label, edge_kernels, edge_label, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) return Kmatrix
def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True): """Calculate walk graph kernels up to n between 2 graphs using Sylvester method. Parameters ---------- G1, G2 : NetworkX graph Graphs between which the kernel is calculated. node_label : string node attribute used as label. edge_label : string edge attribute used as label. Return ------ kernel : float Kernel between 2 graphs. """ Kmatrix = np.zeros((len(Gn), len(Gn))) if q == None: # don't normalize adjacency matrices if q is a uniform vector. Note # A_wave_list accually contains the transposes of the adjacency matrices. A_wave_list = [ nx.adjacency_matrix(G, eweight).todense().transpose() for G in ( tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout ) if verbose else Gn) ] # # normalized adjacency matrices # A_wave_list = [] # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout): # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose() # norm = A_tilde.sum(axis=0) # norm[norm == 0] = 1 # A_wave_list.append(A_tilde / norm) if p == None: # p is uniform distribution as default. def init_worker(Awl_toshare): global G_Awl G_Awl = Awl_toshare do_partial = partial(wrapper_se_do, lmda) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(A_wave_list, ), n_jobs=n_jobs, verbose=verbose) # pbar = tqdm( # total=(1 + len(Gn)) * len(Gn) / 2, # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # S = lmda * A_wave_list[j] # T_t = A_wave_list[i] # # use uniform distribution if there is no prior knowledge. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j]) # p_times_uni = 1 / nb_pd # M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni) # X = dlyap(S, T_t, M0) # X = np.reshape(X, (-1, 1), order='F') # # use uniform distribution if there is no prior knowledge. # q_times = np.full((1, nb_pd), p_times_uni) # Kmatrix[i][j] = np.dot(q_times, X) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) return Kmatrix
def treeletkernel(*args, sub_kernel, node_label='atom', edge_label='bond_type', parallel='imap_unordered', n_jobs=None, verbose=True): """Calculate treelet graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. sub_kernel : function The sub-kernel between 2 real number vectors. Each vector counts the numbers of isomorphic treelets in a graph. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. parallel : string/None Which paralleliztion method is applied to compute the kernel. The Following choices are available: 'imap_unordered': use Python's multiprocessing.Pool.imap_unordered method. None: no parallelization is applied. n_jobs : int Number of jobs for parallelization. The default is to use all computational cores. This argument is only valid when one of the parallelization method is applied. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the treelet kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) labeled = False if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']: labeled = True if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': # get all canonical keys of all graphs before calculating kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 canonkeys = [[] for _ in range(len(Gn))] get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, labeled, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), desc='getting canonkeys', file=sys.stdout) else: iterator = pool.imap_unordered(get_partial, itr, chunksize) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() # compute kernels. def init_worker(canonkeys_toshare): global G_canonkeys G_canonkeys = canonkeys_toshare do_partial = partial(wrapper_treeletkernel_do, sub_kernel) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(canonkeys, ), n_jobs=n_jobs, verbose=verbose) # ---- do not use parallelization. ---- elif parallel == None: # get all canonical keys of all graphs before calculating kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout) if verbose else Gn): canonkeys.append( get_canonkeys(g, node_label, edge_label, labeled, ds_attrs['is_directed'])) # compute kernels. from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(Gn)), 2) for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout) if verbose else itr): Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], sub_kernel) Kmatrix[j][i] = Kmatrix[i][ j] # @todo: no directed graph considered? else: raise Exception('No proper parallelization method designated.') run_time = time.time() - start_time if verbose: print( "\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def untilhpathkernel(*args, node_label='atom', edge_label='bond_type', depth=10, k_func='MinMax', compute_method='trie', n_jobs=None, verbose=True): """Calculate path graph kernels up to depth/hight h between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. depth : integer Depth of search. Longest length of paths. k_func : function A kernel function applied using different notions of fingerprint similarity, defining the type of feature map and normalization method applied for the graph kernel. The Following choices are available: 'MinMax': use the MiniMax kernel and counting feature map. 'tanimoto': use the Tanimoto kernel and binary feature map. None: no sub-kernel is used, the kernel is computed directly. compute_method : string Computation method to store paths and compute the graph kernel. The Following choices are available: 'trie': store paths as tries. 'naive': store paths to lists. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to h between 2 praphs. """ # pre-process depth = int(depth) Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) if k_func != None: if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() # ---- use pool.imap_unordered to parallel and track progress. ---- # get all paths of all graphs before calculating kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 all_paths = [[] for _ in range(len(Gn))] if compute_method == 'trie' and k_func != None: getps_partial = partial(wrapper_find_all_path_as_trie, depth, ds_attrs, node_label, edge_label) elif compute_method != 'trie' and k_func != None: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, True) else: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, False) if verbose: iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), desc='getting paths', file=sys.stdout) else: iterator = pool.imap_unordered(getps_partial, itr, chunksize) for i, ps in iterator: all_paths[i] = ps pool.close() pool.join() # for g in Gn: # if compute_method == 'trie' and k_func != None: # find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) # elif compute_method != 'trie' and k_func != None: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) # else: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) ## size = sys.getsizeof(all_paths) ## for item in all_paths: ## size += sys.getsizeof(item) ## for pppps in item: ## size += sys.getsizeof(pppps) ## print(size) # ## ttt = time.time() ## # ---- ---- use pool.map to parallel ---- ## for i, ps in tqdm( ## pool.map(getps_partial, range(0, len(Gn))), ## desc='getting paths', file=sys.stdout): ## all_paths[i] = ps ## print(time.time() - ttt) if compute_method == 'trie' and k_func != None: def init_worker(trie_toshare): global G_trie G_trie = trie_toshare do_partial = partial(wrapper_uhpath_do_trie, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, verbose=verbose) elif compute_method != 'trie' and k_func != None: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_naive, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, verbose=verbose) else: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- # all_paths = [ # find_all_paths_until_length( # Gn[i], # depth, # ds_attrs, # node_label=node_label, # edge_label=edge_label) for i in tqdm( # range(0, len(Gn)), desc='getting paths', file=sys.stdout) # ] # # if compute_method == 'trie': # pbar = tqdm( # total=((len(Gn) + 1) * len(Gn) / 2), # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i], # all_paths[j], k_func) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) # else: # pbar = tqdm( # total=((len(Gn) + 1) * len(Gn) / 2), # desc='calculating kernels', # file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j], # k_func) # Kmatrix[j][i] = Kmatrix[i][j] # pbar.update(1) run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" % (depth, len(Gn), run_time)) # print(Kmatrix[0][0:10]) return Kmatrix, run_time
def structuralspkernel(*args, node_label='atom', edge_weight=None, edge_label='bond_type', node_kernels=None, edge_kernels=None, compute_method='naive', n_jobs=None, verbose=True): """Calculate mean average structural shortest path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. edge_label : string edge attribute used as label. The default edge label is bond_type. node_kernels: dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. edge_kernels: dict A dictionary of kernel functions for edges, including 3 items: 'symb' for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when edges are unlabeled. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the mean average structural shortest path kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) start_time = time.time() # get shortest paths of each graph in Gn splist = [None] * len(Gn) pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of Gn if compute_method == 'trie': getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) else: getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting shortest paths', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, sp in iterator: splist[i] = sp # time.sleep(10) pool.close() pool.join() # ss = 0 # ss += sys.getsizeof(splist) # for spss in splist: # ss += sys.getsizeof(spss) # for spp in spss: # ss += sys.getsizeof(spp) # time.sleep(20) # # ---- direct running, normally use single CPU core. ---- # splist = [] # if compute_method == 'trie': # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): # splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) # else: # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout): # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare G_gs = gs_toshare if compute_method == 'trie': do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) else: do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) # # ---- use pool.map to parallel. ---- # pool = Pool(n_jobs) # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use pool.imap_unordered to parallel and track progress. ---- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # from contextlib import closing # with closing(Pool(n_jobs)) as pool: # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, 1000), # desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- direct running, normally use single CPU core. ---- # from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) # if compute_method == 'trie': # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], # ds_attrs, node_label, edge_label, node_kernels, edge_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # else: # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], # ds_attrs, node_label, edge_label, node_kernels, edge_kernels) # # if(kernel > 1): # # print("error here ") # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time if verbose: print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, n_jobs=None): """Calculate shortest-path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. / G1, G2 : NetworkX graphs 2 graphs between which the kernel is calculated. node_label : string node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. node_kernels: dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the sp kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] weight = None if edge_weight is None: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) ds_attrs['node_attr_dim'] = 0 # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: # # use default chunksize as pool.map when iterable is less than 100 # chunksize, extra = divmod(len(Gn), n_jobs * 4) # if extra: # chunksize += 1 chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout): Gn[i] = g pool.close() pool.join() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs) run_time = time.time() - start_time print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx