def treeletkernel(*args, sub_kernel, node_label='atom', edge_label='bond_type', parallel='imap_unordered', n_jobs=None, chunksize=None, verbose=True): """Compute treelet graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. sub_kernel : function The sub-kernel between 2 real number vectors. Each vector counts the numbers of isomorphic treelets in a graph. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. parallel : string/None Which paralleliztion method is applied to compute the kernel. The Following choices are available: 'imap_unordered': use Python's multiprocessing.Pool.imap_unordered method. None: no parallelization is applied. n_jobs : int Number of jobs for parallelization. The default is to use all computational cores. This argument is only valid when one of the parallelization method is applied. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the treelet kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) labeled = False if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']: labeled = True if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if chunksize is None: if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 canonkeys = [[] for _ in range(len(Gn))] get_partial = partial(wrapper_get_canonkeys, node_label, edge_label, labeled, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize), desc='getting canonkeys', file=sys.stdout) else: iterator = pool.imap_unordered(get_partial, itr, chunksize) for i, ck in iterator: canonkeys[i] = ck pool.close() pool.join() # compute kernels. def init_worker(canonkeys_toshare): global G_canonkeys G_canonkeys = canonkeys_toshare do_partial = partial(wrapper_treeletkernel_do, sub_kernel) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(canonkeys, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # ---- do not use parallelization. ---- elif parallel is None: # get all canonical keys of all graphs before computing kernels to save # time, but this may cost a lot of memory for large dataset. canonkeys = [] for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout) if verbose else Gn): canonkeys.append( get_canonkeys(g, node_label, edge_label, labeled, ds_attrs['is_directed'])) # compute kernels. from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(Gn)), 2) for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout) if verbose else itr): Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j], sub_kernel) Kmatrix[j][i] = Kmatrix[i][ j] # @todo: no directed graph considered? else: raise Exception('No proper parallelization method designated.') run_time = time.time() - start_time if verbose: print( "\n --- treelet kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, n_jobs=None, chunksize=1): """Calculate shortest-path kernels between graphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] weight = None if edge_weight is None: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout): Gn[i] = g pool.close() pool.join() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) itr = combinations_with_replacement(range(0, len(Gn)), 2) with Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn, )) as pool: for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize), desc='calculating kernels', file=sys.stdout): Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel # # ---- direct running, normally use single CPU core. ---- # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
'extra_params': { 'am_sp_al_nl_el': [1, 1, 2, 0, -1] } }, { 'name': 'NCI-HIV', 'dataset': '../../datasets/NCI-HIV/AIDO99SD.sdf', 'dataset_y': '../../datasets/NCI-HIV/aids_conc_may04.txt', }, # # not working below # {'name': 'PTC_FM', 'dataset': '../../datasets/PTC/Train/FM.ds',}, # {'name': 'PTC_FR', 'dataset': '../../datasets/PTC/Train/FR.ds',}, # {'name': 'PTC_MM', 'dataset': '../../datasets/PTC/Train/MM.ds',}, # {'name': 'PTC_MR', 'dataset': '../../datasets/PTC/Train/MR.ds',}, ] for ds in dslist: dataset, y = loadDataset( ds['dataset'], filename_y=(ds['dataset_y'] if 'dataset_y' in ds else None), extra_params=(ds['extra_params'] if 'extra_params' in ds else None)) attrs = get_dataset_attributes(dataset, target=y, node_label='atom', edge_label='bond_type') print() print(ds['name'] + ':') for atr in attrs: print(atr, ':', attrs[atr]) print()
def untilhpathkernel(*args, node_label='atom', edge_label='bond_type', depth=10, k_func='MinMax', compute_method='trie', parallel='imap_unordered', n_jobs=None, chunksize=None, verbose=True): """Compute path graph kernels up to depth/hight h between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. depth : integer Depth of search. Longest length of paths. k_func : function A kernel function applied using different notions of fingerprint similarity, defining the type of feature map and normalization method applied for the graph kernel. The Following choices are available: 'MinMax': use the MiniMax kernel and counting feature map. 'tanimoto': use the Tanimoto kernel and binary feature map. None: no sub-kernel is used, the kernel is computed directly. compute_method : string Computation method to store paths and compute the graph kernel. The Following choices are available: 'trie': store paths as tries. 'naive': store paths to lists. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to h between 2 praphs. """ # pre-process depth = int(depth) Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] Kmatrix = np.zeros((len(Gn), len(Gn))) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) if k_func is not None: if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() if parallel == 'imap_unordered': # ---- use pool.imap_unordered to parallel and track progress. ---- # get all paths of all graphs before computing kernels to save time, # but this may cost a lot of memory for large datasets. pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if chunksize is None: if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 all_paths = [[] for _ in range(len(Gn))] if compute_method == 'trie' and k_func is not None: getps_partial = partial(wrapper_find_all_path_as_trie, depth, ds_attrs, node_label, edge_label) elif compute_method != 'trie' and k_func is not None: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, True) else: getps_partial = partial(wrapper_find_all_paths_until_length, depth, ds_attrs, node_label, edge_label, False) if verbose: iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), desc='getting paths', file=sys.stdout) else: iterator = pool.imap_unordered(getps_partial, itr, chunksize) for i, ps in iterator: all_paths[i] = ps pool.close() pool.join() # for g in Gn: # if compute_method == 'trie' and k_func is not None: # find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label) # elif compute_method != 'trie' and k_func is not None: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label) # else: # find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False) ## size = sys.getsizeof(all_paths) ## for item in all_paths: ## size += sys.getsizeof(item) ## for pppps in item: ## size += sys.getsizeof(pppps) ## print(size) # ## ttt = time.time() ## # ---- ---- use pool.map to parallel ---- ## for i, ps in tqdm( ## pool.map(getps_partial, range(0, len(Gn))), ## desc='getting paths', file=sys.stdout): ## all_paths[i] = ps ## print(time.time() - ttt) if compute_method == 'trie' and k_func is not None: def init_worker(trie_toshare): global G_trie G_trie = trie_toshare do_partial = partial(wrapper_uhpath_do_trie, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) elif compute_method != 'trie' and k_func is not None: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_naive, k_func) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) else: def init_worker(plist_toshare): global G_plist G_plist = plist_toshare do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(all_paths, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) elif parallel is None: # from pympler import asizeof # ---- direct running, normally use single CPU core. ---- # print(asizeof.asized(all_paths, detail=1).format()) if compute_method == 'trie': all_paths = [ find_all_path_as_trie(Gn[i], depth, ds_attrs, node_label=node_label, edge_label=edge_label) for i in tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout) ] # sizeof_allpaths = asizeof.asizeof(all_paths) # print(sizeof_allpaths) pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2), desc='Computing kernels', file=sys.stdout) for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _untilhpathkernel_do_trie( all_paths[i], all_paths[j], k_func) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) else: all_paths = [ find_all_paths_until_length(Gn[i], depth, ds_attrs, node_label=node_label, edge_label=edge_label) for i in tqdm(range(0, len(Gn)), desc='getting paths', file=sys.stdout) ] # sizeof_allpaths = asizeof.asizeof(all_paths) # print(sizeof_allpaths) pbar = tqdm(total=((len(Gn) + 1) * len(Gn) / 2), desc='Computing kernels', file=sys.stdout) for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _untilhpathkernel_do_naive( all_paths[i], all_paths[j], k_func) Kmatrix[j][i] = Kmatrix[i][j] pbar.update(1) run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---" % (depth, len(Gn), run_time)) # print(Kmatrix[0][0:10]) return Kmatrix, run_time
def marginalizedkernel(*args, node_label='atom', edge_label='bond_type', p_quit=0.5, n_iteration=20, remove_totters=False, n_jobs=None, chunksize=None, verbose=True): """Compute marginalized graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as symbolic label. The default node label is 'atom'. edge_label : string Edge attribute used as symbolic label. The default edge label is 'bond_type'. p_quit : integer The termination probability in the random walks generating step. n_iteration : integer Time of iterations to compute R_inf. remove_totters : boolean Whether to remove totterings by method introduced in [2]. The default value is False. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the marginalized kernel between 2 praphs. """ # pre-process n_iteration = int(n_iteration) Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()] Gn = [g.copy() for g in Gn] ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled'] or node_label is None: node_label = 'atom' for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled'] or edge_label is None: edge_label = 'bond_type' for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') start_time = time.time() if remove_totters: # ---- use pool.imap_unordered to parallel and track progress. ---- pool = Pool(n_jobs) untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label) if chunksize is None: if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 for i, g in tqdm(pool.imap_unordered(untotter_partial, range(0, len(Gn)), chunksize), desc='removing tottering', file=sys.stdout): Gn[i] = g pool.close() pool.join() # # ---- direct running, normally use single CPU core. ---- # Gn = [ # untotterTransformation(G, node_label, edge_label) # for G in tqdm(Gn, desc='removing tottering', file=sys.stdout) # ] Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_marg_do, node_label, edge_label, p_quit, n_iteration) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # # ---- direct running, normally use single CPU core. ---- ## pbar = tqdm( ## total=(1 + len(Gn)) * len(Gn) / 2, ## desc='Computing kernels', ## file=sys.stdout) # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): ## print(i, j) # Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label, # edge_label, p_quit, n_iteration) # Kmatrix[j][i] = Kmatrix[i][j] ## pbar.update(1) run_time = time.time() - start_time if verbose: print( "\n --- marginalized kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def commonwalkkernel(*args, node_label='atom', edge_label='bond_type', # n=None, weight=1, compute_method=None, n_jobs=None, chunksize=None, verbose=True): """Compute common walk graph kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as symbolic label. The default node label is 'atom'. edge_label : string Edge attribute used as symbolic label. The default edge label is 'bond_type'. weight: integer Weight coefficient of different lengths of walks, which represents beta in 'exp' method and gamma in 'geo'. compute_method : string Method used to compute walk kernel. The Following choices are available: 'exp': method based on exponential serials applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. 'geo': method based on geometric serials applied on the direct product graph, as shown in reference [1]. The time complexity is O(n^6) for graphs with n vertices. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is a common walk kernel between 2 graphs. """ # n : integer # Longest length of walks. Only useful when applying the 'brute' method. # 'brute': brute force, simply search for all walks and compare them. compute_method = compute_method.lower() # arrange all graphs in a list Gn = args[0] if len(args) == 1 else [args[0], args[1]] # remove graphs with only 1 node, as they do not have adjacency matrices len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they have only 1 node.\n' % (len_gn - len(Gn))) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'edge_labeled', 'is_directed'], node_label=node_label, edge_label=edge_label) if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') if not ds_attrs['edge_labeled']: for G in Gn: nx.set_edge_attributes(G, '0', 'bond_type') if not ds_attrs['is_directed']: # convert Gn = [G.to_directed() for G in Gn] start_time = time.time() Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare # direct product graph method - exponential if compute_method == 'exp': do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # direct product graph method - geometric elif compute_method == 'geo': do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn,), n_jobs=n_jobs, chunksize=chunksize, verbose=verbose) # pool = Pool(n_jobs) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # # # direct product graph method - exponential # if compute_method == 'exp': # do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight) # # direct product graph method - geometric # elif compute_method == 'geo': # do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight) # # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, chunksize), # desc='computing kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- direct running, normally use single CPU core. ---- # # direct product graph method - exponential # itr = combinations_with_replacement(range(0, len(Gn)), 2) # if compute_method == 'exp': # for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # # direct product graph method - geometric # elif compute_method == 'geo': # for i, j in tqdm(itr, desc='Computing kernels', file=sys.stdout): # Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label, # edge_label, weight) # Kmatrix[j][i] = Kmatrix[i][j] # # search all paths use brute force. # elif compute_method == 'brute': # n = int(n) # # get all paths of all graphs before computing kernels to save time, but this may cost a lot of memory for large dataset. # all_walks = [ # find_all_walks_until_length(Gn[i], n, node_label, edge_label) # for i in range(0, len(Gn)) # ] # # for i in range(0, len(Gn)): # for j in range(i, len(Gn)): # Kmatrix[i][j] = _commonwalkkernel_brute( # all_walks[i], # all_walks[j], # node_label=node_label, # edge_label=edge_label) # Kmatrix[j][i] = Kmatrix[i][j] run_time = time.time() - start_time if verbose: print("\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def structuralspkernel(*args, node_label='atom', edge_weight=None, edge_label='bond_type', node_kernels=None, edge_kernels=None, compute_method='naive', parallel='imap_unordered', # parallel=None, n_jobs=None, verbose=True): """Calculate mean average structural shortest path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. Applied for the computation of the shortest paths. edge_label : string Edge attribute used as label. The default edge label is bond_type. node_kernels : dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. edge_kernels : dict A dictionary of kernel functions for edges, including 3 items: 'symb' for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when edges are unlabeled. compute_method : string Computation method to store the shortest paths and compute the graph kernel. The Following choices are available: 'trie': store paths as tries. 'naive': store paths to lists. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the mean average structural shortest path kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed'], node_label=node_label, edge_label=edge_label) start_time = time.time() # get shortest paths of each graph in Gn if parallel == 'imap_unordered': splist = [None] * len(Gn) pool = Pool(n_jobs) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 # get shortest path graphs of Gn if compute_method == 'trie': getsp_partial = partial(wrapper_getSP_trie, weight, ds_attrs['is_directed']) else: getsp_partial = partial(wrapper_getSP_naive, weight, ds_attrs['is_directed']) if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting shortest paths', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, sp in iterator: splist[i] = sp # time.sleep(10) pool.close() pool.join() # ---- direct running, normally use single CPU core. ---- elif parallel is None: splist = [] if verbose: iterator = tqdm(Gn, desc='getting sp graphs', file=sys.stdout) else: iterator = Gn if compute_method == 'trie': for g in iterator: splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed'])) else: for g in iterator: splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed'])) # ss = 0 # ss += sys.getsizeof(splist) # for spss in splist: # ss += sys.getsizeof(spss) # for spp in spss: # ss += sys.getsizeof(spp) # time.sleep(20) # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- if parallel == 'imap_unordered': def init_worker(spl_toshare, gs_toshare): global G_spl, G_gs G_spl = spl_toshare G_gs = gs_toshare if compute_method == 'trie': do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) else: do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, node_kernels, edge_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(splist, Gn), n_jobs=n_jobs, verbose=verbose) # ---- direct running, normally use single CPU core. ---- elif parallel is None: from itertools import combinations_with_replacement itr = combinations_with_replacement(range(0, len(Gn)), 2) if verbose: iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) else: iterator = itr if compute_method == 'trie': for i, j in iterator: kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j], ds_attrs, node_label, edge_label, node_kernels, edge_kernels) Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel else: for i, j in iterator: kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j], ds_attrs, node_label, edge_label, node_kernels, edge_kernels) # if(kernel > 1): # print("error here ") Kmatrix[i][j] = kernel Kmatrix[j][i] = kernel # # ---- use pool.map to parallel. ---- # pool = Pool(n_jobs) # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use pool.imap_unordered to parallel and track progress. ---- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label, # node_kernels, edge_kernels) # itr = zip(combinations_with_replacement(Gn, 2), # combinations_with_replacement(splist, 2), # combinations_with_replacement(range(0, len(Gn)), 2)) # len_itr = int(len(Gn) * (len(Gn) + 1) / 2) # if len_itr < 1000 * n_jobs: # chunksize = int(len_itr / n_jobs) + 1 # else: # chunksize = 1000 # from contextlib import closing # with closing(Pool(n_jobs)) as pool: # for i, j, kernel in tqdm( # pool.imap_unordered(do_partial, itr, 1000), # desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() run_time = time.time() - start_time if verbose: print("\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time
def randomwalkkernel( *args, # params for all method. compute_method=None, weight=1, p=None, q=None, edge_weight=None, # params for conjugate and fp method. node_kernels=None, edge_kernels=None, node_label='atom', edge_label='bond_type', # params for spectral method. sub_kernel=None, n_jobs=None, verbose=True): """Calculate random walk graph kernels. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. compute_method : string Method used to compute kernel. The Following choices are available: 'sylvester' - Sylvester equation method. 'conjugate' - conjugate gradient method. 'fp' - fixed-point iterations. 'spectral' - spectral decomposition. weight : float A constant weight set for random walks of length h. p : None Initial probability distribution on the unlabeled direct product graph of two graphs. It is set to be uniform over all vertices in the direct product graph. q : None Stopping probability distribution on the unlabeled direct product graph of two graphs. It is set to be uniform over all vertices in the direct product graph. edge_weight : float Edge attribute name corresponding to the edge weight. node_kernels: dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when nodes are unlabeled. This argument is designated to conjugate gradient method and fixed-point iterations. edge_kernels: dict A dictionary of kernel functions for edges, including 3 items: 'symb' for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' for both labels. The first 2 functions take two edge labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two edges. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns a number as the kernel value. Ignored when edges are unlabeled. This argument is designated to conjugate gradient method and fixed-point iterations. node_label: string Node attribute used as label. The default node label is atom. This argument is designated to conjugate gradient method and fixed-point iterations. edge_label : string Edge attribute used as label. The default edge label is bond_type. This argument is designated to conjugate gradient method and fixed-point iterations. sub_kernel: string Method used to compute walk kernel. The Following choices are available: 'exp' : method based on exponential serials. 'geo' : method based on geometric serials. n_jobs: int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the path kernel up to d between 2 praphs. """ compute_method = compute_method.lower() Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] eweight = None if edge_weight == None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, float) or isinstance(some_weight, int): eweight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes(Gn, attr_names=[ 'node_labeled', 'node_attr_dim', 'edge_labeled', 'edge_attr_dim', 'is_directed' ], node_label=node_label, edge_label=edge_label) # remove graphs with no edges, as no walk can be found in their structures, # so the weight matrix between such a graph and itself might be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() # # get vertex and edge concatenated labels for each graph # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed']) # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed']) if compute_method == 'sylvester': if verbose: import warnings warnings.warn('All labels are ignored.') Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs, verbose=verbose) elif compute_method == 'conjugate': Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=verbose) elif compute_method == 'fp': Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels, edge_kernels, node_label, edge_label, eweight, n_jobs, verbose=verbose) elif compute_method == 'spectral': if verbose: import warnings warnings.warn( 'All labels are ignored. Only works for undirected graphs.') Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs, verbose=verbose) elif compute_method == 'kron': pass for i in range(0, len(Gn)): for j in range(i, len(Gn)): Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j], node_label, edge_label) Kmatrix[j][i] = Kmatrix[i][j] else: raise Exception( 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".' ) run_time = time.time() - start_time if verbose: print( "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def spkernel(*args, node_label='atom', edge_weight=None, node_kernels=None, parallel='imap_unordered', n_jobs=None, verbose=True): """Calculate shortest-path kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are calculated. G1, G2 : NetworkX graphs Two graphs between which the kernel is calculated. node_label : string Node attribute used as label. The default node label is atom. edge_weight : string Edge attribute name corresponding to the edge weight. node_kernels : dict A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled. n_jobs : int Number of jobs for parallelization. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the sp kernel between 2 praphs. """ # pre-process Gn = args[0] if len(args) == 1 else [args[0], args[1]] Gn = [g.copy() for g in Gn] weight = None if edge_weight is None: if verbose: print('\n None edge weight specified. Set all weight to 1.\n') else: try: some_weight = list( nx.get_edge_attributes(Gn[0], edge_weight).values())[0] if isinstance(some_weight, (float, int)): weight = edge_weight else: if verbose: print( '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n' % edge_weight) except: if verbose: print( '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n' % edge_weight) ds_attrs = get_dataset_attributes( Gn, attr_names=['node_labeled', 'node_attr_dim', 'is_directed'], node_label=node_label) # remove graphs with no edges, as no sp can be found in their structures, # so the kernel between such a graph and itself will be zero. len_gn = len(Gn) Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0] idx = [G[0] for G in Gn] Gn = [G[1] for G in Gn] if len(Gn) != len_gn: if verbose: print('\n %d graphs are removed as they don\'t contain edges.\n' % (len_gn - len(Gn))) start_time = time.time() if parallel == 'imap_unordered': pool = Pool(n_jobs) # get shortest path graphs of Gn getsp_partial = partial(wrapper_getSPGraph, weight) itr = zip(Gn, range(0, len(Gn))) if len(Gn) < 100 * n_jobs: # # use default chunksize as pool.map when iterable is less than 100 # chunksize, extra = divmod(len(Gn), n_jobs * 4) # if extra: # chunksize += 1 chunksize = int(len(Gn) / n_jobs) + 1 else: chunksize = 100 if verbose: iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize), desc='getting sp graphs', file=sys.stdout) else: iterator = pool.imap_unordered(getsp_partial, itr, chunksize) for i, g in iterator: Gn[i] = g pool.close() pool.join() elif parallel is None: pass # # ---- direct running, normally use single CPU core. ---- # for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout): # i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i)) # # ---- use pool.map to parallel ---- # result_sp = pool.map(getsp_partial, range(0, len(Gn))) # for i in result_sp: # Gn[i[0]] = i[1] # or # getsp_partial = partial(wrap_getSPGraph, Gn, weight) # for i, g in tqdm( # pool.map(getsp_partial, range(0, len(Gn))), # desc='getting sp graphs', # file=sys.stdout): # Gn[i] = g # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP) # sp_ml = [0] * len(Gn) # shortest path matrices # for i in result_sp: # sp_ml[i[0]] = i[1] # edge_x_g = [[] for i in range(len(sp_ml))] # edge_y_g = [[] for i in range(len(sp_ml))] # edge_w_g = [[] for i in range(len(sp_ml))] # for idx, item in enumerate(sp_ml): # for i1 in range(len(item)): # for i2 in range(i1 + 1, len(item)): # if item[i1, i2] != np.inf: # edge_x_g[idx].append(i1) # edge_y_g[idx].append(i2) # edge_w_g[idx].append(item[i1, i2]) # print(len(edge_x_g[0])) # print(len(edge_y_g[0])) # print(len(edge_w_g[0])) Kmatrix = np.zeros((len(Gn), len(Gn))) # ---- use pool.imap_unordered to parallel and track progress. ---- def init_worker(gn_toshare): global G_gn G_gn = gn_toshare do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels) parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, glbv=(Gn, ), n_jobs=n_jobs, verbose=verbose) # # ---- use pool.map to parallel. ---- # # result_perf = pool.map(do_partial, itr) # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels) # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j, kernel in tqdm( # pool.map(do_partial, itr), desc='calculating kernels', # file=sys.stdout): # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel # pool.close() # pool.join() # # ---- use joblib.Parallel to parallel and track progress. ---- # result_perf = Parallel( # n_jobs=n_jobs, verbose=10)( # delayed(do_partial)(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2)) # result_perf = [ # do_partial(ij) # for ij in combinations_with_replacement(range(0, len(Gn)), 2) # ] # for i in result_perf: # Kmatrix[i[0]][i[1]] = i[2] # Kmatrix[i[1]][i[0]] = i[2] # # ---- direct running, normally use single CPU core. ---- # from itertools import combinations_with_replacement # itr = combinations_with_replacement(range(0, len(Gn)), 2) # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout): # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels) # Kmatrix[i][j] = kernel # Kmatrix[j][i] = kernel run_time = time.time() - start_time if verbose: print( "\n --- shortest path kernel matrix of size %d built in %s seconds ---" % (len(Gn), run_time)) return Kmatrix, run_time, idx
def weisfeilerlehmankernel(*args, node_label='atom', edge_label='bond_type', height=0, base_kernel='subtree', parallel=None, n_jobs=None, chunksize=None, verbose=True): """Compute Weisfeiler-Lehman kernels between graphs. Parameters ---------- Gn : List of NetworkX graph List of graphs between which the kernels are computed. G1, G2 : NetworkX graphs Two graphs between which the kernel is computed. node_label : string Node attribute used as label. The default node label is atom. edge_label : string Edge attribute used as label. The default edge label is bond_type. height : int Subtree height. base_kernel : string Base kernel used in each iteration of WL kernel. Only default 'subtree' kernel can be applied for now. parallel : None Which paralleliztion method is applied to compute the kernel. No parallelization can be applied for now. n_jobs : int Number of jobs for parallelization. The default is to use all computational cores. This argument is only valid when one of the parallelization method is applied and can be ignored for now. Return ------ Kmatrix : Numpy matrix Kernel matrix, each element of which is the Weisfeiler-Lehman kernel between 2 praphs. Notes ----- This function now supports WL subtree kernel only. """ # The default base # kernel is subtree kernel. For user-defined kernel, base_kernel is the # name of the base kernel function used in each iteration of WL kernel. # This function returns a Numpy matrix, each element of which is the # user-defined Weisfeiler-Lehman kernel between 2 praphs. # pre-process base_kernel = base_kernel.lower() Gn = args[0] if len(args) == 1 else [args[0], args[1] ] # arrange all graphs in a list Gn = [g.copy() for g in Gn] ds_attrs = get_dataset_attributes(Gn, attr_names=['node_labeled'], node_label=node_label) if not ds_attrs['node_labeled']: for G in Gn: nx.set_node_attributes(G, '0', 'atom') start_time = time.time() # for WL subtree kernel if base_kernel == 'subtree': Kmatrix = _wl_kernel_do(Gn, node_label, edge_label, height, parallel, n_jobs, chunksize, verbose) # for WL shortest path kernel elif base_kernel == 'sp': Kmatrix = _wl_spkernel_do(Gn, node_label, edge_label, height) # for WL edge kernel elif base_kernel == 'edge': Kmatrix = _wl_edgekernel_do(Gn, node_label, edge_label, height) # for user defined base kernel else: Kmatrix = _wl_userkernel_do(Gn, node_label, edge_label, height, base_kernel) run_time = time.time() - start_time if verbose: print( "\n --- Weisfeiler-Lehman %s kernel matrix of size %d built in %s seconds ---" % (base_kernel, len(args[0]), run_time)) return Kmatrix, run_time