Python parallel_gm Examples

Programming Language: Python

Namespace/Package Name: pygraph.utils.parallel

Method/Function: parallel_gm

Examples at hotexamples.com: 12

Python parallel_gm - 12 examples found. These are the top rated real world Python examples of pygraph.utils.parallel.parallel_gm extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def compute_kernel_matrix(Kmatrix, all_num_of_each_label, Gn, parallel, n_jobs,
                          verbose):
    """Compute kernel matrix using the base kernel.
    """
    if parallel == 'imap_unordered':
        # compute kernels.
        def init_worker(alllabels_toshare):
            global G_alllabels
            G_alllabels = alllabels_toshare

        do_partial = partial(wrapper_compute_subtree_kernel, Kmatrix)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_num_of_each_label, ),
                    n_jobs=n_jobs,
                    verbose=verbose)
    elif parallel == None:
        for i in range(len(Kmatrix)):
            for j in range(i, len(Kmatrix)):
                Kmatrix[i][j] = compute_subtree_kernel(
                    all_num_of_each_label[i], all_num_of_each_label[j],
                    Kmatrix[i][j])
                Kmatrix[j][i] = Kmatrix[i][j]

Example #2

Show file

File: commonWalkKernel.py Project: bgauzere/py-graph

def commonwalkkernel(*args,
                     node_label='atom',
                     edge_label='bond_type',
                     n=None,
                     weight=1,
                     compute_method=None,
                     n_jobs=None,
                     verbose=True):
    """Calculate common walk graph kernels between graphs.
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    n : integer
        Longest length of walks. Only useful when applying the 'brute' method.
    weight: integer
        Weight coefficient of different lengths of walks, which represents beta
        in 'exp' method and gamma in 'geo'.
    compute_method : string
        Method used to compute walk kernel. The Following choices are 
        available:
        'exp' : exponential serial method applied on the direct product graph, 
        as shown in reference [1]. The time complexity is O(n^6) for graphs 
        with n vertices.
        'geo' : geometric serial method applied on the direct product graph, as
        shown in reference [1]. The time complexity is O(n^6) for graphs with n
        vertices.
        'brute' : brute force, simply search for all walks and compare them.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is a common walk kernel between 2 
        graphs.
    """
    compute_method = compute_method.lower()
    # arrange all graphs in a list
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]

    # remove graphs with only 1 node, as they do not have adjacency matrices
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_nodes(G) != 1]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they have only 1 node.\n' %
                  (len_gn - len(Gn)))

    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    if not ds_attrs['node_labeled']:
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')
    if not ds_attrs['edge_labeled']:
        for G in Gn:
            nx.set_edge_attributes(G, '0', 'bond_type')
    if not ds_attrs['is_directed']:  #  convert
        Gn = [G.to_directed() for G in Gn]

    start_time = time.time()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    # direct product graph method - exponential
    if compute_method == 'exp':
        do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
    # direct product graph method - geometric
    elif compute_method == 'geo':
        do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                verbose=verbose)

    #    pool = Pool(n_jobs)
    #    itr = zip(combinations_with_replacement(Gn, 2),
    #              combinations_with_replacement(range(0, len(Gn)), 2))
    #    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
    #    if len_itr < 1000 * n_jobs:
    #        chunksize = int(len_itr / n_jobs) + 1
    #    else:
    #        chunksize = 1000
    #
    #    # direct product graph method - exponential
    #    if compute_method == 'exp':
    #        do_partial = partial(wrapper_cw_exp, node_label, edge_label, weight)
    #    # direct product graph method - geometric
    #    elif compute_method == 'geo':
    #        do_partial = partial(wrapper_cw_geo, node_label, edge_label, weight)
    #
    #    for i, j, kernel in tqdm(
    #            pool.imap_unordered(do_partial, itr, chunksize),
    #            desc='calculating kernels',
    #            file=sys.stdout):
    #        Kmatrix[i][j] = kernel
    #        Kmatrix[j][i] = kernel
    #    pool.close()
    #    pool.join()

    #    # ---- direct running, normally use single CPU core. ----
    #    # direct product graph method - exponential
    #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    #    if compute_method == 'exp':
    #        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #            Kmatrix[i][j] = _commonwalkkernel_exp(Gn[i], Gn[j], node_label,
    #                                                      edge_label, weight)
    #            Kmatrix[j][i] = Kmatrix[i][j]
    #
    #    # direct product graph method - geometric
    #    elif compute_method == 'geo':
    #        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #            Kmatrix[i][j] = _commonwalkkernel_geo(Gn[i], Gn[j], node_label,
    #                                                      edge_label, weight)
    #            Kmatrix[j][i] = Kmatrix[i][j]

    #    # search all paths use brute force.
    #    elif compute_method == 'brute':
    #        n = int(n)
    #        # get all paths of all graphs before calculating kernels to save time, but this may cost a lot of memory for large dataset.
    #        all_walks = [
    #            find_all_walks_until_length(Gn[i], n, node_label, edge_label)
    #                for i in range(0, len(Gn))
    #        ]
    #
    #        for i in range(0, len(Gn)):
    #            for j in range(i, len(Gn)):
    #                Kmatrix[i][j] = _commonwalkkernel_brute(
    #                    all_walks[i],
    #                    all_walks[j],
    #                    node_label=node_label,
    #                    edge_label=edge_label)
    #                Kmatrix[j][i] = Kmatrix[i][j]

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx

Example #3

Show file

def marginalizedkernel(*args,
                       node_label='atom',
                       edge_label='bond_type',
                       p_quit=0.5,
                       n_iteration=20,
                       remove_totters=False,
                       n_jobs=None,
                       verbose=True):
    """Calculate marginalized graph kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    p_quit : integer
        the termination probability in the random walks generating step
    n_iteration : integer
        time of iterations to calculate R_inf
    remove_totters : boolean
        whether to remove totters. The default value is True.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the marginalized kernel between
        2 praphs.
    """
    # pre-process
    n_iteration = int(n_iteration)
    Gn = args[0][:] if len(args) == 1 else [args[0].copy(), args[1].copy()]
    
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label, edge_label=edge_label)
    if not ds_attrs['node_labeled'] or node_label == None:
        node_label = 'atom'
        for G in Gn:
            nx.set_node_attributes(G, '0', 'atom')
    if not ds_attrs['edge_labeled'] or edge_label == None:
        edge_label = 'bond_type'
        for G in Gn:
            nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()
    
    if remove_totters:
        # ---- use pool.imap_unordered to parallel and track progress. ----
        pool = Pool(n_jobs)
        untotter_partial = partial(wrapper_untotter, Gn, node_label, edge_label)
        if len(Gn) < 100 * n_jobs:
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        for i, g in tqdm(
                pool.imap_unordered(
                    untotter_partial, range(0, len(Gn)), chunksize),
                desc='removing tottering',
                file=sys.stdout):
            Gn[i] = g
        pool.close()
        pool.join()

#        # ---- direct running, normally use single CPU core. ----
#        Gn = [
#            untotterTransformation(G, node_label, edge_label)
#            for G in tqdm(Gn, desc='removing tottering', file=sys.stdout)
#        ]

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
                global G_gn
                G_gn = gn_toshare
    do_partial = partial(wrapper_marg_do, node_label, edge_label,
                         p_quit, n_iteration)   
    parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, 
                glbv=(Gn,), n_jobs=n_jobs, verbose=verbose)


#    # ---- direct running, normally use single CPU core. ----
##    pbar = tqdm(
##        total=(1 + len(Gn)) * len(Gn) / 2,
##        desc='calculating kernels',
##        file=sys.stdout)
#    for i in range(0, len(Gn)):
#        for j in range(i, len(Gn)):
##            print(i, j)
#            Kmatrix[i][j] = _marginalizedkernel_do(Gn[i], Gn[j], node_label,
#                                                   edge_label, p_quit, n_iteration)
#            Kmatrix[j][i] = Kmatrix[i][j]
##            pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print("\n --- marginalized kernel matrix of size %d built in %s seconds ---"
              % (len(Gn), run_time))

    return Kmatrix, run_time

Example #4

Show file

File: randomWalkKernel.py Project: bgauzere/py-graph

def _spectral_decomposition(Gn,
                            weight,
                            p,
                            q,
                            sub_kernel,
                            eweight,
                            n_jobs,
                            verbose=True):
    """Calculate walk graph kernels up to n between 2 unlabeled graphs using 
    spectral decomposition method. Labels will be ignored.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    if q == None:
        # precompute the spectral decomposition of each graph.
        P_list = []
        D_list = []
        for G in (tqdm(Gn, desc='spectral decompose', file=sys.stdout)
                  if verbose else Gn):
            # don't normalize adjacency matrices if q is a uniform vector. Note
            # A accually is the transpose of the adjacency matrix.
            A = nx.adjacency_matrix(G, eweight).todense().transpose()
            ew, ev = np.linalg.eig(A)
            D_list.append(ew)
            P_list.append(ev)
#        P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?

        if p == None:  # p is uniform distribution as default.
            q_T_list = [
                np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G))
                for G in Gn
            ]

            #            q_T_list = [q.T for q in q_list]
            def init_worker(q_T_toshare, P_toshare, D_toshare):
                global G_q_T, G_P, G_D
                G_q_T = q_T_toshare
                G_P = P_toshare
                G_D = D_toshare

            do_partial = partial(wrapper_sd_do, weight, sub_kernel)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(q_T_list, P_list, D_list),
                        n_jobs=n_jobs,
                        verbose=verbose)


#            pbar = tqdm(
#                total=(1 + len(Gn)) * len(Gn) / 2,
#                desc='calculating kernels',
#                file=sys.stdout)
#            for i in range(0, len(Gn)):
#                for j in range(i, len(Gn)):
#                    result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
#                                    D_list[i], D_list[j], weight, sub_kernel)
#                    Kmatrix[i][j] = result
#                    Kmatrix[j][i] = Kmatrix[i][j]
#                    pbar.update(1)
    return Kmatrix

Example #5

Show file

File: spKernel.py Project: vishalbelsare/py-graph

def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None,
             verbose=True):
    """Calculate shortest-path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    node_label : string
        Node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    node_kernels : dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns an 
        number as the kernel value. Ignored when nodes are unlabeled.
    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the sp kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        if verbose:
            print('\n %d graphs are removed as they don\'t contain edges.\n' %
                  (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        #        # use default chunksize as pool.map when iterable is less than 100
        #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
        #        if extra:
        #            chunksize += 1
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    if verbose:
        iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                        desc='getting sp graphs',
                        file=sys.stdout)
    else:
        iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
    for i, g in iterator:
        Gn[i] = g
    pool.close()
    pool.join()

    #    # ---- direct running, normally use single CPU core. ----
    #    for i in tqdm(range(len(Gn)), desc='getting sp graphs', file=sys.stdout):
    #        i, Gn[i] = wrapper_getSPGraph(weight, (Gn[i], i))

    # # ---- use pool.map to parallel ----
    # result_sp = pool.map(getsp_partial, range(0, len(Gn)))
    # for i in result_sp:
    #     Gn[i[0]] = i[1]
    # or
    # getsp_partial = partial(wrap_getSPGraph, Gn, weight)
    # for i, g in tqdm(
    #         pool.map(getsp_partial, range(0, len(Gn))),
    #         desc='getting sp graphs',
    #         file=sys.stdout):
    #     Gn[i] = g

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs,
                verbose=verbose)

    # # ---- use pool.map to parallel. ----
    # # result_perf = pool.map(do_partial, itr)
    # do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
    # itr = combinations_with_replacement(range(0, len(Gn)), 2)
    # for i, j, kernel in tqdm(
    #         pool.map(do_partial, itr), desc='calculating kernels',
    #         file=sys.stdout):
    #     Kmatrix[i][j] = kernel
    #     Kmatrix[j][i] = kernel
    # pool.close()
    # pool.join()

    # # ---- use joblib.Parallel to parallel and track progress. ----
    # result_perf = Parallel(
    #     n_jobs=n_jobs, verbose=10)(
    #         delayed(do_partial)(ij)
    #         for ij in combinations_with_replacement(range(0, len(Gn)), 2))
    # result_perf = [
    #     do_partial(ij)
    #     for ij in combinations_with_replacement(range(0, len(Gn)), 2)
    # ]
    # for i in result_perf:
    #     Kmatrix[i[0]][i[1]] = i[2]
    #     Kmatrix[i[1]][i[0]] = i[2]

    #    # ---- direct running, normally use single CPU core. ----
    #    from itertools import combinations_with_replacement
    #    itr = combinations_with_replacement(range(0, len(Gn)), 2)
    #    for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
    #        kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
    #        Kmatrix[i][j] = kernel
    #        Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time, idx

Example #6

Show file

File: randomWalkKernel.py Project: bgauzere/py-graph

def _conjugate_gradient(Gn,
                        lmda,
                        p,
                        q,
                        ds_attrs,
                        node_kernels,
                        edge_kernels,
                        node_label,
                        edge_label,
                        eweight,
                        n_jobs,
                        verbose=True):
    """Calculate walk graph kernels up to n between 2 graphs using conjugate method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    #    if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
    #        not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
    #        # this is faster from unlabeled graphs. @todo: why?
    #        if q == None:
    #            # don't normalize adjacency matrices if q is a uniform vector. Note
    #            # A_wave_list accually contains the transposes of the adjacency matrices.
    #            A_wave_list = [
    #                nx.adjacency_matrix(G, eweight).todense().transpose() for G in
    #                    tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
    #            ]
    #            if p == None: # p is uniform distribution as default.
    #                def init_worker(Awl_toshare):
    #                    global G_Awl
    #                    G_Awl = Awl_toshare
    #                do_partial = partial(wrapper_cg_unlabled_do, lmda)
    #                parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
    #                            glbv=(A_wave_list,), n_jobs=n_jobs)
    #    else:
    # reindex nodes using consecutive integers for convenience of kernel calculation.
    Gn = [
        nx.convert_node_labels_to_integers(g,
                                           first_label=0,
                                           label_attribute='label_orignal')
        for g in
        (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)
    ]

    if p == None and q == None:  # p and q are uniform distributions as default.

        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare

        do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
                             node_label, edge_kernels, edge_label, lmda)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(Gn, ),
                    n_jobs=n_jobs,
                    verbose=verbose)


#            pbar = tqdm(
#                total=(1 + len(Gn)) * len(Gn) / 2,
#                desc='calculating kernels',
#                file=sys.stdout)
#            for i in range(0, len(Gn)):
#                for j in range(i, len(Gn)):
#                    result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
#                                           node_label, edge_kernels, edge_label, lmda)
#                    Kmatrix[i][j] = result
#                    Kmatrix[j][i] = Kmatrix[i][j]
#                    pbar.update(1)
    return Kmatrix

Example #7

Show file

File: randomWalkKernel.py Project: bgauzere/py-graph

def _fixed_point(Gn,
                 lmda,
                 p,
                 q,
                 ds_attrs,
                 node_kernels,
                 edge_kernels,
                 node_label,
                 edge_label,
                 eweight,
                 n_jobs,
                 verbose=True):
    """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    #    if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
    #        not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
    #        # this is faster from unlabeled graphs. @todo: why?
    #        if q == None:
    #            # don't normalize adjacency matrices if q is a uniform vector. Note
    #            # A_wave_list accually contains the transposes of the adjacency matrices.
    #            A_wave_list = [
    #                nx.adjacency_matrix(G, eweight).todense().transpose() for G in
    #                    tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
    #            ]
    #            if p == None: # p is uniform distribution as default.
    #                pbar = tqdm(
    #                    total=(1 + len(Gn)) * len(Gn) / 2,
    #                    desc='calculating kernels',
    #                    file=sys.stdout)
    #                for i in range(0, len(Gn)):
    #                    for j in range(i, len(Gn)):
    #                        # use uniform distribution if there is no prior knowledge.
    #                        nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
    #                        p_times_uni = 1 / nb_pd
    #                        w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
    #                        p_times = np.full((nb_pd, 1), p_times_uni)
    #                        x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
    #                        # use uniform distribution if there is no prior knowledge.
    #                        q_times = np.full((1, nb_pd), p_times_uni)
    #                        Kmatrix[i][j] = np.dot(q_times, x)
    #                        Kmatrix[j][i] = Kmatrix[i][j]
    #                        pbar.update(1)
    #    else:
    # reindex nodes using consecutive integers for convenience of kernel calculation.
    Gn = [
        nx.convert_node_labels_to_integers(g,
                                           first_label=0,
                                           label_attribute='label_orignal')
        for g in
        (tqdm(Gn, desc='reindex vertices', file=sys.stdout) if verbose else Gn)
    ]

    if p == None and q == None:  # p and q are uniform distributions as default.

        def init_worker(gn_toshare):
            global G_gn
            G_gn = gn_toshare

        do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
                             node_label, edge_kernels, edge_label, lmda)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(Gn, ),
                    n_jobs=n_jobs,
                    verbose=verbose)
    return Kmatrix

Example #8

Show file

File: randomWalkKernel.py Project: bgauzere/py-graph

def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs, verbose=True):
    """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.

    Parameters
    ----------
    G1, G2 : NetworkX graph
        Graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label.
    edge_label : string
        edge attribute used as label.

    Return
    ------
    kernel : float
        Kernel between 2 graphs.
    """
    Kmatrix = np.zeros((len(Gn), len(Gn)))

    if q == None:
        # don't normalize adjacency matrices if q is a uniform vector. Note
        # A_wave_list accually contains the transposes of the adjacency matrices.
        A_wave_list = [
            nx.adjacency_matrix(G, eweight).todense().transpose() for G in (
                tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout
                     ) if verbose else Gn)
        ]
        #        # normalized adjacency matrices
        #        A_wave_list = []
        #        for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
        #            A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
        #            norm = A_tilde.sum(axis=0)
        #            norm[norm == 0] = 1
        #            A_wave_list.append(A_tilde / norm)
        if p == None:  # p is uniform distribution as default.

            def init_worker(Awl_toshare):
                global G_Awl
                G_Awl = Awl_toshare

            do_partial = partial(wrapper_se_do, lmda)
            parallel_gm(do_partial,
                        Kmatrix,
                        Gn,
                        init_worker=init_worker,
                        glbv=(A_wave_list, ),
                        n_jobs=n_jobs,
                        verbose=verbose)


#            pbar = tqdm(
#                total=(1 + len(Gn)) * len(Gn) / 2,
#                desc='calculating kernels',
#                file=sys.stdout)
#            for i in range(0, len(Gn)):
#                for j in range(i, len(Gn)):
#                    S = lmda * A_wave_list[j]
#                    T_t = A_wave_list[i]
#                    # use uniform distribution if there is no prior knowledge.
#                    nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
#                    p_times_uni = 1 / nb_pd
#                    M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
#                    X = dlyap(S, T_t, M0)
#                    X = np.reshape(X, (-1, 1), order='F')
#                    # use uniform distribution if there is no prior knowledge.
#                    q_times = np.full((1, nb_pd), p_times_uni)
#                    Kmatrix[i][j] = np.dot(q_times, X)
#                    Kmatrix[j][i] = Kmatrix[i][j]
#                    pbar.update(1)

    return Kmatrix

Example #9

Show file

def treeletkernel(*args,
                  sub_kernel,
                  node_label='atom',
                  edge_label='bond_type',
                  parallel='imap_unordered',
                  n_jobs=None,
                  verbose=True):
    """Calculate treelet graph kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    sub_kernel : function
        The sub-kernel between 2 real number vectors. Each vector counts the
        numbers of isomorphic treelets in a graph.
    node_label : string
        Node attribute used as label. The default node label is atom.   
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.
    parallel : string/None
        Which paralleliztion method is applied to compute the kernel. The 
        Following choices are available:
        'imap_unordered': use Python's multiprocessing.Pool.imap_unordered
        method.
        None: no parallelization is applied.
    n_jobs : int
        Number of jobs for parallelization. The default is to use all 
        computational cores. This argument is only valid when one of the 
        parallelization method is applied.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the treelet kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
        node_label=node_label,
        edge_label=edge_label)
    labeled = False
    if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']:
        labeled = True
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    # ---- use pool.imap_unordered to parallel and track progress. ----
    if parallel == 'imap_unordered':
        # get all canonical keys of all graphs before calculating kernels to save
        # time, but this may cost a lot of memory for large dataset.
        pool = Pool(n_jobs)
        itr = zip(Gn, range(0, len(Gn)))
        if len(Gn) < 100 * n_jobs:
            chunksize = int(len(Gn) / n_jobs) + 1
        else:
            chunksize = 100
        canonkeys = [[] for _ in range(len(Gn))]
        get_partial = partial(wrapper_get_canonkeys, node_label, edge_label,
                              labeled, ds_attrs['is_directed'])
        if verbose:
            iterator = tqdm(pool.imap_unordered(get_partial, itr, chunksize),
                            desc='getting canonkeys',
                            file=sys.stdout)
        else:
            iterator = pool.imap_unordered(get_partial, itr, chunksize)
        for i, ck in iterator:
            canonkeys[i] = ck
        pool.close()
        pool.join()

        # compute kernels.
        def init_worker(canonkeys_toshare):
            global G_canonkeys
            G_canonkeys = canonkeys_toshare

        do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(canonkeys, ),
                    n_jobs=n_jobs,
                    verbose=verbose)

    # ---- do not use parallelization. ----
    elif parallel == None:
        # get all canonical keys of all graphs before calculating kernels to save
        # time, but this may cost a lot of memory for large dataset.
        canonkeys = []
        for g in (tqdm(Gn, desc='getting canonkeys', file=sys.stdout)
                  if verbose else Gn):
            canonkeys.append(
                get_canonkeys(g, node_label, edge_label, labeled,
                              ds_attrs['is_directed']))

        # compute kernels.
        from itertools import combinations_with_replacement
        itr = combinations_with_replacement(range(0, len(Gn)), 2)
        for i, j in (tqdm(itr, desc='getting canonkeys', file=sys.stdout)
                     if verbose else itr):
            Kmatrix[i][j] = _treeletkernel_do(canonkeys[i], canonkeys[j],
                                              sub_kernel)
            Kmatrix[j][i] = Kmatrix[i][
                j]  # @todo: no directed graph considered?

    else:
        raise Exception('No proper parallelization method designated.')

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- treelet kernel matrix of size %d built in %s seconds ---" %
            (len(Gn), run_time))

    return Kmatrix, run_time

Example #10

Show file

File: untilHPathKernel.py Project: vishalbelsare/py-graph

def untilhpathkernel(*args,
                     node_label='atom',
                     edge_label='bond_type',
                     depth=10,
                     k_func='MinMax',
                     compute_method='trie',
                     n_jobs=None,
                     verbose=True):
    """Calculate path graph kernels up to depth/hight h between graphs.
    
    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        Two graphs between which the kernel is calculated.
    node_label : string
        Node attribute used as label. The default node label is atom.
    edge_label : string
        Edge attribute used as label. The default edge label is bond_type.
    depth : integer
        Depth of search. Longest length of paths.
    k_func : function
        A kernel function applied using different notions of fingerprint 
        similarity, defining the type of feature map and normalization method 
        applied for the graph kernel. The Following choices are available:
        'MinMax': use the MiniMax kernel and counting feature map.
        'tanimoto': use the Tanimoto kernel and binary feature map.
        None: no sub-kernel is used, the kernel is computed directly.
    compute_method : string
        Computation method to store paths and compute the graph kernel. The 
        Following choices are available:
        'trie': store paths as tries.
        'naive': store paths to lists.
    n_jobs : int
        Number of jobs for parallelization.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the path kernel up to h between
        2 praphs.
    """
    # pre-process
    depth = int(depth)
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    Gn = [g.copy() for g in Gn]
    Kmatrix = np.zeros((len(Gn), len(Gn)))
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)
    if k_func != None:
        if not ds_attrs['node_labeled']:
            for G in Gn:
                nx.set_node_attributes(G, '0', 'atom')
        if not ds_attrs['edge_labeled']:
            for G in Gn:
                nx.set_edge_attributes(G, '0', 'bond_type')

    start_time = time.time()

    # ---- use pool.imap_unordered to parallel and track progress. ----
    # get all paths of all graphs before calculating kernels to save time,
    # but this may cost a lot of memory for large datasets.
    pool = Pool(n_jobs)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    all_paths = [[] for _ in range(len(Gn))]
    if compute_method == 'trie' and k_func != None:
        getps_partial = partial(wrapper_find_all_path_as_trie, depth, ds_attrs,
                                node_label, edge_label)
    elif compute_method != 'trie' and k_func != None:
        getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                ds_attrs, node_label, edge_label, True)
    else:
        getps_partial = partial(wrapper_find_all_paths_until_length, depth,
                                ds_attrs, node_label, edge_label, False)
    if verbose:
        iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
                        desc='getting paths',
                        file=sys.stdout)
    else:
        iterator = pool.imap_unordered(getps_partial, itr, chunksize)
    for i, ps in iterator:
        all_paths[i] = ps
    pool.close()
    pool.join()

    #    for g in Gn:
    #        if compute_method == 'trie' and k_func != None:
    #            find_all_path_as_trie(g, depth, ds_attrs, node_label, edge_label)
    #        elif compute_method != 'trie' and k_func != None:
    #            find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label)
    #        else:
    #            find_all_paths_until_length(g, depth, ds_attrs, node_label, edge_label, False)

    ##    size = sys.getsizeof(all_paths)
    ##    for item in all_paths:
    ##        size += sys.getsizeof(item)
    ##        for pppps in item:
    ##            size += sys.getsizeof(pppps)
    ##    print(size)
    #
    ##    ttt = time.time()
    ##    # ---- ---- use pool.map to parallel ----
    ##    for i, ps in tqdm(
    ##            pool.map(getps_partial, range(0, len(Gn))),
    ##            desc='getting paths', file=sys.stdout):
    ##        all_paths[i] = ps
    ##    print(time.time() - ttt)

    if compute_method == 'trie' and k_func != None:

        def init_worker(trie_toshare):
            global G_trie
            G_trie = trie_toshare

        do_partial = partial(wrapper_uhpath_do_trie, k_func)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_paths, ),
                    n_jobs=n_jobs,
                    verbose=verbose)
    elif compute_method != 'trie' and k_func != None:

        def init_worker(plist_toshare):
            global G_plist
            G_plist = plist_toshare

        do_partial = partial(wrapper_uhpath_do_naive, k_func)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_paths, ),
                    n_jobs=n_jobs,
                    verbose=verbose)
    else:

        def init_worker(plist_toshare):
            global G_plist
            G_plist = plist_toshare

        do_partial = partial(wrapper_uhpath_do_kernelless, ds_attrs,
                             edge_kernels)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(all_paths, ),
                    n_jobs=n_jobs,
                    verbose=verbose)

#    # ---- direct running, normally use single CPU core. ----
#    all_paths = [
#        find_all_paths_until_length(
#            Gn[i],
#            depth,
#            ds_attrs,
#            node_label=node_label,
#            edge_label=edge_label) for i in tqdm(
#                range(0, len(Gn)), desc='getting paths', file=sys.stdout)
#    ]
#
#    if compute_method == 'trie':
#        pbar = tqdm(
#            total=((len(Gn) + 1) * len(Gn) / 2),
#            desc='calculating kernels',
#            file=sys.stdout)
#        for i in range(0, len(Gn)):
#            for j in range(i, len(Gn)):
#                Kmatrix[i][j] = _untilhpathkernel_do_trie(all_paths[i],
#                       all_paths[j], k_func)
#                Kmatrix[j][i] = Kmatrix[i][j]
#                pbar.update(1)
#    else:
#        pbar = tqdm(
#            total=((len(Gn) + 1) * len(Gn) / 2),
#            desc='calculating kernels',
#            file=sys.stdout)
#        for i in range(0, len(Gn)):
#            for j in range(i, len(Gn)):
#                Kmatrix[i][j] = _untilhpathkernel_do_naive(all_paths[i], all_paths[j],
#                                                     k_func)
#                Kmatrix[j][i] = Kmatrix[i][j]
#                pbar.update(1)

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- kernel matrix of path kernel up to %d of size %d built in %s seconds ---"
            % (depth, len(Gn), run_time))


#    print(Kmatrix[0][0:10])
    return Kmatrix, run_time

Example #11

Show file

def structuralspkernel(*args,
                       node_label='atom',
                       edge_weight=None,
                       edge_label='bond_type',
                       node_kernels=None,
                       edge_kernels=None,
                       compute_method='naive',
                       n_jobs=None,
                       verbose=True):
    """Calculate mean average structural shortest path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    edge_label : string
        edge attribute used as label. The default edge label is bond_type.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when nodes are unlabeled.
    edge_kernels: dict
        A dictionary of kernel functions for edges, including 3 items: 'symb' 
        for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix' 
        for both labels. The first 2 functions take two edge labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two edges. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns a number
        as the kernel value. Ignored when edges are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the mean average structural 
        shortest path kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        if verbose:
            print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                if verbose:
                    print(
                        '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                        % edge_weight)
        except:
            if verbose:
                print(
                    '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                    % edge_weight)
    ds_attrs = get_dataset_attributes(Gn,
                                      attr_names=[
                                          'node_labeled', 'node_attr_dim',
                                          'edge_labeled', 'edge_attr_dim',
                                          'is_directed'
                                      ],
                                      node_label=node_label,
                                      edge_label=edge_label)

    start_time = time.time()

    # get shortest paths of each graph in Gn
    splist = [None] * len(Gn)
    pool = Pool(n_jobs)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    # get shortest path graphs of Gn
    if compute_method == 'trie':
        getsp_partial = partial(wrapper_getSP_trie, weight,
                                ds_attrs['is_directed'])
    else:
        getsp_partial = partial(wrapper_getSP_naive, weight,
                                ds_attrs['is_directed'])
    if verbose:
        iterator = tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                        desc='getting shortest paths',
                        file=sys.stdout)
    else:
        iterator = pool.imap_unordered(getsp_partial, itr, chunksize)
    for i, sp in iterator:
        splist[i] = sp
#        time.sleep(10)
    pool.close()
    pool.join()

    #    ss = 0
    #    ss += sys.getsizeof(splist)
    #    for spss in splist:
    #        ss += sys.getsizeof(spss)
    #        for spp in spss:
    #            ss += sys.getsizeof(spp)

    #    time.sleep(20)

    #    # ---- direct running, normally use single CPU core. ----
    #    splist = []
    #    if compute_method == 'trie':
    #        for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
    #            splist.append(get_sps_as_trie(g, weight, ds_attrs['is_directed']))
    #    else:
    #        for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
    #            splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))

    # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
    # sp_ml = [0] * len(Gn)  # shortest path matrices
    # for i in result_sp:
    #     sp_ml[i[0]] = i[1]
    # edge_x_g = [[] for i in range(len(sp_ml))]
    # edge_y_g = [[] for i in range(len(sp_ml))]
    # edge_w_g = [[] for i in range(len(sp_ml))]
    # for idx, item in enumerate(sp_ml):
    #     for i1 in range(len(item)):
    #         for i2 in range(i1 + 1, len(item)):
    #             if item[i1, i2] != np.inf:
    #                 edge_x_g[idx].append(i1)
    #                 edge_y_g[idx].append(i2)
    #                 edge_w_g[idx].append(item[i1, i2])
    # print(len(edge_x_g[0]))
    # print(len(edge_y_g[0]))
    # print(len(edge_w_g[0]))

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(spl_toshare, gs_toshare):
        global G_spl, G_gs
        G_spl = spl_toshare
        G_gs = gs_toshare

    if compute_method == 'trie':
        do_partial = partial(wrapper_ssp_do_trie, ds_attrs, node_label,
                             edge_label, node_kernels, edge_kernels)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(splist, Gn),
                    n_jobs=n_jobs,
                    verbose=verbose)
    else:
        do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
                             node_kernels, edge_kernels)
        parallel_gm(do_partial,
                    Kmatrix,
                    Gn,
                    init_worker=init_worker,
                    glbv=(splist, Gn),
                    n_jobs=n_jobs,
                    verbose=verbose)


#    # ---- use pool.map to parallel. ----
#    pool = Pool(n_jobs)
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    for i, j, kernel in tqdm(
#            pool.map(do_partial, itr), desc='calculating kernels',
#            file=sys.stdout):
#        Kmatrix[i][j] = kernel
#        Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()

#    # ---- use pool.imap_unordered to parallel and track progress. ----
#    do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
#                         node_kernels, edge_kernels)
#    itr = zip(combinations_with_replacement(Gn, 2),
#              combinations_with_replacement(splist, 2),
#              combinations_with_replacement(range(0, len(Gn)), 2))
#    len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
#    if len_itr < 1000 * n_jobs:
#        chunksize = int(len_itr / n_jobs) + 1
#    else:
#        chunksize = 1000
#    from contextlib import closing
#    with closing(Pool(n_jobs)) as pool:
#        for i, j, kernel in tqdm(
#                pool.imap_unordered(do_partial, itr, 1000),
#                desc='calculating kernels',
#                file=sys.stdout):
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel
#    pool.close()
#    pool.join()

#    # ---- direct running, normally use single CPU core. ----
#    from itertools import combinations_with_replacement
#    itr = combinations_with_replacement(range(0, len(Gn)), 2)
#    if compute_method == 'trie':
#        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#            kernel = ssp_do_trie(Gn[i], Gn[j], splist[i], splist[j],
#                    ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel
#    else:
#        for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
#            kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
#                    ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
#    #        if(kernel > 1):
#    #            print("error here ")
#            Kmatrix[i][j] = kernel
#            Kmatrix[j][i] = kernel

    run_time = time.time() - start_time
    if verbose:
        print(
            "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
            % (len(Gn), run_time))

    return Kmatrix, run_time

Example #12

Show file

def spkernel(*args,
             node_label='atom',
             edge_weight=None,
             node_kernels=None,
             n_jobs=None):
    """Calculate shortest-path kernels between graphs.

    Parameters
    ----------
    Gn : List of NetworkX graph
        List of graphs between which the kernels are calculated.
    /
    G1, G2 : NetworkX graphs
        2 graphs between which the kernel is calculated.
    node_label : string
        node attribute used as label. The default node label is atom.
    edge_weight : string
        Edge attribute name corresponding to the edge weight.
    node_kernels: dict
        A dictionary of kernel functions for nodes, including 3 items: 'symb' 
        for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' 
        for both labels. The first 2 functions take two node labels as 
        parameters, and the 'mix' function takes 4 parameters, a symbolic and a
        non-symbolic label for each the two nodes. Each label is in form of 2-D
        dimension array (n_samples, n_features). Each function returns an 
        number as the kernel value. Ignored when nodes are unlabeled.

    Return
    ------
    Kmatrix : Numpy matrix
        Kernel matrix, each element of which is the sp kernel between 2 praphs.
    """
    # pre-process
    Gn = args[0] if len(args) == 1 else [args[0], args[1]]
    weight = None
    if edge_weight is None:
        print('\n None edge weight specified. Set all weight to 1.\n')
    else:
        try:
            some_weight = list(
                nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
            if isinstance(some_weight, (float, int)):
                weight = edge_weight
            else:
                print(
                    '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
                    % edge_weight)
        except:
            print(
                '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
                % edge_weight)
    ds_attrs = get_dataset_attributes(
        Gn,
        attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
        node_label=node_label)
    ds_attrs['node_attr_dim'] = 0

    # remove graphs with no edges, as no sp can be found in their structures,
    # so the kernel between such a graph and itself will be zero.
    len_gn = len(Gn)
    Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
    idx = [G[0] for G in Gn]
    Gn = [G[1] for G in Gn]
    if len(Gn) != len_gn:
        print('\n %d graphs are removed as they don\'t contain edges.\n' %
              (len_gn - len(Gn)))

    start_time = time.time()

    pool = Pool(n_jobs)
    # get shortest path graphs of Gn
    getsp_partial = partial(wrapper_getSPGraph, weight)
    itr = zip(Gn, range(0, len(Gn)))
    if len(Gn) < 100 * n_jobs:
        #        # use default chunksize as pool.map when iterable is less than 100
        #        chunksize, extra = divmod(len(Gn), n_jobs * 4)
        #        if extra:
        #            chunksize += 1
        chunksize = int(len(Gn) / n_jobs) + 1
    else:
        chunksize = 100
    for i, g in tqdm(pool.imap_unordered(getsp_partial, itr, chunksize),
                     desc='getting sp graphs',
                     file=sys.stdout):
        Gn[i] = g
    pool.close()
    pool.join()

    Kmatrix = np.zeros((len(Gn), len(Gn)))

    # ---- use pool.imap_unordered to parallel and track progress. ----
    def init_worker(gn_toshare):
        global G_gn
        G_gn = gn_toshare

    do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
    parallel_gm(do_partial,
                Kmatrix,
                Gn,
                init_worker=init_worker,
                glbv=(Gn, ),
                n_jobs=n_jobs)

    run_time = time.time() - start_time
    print(
        "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
        % (len(Gn), run_time))

    return Kmatrix, run_time, idx