def main():
    # Load ENZYMES data set
    #  graph_db, classes = dp.read_txt("ENZYMES")
    obj = joblib.load("../sample.graph.jbl")
    graph_db = obj["graph"]
    classes = obj["label"]
    print(graph_db[0])
    print(classes)
    print(len(classes))
    print("*****1")
    # Parameters used:
    # Compute gram matrix: False,
    # Normalize gram matrix: False
    # Use discrete labels: False
    kernel_parameters_sp = [False, False, 1]

    # Parameters used:
    # Compute gram matrix: False,
    # Normalize gram matrix: False
    # Use discrete labels: False
    # Number of iterations for WL: 3
    kernel_parameters_wl = [3, False, False, 1]

    # Use discrete labels, too
    # kernel_parameters_sp = [False, False, 1]
    # kernel_parameters_wl = [3, False, False, 1]

    # Compute gram matrix for HGK-WL
    # 20 is the number of iterations
    gram_matrix = rbk.hash_graph_kernel(graph_db,
                                        sp_exp.shortest_path_kernel,
                                        kernel_parameters_sp,
                                        20,
                                        scale_attributes=True,
                                        lsh_bin_width=1.0,
                                        sigma=1.0)
    # Normalize gram matrix
    gram_matrix = aux.normalize_gram_matrix(gram_matrix)
    print("****1")

    # Compute gram matrix for HGK-SP
    # 20 is the number of iterations
    gram_matrix = rbk.hash_graph_kernel(graph_db,
                                        wl.weisfeiler_lehman_subtree_kernel,
                                        kernel_parameters_wl,
                                        20,
                                        scale_attributes=True,
                                        lsh_bin_width=1.0,
                                        sigma=1.0)

    # Normalize gram matrix
    gram_matrix = aux.normalize_gram_matrix(gram_matrix)

    # Write out LIBSVM matrix
    dp.write_lib_svm(gram_matrix, classes, "gram_matrix")
Exemple #2
0
def eval_kernel(kernel, classes, mode, n_reps=10, all_std=True):
    """Evaluates a specific kernel that will be normalized before evaluation.

    Args:
        kernel ([list]): kernel
        classes (list): dataset classes
        mode (string): either LINEAR or KERNEL
        n_reps (int, optional): Number of repetitions. Defaults to 10.
        all_std (bool, optional): Standard deviation?. Defaults to True.

    Returns:
        tuple: evaluation results
    """
    normalized = []
    print(f'Starting normalization of {len(kernel)} elements...')
    for array in kernel:
        if mode == 'LINEAR':
            normalized.append(aux.normalize_feature_vector(array))
        else:
            normalized.append(aux.normalize_gram_matrix(array))
    print(f'Normalization finished, starting {mode} SVM...')
    if mode == 'LINEAR':
        return ke.linear_svm_evaluation(normalized,
                                        classes,
                                        num_repetitions=n_reps,
                                        all_std=all_std)
    return ke.kernel_svm_evaluation(normalized,
                                    classes,
                                    num_repetitions=n_reps,
                                    all_std=all_std)
Exemple #3
0
def plot_kpca_nmi_and_clustering(classes):
  ds_name = 'ENZYMES'
  base_path = os.path.join("kernels","node_labels")

  fig, axs = plt.subplots(3,3, figsize=(15,15))
  representations = ["wl3", "graphlet", "shortestpath"]

  for (i, representation) in enumerate(representations): 
    gram = load_csv(os.path.join(base_path,f"{ds_name}_gram_matrix_{representation}.csv"))
    gram = aux.normalize_gram_matrix(gram)

    kpca = KernelPCA(n_components=100, kernel="precomputed")
    reduced_kpca = kpca.fit_transform(gram)
    # fig, ax = plt.subplots(figsize=(5,5))
    axs[0][i].scatter(reduced_kpca[:,0], reduced_kpca[:,1], c=classes, s=1)
    axs[0][i].set_title(f'{representation} KPCA ground truth')
    
    kmeans = KMeans(n_clusters=6).fit(reduced_kpca)
    axs[1][i].scatter(reduced_kpca[:,0], reduced_kpca[:,1], c=kmeans.labels_, s=1)
    axs[1][i].set_title(f'{representation} KPCA KMeans')
    print(f"NMI KMeans {representation}: {nmi(classes, kmeans.labels_)}")

    db = DBSCAN().fit(reduced_kpca)
    axs[2][i].scatter(reduced_kpca[:,0], reduced_kpca[:,1], c=db.labels_, s=1)
    axs[2][i].set_title(f'{representation} DBSCAN KMeans')
    print(f"NMI DBSCAN {representation}: {nmi(classes, db.labels_)}\n")

  plt.show()
Exemple #4
0
def hash_graph_kernel(graph_db, base_kernel, kernel_parameters, iterations=20, lsh_bin_width=1.0, sigma=1.0,
                      normalize_gram_matrix=True, use_gram_matrices=False, scale_attributes=True):
    num_vertices = 0
    for g in graph_db:
        num_vertices += g.num_vertices()
    n = len(graph_db)

    g = graph_db[0]
    v = list(graph_db[0].vertices())[0]
    dim_attributes = len(g.vp.na[v])
    colors_0 = np.zeros([num_vertices, dim_attributes])
    offset = 0

    gram_matrix = np.zeros([n, n])

    # Get attributes from all graph instances
    graph_indices = []
    for g in graph_db:
        for i, v in enumerate(g.vertices()):
            colors_0[i + offset] = g.vp.na[v]

        graph_indices.append((offset, offset + g.num_vertices() - 1))
        offset += g.num_vertices()

    # Normalize attributes: center to the mean and component wise scale to unit variance
    if scale_attributes:
        colors_0 = pre.scale(colors_0, axis=0)

    for it in xrange(0, iterations):
        colors_hashed = aux.locally_sensitive_hashing(colors_0, dim_attributes, lsh_bin_width, sigma=sigma)

        tmp = base_kernel(graph_db, colors_hashed, *kernel_parameters)

        if it == 0 and not use_gram_matrices:
            feature_vectors = tmp
        else:
            if use_gram_matrices:
                feature_vectors = tmp
                feature_vectors = feature_vectors.tocsr()
                feature_vectors = m.sqrt(1.0 / iterations) * (feature_vectors)
                gram_matrix += feature_vectors.dot(feature_vectors.T).toarray()

            else:
                feature_vectors = sparse.hstack((feature_vectors, tmp))

    feature_vectors = feature_vectors.tocsr()

    if not use_gram_matrices:
        # Normalize feature vectors
        feature_vectors = m.sqrt(1.0 / iterations) * (feature_vectors)
        # Compute Gram matrix
        gram_matrix = feature_vectors.dot(feature_vectors.T)
        gram_matrix = gram_matrix.toarray()

    if normalize_gram_matrix:
        gram_matrix = aux.normalize_gram_matrix(gram_matrix)

    return gram_matrix
Exemple #5
0
def eval_wl(data, classes):
    """Evaluates the gram matrices of WL kernels.

    Args:
        data (list): data
        classes ([list]): classes
    """
    for array in data["gram_matrix"]["wl"]:
        normalized = [aux.normalize_gram_matrix(array)]
        print(
            ke.kernel_svm_evaluation(normalized,
                                     classes,
                                     num_repetitions=10,
                                     all_std=True))
Exemple #6
0
def weisfeiler_lehman_subtree_kernel(graph_db, hashed_attributes, *kwargs):
    iterations = kwargs[0]
    compute_gram_matrix = kwargs[1]
    normalize_gram_matrix = kwargs[2]
    use_labels = kwargs[3]

    # Create one empty feature vector for each graph
    feature_vectors = []
    for _ in graph_db:
        feature_vectors.append(np.zeros(0, dtype=np.float64))

    # Construct block diagonal matrix of all adjacency matrices
    adjacency_matrices = []
    for g in graph_db:
        adjacency_matrices.append(gt.adjacency(g))
    M = sp.sparse.block_diag(tuple(adjacency_matrices),
                             dtype=np.float64,
                             format="csr")
    num_vertices = M.shape[0]

    # Load list of precalculated logarithms of prime numbers
    log_primes = log_pl.log_primes[0:num_vertices]

    # Color vector representing labels
    colors_0 = np.zeros(num_vertices, dtype=np.float64)
    # Color vector representing hashed attributes
    colors_1 = hashed_attributes

    # Get labels (colors) from all graph instances
    offset = 0
    graph_indices = []

    for g in graph_db:
        if use_labels == 1:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = g.vp.nl[v]
        if use_labels == 2:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = v.out_degree()

        graph_indices.append((offset, offset + g.num_vertices() - 1))
        offset += g.num_vertices()

    # Map labels to [0, number_of_colors)
    if use_labels:
        _, colors_0 = np.unique(colors_0, return_inverse=True)

    for it in range(0, iterations + 1):

        if use_labels:
            # Map colors into a single color vector
            if len(colors_1) > 0:
                colors_all = np.array([colors_0, colors_1])
            else:
                colors_all = np.array([colors_0])
            colors_all = [hash(tuple(row)) for row in colors_all.T]
            _, colors_all = np.unique(colors_all, return_inverse=True)
            max_all = int(np.amax(colors_all) + 1)
            # max_all = int(np.amax(colors_0) + 1)

            feature_vectors = [
                np.concatenate((feature_vectors[i],
                                np.bincount(colors_all[index[0]:index[1] + 1],
                                            minlength=max_all)))
                for i, index in enumerate(graph_indices)
            ]

            # Avoid coloring computation in last iteration
            if it < iterations:
                colors_0 = compute_coloring(M, colors_0,
                                            log_primes[0:len(colors_0)])
                if len(colors_1) > 0:
                    colors_1 = compute_coloring(M, colors_1,
                                                log_primes[0:len(colors_1)])
        else:
            max_1 = int(np.amax(colors_1) + 1)

            feature_vectors = [
                np.concatenate((feature_vectors[i],
                                np.bincount(colors_1[index[0]:index[1] + 1],
                                            minlength=max_1)))
                for i, index in enumerate(graph_indices)
            ]

            # Avoid coloring computation in last iteration
            if it < iterations:
                colors_1 = compute_coloring(M, colors_1,
                                            log_primes[0:len(colors_1)])

    if not compute_gram_matrix:
        return feature_vectors

#return lil.lil_matrix(feature_vectors, dtype=np.float64)
    else:
        # Make feature vectors sparse
        gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64)
        # Compute gram matrix
        gram_matrix = gram_matrix.dot(gram_matrix.T)

        gram_matrix = gram_matrix.toarray()

        if normalize_gram_matrix:
            return aux.normalize_gram_matrix(gram_matrix)
        else:
            return gram_matrix
def shortest_path_kernel(graph_db, hashed_attributes, *kwargs):
    compute_gram_matrix = kwargs[0]
    normalize_gram_matrix = kwargs[1]
    use_labels = kwargs[2]

    num_vertices = 0
    for g in graph_db:
        num_vertices += g.num_vertices()

    offset = 0
    graph_indices = []
    colors_0 = np.zeros(num_vertices, dtype=np.int64)

    # Get labels (colors) from all graph instances
    offset = 0
    for g in graph_db:
        graph_indices.append((offset, offset + g.num_vertices() - 1))

        if use_labels == 1:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = g.vp.nl[v]
        if use_labels == 2:
            for i, v in enumerate(g.vertices()):
                colors_0[i + offset] = v.out_degree()

        offset += g.num_vertices()
    _, colors_0 = np.unique(colors_0, return_inverse=True)

    colors_1 = hashed_attributes

    triple_indices = []
    triple_offset = 0
    triples = []

    # Solve APSP problem for every graphs in graph data base
    for i, g in enumerate(graph_db):
        a = gt.adjacency(g)
        M = csg.shortest_path(a, method='J', directed=False, unweighted=True)

        index = graph_indices[i]

        if use_labels:
            l = colors_0[index[0]:index[1] + 1]
            h = colors_1[index[0]:index[1] + 1]
        else:
            h = colors_1[index[0]:index[1] + 1]
        d = M.shape[0]

        # For each pair of vertices collect labels, hashed attributes, and shortest-path distance
        pairs = list(it.product(range(d), repeat=2))
        if use_labels:
            t = [
                hash((l[k], h[k], l[j], h[j], M[k][j])) for (k, j) in pairs
                if (k != j or ~np.isinf(M[k][j]))
            ]
        else:
            t = [
                hash((h[k], h[j], M[k][j])) for (k, j) in pairs
                if (k != j or ~np.isinf(M[k][j]))
            ]

        triples.extend(t)

        triple_indices.append((triple_offset, triple_offset + len(t) - 1))
        triple_offset += len(t)

    _, colors = np.unique(triples, return_inverse=True)
    m = np.amax(colors) + 1

    # Compute feature vectors
    feature_vectors = []
    for i, index in enumerate(triple_indices):
        feature_vectors.append(
            np.bincount(colors[index[0]:index[1] + 1], minlength=m))

    if not compute_gram_matrix:
        return lil.lil_matrix(feature_vectors, dtype=np.float64)
    else:
        # Make feature vectors sparse
        gram_matrix = csr.csr_matrix(feature_vectors, dtype=np.float64)
        # Compute gram matrix
        gram_matrix = gram_matrix.dot(gram_matrix.T)

        gram_matrix = gram_matrix.toarray()

        if normalize_gram_matrix:
            return aux.normalize_gram_matrix(gram_matrix)
        else:
            return gram_matrix
Exemple #8
0
def main():

    path = "./GM/EXP/"
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["PROTEINS", True], ["PTC_FM", True],
               ["NCI1", True]]
    algorithms = ["LWLC2"]

    for a in algorithms:
        for d, use_labels in dataset:
            gram_matrices = []
            for i in range(0, 10):
                if not pth.exists(path + d + "__" + a + "_" + str(i) +
                                  ".gram"):
                    continue
                else:
                    gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" +
                                                  str(i) + ".gram")
                    gram_matrix = normalize_gram_matrix(gram_matrix)
                    classes = read_classes(d)
                    gram_matrices.append(gram_matrix)

            if gram_matrices != []:
                acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices,
                                                            classes,
                                                            num_repetitions=10)
                print(a, d, acc, acc_train, s_1)

    exit()

    path = "./GM/EXP/"
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["NCI1", True], ["NCI109", True],
               ["PROTEINS", True], ["PTC_FM", True], ["REDDIT-BINARY", False]]
    algorithms = [
        "WL1", "GR", "SP", "WLOA", "LWL2", "LWLP2", "WL2", "DWL2", "LWL3",
        "LWLP3", "WL3", "DWL3"
    ]

    for a in algorithms:
        for d, use_labels in dataset:
            gram_matrices = []
            for i in range(0, 10):
                if not pth.exists(path + d + "__" + a + "_" + str(i) +
                                  ".gram"):
                    continue
                else:
                    gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" +
                                                  str(i) + ".gram")
                    gram_matrix = normalize_gram_matrix(gram_matrix)
                    classes = read_classes(d)
                    gram_matrices.append(gram_matrix)

            if gram_matrices != []:
                acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices,
                                                            classes,
                                                            num_repetitions=10)
                print(a, d, acc, acc_train, s_1)

    path = "./GM/EXPSPARSE/"
    for name in [
            "Yeast", "YeastH", "UACC257", "UACC257H", "OVCAR-8", "OVCAR-8H"
    ]:
        for algorithm in ["LWL2", "LWLP2", "WL"]:

            # Collect feature matrices over all iterations
            all_feature_matrices = []
            classes = read_classes(name)
            for i in range(2, 3):
                # Load feature matrices.
                feature_vector = pd.read_csv(path + name + "__" + algorithm +
                                             "_" + str(i),
                                             header=1,
                                             delimiter=" ").to_numpy()

                feature_vector = feature_vector.astype(int)
                feature_vector[:, 0] = feature_vector[:, 0] - 1
                feature_vector[:, 1] = feature_vector[:, 1] - 1
                feature_vector[:, 2] = feature_vector[:, 2] + 1

                xmax = int(feature_vector[:, 0].max())
                ymax = int(feature_vector[:, 1].max())

                feature_vector = sp.coo_matrix(
                    (feature_vector[:, 2],
                     (feature_vector[:, 0], feature_vector[:, 1])),
                    shape=(xmax + 1, ymax + 1))
                feature_vector = feature_vector.tocsr()

                all_feature_matrices.append(feature_vector)

            acc, s_1 = linear_svm_evaluation(all_feature_matrices,
                                             classes,
                                             num_repetitions=3,
                                             all_std=False)
            print(name, algorithm, acc, s_1)
Exemple #9
0
import auxiliarymethods.auxiliary_methods as aux
import auxiliarymethods.datasets as dp
import kernel_baselines as kb
from auxiliarymethods.kernel_evaluation import kernel_svm_evaluation

# Download dataset.
classes = dp.get_dataset("ENZYMES")
use_labels, use_edge_labels = True, False

all_matrices = []
# Compute 1-WL kernel for 1 to 5 iterations.
for i in range(1, 6):
    # Use node labels and no edge labels.
    gm = kb.compute_wl_1_dense("ENZYMES", i, use_labels, use_edge_labels)
    # Apply cosine normalization.
    gm = aux.normalize_gram_matrix(gm)
    all_matrices.append(gm)

# Perform 10 repetitions of 10-CV using LIBSVM.
print(kernel_svm_evaluation(all_matrices, classes,
                            num_repetitions=10, all_std=True))

Exemple #10
0
def main():
    ### Smaller datasets using LIBSVM.
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["NCI1", True], ["PROTEINS", True],
               ["REDDIT-BINARY", False]]

    # Number of repetitions of 10-CV.
    num_reps = 10

    results = []
    for dataset, use_labels in dataset:
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # WLOA kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wloa_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Graphlet kernel.
        all_matrices = []
        gm = kb.compute_graphlet_dense(dataset, use_labels, False)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "GR " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "GR " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_dense(dataset, use_labels)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "SP " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "SP " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

    # Number of repetitions of 10-CV.
    num_reps = 3

    ### Larger datasets using LIBLINEAR with edge labels.
    dataset = [["MOLT-4", True, True], ["Yeast", True, True],
               ["MCF-7", True, True], ["github_stargazers", False, False],
               ["reddit_threads", False, False]]

    for d, use_labels, use_edge_labels in dataset:
        dataset = d
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_sparse(dataset, i, use_labels,
                                        use_edge_labels)
            gm_n = aux.normalize_feature_vector(gm)
            all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Graphlet kernel, number of iterations in [1:6].
        all_matrices = []
        gm = kb.compute_graphlet_sparse(dataset, use_labels, use_edge_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_sparse(dataset, use_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

    for r in results:
        print(r)