Esempio n. 1
0
def eval_kernel(kernel, classes, mode, n_reps=10, all_std=True):
    """Evaluates a specific kernel that will be normalized before evaluation.

    Args:
        kernel ([list]): kernel
        classes (list): dataset classes
        mode (string): either LINEAR or KERNEL
        n_reps (int, optional): Number of repetitions. Defaults to 10.
        all_std (bool, optional): Standard deviation?. Defaults to True.

    Returns:
        tuple: evaluation results
    """
    normalized = []
    print(f'Starting normalization of {len(kernel)} elements...')
    for array in kernel:
        if mode == 'LINEAR':
            normalized.append(aux.normalize_feature_vector(array))
        else:
            normalized.append(aux.normalize_gram_matrix(array))
    print(f'Normalization finished, starting {mode} SVM...')
    if mode == 'LINEAR':
        return ke.linear_svm_evaluation(normalized,
                                        classes,
                                        num_repetitions=n_reps,
                                        all_std=all_std)
    return ke.kernel_svm_evaluation(normalized,
                                    classes,
                                    num_repetitions=n_reps,
                                    all_std=all_std)
Esempio n. 2
0
def run(with_install=True):
    if with_install:
        install_dependencies()
    base_path = os.path.join("kernels", "node_labels")
    ds_name = "ENZYMES"
    classes = dp.get_dataset(ds_name)
    G = tud_to_networkx(ds_name)
    print(f"Number of graphs in data set is {len(G)}")
    print(f"Number of classes {len(set(classes.tolist()))}")

    labels = get_labels(G)
    graph_dict = get_graph_dict(G, classes)

    print_graph_information(graph_dict)

    visualize(graph_dict[6][7])
    graph_dict[6][7].number_of_nodes()
    data = load_data()

    eval_wl(data, classes)

    max_nodes = max(map(lambda x: x.number_of_nodes(), G))
    histograms = csr_matrix((len(G), max_nodes))
    for i, g in enumerate(G):
        for n, d in g.degree():
            histograms[i, n] = d

    histogram_gram = histograms @ histograms.T

    centrality = csr_matrix((len(G), max_nodes))
    for i, g in enumerate(G):
        for n, d in nx.degree_centrality(g).items():
            centrality[i, n] = d

    centrality_gram = centrality @ centrality.T
    val = data["vectors"]["wl"][2].T.dot(histograms)
    val = data["vectors"]["wl"][2].T.dot(histograms)
    print(val.shape)
    normalized = [aux.normalize_feature_vector(val)]
    print(normalized[0].shape)
    print(
        ke.linear_svm_evaluation(normalized,
                                 classes,
                                 num_repetitions=10,
                                 all_std=True))
Esempio n. 3
0
import auxiliarymethods.auxiliary_methods as aux
import auxiliarymethods.datasets as dp
import kernel_baselines as kb
from auxiliarymethods.kernel_evaluation import linear_svm_evaluation

# Download dataset.
classes = dp.get_dataset("MOLT-4")
use_labels, use_edge_labels = True, True

all_matrices = []
# Compute 1-WL kernel for 1 to 5 iterations.
for i in range(1, 6):
    # Use node labels and edge labels.
    gm = kb.compute_wl_1_sparse(dataset, i, use_labels, use_edge_labels)
    # Apply \ell_2 normalization.
    gm_n = aux.normalize_feature_vector(gm)
    all_matrices.append(gm_n)

# Perform 10 repetitions of 10-CV using LIBINEAR.
print(
    linear_svm_evaluation(all_matrices,
                          classes,
                          num_repetitions=10,
                          all_std=True))
Esempio n. 4
0
def main():

    path = "./GM/EXP/"
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["PROTEINS", True], ["PTC_FM", True],
               ["NCI1", True]]
    algorithms = ["LWLC2"]

    for a in algorithms:
        for d, use_labels in dataset:
            gram_matrices = []
            for i in range(0, 10):
                if not pth.exists(path + d + "__" + a + "_" + str(i) +
                                  ".gram"):
                    continue
                else:
                    gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" +
                                                  str(i) + ".gram")
                    gram_matrix = normalize_gram_matrix(gram_matrix)
                    classes = read_classes(d)
                    gram_matrices.append(gram_matrix)

            if gram_matrices != []:
                acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices,
                                                            classes,
                                                            num_repetitions=10)
                print(a, d, acc, acc_train, s_1)

    exit()

    path = "./GM/EXP/"
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["NCI1", True], ["NCI109", True],
               ["PROTEINS", True], ["PTC_FM", True], ["REDDIT-BINARY", False]]
    algorithms = [
        "WL1", "GR", "SP", "WLOA", "LWL2", "LWLP2", "WL2", "DWL2", "LWL3",
        "LWLP3", "WL3", "DWL3"
    ]

    for a in algorithms:
        for d, use_labels in dataset:
            gram_matrices = []
            for i in range(0, 10):
                if not pth.exists(path + d + "__" + a + "_" + str(i) +
                                  ".gram"):
                    continue
                else:
                    gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" +
                                                  str(i) + ".gram")
                    gram_matrix = normalize_gram_matrix(gram_matrix)
                    classes = read_classes(d)
                    gram_matrices.append(gram_matrix)

            if gram_matrices != []:
                acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices,
                                                            classes,
                                                            num_repetitions=10)
                print(a, d, acc, acc_train, s_1)

    path = "./GM/EXPSPARSE/"
    for name in [
            "Yeast", "YeastH", "UACC257", "UACC257H", "OVCAR-8", "OVCAR-8H"
    ]:
        for algorithm in ["LWL2", "LWLP2", "WL"]:

            # Collect feature matrices over all iterations
            all_feature_matrices = []
            classes = read_classes(name)
            for i in range(2, 3):
                # Load feature matrices.
                feature_vector = pd.read_csv(path + name + "__" + algorithm +
                                             "_" + str(i),
                                             header=1,
                                             delimiter=" ").to_numpy()

                feature_vector = feature_vector.astype(int)
                feature_vector[:, 0] = feature_vector[:, 0] - 1
                feature_vector[:, 1] = feature_vector[:, 1] - 1
                feature_vector[:, 2] = feature_vector[:, 2] + 1

                xmax = int(feature_vector[:, 0].max())
                ymax = int(feature_vector[:, 1].max())

                feature_vector = sp.coo_matrix(
                    (feature_vector[:, 2],
                     (feature_vector[:, 0], feature_vector[:, 1])),
                    shape=(xmax + 1, ymax + 1))
                feature_vector = feature_vector.tocsr()

                all_feature_matrices.append(feature_vector)

            acc, s_1 = linear_svm_evaluation(all_feature_matrices,
                                             classes,
                                             num_repetitions=3,
                                             all_std=False)
            print(name, algorithm, acc, s_1)
Esempio n. 5
0
def main():
    ### Smaller datasets using LIBSVM.
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["NCI1", True], ["PROTEINS", True],
               ["REDDIT-BINARY", False]]

    # Number of repetitions of 10-CV.
    num_reps = 10

    results = []
    for dataset, use_labels in dataset:
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # WLOA kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wloa_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Graphlet kernel.
        all_matrices = []
        gm = kb.compute_graphlet_dense(dataset, use_labels, False)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "GR " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "GR " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_dense(dataset, use_labels)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "SP " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "SP " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

    # Number of repetitions of 10-CV.
    num_reps = 3

    ### Larger datasets using LIBLINEAR with edge labels.
    dataset = [["MOLT-4", True, True], ["Yeast", True, True],
               ["MCF-7", True, True], ["github_stargazers", False, False],
               ["reddit_threads", False, False]]

    for d, use_labels, use_edge_labels in dataset:
        dataset = d
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_sparse(dataset, i, use_labels,
                                        use_edge_labels)
            gm_n = aux.normalize_feature_vector(gm)
            all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Graphlet kernel, number of iterations in [1:6].
        all_matrices = []
        gm = kb.compute_graphlet_sparse(dataset, use_labels, use_edge_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_sparse(dataset, use_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

    for r in results:
        print(r)