Beispiel #1
0
def eval_kernel(kernel, classes, mode, n_reps=10, all_std=True):
    """Evaluates a specific kernel that will be normalized before evaluation.

    Args:
        kernel ([list]): kernel
        classes (list): dataset classes
        mode (string): either LINEAR or KERNEL
        n_reps (int, optional): Number of repetitions. Defaults to 10.
        all_std (bool, optional): Standard deviation?. Defaults to True.

    Returns:
        tuple: evaluation results
    """
    normalized = []
    print(f'Starting normalization of {len(kernel)} elements...')
    for array in kernel:
        if mode == 'LINEAR':
            normalized.append(aux.normalize_feature_vector(array))
        else:
            normalized.append(aux.normalize_gram_matrix(array))
    print(f'Normalization finished, starting {mode} SVM...')
    if mode == 'LINEAR':
        return ke.linear_svm_evaluation(normalized,
                                        classes,
                                        num_repetitions=n_reps,
                                        all_std=all_std)
    return ke.kernel_svm_evaluation(normalized,
                                    classes,
                                    num_repetitions=n_reps,
                                    all_std=all_std)
Beispiel #2
0
def eval_wl(data, classes):
    """Evaluates the gram matrices of WL kernels.

    Args:
        data (list): data
        classes ([list]): classes
    """
    for array in data["gram_matrix"]["wl"]:
        normalized = [aux.normalize_gram_matrix(array)]
        print(
            ke.kernel_svm_evaluation(normalized,
                                     classes,
                                     num_repetitions=10,
                                     all_std=True))
Beispiel #3
0
import auxiliarymethods.auxiliary_methods as aux
import auxiliarymethods.datasets as dp
import kernel_baselines as kb
from auxiliarymethods.kernel_evaluation import kernel_svm_evaluation

# Download dataset.
classes = dp.get_dataset("ENZYMES")
use_labels, use_edge_labels = True, False

all_matrices = []
# Compute 1-WL kernel for 1 to 5 iterations.
for i in range(1, 6):
    # Use node labels and no edge labels.
    gm = kb.compute_wl_1_dense("ENZYMES", i, use_labels, use_edge_labels)
    # Apply cosine normalization.
    gm = aux.normalize_gram_matrix(gm)
    all_matrices.append(gm)

# Perform 10 repetitions of 10-CV using LIBSVM.
print(kernel_svm_evaluation(all_matrices, classes,
                            num_repetitions=10, all_std=True))

Beispiel #4
0
def main():

    path = "./GM/EXP/"
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["PROTEINS", True], ["PTC_FM", True],
               ["NCI1", True]]
    algorithms = ["LWLC2"]

    for a in algorithms:
        for d, use_labels in dataset:
            gram_matrices = []
            for i in range(0, 10):
                if not pth.exists(path + d + "__" + a + "_" + str(i) +
                                  ".gram"):
                    continue
                else:
                    gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" +
                                                  str(i) + ".gram")
                    gram_matrix = normalize_gram_matrix(gram_matrix)
                    classes = read_classes(d)
                    gram_matrices.append(gram_matrix)

            if gram_matrices != []:
                acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices,
                                                            classes,
                                                            num_repetitions=10)
                print(a, d, acc, acc_train, s_1)

    exit()

    path = "./GM/EXP/"
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["NCI1", True], ["NCI109", True],
               ["PROTEINS", True], ["PTC_FM", True], ["REDDIT-BINARY", False]]
    algorithms = [
        "WL1", "GR", "SP", "WLOA", "LWL2", "LWLP2", "WL2", "DWL2", "LWL3",
        "LWLP3", "WL3", "DWL3"
    ]

    for a in algorithms:
        for d, use_labels in dataset:
            gram_matrices = []
            for i in range(0, 10):
                if not pth.exists(path + d + "__" + a + "_" + str(i) +
                                  ".gram"):
                    continue
                else:
                    gram_matrix, _ = read_lib_svm(path + d + "__" + a + "_" +
                                                  str(i) + ".gram")
                    gram_matrix = normalize_gram_matrix(gram_matrix)
                    classes = read_classes(d)
                    gram_matrices.append(gram_matrix)

            if gram_matrices != []:
                acc, acc_train, s_1 = kernel_svm_evaluation(gram_matrices,
                                                            classes,
                                                            num_repetitions=10)
                print(a, d, acc, acc_train, s_1)

    path = "./GM/EXPSPARSE/"
    for name in [
            "Yeast", "YeastH", "UACC257", "UACC257H", "OVCAR-8", "OVCAR-8H"
    ]:
        for algorithm in ["LWL2", "LWLP2", "WL"]:

            # Collect feature matrices over all iterations
            all_feature_matrices = []
            classes = read_classes(name)
            for i in range(2, 3):
                # Load feature matrices.
                feature_vector = pd.read_csv(path + name + "__" + algorithm +
                                             "_" + str(i),
                                             header=1,
                                             delimiter=" ").to_numpy()

                feature_vector = feature_vector.astype(int)
                feature_vector[:, 0] = feature_vector[:, 0] - 1
                feature_vector[:, 1] = feature_vector[:, 1] - 1
                feature_vector[:, 2] = feature_vector[:, 2] + 1

                xmax = int(feature_vector[:, 0].max())
                ymax = int(feature_vector[:, 1].max())

                feature_vector = sp.coo_matrix(
                    (feature_vector[:, 2],
                     (feature_vector[:, 0], feature_vector[:, 1])),
                    shape=(xmax + 1, ymax + 1))
                feature_vector = feature_vector.tocsr()

                all_feature_matrices.append(feature_vector)

            acc, s_1 = linear_svm_evaluation(all_feature_matrices,
                                             classes,
                                             num_repetitions=3,
                                             all_std=False)
            print(name, algorithm, acc, s_1)
Beispiel #5
0
def main():
    ### Smaller datasets using LIBSVM.
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["NCI1", True], ["PROTEINS", True],
               ["REDDIT-BINARY", False]]

    # Number of repetitions of 10-CV.
    num_reps = 10

    results = []
    for dataset, use_labels in dataset:
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # WLOA kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wloa_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Graphlet kernel.
        all_matrices = []
        gm = kb.compute_graphlet_dense(dataset, use_labels, False)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "GR " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "GR " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_dense(dataset, use_labels)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "SP " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "SP " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

    # Number of repetitions of 10-CV.
    num_reps = 3

    ### Larger datasets using LIBLINEAR with edge labels.
    dataset = [["MOLT-4", True, True], ["Yeast", True, True],
               ["MCF-7", True, True], ["github_stargazers", False, False],
               ["reddit_threads", False, False]]

    for d, use_labels, use_edge_labels in dataset:
        dataset = d
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_sparse(dataset, i, use_labels,
                                        use_edge_labels)
            gm_n = aux.normalize_feature_vector(gm)
            all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Graphlet kernel, number of iterations in [1:6].
        all_matrices = []
        gm = kb.compute_graphlet_sparse(dataset, use_labels, use_edge_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_sparse(dataset, use_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

    for r in results:
        print(r)