Python get_dataset 예제들, auxiliarymethods.datasets.get_dataset Python 예제들

예제 #1

0

파일 보기

    def process(self):
        data_list = []

        indices_val = list(range(225011, 249456))

        dp.get_dataset("ZINC_val", regression=True)
        node_labels = pre.get_all_node_labels("ZINC_full", True, True)
        targets = pre.read_targets("ZINC_val", list(range(0, 24445)))

        node_labels_1 = node_labels[225011:249456]
        matrices = pre.get_all_matrices_wl("ZINC_val", list(range(0, 24445)))
        targets_1 = targets
        for i, m in enumerate(matrices):
            edge_index_1 = torch.tensor(matrices[i][0]).t().contiguous()
            edge_index_2 = torch.tensor(matrices[i][1]).t().contiguous()

            data = Data()
            data.edge_index_1 = edge_index_1
            data.edge_index_2 = edge_index_2

            data.x = torch.from_numpy(np.array(node_labels_1[i])).to(
                torch.float)
            data.y = data.y = torch.from_numpy(np.array([targets_1[i]
                                                         ])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #2

0

파일 보기

    def process(self):
        data_list = []

        dp.get_dataset("ZINC_test", regression=True)

        # TODO Change this
        node_labels = pre.get_all_node_labels("ZINC_full", True, True)
        targets = pre.read_targets("ZINC_test", list(range(0, 5000)))

        node_labels_1 = node_labels[220011:225011]
        matrices = pre.get_all_matrices_wl("ZINC_test", list(range(0, 5000)))
        targets_1 = targets
        for i, m in enumerate(matrices):
            edge_index_1 = torch.tensor(matrices[i][0]).t().contiguous()
            edge_index_2 = torch.tensor(matrices[i][1]).t().contiguous()

            data = Data()
            data.edge_index_1 = edge_index_1
            data.edge_index_2 = edge_index_2

            # one_hot = np.eye(492)[node_labels[i]]
            data.x = torch.from_numpy(np.array(node_labels_1[i])).to(
                torch.float)
            data.y = data.y = torch.from_numpy(np.array([targets_1[i]
                                                         ])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #3

0

파일 보기

    def process(self):
        data_list = []

        indices_train = []
        indices_val = []
        indices_test = []

        infile = open("test_al_50.index", "r")
        for line in infile:
            indices_test = line.split(",")
            indices_test = [int(i) for i in indices_test]

        infile = open("val_al_50.index", "r")
        for line in infile:
            indices_val = line.split(",")
            indices_val = [int(i) for i in indices_val]

        infile = open("train_al_50.index", "r")
        for line in infile:
            indices_train = line.split(",")
            indices_train = [int(i) for i in indices_train]

        targets = dp.get_dataset("alchemy_full", multigregression=True)
        tmp1 = targets[indices_train].tolist()
        tmp2 = targets[indices_val].tolist()
        tmp3 = targets[indices_test].tolist()
        targets = tmp1
        targets.extend(tmp2)
        targets.extend(tmp3)

        node_labels = pre.get_all_node_labels_alchem_1(True, True,
                                                       indices_train,
                                                       indices_val,
                                                       indices_test)
        edge_labels = pre.get_all_edge_labels_alchem_1(True, True,
                                                       indices_train,
                                                       indices_val,
                                                       indices_test)

        matrices = pre.get_all_matrices_1("alchemy_full", indices_train)
        matrices.extend(pre.get_all_matrices_1("alchemy_full", indices_val))
        matrices.extend(pre.get_all_matrices_1("alchemy_full", indices_test))

        for i, m in enumerate(matrices):
            data = Data()
            data.edge_index = torch.tensor(matrices[i]).t().contiguous()

            one_hot = np.eye(6)[node_labels[i]]
            data.x = torch.from_numpy(one_hot).to(torch.float)

            one_hot = np.eye(4)[edge_labels[i]]
            data.edge_attr = torch.from_numpy(one_hot).to(torch.float)

            data.y = torch.from_numpy(np.array([targets[i]])).to(torch.float)
            print(data.y.size())

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #4

0

파일 보기

파일: local_2_FULL.py 프로젝트: chrsmrrs/sparsewl

    def process(self):
        data_list = []

        targets = dp.get_dataset("alchemy_full",
                                 multigregression=True).tolist()
        node_labels = pre.get_all_node_labels("alchemy_full", True, True)
        matrices = pre.get_all_matrices("alchemy_full", list(range(202579)))

        for i, m in enumerate(matrices):
            edge_index_1 = torch.tensor(matrices[i][0]).t().contiguous()
            edge_index_2 = torch.tensor(matrices[i][1]).t().contiguous()

            data = Data()
            data.edge_index_1 = edge_index_1
            data.edge_index_2 = edge_index_2

            one_hot = np.eye(83)[node_labels[i]]
            data.x = torch.from_numpy(one_hot).to(torch.float)
            data.y = data.y = torch.from_numpy(np.array([targets[i]
                                                         ])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #5

0

파일 보기

    def process(self):
        data_list = []

        targets = dp.get_dataset("ZINC_full", regression=True)
        targets = targets.tolist()

        node_labels = pre.get_all_node_labels_1("ZINC_full", True)
        edge_labels = pre.get_all_edge_labels_1("ZINC_full")

        matrices = pre.get_all_matrices_1("ZINC_full", list(range(0, 249456)))

        for i, m in enumerate(matrices):
            data = Data()
            data.edge_index = torch.tensor(matrices[i]).t().contiguous()

            one_hot = np.eye(28)[node_labels[i]]
            data.x = torch.from_numpy(one_hot).to(torch.float)

            one_hot = np.eye(3)[edge_labels[i]]
            data.edge_attr = torch.from_numpy(one_hot).to(torch.float)

            data.y = torch.from_numpy(np.array([targets[i]])).to(torch.float)
            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #6

0

파일 보기

파일: malkin_2_10K.py 프로젝트: chrsmrrs/sparsewl

    def process(self):
        data_list = []

        indices_train = []
        indices_val = []
        indices_test = []

        infile = open("train.index.txt", "r")
        for line in infile:
            indices_train = line.split(",")
            indices_train = [int(i) for i in indices_train]

        infile = open("val.index.txt", "r")
        for line in infile:
            indices_val = line.split(",")
            indices_val = [int(i) for i in indices_val]

        infile = open("test.index.txt", "r")
        for line in infile:
            indices_test = line.split(",")
            indices_test = [int(i) for i in indices_test]

        dp.get_dataset("ZINC_train", regression=True)
        dp.get_dataset("ZINC_test", regression=True)
        dp.get_dataset("ZINC_val", regression=True)

        node_labels = pre.get_all_node_labels_ZINC(True, True, indices_train,
                                                   indices_val, indices_test)

        targets = pre.read_targets("ZINC_train", indices_train)
        targets.extend(pre.read_targets("ZINC_val", indices_val))
        targets.extend(pre.read_targets("ZINC_test", indices_test))

        matrices = pre.get_all_matrices_dwl("ZINC_train", indices_train)
        matrices.extend(pre.get_all_matrices_dwl("ZINC_val", indices_val))
        matrices.extend(pre.get_all_matrices_dwl("ZINC_test", indices_test))

        for i, m in enumerate(matrices):
            edge_index_1_l = torch.tensor(matrices[i][0]).t().contiguous()
            edge_index_1_g = torch.tensor(matrices[i][1]).t().contiguous()
            edge_index_2_l = torch.tensor(matrices[i][2]).t().contiguous()
            edge_index_2_g = torch.tensor(matrices[i][3]).t().contiguous()

            data = Data()
            data.edge_index_1_l = edge_index_1_l
            data.edge_index_1_g = edge_index_1_g
            data.edge_index_2_l = edge_index_2_l
            data.edge_index_2_g = edge_index_2_g

            one_hot = np.eye(445)[node_labels[i]]
            data.x = torch.from_numpy(one_hot).to(torch.float)
            data.y = data.y = torch.from_numpy(np.array([targets[i]
                                                         ])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #7

0

파일 보기

파일: local_2_connected_50K.py 프로젝트: chrsmrrs/sparsewl

    def process(self):
        data_list = []

        indices_train = []
        indices_val = []
        indices_test = []

        indices_train = []
        indices_val = []

        infile = open("train_50.index.txt", "r")
        for line in infile:
            indices_train = line.split(",")
            indices_train = [int(i) for i in indices_train]

        infile = open("val_50.index.txt", "r")
        for line in infile:
            indices_val = line.split(",")
            indices_val = [int(i) for i in indices_val]

        indices_test = list(range(0, 5000))

        dp.get_dataset("ZINC_train")
        dp.get_dataset("ZINC_test")
        dp.get_dataset("ZINC_val")
        node_labels = pre.get_all_node_labels_ZINC_connected(
            True, True, indices_train, indices_val, indices_test)

        targets = pre.read_targets("ZINC_train", indices_train)
        targets.extend(pre.read_targets("ZINC_val", indices_val))
        targets.extend(pre.read_targets("ZINC_test", indices_test))

        matrices = pre.get_all_matrices_connected("ZINC_train", indices_train)
        matrices.extend(pre.get_all_matrices_connected("ZINC_val",
                                                       indices_val))
        matrices.extend(
            pre.get_all_matrices_connected("ZINC_test", indices_test))

        for i, m in enumerate(matrices):
            edge_index_1 = torch.tensor(matrices[i][0]).t().contiguous().long()
            edge_index_2 = torch.tensor(matrices[i][1]).t().contiguous().long()

            data = Data()
            data.edge_index_1 = edge_index_1
            data.edge_index_2 = edge_index_2

            one_hot = np.eye(445)[node_labels[i]]
            data.x = torch.from_numpy(one_hot).to(torch.float)
            data.y = data.y = torch.from_numpy(np.array([targets[i]
                                                         ])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #8

0

파일 보기

파일: kernels.py 프로젝트: markkod/pc3-enzymes

def run(with_install=True):
    if with_install:
        install_dependencies()
    base_path = os.path.join("kernels", "node_labels")
    ds_name = "ENZYMES"
    classes = dp.get_dataset(ds_name)
    G = tud_to_networkx(ds_name)
    print(f"Number of graphs in data set is {len(G)}")
    print(f"Number of classes {len(set(classes.tolist()))}")

    labels = get_labels(G)
    graph_dict = get_graph_dict(G, classes)

    print_graph_information(graph_dict)

    visualize(graph_dict[6][7])
    graph_dict[6][7].number_of_nodes()
    data = load_data()

    eval_wl(data, classes)

    max_nodes = max(map(lambda x: x.number_of_nodes(), G))
    histograms = csr_matrix((len(G), max_nodes))
    for i, g in enumerate(G):
        for n, d in g.degree():
            histograms[i, n] = d

    histogram_gram = histograms @ histograms.T

    centrality = csr_matrix((len(G), max_nodes))
    for i, g in enumerate(G):
        for n, d in nx.degree_centrality(g).items():
            centrality[i, n] = d

    centrality_gram = centrality @ centrality.T
    val = data["vectors"]["wl"][2].T.dot(histograms)
    val = data["vectors"]["wl"][2].T.dot(histograms)
    print(val.shape)
    normalized = [aux.normalize_feature_vector(val)]
    print(normalized[0].shape)
    print(
        ke.linear_svm_evaluation(normalized,
                                 classes,
                                 num_repetitions=10,
                                 all_std=True))

예제 #9

0

파일 보기

파일: kernels.py 프로젝트: markkod/pc3-enzymes

def eval_all(data):
    """Evaluates the kernels on the data.

    Args:
        data ([type]): [description]

    Returns:
        [type]: [description]
    """
    classes = dp.get_dataset('ENZYMES')
    result = {}
    for data_type in data.keys():
        mode = 'LINEAR' if data_type == 'vectors' else 'KERNEL'
        result[data_type] = {}
        print('MODE:', mode)
        for kernel in data[data_type]:
            print(f'\nEvaluating {kernel} SVM...')
            result[data_type][kernel] = eval_kernel(data[data_type][kernel],
                                                    classes, mode)
            print(f'{data_type}-{kernel} : {result[data_type][kernel]}')
    return result

예제 #10

0

파일 보기

파일: local_2_ALL.py 프로젝트: chrsmrrs/sparsewl

    def process(self):
        data_list = []
        targets = dp.get_dataset("QM9", multigregression=True).tolist()
        attributes = pre.get_all_attributes("QM9")

        node_labels = pre.get_all_node_labels("QM9", False, False)
        matrices = pre.get_all_matrices("QM9", list(range(129433)))

        for i, m in enumerate(matrices):
            edge_index_1 = torch.tensor(matrices[i][0]).t().contiguous()
            edge_index_2 = torch.tensor(matrices[i][1]).t().contiguous()

            data = Data()
            data.edge_index_1 = edge_index_1
            data.edge_index_2 = edge_index_2

            one_hot = np.eye(3)[node_labels[i]]
            data.x = torch.from_numpy(one_hot).to(torch.float)

            # Continuous information.
            data.first = torch.from_numpy(np.array(
                attributes[i][0])[:, 0:13]).to(torch.float)
            data.first_coord = torch.from_numpy(
                np.array(attributes[i][0])[:, 13:]).to(torch.float)

            data.second = torch.from_numpy(
                np.array(attributes[i][1])[:, 0:13]).to(torch.float)
            data.second_coord = torch.from_numpy(
                np.array(attributes[i][1])[:, 13:]).to(torch.float)
            data.dist = torch.norm(data.first_coord - data.second_coord,
                                   p=2,
                                   dim=-1).view(-1, 1)
            data.edge_attr = torch.from_numpy(np.array(attributes[i][2])).to(
                torch.float)
            data.y = torch.from_numpy(np.array([targets[i]])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #11

0

파일 보기

파일: malkin_2_FULL.py 프로젝트: chrsmrrs/sparsewl

    def process(self):
        data_list = []

        indices_train = list(range(0, 220011))
        indices_val = list(range(0, 24445))
        indices_test = list(range(0, 5000))

        dp.get_dataset("ZINC_train", regression=True)
        dp.get_dataset("ZINC_test", regression=True)
        dp.get_dataset("ZINC_val", regression=True)
        node_labels = pre.get_all_node_labels_ZINC(True, True, indices_train,
                                                   indices_val, indices_test)

        targets = pre.read_targets("ZINC_train", indices_train)
        targets.extend(pre.read_targets("ZINC_val", indices_val))
        targets.extend(pre.read_targets("ZINC_test", indices_test))

        node_labels = node_labels[0:50000]
        matrices = pre.get_all_matrices_dwl("ZINC_train", list(range(0,
                                                                     50000)))
        targets = targets[0:50000]

        for i, m in enumerate(matrices):
            edge_index_1_l = torch.tensor(matrices[i][0]).t().contiguous()
            edge_index_1_g = torch.tensor(matrices[i][1]).t().contiguous()
            edge_index_2_l = torch.tensor(matrices[i][2]).t().contiguous()
            edge_index_2_g = torch.tensor(matrices[i][3]).t().contiguous()

            data = Data()
            data.edge_index_1_l = edge_index_1_l
            data.edge_index_1_g = edge_index_1_g
            data.edge_index_2_l = edge_index_2_l
            data.edge_index_2_g = edge_index_2_g

            data.x = torch.from_numpy(np.array(node_labels[i])).to(torch.float)
            data.y = data.y = torch.from_numpy(np.array([targets[i]
                                                         ])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #12

0

파일 보기

import auxiliarymethods.auxiliary_methods as aux
import auxiliarymethods.datasets as dp
import kernel_baselines as kb
from auxiliarymethods.kernel_evaluation import linear_svm_evaluation

# Download dataset.
classes = dp.get_dataset("MOLT-4")
use_labels, use_edge_labels = True, True

all_matrices = []
# Compute 1-WL kernel for 1 to 5 iterations.
for i in range(1, 6):
    # Use node labels and edge labels.
    gm = kb.compute_wl_1_sparse(dataset, i, use_labels, use_edge_labels)
    # Apply \ell_2 normalization.
    gm_n = aux.normalize_feature_vector(gm)
    all_matrices.append(gm_n)

# Perform 10 repetitions of 10-CV using LIBINEAR.
print(
    linear_svm_evaluation(all_matrices,
                          classes,
                          num_repetitions=10,
                          all_std=True))

예제 #13

0

파일 보기

파일: networkx.py 프로젝트: avarf/tudataset

import auxiliarymethods.datasets as dp
from auxiliarymethods.gnn_evaluation import gnn_evaluation

dataset = "PROTEINS"

# Download dataset.
dp.get_dataset(dataset)
# Output dataset as a list of graphs.
graph_db = tud_to_networkx(dataset)

예제 #14

0

파일 보기

def main():
    num_reps = 10

    # Smaller datasets.
    dataset = [["IMDB-BINARY", False], ["IMDB-MULTI", False], ["NCI1", True],
               ["NCI109", True], ["PROTEINS", True], ["PTC_FM", True],
               ["REDDIT-BINARY", False], ["ENZYMES", True]]

    results = []
    for d, use_labels in dataset:
        dp.get_dataset(d)

        acc, s_1, s_2 = gnn_evaluation(GIN0,
                                       d, [1, 2, 3, 4, 5], [32, 64, 128],
                                       max_num_epochs=200,
                                       batch_size=64,
                                       start_lr=0.01,
                                       num_repetitions=num_reps,
                                       all_std=True)
        print(d + " " + "GIN0 " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GIN0 " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        acc, s_1, s_2 = gnn_evaluation(GIN,
                                       d, [1, 2, 3, 4, 5], [32, 64, 128],
                                       max_num_epochs=200,
                                       batch_size=64,
                                       start_lr=0.01,
                                       num_repetitions=num_reps,
                                       all_std=True)
        print(d + " " + "GIN " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GIN " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

    num_reps = 3

    # Larger datasets with edge labels.
    dataset = [["Yeast", True], ["YeastH", True], ["UACC257", True],
               ["UACC257H", True], ["OVCAR-8", True], ["OVCAR-8H", True]]
    dataset = [["YeastH", True], ["UACC257", True], ["UACC257H", True],
               ["OVCAR-8", True], ["OVCAR-8H", True]]

    for d, use_labels in dataset:
        dp.get_dataset(d)

        acc, s_1, s_2 = gnn_evaluation(GINE,
                                       d, [2], [64],
                                       max_num_epochs=200,
                                       batch_size=64,
                                       start_lr=0.01,
                                       num_repetitions=num_reps,
                                       all_std=True)
        print(d + " " + "GINE " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GINE " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        acc, s_1, s_2 = gnn_evaluation(GINE0,
                                       d, [2], [64],
                                       max_num_epochs=200,
                                       batch_size=64,
                                       start_lr=0.01,
                                       num_repetitions=num_reps,
                                       all_std=True)
        print(d + " " + "GINE0 " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GINE0 " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

예제 #15

0

파일 보기

파일: example.py 프로젝트: zeta1999/tudataset

import auxiliarymethods.auxiliary_methods as aux
import auxiliarymethods.datasets as dp
import kernel_baselines as kb
from auxiliarymethods.kernel_evaluation import kernel_svm_evaluation

# Download dataset.
classes = dp.get_dataset("ENZYMES")
use_labels, use_edge_labels = True, False

all_matrices = []
# Compute 1-WL kernel for 1 to 5 iterations.
for i in range(1, 6):
    # Use node labels and no edge labels.
    gm = kb.compute_wl_1_dense("ENZYMES", i, use_labels, use_edge_labels)
    # Apply cosine normalization.
    gm = aux.normalize_gram_matrix(gm)
    all_matrices.append(gm)

# Perform 10 repetitions of 10-CV using LIBSVM.
print(kernel_svm_evaluation(all_matrices, classes,
                            num_repetitions=10, all_std=True))

예제 #16

0

파일 보기

파일: main_kernel.py 프로젝트: markkod/pc3-enzymes

def main():
    ### Smaller datasets using LIBSVM.
    dataset = [["ENZYMES", True], ["IMDB-BINARY", False],
               ["IMDB-MULTI", False], ["NCI1", True], ["PROTEINS", True],
               ["REDDIT-BINARY", False]]

    # Number of repetitions of 10-CV.
    num_reps = 10

    results = []
    for dataset, use_labels in dataset:
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WL1 " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # WLOA kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wloa_dense(dataset, i, use_labels, False)
            gm_n = aux.normalize_gram_matrix(gm)
            all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "WLOA " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Graphlet kernel.
        all_matrices = []
        gm = kb.compute_graphlet_dense(dataset, use_labels, False)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "GR " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "GR " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_dense(dataset, use_labels)
        gm_n = aux.normalize_gram_matrix(gm)
        all_matrices.append(gm_n)
        acc, s_1, s_2 = kernel_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(dataset + " " + "SP " + str(acc) + " " + str(s_1) + " " +
              str(s_2))
        results.append(dataset + " " + "SP " + str(acc) + " " + str(s_1) +
                       " " + str(s_2))

    # Number of repetitions of 10-CV.
    num_reps = 3

    ### Larger datasets using LIBLINEAR with edge labels.
    dataset = [["MOLT-4", True, True], ["Yeast", True, True],
               ["MCF-7", True, True], ["github_stargazers", False, False],
               ["reddit_threads", False, False]]

    for d, use_labels, use_edge_labels in dataset:
        dataset = d
        classes = dp.get_dataset(dataset)

        # 1-WL kernel, number of iterations in [1:6].
        all_matrices = []
        for i in range(1, 6):
            gm = kb.compute_wl_1_sparse(dataset, i, use_labels,
                                        use_edge_labels)
            gm_n = aux.normalize_feature_vector(gm)
            all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "WL1SP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Graphlet kernel, number of iterations in [1:6].
        all_matrices = []
        gm = kb.compute_graphlet_sparse(dataset, use_labels, use_edge_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GRSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

        # Shortest-path kernel.
        all_matrices = []
        gm = kb.compute_shortestpath_sparse(dataset, use_labels)
        gm_n = aux.normalize_feature_vector(gm)
        all_matrices.append(gm_n)

        acc, s_1, s_2 = linear_svm_evaluation(all_matrices,
                                              classes,
                                              num_repetitions=num_reps,
                                              all_std=True)
        print(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "SPSP " + str(acc) + " " + str(s_1) + " " +
                       str(s_2))

    for r in results:
        print(r)

예제 #17

0

파일 보기

파일: local_3_10K_connected.py 프로젝트: chrsmrrs/sparsewl

    def process(self):
        data_list = []

        indices_train = []
        indices_val = []
        indices_test = []

        infile = open("test_al_10.index", "r")
        for line in infile:
            indices_test = line.split(",")
            indices_test = [int(i) for i in indices_test]

        infile = open("val_al_10.index", "r")
        for line in infile:
            indices_val = line.split(",")
            indices_val = [int(i) for i in indices_val]

        infile = open("train_al_10.index", "r")
        for line in infile:
            indices_train = line.split(",")
            indices_train = [int(i) for i in indices_train]

        targets = dp.get_dataset("alchemy_full", multigregression=True)
        tmp_1 = targets[indices_train].tolist()
        tmp_2 = targets[indices_val].tolist()
        tmp_3 = targets[indices_test].tolist()
        targets = tmp_1
        targets.extend(tmp_2)
        targets.extend(tmp_3)

        node_labels = pre.get_all_node_labels_allchem_3(
            True, True, indices_train, indices_val, indices_test)

        matrices = pre.get_all_matrices_3_connected("alchemy_full",
                                                    indices_train)
        matrices.extend(
            pre.get_all_matrices_3_connected("alchemy_full", indices_val))
        matrices.extend(
            pre.get_all_matrices_3_connected("alchemy_full", indices_test))

        for i, m in enumerate(matrices):
            edge_index_1 = torch.tensor(matrices[i][0]).t().contiguous()
            edge_index_2 = torch.tensor(matrices[i][1]).t().contiguous()
            edge_index_3 = torch.tensor(matrices[i][2]).t().contiguous()

            data = Data()
            data.edge_index_1 = edge_index_1
            data.edge_index_2 = edge_index_2
            data.edge_index_3 = edge_index_3

            one_hot = np.eye(163)[node_labels[i]]
            data.x = torch.from_numpy(one_hot).to(torch.float)

            print(edge_index_1.max(), edge_index_2.max(), edge_index_3.max(),
                  data.x.size())

            data.y = data.y = torch.from_numpy(np.array([targets[i]
                                                         ])).to(torch.float)

            data_list.append(data)

        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

예제 #18

0

파일 보기

def main():
    num_reps = 10

    ### Smaller datasets.
    dataset = [["IMDB-BINARY", False], ["IMDB-MULTI", False], ["NCI1", True], ["PROTEINS", True],
               ["REDDIT-BINARY", False], ["ENZYMES", True]]

    results = []
    for d, use_labels in dataset:
        # Download dataset.
        dp.get_dataset(d)

        # GIN, dataset d, layers in [1:6], hidden dimension in {32,64,128}.
        acc, s_1, s_2 = gnn_evaluation(GIN, d, [1, 2, 3, 4, 5], [32, 64, 128], max_num_epochs=200, batch_size=64,
                                       start_lr=0.01, num_repetitions=num_reps, all_std=True)
        print(d + " " + "GIN " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GIN " + str(acc) + " " + str(s_1) + " " + str(s_2))

        # GIN with jumping knowledge, dataset d, layers in [1:6], hidden dimension in {32,64,128}.
        acc, s_1, s_2 = gnn_evaluation(GINWithJK, d, [1, 2, 3, 4, 5], [32, 64, 128], max_num_epochs=200,
                                       batch_size=64,
                                       start_lr=0.01, num_repetitions=num_reps, all_std=True)
        print(d + " " + "GINWithJK " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GINWithJK " + str(acc) + " " + str(s_1) + " " + str(s_2))

    num_reps = 3
    print(num_reps)

    ### Midscale datasets.
    dataset = [["MOLT-4", True, True], ["Yeast", True, True], ["MCF-7", True, True]]

    for d, use_labels, _ in dataset:
        print(d)
        dp.get_dataset(d)

        # GINE (GIN with edge labels), dataset d, 3 layers, hidden dimension in {64}.
        acc, s_1, s_2 = gnn_evaluation(GINE, d, [3], [64], max_num_epochs=200,
                                       batch_size=64, start_lr=0.01,
                                       num_repetitions=num_reps, all_std=True)
        print(d + " " + "GINE " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GINE " + str(acc) + " " + str(s_1) + " " + str(s_2))

        # GINE (GIN with edge labels) with jumping knowledge, dataset d, 3 layers, hidden dimension in {64}.
        acc, s_1, s_2 = gnn_evaluation(GINEWithJK, d, [3], [64], max_num_epochs=200,
                                       batch_size=64,
                                       start_lr=0.01,
                                       num_repetitions=num_reps, all_std=True)
        print(d + " " + "GINEJK " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GINEJK " + str(acc) + " " + str(s_1) + " " + str(s_2))

    dataset = [["reddit_threads", False, False],
               ["github_stargazers", False, False],
               ]

    for d, use_labels, _ in dataset:
        print(d)
        dp.get_dataset(d)

        # GINE (GIN with edge labels), dataset d, 3 layers, hidden dimension in {64}.
        acc, s_1, s_2 = gnn_evaluation(GIN, d, [3], [64], max_num_epochs=200,
                                       batch_size=64, start_lr=0.01,
                                       num_repetitions=num_reps, all_std=True)
        print(d + " " + "GIN " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GIN " + str(acc) + " " + str(s_1) + " " + str(s_2))

        # GINE (GIN with edge labels) with jumping knowledge, dataset d, 3 layers, hidden dimension in {64}.
        acc, s_1, s_2 = gnn_evaluation(GINWithJK, d, [3], [64], max_num_epochs=200,
                                       batch_size=64,
                                       start_lr=0.01,
                                       num_repetitions=num_reps, all_std=True)
        print(d + " " + "GINJK " + str(acc) + " " + str(s_1) + " " + str(s_2))
        results.append(d + " " + "GINJK " + str(acc) + " " + str(s_1) + " " + str(s_2))

    for r in results:
        print(r)