Exemple #1
0
def load_gntk_matrices(source="TU_DO", dataset="MUTAG", min_scale_mat=0):
    config = cfg.Config()

    matrix_dir = f"{config.matrix_path}/GNTK_{source}/{dataset}"
    data_dir = f"{config.data_path}/{source}/{dataset}"
    kernel_matrices = {}

    matrix_list = [
        mat_name
        for mat_name in os.listdir(matrix_dir)
        if os.path.isdir(f"{matrix_dir}/{mat_name}")
    ]
    for mat_name in matrix_list:
        with open(f"{matrix_dir}/{mat_name}/gram.pkl", "rb") as f:
            mat = pickle.load(f)

        # if args.min_scale_mat is True, scale each matrix
        # by its min. Done by Du et al.
        # False by default due to potential leak of test data
        if min_scale_mat:
            mat = mat / mat.min()

        kernel_matrices["_".join(mat_name.split("_")[2:])] = mat

    labels = np.loadtxt(f"{data_dir}/{dataset}_graph_labels.txt")

    return (kernel_matrices, labels)
Exemple #2
0
    def __init__(self):
        super(num_blocks_results_1, self).__init__()

        self.config = cfg.Config()
        
        self.data = {}
        self._get_data()
        self._get_plot_dims()
    def __init__(self):
        super(bias_var_tradeoff, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
    def __init__(self):
        super(diagonal_dominance, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #5
0
    def __init__(self):
        super(data_stats, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
    def __init__(self):
        super(profiling_results_1, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #7
0
    def __init__(self):
        super(jk_results_2, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
    def __init__(self):
        super(exp_a_evaluation_2, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #9
0
    def __init__(self):
        super(kernal_normalization_results_2, self).__init__()

        self.config = cfg.Config()
        
        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #10
0
    def __init__(self):
        super(time_profiling, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #11
0
    def __init__(self):
        super(activation_functions, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #12
0
    def __init__(self):
        super(gntk_expressivity_mds_2, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #13
0
    def __init__(self):
        super(time_complexity, self).__init__()

        self.config = cfg.Config()

        self.data = {}
        self._get_data()
        self._get_plot_dims()
Exemple #14
0
def load_graphs_duetal(dataset):
    logger.info(f"loading dataset {dataset} from Du et al. data.")

    if dataset in ["IMDBBINARY", "COLLAB", "IMDBMULTI"]:
        degree_as_label = True
    elif dataset in ["MUTAG", "PROTEINS", "PTC", "NCI1"]:
        degree_as_label = False

    config = cfg.Config()
    data_dir = f"{config.data_path_duetal}/{dataset}"

    g_list = []
    g_labels = []
    label_dict = {}
    feat_dict = {}

    with open(f"{data_dir}/{dataset}.txt", "r") as f:
        n_g = int(f.readline().strip())
        for i in tqdm(range(n_g)):
            row = f.readline().strip().split()
            n, l = [int(w) for w in row]
            if not l in label_dict:
                mapped = len(label_dict)
                label_dict[l] = mapped
            g = nx.Graph()
            n_edges = 0
            for j in range(n):
                row = f.readline().strip().split()
                tmp = int(row[1]) + 2
                if tmp == len(row):
                    # no node attributes
                    row = [int(w) for w in row]
                    # attr = None
                else:
                    row = [int(w) for w in row[:tmp]]
                    # attr = np.array([float(w) for w in row[tmp:]])
                if not row[0] in feat_dict:
                    mapped = len(feat_dict)
                    feat_dict[row[0]] = mapped
                g.add_node(j, lab=feat_dict[row[0]])

                n_edges += row[1]
                for k in range(2, len(row)):
                    g.add_edge(j, row[k])

            if degree_as_label:
                nx.set_node_attributes(g, dict(g.degree()), "lab")

            assert len(g) == n

            g_list.append(g)
            g_labels.append(label_dict[l])

    logger.info(f"# classes -- {len(label_dict)}")
    logger.info(f"# data -- {len(g_list)}")

    return g_list, g_labels
Exemple #15
0
from src.data import graph_utils

from timeit import default_timer as timer

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-n",
        "--n_samples",
        default=50,
        type=int,
        help="Number of samples to calculate the gram matrices for.",
    )

    args = parser.parse_args()
    config = cfg.Config()
    exp_config = cfg.TimingExpConfig()

    out_dir = f"{config.exp_path}/timing"
    utils.make_dirs_checked(out_dir)

    datasets = ["IMDBBINARY", "IMDBMULTI", "MUTAG", "NCI1", "PROTEINS", "PTC"]
    kernels = ["GNTK", "VH", "EH", "SP", "WL"]

    gram_time = {dataset: {} for dataset in datasets}
    for dataset in tqdm(datasets, desc="Datasets"):

        # load data
        graphs, labels = data_loaders.load_graphs_tudortmund(dataset)

        n_graphs = len(graphs)
Exemple #16
0
def load_data_tudo(dataset):
    """
        dataset: name of dataset
        test_proportion: ratio of test train split
        seed: random seed for random splitting of dataset
    """
    config = cfg.Config()

    data_dir = config.data_path_tudo

    if dataset in ["IMDBBINARY", "COLLAB", "IMDBMULTI"]:
        degree_as_tag = True
    elif dataset in ["MUTAG", "PROTEINS", "PTC", "NCI1"]:
        degree_as_tag = False

    logger.info("Loading data")
    g_list = []
    label_dict = {}
    feat_dict = {}

    files = [
        file.replace("{}_".format(dataset), "").replace(".txt", "")
        for file in os.listdir(os.path.join(data_dir, dataset))
        if file.split("_")[0] == dataset
    ]
    g_indicator = np.loadtxt(
        os.path.join(data_dir, dataset,
                     "{}_graph_indicator.txt".format(dataset)),
        delimiter=",",
    )
    g_labels = np.loadtxt(
        os.path.join(data_dir, dataset, "{}_graph_labels.txt".format(dataset)),
        delimiter=",",
    ).tolist()

    # create helpers
    n_g = np.max(g_indicator).astype(int)
    n_nodes = g_indicator.shape[0]
    n2g_dict = {
        i: int(g_ind) - 1
        for i, g_ind in enumerate(g_indicator.tolist())
    }

    edge_labels_bool = "edge_labels" in files
    node_labels_bool = "node_labels" in files
    if node_labels_bool:
        node_labels = open(
            os.path.join(data_dir, dataset,
                         "{}_node_labels.txt".format(dataset)), "r")
    if edge_labels_bool:
        edge_labels = open(
            os.path.join(data_dir, dataset,
                         "{}_edge_labels.txt".format(dataset)), "r")
    A = open(os.path.join(data_dir, dataset, "{}_A.txt".format(dataset)), "r")

    node_idx = 0
    for g_idx in tqdm(range(n_g)):
        if not g_labels[g_idx] in label_dict:
            mapped = len(label_dict)
            label_dict[g_labels[g_idx]] = mapped

        g = nx.Graph()
        g_node_idx = 0
        node_dict = {}
        node_tags = []
        while n2g_dict[node_idx] == g_idx:
            node_dict[node_idx] = g_node_idx
            if node_labels_bool:
                l = int(node_labels.readline().strip())
                if not l in feat_dict:
                    mapped = len(feat_dict)
                    feat_dict[l] = mapped
                g.add_node(g_node_idx)
                node_tags.append(feat_dict[l])
            node_idx += 1
            g_node_idx += 1
            if node_idx == n_nodes:
                break

        edge = A.readline().strip().replace(" ", "").split(",")
        while (n2g_dict[int(edge[0]) - 1] == g_idx) & (edge != ""):
            v1 = int(edge[0]) - 1
            v2 = int(edge[1]) - 1
            g.add_edge(node_dict[v1], node_dict[v2])
            edge = A.readline().strip().replace(" ", "").split(",")
            if edge[0] == "":
                break

        g_list.append(S2VGraph(g, label_dict[g_labels[g_idx]], node_tags))
        inverse_label_dict = {v: k for k, v in label_dict.items()}

    # add labels and edge_mat
    for g in g_list:
        g.neighbors = [[] for _ in range(len(g.g))]
        for i, j in g.g.edges():
            g.neighbors[i].append(j)
            g.neighbors[j].append(i)
        degree_list = []
        for i in range(len(g.g)):
            g.neighbors[i] = g.neighbors[i]
            degree_list.append(len(g.neighbors[i]))
        g.max_neighbor = max(degree_list)

        # g.label = label_dict[g.label]

        edges = [list(pair) for pair in g.g.edges()]
        edges.extend([[i, j] for j, i in edges])

        deg_list = list(dict(g.g.degree(range(len(g.g)))).values())
        g.edge_mat = torch.LongTensor(edges).transpose(0, 1)

    if degree_as_tag:
        for g in g_list:
            g.node_tags = list(dict(g.g.degree).values())

    # Extracting unique tag labels
    tagset = set([])
    for g in g_list:
        tagset = tagset.union(set(g.node_tags))

    tagset = list(tagset)
    tag2index = {tagset[i]: i for i in range(len(tagset))}

    for g in g_list:
        g.node_features = torch.zeros(len(g.node_tags), len(tagset))
        g.node_features[range(len(g.node_tags)),
                        [tag2index[tag] for tag in g.node_tags]] = 1

    logger.info("# classes: %d" % len(label_dict))
    logger.info("# maximum node tag: %d" % len(tagset))
    logger.info("# data: %d" % len(g_list))

    return g_list, len(label_dict), g_labels, inverse_label_dict
Exemple #17
0
def load_graphs_tudortmund(dataset):
    config = cfg.Config()
    data_dir = f"{config.data_path_tudo}/{dataset}"
    logger.info(f"loading dataset {dataset} from TU Dortmund data.")
    files = [
        file.replace(f"{dataset}_", "").replace(".txt", "")
        for file in os.listdir(data_dir)
        if file.split("_")[0] == dataset
    ]
    g_indicator = np.loadtxt(f"{data_dir}/{dataset}_graph_indicator.txt", delimiter=",")
    g_labels = np.loadtxt(
        f"{data_dir}/{dataset}_graph_labels.txt", delimiter=","
    ).tolist()

    # create helpers
    N = np.max(g_indicator).astype(int)
    n_nodes = g_indicator.shape[0]
    n2g_dict = {i: int(g_ind) - 1 for i, g_ind in enumerate(g_indicator.tolist())}

    edge_labels_bool = "edge_labels" in files
    node_labels_bool = "node_labels" in files
    if node_labels_bool:
        node_labels = open(f"{data_dir}/{dataset}_node_labels.txt", "r")
    if edge_labels_bool:
        edge_labels = open(f"{data_dir}/{dataset}_edge_labels.txt", "r")
    A = open(f"{data_dir}/{dataset}_A.txt", "r")

    node_idx = 0
    g_list = []
    for g_idx in tqdm(range(N)):
        g = nx.Graph()
        while n2g_dict[node_idx] == g_idx:
            if node_labels_bool:
                g.add_node(node_idx, lab=int(node_labels.readline().strip()))
            else:
                g.add_node(node_idx)
            node_idx += 1
            if node_idx == n_nodes:
                break

        edge = A.readline().strip().replace(" ", "").split(",")
        while (n2g_dict[int(edge[0]) - 1] == g_idx) & (edge != ""):
            if edge_labels_bool:
                g.add_edge(
                    int(edge[0]) - 1,
                    int(edge[1]) - 1,
                    lab=int(edge_labels.readline().strip()),
                )
            else:
                g.add_edge(int(edge[0]) - 1, int(edge[1]) - 1)
            edge = A.readline().strip().replace(" ", "").split(",")
            if edge[0] == "":
                break

        if not node_labels_bool:
            nx.set_node_attributes(g, dict(g.degree()), "lab")

        g_list.append(g)

    logger.info(f"# graphs -- {len(g_list)}")

    return g_list, g_labels
Exemple #18
0
def main():
    # Training settings
    # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper.
    parser = argparse.ArgumentParser(
        description=
        "PyTorch graph convolutional neural net for whole-graph classification"
    )
    parser.add_argument("--dataset",
                        type=str,
                        default="MUTAG",
                        help="name of dataset (default: MUTAG)")
    parser.add_argument(
        "--rep_idx",
        type=int,
        default=0,
        help="the index of the cv iteration. Should be less then 10.",
    )
    parser.add_argument(
        "--fold_idx",
        type=int,
        default=0,
        help="the index of fold in 10-fold validation. Should be less then 10.",
    )
    parser.add_argument(
        "--learn_eps",
        action="store_true",
        help=
        "Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.",
    )
    args = parser.parse_args()

    config = cfg.Config()
    gin_config = cfg.GINConfig(args.dataset)

    seed = 42 + args.rep_idx

    architecture = f"L{gin_config.num_layers}_R{gin_config.num_mlp_layers}_scale{gin_config.neighbor_pooling_type}"
    fold_name = f"rep{args.rep_idx}_fold{args.fold_idx}"

    out_dir = f"{config.exp_path}/GIN/{args.dataset}/{architecture}"
    utils.make_dirs_checked(out_dir)

    # set up seeds and gpu device
    torch.manual_seed(0)
    np.random.seed(0)
    device = (torch.device("cuda:" + str(args.device))
              if torch.cuda.is_available() else torch.device("cpu"))

    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(0)

    graphs, num_classes, g_labels, inv_label_dict = load_data_tudo(
        args.dataset)

    ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx.
    train_graphs, test_graphs, train_idx, test_idx = separate_data(
        args.dataset, graphs, seed, args.fold_idx, g_labels)
    # np.savetxt(f'{out_dir}/{file}_train_indices.txt', train_idx, delimiter=",")
    np.savetxt(f"{out_dir}/{fold_name}_test_indices.txt",
               test_idx,
               delimiter=",")

    model = GraphCNN(
        gin_config.num_layers,
        gin_config.num_mlp_layers,
        train_graphs[0].node_features.shape[1],
        gin_config.hidden_dim,
        num_classes,
        gin_config.final_dropout,
        args.learn_eps,
        gin_config.graph_pooling_type,
        gin_config.neighbor_pooling_type,
        device,
    ).to(device)

    optimizer = optim.Adam(model.parameters(), lr=gin_config.lr)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)

    for epoch in range(1, gin_config.epochs + 1):
        scheduler.step()

        avg_loss = train(gin_config, model, device, train_graphs, optimizer,
                         epoch)
        acc_train, acc_test, _ = test(model, device, train_graphs, test_graphs,
                                      epoch)

        with open(f"{out_dir}/{fold_name}.txt", "a") as f:
            f.write("%f %f %f" % (avg_loss, acc_train, acc_test))
            f.write("\n")

        if epoch == gin_config.epochs:
            _, _, predictions = test(model, device, train_graphs, test_graphs,
                                     epoch)
            predictions = predictions.data.cpu().numpy().flatten().tolist()
            predictions = [inv_label_dict[pred] for pred in predictions]
            np.savetxt(
                f"{out_dir}/{fold_name}_test_predictions.txt",
                predictions,
                delimiter=",",
            )

        print("")

        print(model.eps)