Exemple #1
0
def visualize_graph_dataset(dataset_name):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # checking whether you have a GPU
    config = {
        'dataset_name': dataset_name,  # Cora or PPI
        'layer_type': LayerType.IMP3,  # don't care, but it's needed for load_graph_data function to work
        'should_visualize': True  # visualize the dataset
    }
    load_graph_data(config, device)
Exemple #2
0
def visualize_graph_dataset(dataset_name):
    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # checking whether you have a GPU
    config = {
        'dataset_name': dataset_name,
        'layer_type': LayerType.IMP3,  # don't care
        'should_visualize': True  # visualize the dataset
    }
    load_graph_data(config, device)
Exemple #3
0
def train_gat(config):
    global BEST_VAL_ACC, BEST_VAL_LOSS

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # checking whether you have a GPU, I hope so!

    # Step 1: load the graph data
    node_features, node_labels, edge_index, train_indices, val_indices, test_indices = load_graph_data(config, device)

    # Step 2: prepare the model
    gat = GAT(
        num_of_layers=config['num_of_layers'],
        num_heads_per_layer=config['num_heads_per_layer'],
        num_features_per_layer=config['num_features_per_layer'],
        add_skip_connection=config['add_skip_connection'],
        bias=config['bias'],
        dropout=config['dropout'],
        layer_type=config['layer_type'],
        log_attention_weights=False  # no need to store attentions, used only in playground.py while visualizing
    ).to(device)

    # Step 3: Prepare other training related utilities (loss & optimizer and decorator function)
    loss_fn = nn.CrossEntropyLoss(reduction='mean')
    optimizer = Adam(gat.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

    # The decorator function makes things cleaner since there is a lot of redundancy between the train and val loops
    main_loop = get_main_loop(
        config,
        gat,
        loss_fn,
        optimizer,
        node_features,
        node_labels,
        edge_index,
        train_indices,
        val_indices,
        test_indices,
        config['patience_period'],
        time.time())

    BEST_VAL_ACC, BEST_VAL_LOSS, PATIENCE_CNT = [0, 0, 0]  # reset vars used for early stopping

    # Step 4: Start the training procedure
    for epoch in range(config['num_of_epochs']):
        # Training loop
        main_loop(phase=LoopPhase.TRAIN, epoch=epoch)

        # Validation loop
        with torch.no_grad():
            try:
                main_loop(phase=LoopPhase.VAL, epoch=epoch)
            except Exception as e:  # "patience has run out" exception :O
                print(str(e))
                break  # break out from the training loop

    # Step 5: Potentially test your model
    # Don't overfit to the test dataset - only when you've fine-tuned your model on the validation dataset should you
    # report your final loss and accuracy on the test dataset. Friends don't let friends overfit to the test data. <3
    if config['should_test']:
        test_acc = main_loop(phase=LoopPhase.TEST)
        config['test_acc'] = test_acc
        print(f'Test accuracy = {test_acc}')
    else:
        config['test_acc'] = -1

    # Save the latest GAT in the binaries directory
    torch.save(utils.get_training_state(config, gat), os.path.join(BINARIES_PATH, utils.get_available_binary_name()))
Exemple #4
0
def visualize_gat_properties(model_name=r'gat_000000.pth',
                             dataset_name=DatasetType.CORA.name,
                             visualization_type=VisualizationType.ATTENTION):
    """
    Using t-SNE to visualize GAT embeddings in 2D space.
    Check out this one for more intuition on how to tune t-SNE: https://distill.pub/2016/misread-tsne/

    If you think it'd be useful for me to implement t-SNE as well and explain how every single detail works
    open up an issue or DM me on social media! <3

    Note: I also tried using UMAP but it doesn't provide any more insight than t-SNE.
    (con: it has a lot of dependencies if you want to use their plotting functionality)

    """
    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # checking whether you have a GPU, I hope so!

    config = {
        'dataset_name': dataset_name,
        'layer_type': LayerType.IMP3,
        'should_visualize': False  # don't visualize the dataset
    }

    # Step 1: Prepare the data
    node_features, node_labels, topology, train_indices, val_indices, test_indices = load_graph_data(
        config, device)

    # Step 2: Prepare the model
    model_path = os.path.join(BINARIES_PATH, model_name)
    model_state = torch.load(model_path)

    gat = GAT(num_of_layers=model_state['num_of_layers'],
              num_heads_per_layer=model_state['num_heads_per_layer'],
              num_features_per_layer=model_state['num_features_per_layer'],
              add_skip_connection=model_state['add_skip_connection'],
              bias=model_state['bias'],
              dropout=model_state['dropout'],
              layer_type=name_to_layer_type(model_state['layer_type']),
              log_attention_weights=True).to(device)

    print_model_metadata(model_state)
    gat.load_state_dict(model_state["state_dict"], strict=True)
    gat.eval(
    )  # some layers like nn.Dropout behave differently in train vs eval mode so this part is important

    # Step 3: Calculate the things we'll need for different visualization types (attention, scores, edge_index)

    # This context manager is important (and you'll often see it), otherwise PyTorch will eat much more memory.
    # It would be saving activations for backprop but we are not going to do any model training just the prediction.
    with torch.no_grad():
        # Step 3: Run predictions and collect the high dimensional data
        all_nodes_unnormalized_scores, _ = gat(
            (node_features, topology))  # shape = (N, num of classes)
        all_nodes_unnormalized_scores = all_nodes_unnormalized_scores.cpu(
        ).numpy()

    # We'll need the edge index in different for multiple visualization types
    if config[
            'layer_type'] == LayerType.IMP3:  # imp 3 works with edge index while others work with adjacency info
        edge_index = topology
    else:
        edge_index = convert_adj_to_edge_index(topology)

    # Step 4: Perform a specific visualization
    if visualization_type == VisualizationType.ATTENTION:
        # The number of nodes for which we want to visualize their attention over neighboring nodes
        # (2x this actually as we add nodes with highest degree + random nodes)
        num_nodes_of_interest = 4  # 4 is an arbitrary number you can play with these numbers
        head_to_visualize = 0  # plot attention from this multi-head attention's head
        gat_layer_id = 1  # plot attention from this GAT layer

        # Build up the complete graph
        # node_features shape = (N, FIN), where N is the number of nodes and FIN number of input features
        total_num_of_nodes = len(node_features)
        complete_graph = ig.Graph()
        complete_graph.add_vertices(
            total_num_of_nodes
        )  # igraph creates nodes with ids [0, total_num_of_nodes - 1]
        edge_index_tuples = list(zip(
            edge_index[0, :], edge_index[1, :]))  # igraph requires this format
        complete_graph.add_edges(edge_index_tuples)

        # Pick the target nodes to plot (nodes with highest degree + random nodes)
        # Note: there could be an overlap between random nodes and nodes with highest degree - but highly unlikely
        nodes_of_interest_ids = np.argpartition(
            complete_graph.degree(),
            -num_nodes_of_interest)[-num_nodes_of_interest:]
        random_node_ids = np.random.randint(low=0,
                                            high=total_num_of_nodes,
                                            size=num_nodes_of_interest)
        nodes_of_interest_ids = np.append(nodes_of_interest_ids,
                                          random_node_ids)
        np.random.shuffle(nodes_of_interest_ids)

        target_node_ids = edge_index[1]
        source_nodes = edge_index[0]

        for target_node_id in nodes_of_interest_ids:
            # Step 1: Find the neighboring nodes to the target node
            # Note: self edge for CORA is included so the target node is it's own neighbor (Alexandro yo soy tu madre)
            src_nodes_indices = torch.eq(target_node_ids, target_node_id)
            source_node_ids = source_nodes[src_nodes_indices].cpu().numpy()
            size_of_neighborhood = len(source_node_ids)

            # Step 2: Fetch their labels
            labels = node_labels[source_node_ids].cpu().numpy()

            # Step 3: Fetch the attention weights for edges (attention is logged during GAT's forward pass above)
            # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous
            all_attention_weights = gat.gat_net[
                gat_layer_id].attention_weights.squeeze(dim=-1)
            attention_weights = all_attention_weights[
                src_nodes_indices, head_to_visualize].cpu().numpy()
            # This part shows that for CORA what GAT learns is pretty much constant attention weights! Like in GCN!
            print(
                f'Max attention weight = {np.max(attention_weights)} and min = {np.min(attention_weights)}'
            )
            attention_weights /= np.max(
                attention_weights
            )  # rescale the biggest weight to 1 for nicer plotting

            # Build up the neighborhood graph whose attention we want to visualize
            # igraph constraint - it works with contiguous range of ids so we map e.g. node 497 to 0, 12 to 1, etc.
            id_to_igraph_id = dict(
                zip(source_node_ids, range(len(source_node_ids))))
            ig_graph = ig.Graph()
            ig_graph.add_vertices(size_of_neighborhood)
            ig_graph.add_edges([(id_to_igraph_id[neighbor],
                                 id_to_igraph_id[target_node_id])
                                for neighbor in source_node_ids])

            # Prepare the visualization settings dictionary and plot
            visual_style = {
                "edge_width":
                attention_weights,  # make edges as thick as the corresponding attention weight
                "layout": ig_graph.layout_reingold_tilford_circular(
                )  # layout for tree-like graphs
            }
            # This is the only part that's Cora specific as Cora has 7 labels
            if dataset_name.lower() == DatasetType.CORA.name.lower():
                visual_style["vertex_color"] = [
                    cora_label_to_color_map[label] for label in labels
                ]
            else:
                print(
                    'Add custom color scheme for your specific dataset. Using igraph default coloring.'
                )

            ig.plot(ig_graph, **visual_style)

    elif visualization_type == VisualizationType.EMBEDDINGS:  # visualize embeddings (using t-SNE)
        node_labels = node_labels.cpu().numpy()
        num_classes = len(set(node_labels))

        # Feel free to experiment with perplexity it's arguable the most important parameter of t-SNE and it basically
        # controls the standard deviation of Gaussians i.e. the size of the neighborhoods in high dim (original) space.
        # Simply put the goal of t-SNE is to minimize the KL-divergence between joint Gaussian distribution fit over
        # high dim points and between the t-Student distribution fit over low dimension points (the ones we're plotting)
        # Intuitively, by doing this, we preserve the similarities (relationships) between the high and low dim points.
        # This (probably) won't make much sense if you're not already familiar with t-SNE, God knows I've tried. :P
        t_sne_embeddings = TSNE(
            n_components=2, perplexity=30,
            method='barnes_hut').fit_transform(all_nodes_unnormalized_scores)

        for class_id in range(num_classes):
            # We extract the points whose true label equals class_id and we color them in the same way, hopefully
            # they'll be clustered together on the 2D chart - that would mean that GAT has learned good representations!
            plt.scatter(t_sne_embeddings[node_labels == class_id, 0],
                        t_sne_embeddings[node_labels == class_id, 1],
                        s=20,
                        color=cora_label_to_color_map[class_id],
                        edgecolors='black',
                        linewidths=0.2)
        plt.show()

    # We want our local probability distributions (attention weights over the neighborhoods) to be
    # non-uniform because that means that GAT is learning a useful pattern. Entropy histograms help us visualize
    # how different those neighborhood distributions are from the uniform distribution (constant attention).
    # If the GAT is learning const attention we could well be using GCN or some even simpler models.
    elif visualization_type == VisualizationType.ENTROPY:
        num_heads_per_layer = [layer.num_of_heads for layer in gat.gat_net]
        num_layers = len(num_heads_per_layer)

        num_of_nodes = len(node_features)
        target_node_ids = edge_index[1].cpu().numpy()

        # For every GAT layer and for every GAT attention head plot the entropy histogram
        for layer_id in range(num_layers):
            # Fetch the attention weights for edges (attention is logged during GAT's forward pass above)
            # attention shape = (N, NH, 1) -> (N, NH) - we just squeeze the last dim it's superfluous
            all_attention_weights = gat.gat_net[
                layer_id].attention_weights.squeeze(dim=-1).cpu().numpy()

            for head_id in range(num_heads_per_layer[layer_id]):
                uniform_dist_entropy_list = [
                ]  # save the ideal uniform histogram as the reference
                neighborhood_entropy_list = []

                for target_node_id in range(
                        num_of_nodes
                ):  # find every the neighborhood for every node in the graph
                    # These attention weights sum up to 1 by GAT design so we can treat it as a probability distribution
                    neigborhood_attention = all_attention_weights[
                        target_node_ids == target_node_id].flatten()
                    # Reference uniform distribution of the same length
                    ideal_uniform_attention = np.ones(
                        len(neigborhood_attention)) / len(
                            neigborhood_attention)

                    # Calculate the entropy, check out this video if you're not familiar with the concept:
                    # https://www.youtube.com/watch?v=ErfnhcEV1O8 (Aurélien Géron)
                    neighborhood_entropy_list.append(
                        entropy(neigborhood_attention, base=2))
                    uniform_dist_entropy_list.append(
                        entropy(ideal_uniform_attention, base=2))

                title = f'Cora entropy histogram layer={layer_id}, attention head={head_id}'
                draw_entropy_histogram(uniform_dist_entropy_list,
                                       title,
                                       color='orange',
                                       uniform_distribution=True)
                draw_entropy_histogram(neighborhood_entropy_list,
                                       title,
                                       color='dodgerblue')

                fig = plt.gcf()  # get current figure
                plt.show()
                fig.savefig(
                    os.path.join(DATA_DIR_PATH,
                                 f'layer_{layer_id}_head_{head_id}.jpg'))
                plt.close()
    else:
        raise Exception(
            f'Visualization type {visualization_type} not supported.')
def train_gat_cora(config):

    device = torch.device("cuda" if torch.cuda.is_available() else
                          "cpu")  # checking whether you have a GPU, I hope so!

    # Step 1: load the graph data
    node_features, node_labels, edge_index, train_indices, val_indices, test_indices = load_graph_data(
        config, device)
    ### BUG: node_features vary in AR
    # What is edg-index? it is a representation of the edges of the graph
    #
    graph_data = (node_features, edge_index)

    # Step 2: prepare the model
    gat = GAT(
        num_of_layers=config['num_of_layers'],
        num_heads_per_layer=config['num_heads_per_layer'],
        num_features_per_layer=config['num_features_per_layer'],
        add_skip_connection=config['add_skip_connection'],
        bias=config['bias'],
        dropout=config['dropout'],
        layer_type=config['layer_type'],
        log_attention_weights=
        False  # no need to store attentions, used only in playground.py for visualizations
    ).to(device)

    # Step 3: Prepare other training related utilities (loss & optimizer and decorator function)
    loss_fn = nn.CrossEntropyLoss(reduction='mean')
    optimizer = Adam(gat.parameters(),
                     lr=config['lr'],
                     weight_decay=config['weight_decay'])

    if phase == LoopPhase.TRAIN:
        gat.train()
    else:
        gat.eval()

    # Do a forwards pass and extract only the relevant node scores (train/val or test ones)
    # Note: [0] just extracts the node_features part of the data (index 1 contains the edge_index)
    # shape = (N, C) where N is the number of nodes in the split (train/val/test) and C is the number of classes
    nodes_unnormalized_scores = gat(graph_data)[0].index_select(
        node_dim, node_indices)

    # Example: let's take an output for a single node on Cora - it's a vector of size 7 and it contains unnormalized
    # scores like: V = [-1.393,  3.0765, -2.4445,  9.6219,  2.1658, -5.5243, -4.6247]
    # What PyTorch's cross entropy loss does is for every such vector it first applies a softmax, and so we'll
    # have the V transformed into: [1.6421e-05, 1.4338e-03, 5.7378e-06, 0.99797, 5.7673e-04, 2.6376e-07, 6.4848e-07]
    # secondly, whatever the correct class is (say it's 3), it will then take the element at position 3,
    # 0.99797 in this case, and the loss will be -log(0.99797). It does this for every node and applies a mean.
    # You can see that as the probability of the correct class for most nodes approaches 1 we get to 0 loss! <3
    loss = cross_entropy_loss(nodes_unnormalized_scores, gt_node_labels)

    if phase == LoopPhase.TRAIN:
        optimizer.zero_grad(
        )  # clean the trainable weights gradients in the computational graph (.grad fields)
        loss.backward(
        )  # compute the gradients for every trainable weight in the computational graph
        optimizer.step()  # apply the gradients to weights
Exemple #6
0
def train_gat_ppi(config):
    """
    Very similar to Cora's training script. The main differences are:
    1. Using dataloaders since we're dealing with an inductive setting - multiple graphs per batch
    2. Doing multi-class classification (BCEWithLogitsLoss) and reporting micro-F1 instead of accuracy
    3. Model architecture and hyperparams are a bit different (as reported in the GAT paper)

    """
    global BEST_VAL_PERF, BEST_VAL_LOSS

    # Checking whether you have a strong GPU. Since PPI training requires almost 8 GBs of VRAM
    # I've added the option to force the use of CPU even though you have a GPU on your system (but it's too weak).
    device = torch.device("cuda" if torch.cuda.is_available() and not config['force_cpu'] else "cpu")

    # Step 1: prepare the data loaders
    data_loader_train, data_loader_val, data_loader_test = load_graph_data(config, device)

    # Step 2: prepare the model
    gat = GAT(
        num_of_layers=config['num_of_layers'],
        num_heads_per_layer=config['num_heads_per_layer'],
        num_features_per_layer=config['num_features_per_layer'],
        add_skip_connection=config['add_skip_connection'],
        bias=config['bias'],
        dropout=config['dropout'],
        layer_type=config['layer_type'],
        log_attention_weights=False  # no need to store attentions, used only in playground.py for visualizations
    ).to(device)

    # Step 3: Prepare other training related utilities (loss & optimizer and decorator function)
    loss_fn = nn.BCEWithLogitsLoss(reduction='mean')
    optimizer = Adam(gat.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

    # The decorator function makes things cleaner since there is a lot of redundancy between the train and val loops
    main_loop = get_main_loop(
        config,
        gat,
        loss_fn,
        optimizer,
        config['patience_period'],
        time.time())

    BEST_VAL_PERF, BEST_VAL_LOSS, PATIENCE_CNT = [0, 0, 0]  # reset vars used for early stopping

    # Step 4: Start the training procedure
    for epoch in range(config['num_of_epochs']):
        # Training loop
        main_loop(phase=LoopPhase.TRAIN, data_loader=data_loader_train, epoch=epoch)

        # Validation loop
        with torch.no_grad():
            try:
                main_loop(phase=LoopPhase.VAL, data_loader=data_loader_val, epoch=epoch)
            except Exception as e:  # "patience has run out" exception :O
                print(str(e))
                break  # break out from the training loop

    # Step 5: Potentially test your model
    # Don't overfit to the test dataset - only when you've fine-tuned your model on the validation dataset should you
    # report your final loss and micro-F1 on the test dataset. Friends don't let friends overfit to the test data. <3
    if config['should_test']:
        micro_f1 = main_loop(phase=LoopPhase.TEST, data_loader=data_loader_test)
        config['test_perf'] = micro_f1

        print('*' * 50)
        print(f'Test micro-F1 = {micro_f1}')
    else:
        config['test_perf'] = -1

    # Save the latest GAT in the binaries directory
    torch.save(
        utils.get_training_state(config, gat),
        os.path.join(BINARIES_PATH, utils.get_available_binary_name(config['dataset_name']))
    )
Exemple #7
0
def train_gat_ppi(config):

    # 记录全局参数,最好的验证F1值,最好的验证损失
    global BEST_VAL_MICRO_F1, BEST_VAL_LOSS

    device = torch.device("cuda" if torch.cuda.is_available()
                          and not config['force_cpu'] else "cpu")

    # Step1 加载数据
    data_loader_train, data_loader_val, data_loader_test = load_graph_data(
        config, device)

    # Step2 准备模型
    gat = GAT_ppi(num_of_layers=config['num_of_layers'],
                  num_heads_per_layer=config['num_heads_per_layer'],
                  num_features_per_layer=config['num_features_per_layer'],
                  add_skip_connection=config['add_skip_connection'],
                  bias=config['bias'],
                  dropout=config['dropout'],
                  log_attention_weights=False).to(device)

    # Step3 准备训练工具
    loss_fn = nn.BCEWithLogitsLoss(reduction='mean')
    optimizer = Adam(gat.parameters(),
                     lr=config['lr'],
                     weight_decay=config['weight_decay'])

    # 返回主迭代方法,这样提高代码复用率
    main_loop = get_main_loop(config=config,
                              gat=gat,
                              sigmoid_cross_entropy_loss=loss_fn,
                              optimizer=optimizer,
                              patience_period=config['patience_period'],
                              time_start=time.time())

    BEST_VAL_MICRO_F1, BEST_VAL_LOSS, PATIENCE_CNT = [0, 0, 0]  # 重置

    # Step4 开始训练过程
    for epoch in range(config['num_of_epochs']):
        # 训练循环
        main_loop(phase=LoopPhase.TRAIN,
                  data_loader=data_loader_train,
                  epoch=epoch)

        # 验证循环
        with torch.no_grad():
            try:
                main_loop(phase=LoopPhase.VAL,
                          data_loader=data_loader_val,
                          epoch=epoch)
            except Exception as e:
                print(str(e))
                break

    # Step5 验证
    if config['should_test']:
        micro_f1 = main_loop(phase=LoopPhase.TEST,
                             data_loader=data_loader_test)
        config['test_perf'] = micro_f1

        print('*' * 50)
        print(f'Test micro-F1 = {micro_f1}')

    else:
        config['test_perf'] = -1

    # 保存最新的GAT模型的二进制文件
    torch.save(
        utils.get_training_state(config, gat),
        os.path.join(BINARIES_PATH,
                     utils.get_available_binary_name(config['dataset_name'])))