Beispiel #1
0
def visualize_glayout_without_training(layout="tsne", **kwargs):
    _args = get_args(**kwargs)
    pprint_args(_args)
    train_d, val_d, test_d = get_dataset_or_loader(
        _args.dataset_class,
        _args.dataset_name,
        _args.data_root,
        batch_size=_args.batch_size,
        seed=_args.seed,
    )
    data = train_d[0]
    plot_graph_layout(data.x.numpy(),
                      data.y.numpy(),
                      data.edge_index.numpy(),
                      args=_args,
                      edge_to_attention=None,
                      key="raw",
                      layout=layout)
Beispiel #2
0
def visualize_attention_metric_for_multiple_models(
        name_prefix_and_kwargs: List[Tuple[str, Dict]],
        unit_width_per_name=3,
        extension="png"):
    res = None
    total_args, num_layers, custom_key_list, name_prefix_list = None, None, [], []
    kld1_list, kld2_list, jsd_list, ent_list = [], [], [], []  # [L * M, N]
    for name_prefix, kwargs in name_prefix_and_kwargs:
        args = get_args(**kwargs)
        custom_key_list.append(args.custom_key)
        num_layers = args.num_layers

        train_d, val_d, test_d = get_dataset_or_loader(
            args.dataset_class,
            args.dataset_name,
            args.data_root,
            batch_size=args.batch_size,
            seed=args.seed,
        )
        if val_d is None and test_d is None:
            data_list = [train_d[0]]
        else:
            data_list = []
            for _data in chain(train_d, val_d, test_d):
                if _data.x.size(0) != len(_data.agreement_dist):
                    _data.agreement_dist = [
                        _ad for _ad in _data.agreement_dist[0]
                    ]
                    _data.uniform_att_dist = [
                        _uad for _uad in _data.uniform_att_dist[0]
                    ]
                data_list.append(_data)

        gpu_id = [
            int(
                np.random.choice([
                    g for g in range(args.num_gpus_total)
                    if g not in args.gpu_deny_list
                ], 1))
        ][0]

        if args.verbose >= 1:
            pprint_args(args)
            cprint("Use GPU the ID of which is {}".format(gpu_id), "yellow")

        device = "cpu" if gpu_id is None \
            else torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() else 'cpu')

        model, ret = run(args, gpu_id=gpu_id, return_model=True)

        kld1_layer, kld2_layer, jsd_layer, ent_layer, *res = \
            get_attention_metric_for_single_model_and_multiple_data(model, data_list, device)
        kld1_list += kld1_layer
        kld2_list += kld2_layer
        jsd_list += jsd_layer
        ent_list += ent_layer
        name_prefix_list.append(name_prefix)
        total_args = args

        torch.cuda.empty_cache()

    total_args.custom_key = "-".join(sorted(custom_key_list))
    plot_kld_jsd_ent(kld1_list,
                     kld2_list,
                     jsd_list,
                     ent_list,
                     *res,
                     num_layers=num_layers,
                     model_args=total_args,
                     epoch=-1,
                     name_prefix_list=name_prefix_list,
                     unit_width_per_name=unit_width_per_name,
                     extension=extension,
                     flierprops={
                         "marker": "x",
                         "markersize": 12
                     })
Beispiel #3
0
def get_degree_and_homophily(dataset_class,
                             dataset_name,
                             data_root,
                             use_multiprocessing=False,
                             use_loader=False,
                             **kwargs) -> np.ndarray:
    """
    :param dataset_class: str
    :param dataset_name: str
    :param data_root: str
    :param use_multiprocessing:
    :param use_loader:
    :return: np.ndarray the shape of which is [N, 2] (degree, homophily) for Ns
    """
    print(f"{dataset_class} / {dataset_name} / {data_root}")

    _data_attr = get_dataset_or_loader(dataset_class,
                                       dataset_name,
                                       data_root,
                                       seed=42,
                                       **kwargs)
    val_d, test_d, train_loader, eval_loader = None, None, None, None
    if not use_loader:
        train_d, val_d, test_d = _data_attr
    else:
        train_d, train_loader, eval_loader = _data_attr

    if dataset_name in ["PPI", "WebKB4Univ", "CLUSTER"]:
        cum_sum = 0
        y_list, edge_index_list = [], []
        for _data in chain(train_d, val_d, test_d):
            y_list.append(_data.y)
            edge_index_list.append(_data.edge_index + cum_sum)
            cum_sum += _data.y.size(0)
        y = torch.cat(y_list, dim=0)
        edge_index = torch.cat(edge_index_list, dim=1)

    elif use_loader and dataset_name in ["Reddit"]:
        cum_sum = 0
        y_list, edge_index_list = [], []
        data = train_d[0]
        for _data in chain(
                train_loader(data.train_mask),
                eval_loader(data.val_mask),
                eval_loader(data.test_mask),
        ):
            y_list.append(data.y[_data.n_id])
            edge_index_list.append(_data.edge_index + cum_sum)
            cum_sum += _data.n_id.size(0)
        y = torch.cat(y_list, dim=0)
        edge_index = torch.cat(edge_index_list, dim=1)
        cprint(f"Edges: {edge_index.size()}, Y: {y.size()}", "yellow")

    else:
        data = train_d[0]
        y_list, edge_index_list = None, None
        y, edge_index = data.y, data.edge_index

    deg = degree(edge_index[0], num_nodes=y.size(0))
    if y_list is None:
        homophily = get_homophily(edge_index,
                                  y,
                                  use_multiprocessing=use_multiprocessing)
    else:
        homophily = get_homophily_from_list(edge_index_list, y_list,
                                            use_multiprocessing)

    degree_and_homophily = []
    for _deg, _hom in zip(deg, homophily):
        _deg, _hom = int(_deg), float(_hom)
        if _deg != 0:
            degree_and_homophily.append([_deg, _hom])
    return np.asarray(degree_and_homophily)
Beispiel #4
0
def get_graph_property(graph_property_list,
                       dataset_class,
                       dataset_name,
                       data_root,
                       verbose=True,
                       **kwargs):
    _data_attr = get_dataset_or_loader(dataset_class,
                                       dataset_name,
                                       data_root,
                                       seed=42,
                                       **kwargs)
    train_d, val_d, test_d = _data_attr

    if dataset_name in ["PPI", "WebKB4Univ", "CLUSTER"]:
        cum_sum = 0
        y_list, edge_index_list = [], []
        for _data in chain(train_d, val_d, test_d):
            y_list.append(_data.y)
            edge_index_list.append(_data.edge_index + cum_sum)
            cum_sum += _data.y.size(0)
        y = torch.cat(y_list, dim=0)
        edge_index = torch.cat(edge_index_list, dim=1)

    else:
        data = train_d[0]
        y, edge_index = data.y, data.edge_index
        y_list, edge_index_list = [y], [edge_index]

    # to_undirected
    one_nxg = to_networkx(Data(edge_index=edge_index),
                          to_undirected=is_undirected(edge_index))
    nxg_list = [
        to_networkx(Data(edge_index=ei),
                    to_undirected=is_undirected(edge_index))
        for ei in edge_index_list
    ]

    ni_nxg_list = [deepcopy(nxg) for nxg in nxg_list]
    for ni_nxg in ni_nxg_list:
        ni_nxg.remove_nodes_from(list(nx.isolates(ni_nxg)))

    gp_dict = {}
    if graph_property_list is None or "diameter" in graph_property_list:
        diameter_list = []
        for ni_nxg in ni_nxg_list:
            ni_nxg = ni_nxg.to_undirected()  # important for computing cc.
            for cc in nx.connected_components(ni_nxg):
                ni_nxg_cc = ni_nxg.subgraph(cc).copy()
                diameter_list.append(
                    nx.algorithms.distance_measures.diameter(ni_nxg_cc))
        gp_dict["diameter_mean"] = float(np.mean(diameter_list))
        gp_dict["diameter_std"] = float(np.std(diameter_list))
        gp_dict["diameter_max"] = float(np.max(diameter_list))
        gp_dict["diameter_min"] = float(np.min(diameter_list))
        gp_dict["diameter_n"] = len(diameter_list)

    if graph_property_list is None or "average_clustering_coefficient" in graph_property_list:
        gp_dict["average_clustering_coefficient"] = nx.average_clustering(
            one_nxg)

    if verbose:
        print(f"{dataset_class} / {dataset_name} / {data_root}")
        pprint(gp_dict)

    if graph_property_list is None or "centrality" in graph_property_list:
        dc = nx.degree_centrality(one_nxg)
        gp_dict["degree_centrality_mean"] = float(np.mean(list(dc.values())))
        gp_dict["degree_centrality_std"] = float(np.std(list(dc.values())))
        cc = nx.closeness_centrality(one_nxg)
        gp_dict["closeness_centrality_mean"] = float(np.mean(list(
            cc.values())))
        gp_dict["closeness_centrality_std"] = float(np.std(list(cc.values())))

    if graph_property_list is None or "assortativity" in graph_property_list:
        gp_dict[
            "degree_assortativity_coefficient"] = nx.degree_assortativity_coefficient(
                one_nxg)

    if verbose:
        print(f"{dataset_class} / {dataset_name} / {data_root}")
        pprint(gp_dict)

    return gp_dict
Beispiel #5
0
            edge_index=edge_index,
            edge_sampling_ratio=edge_sampling_ratio,
            criterion=criterion,
        )
        return loss


if __name__ == '__main__':
    from arguments import get_args

    main_args = get_args(
        model_name="GCN",
        dataset_class="PPI",
        dataset_name="PPI",
        custom_key="NE",
    )

    train_d, val_d, test_d = get_dataset_or_loader(
        main_args.dataset_class,
        main_args.dataset_name,
        main_args.data_root,
        batch_size=main_args.batch_size,
        seed=main_args.seed,
    )

    _m = LinkGNN(main_args, train_d)

    for b in train_d:
        ob = _m(b.x, b.edge_index)
        print(b.x.size(), b.edge_index.size(), ob.size())
Beispiel #6
0
def run(args, gpu_id=None, return_model=False, return_time_series=False):
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    np.random.seed(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    running_device = "cpu" if gpu_id is None \
        else torch.device('cuda:{}'.format(gpu_id) if torch.cuda.is_available() else 'cpu')

    best_val_perf = 0.
    test_perf_at_best_val = 0.
    best_test_perf = 0.
    best_test_perf_at_best_val = 0.
    link_test_perf_at_best_val = 0.

    val_loss_deque = deque(maxlen=args.early_stop_queue_length)
    val_perf_deque = deque(maxlen=args.early_stop_queue_length)

    dataset_kwargs = {}
    if args.dataset_class == "ENSPlanetoid":
        dataset_kwargs["neg_sample_ratio"] = args.neg_sample_ratio
    if args.dataset_name == "ogbn-arxiv":
        dataset_kwargs["to_undirected"] = args.to_undirected
    if args.dataset_name == "ogbn-products":
        dataset_kwargs["size"] = args.data_sampling_size
        dataset_kwargs["num_hops"] = args.data_sampling_num_hops
        dataset_kwargs["shuffle"] = True

    _data_attr = get_dataset_or_loader(
        args.dataset_class,
        args.dataset_name,
        args.data_root,
        batch_size=args.batch_size,
        seed=args.seed,
        **dataset_kwargs,
    )
    if _data_attr[-1] is None:
        train_d, val_d, test_d = _data_attr
        loader = None
        dataset_or_loader = train_d
        eval_dataset_or_loader = None
    else:
        train_d, train_loader, eval_loader = _data_attr
        val_d, test_d = None, None
        dataset_or_loader = (train_d, train_loader)
        eval_dataset_or_loader = (train_d, eval_loader)

    net_cls = _get_model_cls(args.model_name)
    net = net_cls(args, train_d)
    net = net.to(running_device)

    loaded = load_model(net, args, target_epoch=None)
    if loaded is not None:
        net, other_state_dict = loaded
        best_val_perf = other_state_dict["perf"]
        args.start_epoch = other_state_dict["epoch"]

    loss_func = eval(str(args.loss)) or nn.CrossEntropyLoss(
    )  # nn.BCEWithLogitsLoss(), nn.CrossEntropyLoss()
    adam_optim = optim.Adam(net.parameters(),
                            lr=args.lr,
                            weight_decay=args.l2_lambda)
    evaluator = Evaluator(name=args.dataset_name)

    ret = {}
    val_perf_list, test_perf_list, val_loss_list = [], [], []
    perf_task_for_val = getattr(args, "perf_task_for_val", "Node")
    for current_iter, epoch in enumerate(
            tqdm(range(args.start_epoch, args.start_epoch + args.epochs))):

        train_loss = train_model(running_device,
                                 net,
                                 dataset_or_loader,
                                 loss_func,
                                 adam_optim,
                                 epoch=epoch,
                                 _args=args)

        if args.verbose >= 2 and epoch % args.val_interval == 0:
            print("\n\t- Train loss: {}".format(train_loss))

        # Validation.
        if epoch % args.val_interval == 0:

            val_perf, val_loss, test_perf, test_loss = test_model(
                running_device,
                net,
                eval_dataset_or_loader or dataset_or_loader,
                loss_func,
                evaluator=evaluator,
                _args=args,
                val_or_test="val",
                verbose=args.verbose,
                run_link_prediction=(perf_task_for_val == "Link"),
            )
            garbage_collection_cuda()

            if args.save_plot:
                val_perf_list.append(val_perf)
                test_perf_list.append(test_perf)
                val_loss_list.append(val_loss.item())

            if test_perf > best_test_perf:
                best_test_perf = test_perf

            if val_perf >= best_val_perf:

                print_color = "yellow"
                best_val_perf = val_perf
                test_perf_at_best_val = test_perf

                if test_perf_at_best_val > best_test_perf_at_best_val:
                    best_test_perf_at_best_val = test_perf_at_best_val

                if args.task_type == "Link_Prediction":
                    link_test_perf, _ = test_model(running_device,
                                                   net,
                                                   test_d or train_d,
                                                   loss_func,
                                                   _args=args,
                                                   val_or_test="test",
                                                   verbose=0,
                                                   run_link_prediction=True)
                    link_test_perf_at_best_val = link_test_perf

                if args.save_model:
                    save_model(net, args, target_epoch=epoch, perf=val_perf)

            else:
                print_color = None

            ret = {
                "best_val_perf": best_val_perf,
                "test_perf_at_best_val": test_perf_at_best_val,
                "best_test_perf": best_test_perf,
                "best_test_perf_at_best_val": best_test_perf_at_best_val,
            }
            if args.verbose >= 1:
                cprint_multi_lines("\t- ", print_color, **ret)

            # Check early stop condition
            if args.use_early_stop and current_iter > args.early_stop_patience:
                recent_val_loss_mean = float(np.mean(val_loss_deque))
                val_loss_change = abs(recent_val_loss_mean -
                                      val_loss) / recent_val_loss_mean
                recent_val_perf_mean = float(np.mean(val_perf_deque))
                val_perf_change = abs(recent_val_perf_mean -
                                      val_perf) / recent_val_perf_mean

                if (val_loss_change < args.early_stop_threshold_loss) or \
                        (val_perf_change < args.early_stop_threshold_perf):
                    if args.verbose >= 1:
                        cprint("Early Stopped at epoch {}".format(epoch),
                               "red")
                        cprint(
                            "\t- val_loss_change is {} (thres: {}) | {} -> {}".
                            format(
                                round(val_loss_change, 6),
                                round(args.early_stop_threshold_loss, 6),
                                recent_val_loss_mean,
                                val_loss,
                            ), "red")
                        cprint(
                            "\t- val_perf_change is {} (thres: {}) | {} -> {}".
                            format(
                                round(val_perf_change, 6),
                                round(args.early_stop_threshold_perf, 6),
                                recent_val_perf_mean,
                                val_perf,
                            ), "red")
                    break
            val_loss_deque.append(val_loss)
            val_perf_deque.append(val_perf)

    if args.task_type == "Link_Prediction":
        ret = {"link_test_perf_at_best_val": link_test_perf_at_best_val, **ret}

    if args.save_plot:
        save_loss_and_perf_plot([val_loss_list, val_perf_list, test_perf_list],
                                ret,
                                args,
                                columns=["val_loss", "val_perf", "test_perf"])

    if return_model:
        return net, ret
    if return_time_series:
        return {
            "val_loss_list": val_loss_list,
            "val_perf_list": val_perf_list,
            "test_perf_list": test_perf_list,
            **ret
        }

    return ret