def fan(args, feat=None):
    with open(os.path.join(args.datadir, args.pkl_fname), "rb") as pkl_file:
        data = pickle.load(pkl_file)
    graphs = data[0]
    labels = data[1]
    test_graphs = data[2]
    test_labels = data[3]

    for i in range(len(graphs)):
        graphs[i].graph["label"] = labels[i]
    for i in range(len(test_graphs)):
        test_graphs[i].graph["label"] = test_labels[i]

    if feat is None:
        featgen_const = featgen.ConstFeatureGen(
            np.ones(args.input_dim, dtype=float))
        for G in graphs:
            featgen_const.gen_node_features(G)
        for G in test_graphs:
            featgen_const.gen_node_features(G)

    train_dataset, test_dataset, max_num_nodes = prepare_data(
        graphs, args, test_graphs=test_graphs)
    model = models.GcnEncoderGraph(
        args.input_dim,
        args.hidden_dim,
        args.output_dim,
        args.num_classes,
        args.num_gc_layers,
        bn=args.bn,
    ).cuda()
    train(train_dataset, model, args, test_dataset=test_dataset)
    evaluate(test_dataset, model, args, "Validation")
def benchmark_task_val(args, writer=None, feat="node-label"):
    all_vals = []
    graphs = io_utils.read_graphfile(args.datadir,
                                     args.bmname,
                                     max_nodes=args.max_nodes)

    if feat == "node-feat" and "feat_dim" in graphs[0].graph:
        print("Using node features")
        input_dim = graphs[0].graph["feat_dim"]
    elif feat == "node-label" and "label" in graphs[0].nodes[0]:
        print("Using node labels")
        for G in graphs:
            for u in G.nodes():
                G.nodes[u]["feat"] = np.array(G.nodes[u]["label"])
    else:
        print("Using constant labels")
        featgen_const = featgen.ConstFeatureGen(
            np.ones(args.input_dim, dtype=float))
        for G in graphs:
            featgen_const.gen_node_features(G)

    # 10 splits
    for i in range(10):
        train_dataset, val_dataset, max_num_nodes, input_dim, assign_input_dim = cross_val.prepare_val_data(
            graphs, args, i, max_nodes=args.max_nodes)
        print("Method: base")
        model = models.GcnEncoderGraph(
            input_dim,
            args.hidden_dim,
            args.output_dim,
            args.num_classes,
            args.num_gc_layers,
            bn=args.bn,
            dropout=args.dropout,
            args=args,
        ).cuda()

        _, val_accs = train(
            train_dataset,
            model,
            args,
            val_dataset=val_dataset,
            test_dataset=None,
            writer=writer,
        )
        all_vals.append(np.array(val_accs))
    all_vals = np.vstack(all_vals)
    all_vals = np.mean(all_vals, axis=0)
    print(all_vals)
    print(np.max(all_vals))
    print(np.argmax(all_vals))
def FFMpeg(args, writer=None, feat="node-label"):
    graphs = io_utils.read_graphfile(args.datadir,
                                     args.bmname,
                                     max_nodes=args.max_nodes)
    print(max([G.graph["label"] for G in graphs]))

    if feat == "node-feat" and "feat_dim" in graphs[0].graph:
        print("Using node features")
        input_dim = graphs[0].graph["feat_dim"]
    elif feat == "node-label" and "label" in graphs[0].nodes[0]:
        print("Using node labels")
        for G in graphs:
            for u in G.nodes():
                G.nodes[u]["feat"] = np.array(G.nodes[u]["label"])
                # make it -1/1 instead of 0/1
                # feat = np.array(G.nodes[u]['label'])
                # G.nodes[u]['feat'] = feat * 2 - 1
    else:
        print("Using constant labels")
        featgen_const = featgen.ConstFeatureGen(
            np.ones(args.input_dim, dtype=float))
        for G in graphs:
            featgen_const.gen_node_features(G)

    train_dataset, val_dataset, test_dataset, max_num_nodes, input_dim, assign_input_dim = prepare_data(
        graphs, args, max_nodes=args.max_nodes)
    if args.method == "soft-assign":
        print("Method: soft-assign")
        model = models.SoftPoolingGcnEncoder(
            max_num_nodes,
            input_dim,
            args.hidden_dim,
            args.output_dim,
            args.num_classes,
            args.num_gc_layers,
            args.hidden_dim,
            assign_ratio=args.assign_ratio,
            num_pooling=args.num_pool,
            bn=args.bn,
            dropout=args.dropout,
            linkpred=args.linkpred,
            args=args,
            assign_input_dim=assign_input_dim,
        ).cuda()
    else:
        print("Method: base")
        model = models.GcnEncoderGraph(
            input_dim,
            args.hidden_dim,
            args.output_dim,
            args.num_classes,
            args.num_gc_layers,
            bn=args.bn,
            dropout=args.dropout,
            args=args,
        ).cuda()

    train(
        train_dataset,
        model,
        args,
        val_dataset=val_dataset,
        test_dataset=test_dataset,
        writer=writer,
    )
    evaluate(test_dataset, model, args, "Validation")
Ejemplo n.º 4
0
def main():
    # Load a configuration
    prog_args = arg_parse()

    if prog_args.gpu:
        os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda
        print("CUDA", prog_args.cuda)
    else:
        print("Using CPU")

    # Configure the logging directory
    if prog_args.writer:
        path = os.path.join(prog_args.logdir,
                            io_utils.gen_explainer_prefix(prog_args))
        if os.path.isdir(path) and prog_args.clean_log:
            print('Removing existing log dir: ', path)
            if not input(
                    "Are you sure you want to remove this directory? (y/n): "
            ).lower().strip()[:1] == "y":
                sys.exit(1)
            shutil.rmtree(path)
        writer = SummaryWriter(path)
    else:
        writer = None

    # Load a model checkpoint
    ckpt = io_utils.load_ckpt(prog_args)
    cg_dict = ckpt["cg"]  # get computation graph
    input_dim = cg_dict["feat"].shape[2]
    num_classes = cg_dict["pred"].shape[2]
    print("Loaded model from {}".format(prog_args.ckptdir))
    print("input dim: ", input_dim, "; num classes: ", num_classes)

    # Determine explainer mode
    graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0
                  or prog_args.graph_idx >= 0)

    # build model
    print("Method: ", prog_args.method)
    if graph_mode:
        # Explain Graph prediction
        model = models.GcnEncoderGraph(
            input_dim=input_dim,
            hidden_dim=prog_args.hidden_dim,
            embedding_dim=prog_args.output_dim,
            label_dim=num_classes,
            num_layers=prog_args.num_gc_layers,
            bn=prog_args.bn,
            args=prog_args,
        )
    else:
        if prog_args.dataset == "ppi_essential":
            # class weight in CE loss for handling imbalanced label classes
            prog_args.loss_weight = torch.tensor([1.0, 5.0],
                                                 dtype=torch.float).cuda()
        # Explain Node prediction
        model = models.GcnEncoderNode(
            input_dim=input_dim,
            hidden_dim=prog_args.hidden_dim,
            embedding_dim=prog_args.output_dim,
            label_dim=num_classes,
            num_layers=prog_args.num_gc_layers,
            bn=prog_args.bn,
            args=prog_args,
        )
    if prog_args.gpu:
        model = model.cuda()
    # load state_dict (obtained by model.state_dict() when saving checkpoint)
    model.load_state_dict(ckpt["model_state"])

    # Create explainer
    explainer = explain.Explainer(
        model=model,
        adj=cg_dict["adj"],
        feat=cg_dict["feat"],
        label=cg_dict["label"],
        pred=cg_dict["pred"],
        train_idx=cg_dict["train_idx"],
        args=prog_args,
        writer=writer,
        print_training=True,
        graph_mode=graph_mode,
        graph_idx=prog_args.graph_idx,
    )

    # TODO: API should definitely be cleaner
    # Let's define exactly which modes we support
    # We could even move each mode to a different method (even file)
    if prog_args.explain_node is not None:
        explainer.explain(prog_args.explain_node, unconstrained=False)
    elif graph_mode:
        if prog_args.explain_all:
            explain_path = Path('explanations/gnnexplainer/')
            explain_path.mkdir(exist_ok=True, parents=True)
            embeddings_path = Path('embeddings-%s/' % prog_args.bmname)
            embeddings_path.mkdir(exist_ok=True, parents=True)
            for i in range(len(cg_dict['all_idx'])):
                print('Explaining %s' % cg_dict['all_idx'][i])
                explainer.explain(node_idx=0,
                                  graph_idx=i,
                                  graph_mode=True,
                                  unconstrained=False,
                                  original_idx=cg_dict['all_idx'][i])
        elif prog_args.multigraph_class >= 0:
            print(cg_dict["label"])
            # only run for graphs with label specified by multigraph_class
            labels = cg_dict["label"].numpy()
            graph_indices = []
            for i, l in enumerate(labels):
                if l == prog_args.multigraph_class:
                    graph_indices.append(i)
                if len(graph_indices) > 30:
                    break
            print(
                "Graph indices for label ",
                prog_args.multigraph_class,
                " : ",
                graph_indices,
            )
            explainer.explain_graphs(graph_indices=graph_indices)

        elif prog_args.graph_idx == -1:
            # just run for a customized set of indices
            explainer.explain_graphs(graph_indices=[1, 2, 3, 4])
        else:
            explainer.explain(
                node_idx=0,
                graph_idx=prog_args.graph_idx,
                graph_mode=True,
                unconstrained=False,
                original_idx=cg_dict['all_idx'][prog_args.graph_idx])
            # io_utils.plot_cmap_tb(writer, "tab20", 20, "tab20_cmap")
    else:
        if prog_args.multinode_class >= 0:
            print(cg_dict["label"])
            # only run for nodes with label specified by multinode_class
            labels = cg_dict["label"][0]  # already numpy matrix

            node_indices = []
            for i, l in enumerate(labels):
                if len(node_indices) > 4:
                    break
                if l == prog_args.multinode_class:
                    node_indices.append(i)
            print(
                "Node indices for label ",
                prog_args.multinode_class,
                " : ",
                node_indices,
            )
            explainer.explain_nodes(node_indices, prog_args)

        else:
            # explain a set of nodes
            masked_adj = explainer.explain_nodes_gnn_stats(
                range(400, 700, 5), prog_args)
Ejemplo n.º 5
0
def main():
    # Load a configuration
    prog_args = arg_parse()

    if prog_args.gpu:
        os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda
        print("CUDA", prog_args.cuda)
    else:
        print("Using CPU")

    # Configure the logging directory
    if prog_args.writer:
        path = os.path.join(prog_args.logdir,
                            io_utils.gen_explainer_prefix(prog_args))
        if os.path.isdir(path) and prog_args.clean_log:
            print('Removing existing log dir: ', path)
            if not input(
                    "Are you sure you want to remove this directory? (y/n): "
            ).lower().strip()[:1] == "y":
                sys.exit(1)
            shutil.rmtree(path)
        writer = SummaryWriter(path)
    else:
        writer = None

    # Load data and a model checkpoint
    ckpt = io_utils.load_ckpt(prog_args)
    cg_dict = ckpt["cg"]  # get computation graph
    input_dim = cg_dict["feat"].shape[2]
    num_classes = cg_dict["pred"].shape[2]
    print("Loaded model from {}".format(prog_args.ckptdir))
    print("input dim: ", input_dim, "; num classes: ", num_classes)

    # Determine explainer mode (node classif)
    graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0
                  or prog_args.graph_idx >= 0)

    # build model
    print("Method: ", prog_args.method)
    if graph_mode:
        # Explain Graph prediction
        model = models.GcnEncoderGraph(
            input_dim=input_dim,
            hidden_dim=prog_args.hidden_dim,
            embedding_dim=prog_args.output_dim,
            label_dim=num_classes,
            num_layers=prog_args.num_gc_layers,
            bn=prog_args.bn,
            args=prog_args,
        )
    else:
        if prog_args.dataset == "ppi_essential":
            # class weight in CE loss for handling imbalanced label classes
            prog_args.loss_weight = torch.tensor([1.0, 5.0],
                                                 dtype=torch.float).cuda()
        # Explain Node prediction
        model = models.GcnEncoderNode(
            input_dim=input_dim,
            hidden_dim=prog_args.hidden_dim,
            embedding_dim=prog_args.output_dim,
            label_dim=num_classes,
            num_layers=prog_args.num_gc_layers,
            bn=prog_args.bn,
            args=prog_args,
        )
    if prog_args.gpu:
        model = model.cuda()

    # Load state_dict (obtained by model.state_dict() when saving checkpoint)
    model.load_state_dict(ckpt["model_state"])

    # Convertion data required to get correct model output for GraphSHAP
    adj = torch.tensor(cg_dict["adj"], dtype=torch.float)
    x = torch.tensor(cg_dict["feat"], requires_grad=True, dtype=torch.float)
    if prog_args.gpu:
        y_pred, att_adj = model(x.cuda(), adj.cuda())
    else:
        y_pred, att_adj = model(x, adj)

    # Transform their data into our format
    data = transform_data(adj, x, cg_dict["label"][0].tolist())

    # Generate test nodes
    # Use only these specific nodes as they are the ones added manually, part of the defined shapes
    # node_indices = extract_test_nodes(data, num_samples=10, cg_dict['train_idx'])
    k = 4  # number of nodes for the shape introduced (house, cycle)
    K = 0
    if prog_args.dataset == 'syn1':
        node_indices = list(range(400, 410, 5))
    elif prog_args.dataset == 'syn2':
        node_indices = list(range(400, 405, 5)) + list(range(1100, 1105, 5))
    elif prog_args.dataset == 'syn4':
        node_indices = list(range(511, 523, 6))
        if prog_args.hops == 3:
            k = 5
        else:
            K = 5
    elif prog_args.dataset == 'syn5':
        node_indices = list(range(511, 529, 9))
        if prog_args.hops == 3:
            k = 7
            K = 8
        else:
            k = 5
            K = 8

    # GraphSHAP explainer
    # graphshap = GraphSHAP(data, model, adj, writer, prog_args.dataset, prog_args.gpu)

    # Run GNN Explainer and retrieve produced explanations

    gnne = explain.Explainer(
        model=model,
        adj=cg_dict["adj"],
        feat=cg_dict["feat"],
        label=cg_dict["label"],
        pred=cg_dict["pred"],
        train_idx=cg_dict["train_idx"],
        args=prog_args,
        writer=writer,
        print_training=True,
        graph_mode=graph_mode,
        graph_idx=prog_args.graph_idx,
    )

    ### GNNE
    # Explain a set of nodes - accuracy on edges this time
    t = time.time()
    gnne_edge_accuracy, gnne_auc, gnne_node_accuracy, important_nodes_gnne =\
        gnne.explain_nodes_gnn_stats(
            node_indices, prog_args
        )
    e = time.time()
    print('Time: ', e - t)
Ejemplo n.º 6
0
def main():
    # Load a configuration
    prog_args = arg_parse()

    if prog_args.gpu:
        os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda
        print("CUDA", prog_args.cuda)
    else:
        print("Using CPU")

    # Configure the logging directory
    if prog_args.writer:
        path = os.path.join(prog_args.logdir,
                            io_utils.gen_explainer_prefix(prog_args))
        if os.path.isdir(path) and prog_args.clean_log:
            print('Removing existing log dir: ', path)
            if not input(
                    "Are you sure you want to remove this directory? (y/n): "
            ).lower().strip()[:1] == "y":
                sys.exit(1)
            shutil.rmtree(path)
        writer = SummaryWriter(path)
    else:
        writer = None

    # Load data and a model checkpoint
    ckpt = io_utils.load_ckpt(prog_args)
    cg_dict = ckpt["cg"]  # get computation graph
    input_dim = cg_dict["feat"].shape[2]
    num_classes = cg_dict["pred"].shape[2]
    print("Loaded model from {}".format(prog_args.ckptdir))
    print("input dim: ", input_dim, "; num classes: ", num_classes)

    # Determine explainer mode (node classif)
    graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0
                  or prog_args.graph_idx >= 0)

    # build model
    print("Method: ", prog_args.method)
    if graph_mode:
        # Explain Graph prediction
        model = models.GcnEncoderGraph(
            input_dim=input_dim,
            hidden_dim=prog_args.hidden_dim,
            embedding_dim=prog_args.output_dim,
            label_dim=num_classes,
            num_layers=prog_args.num_gc_layers,
            bn=prog_args.bn,
            args=prog_args,
        )
    else:
        if prog_args.dataset == "ppi_essential":
            # class weight in CE loss for handling imbalanced label classes
            prog_args.loss_weight = torch.tensor([1.0, 5.0],
                                                 dtype=torch.float).cuda()
        # Explain Node prediction
        model = models.GcnEncoderNode(
            input_dim=input_dim,
            hidden_dim=prog_args.hidden_dim,
            embedding_dim=prog_args.output_dim,
            label_dim=num_classes,
            num_layers=prog_args.num_gc_layers,
            bn=prog_args.bn,
            args=prog_args,
        )
    if prog_args.gpu:
        model = model.cuda()

    # Load state_dict (obtained by model.state_dict() when saving checkpoint)
    model.load_state_dict(ckpt["model_state"])

    # Convertion data required to get correct model output for GraphSHAP
    adj = torch.tensor(cg_dict["adj"], dtype=torch.float)
    x = torch.tensor(cg_dict["feat"], requires_grad=True, dtype=torch.float)
    if prog_args.gpu:
        y_pred, att_adj = model(x.cuda(), adj.cuda())
    else:
        y_pred, att_adj = model(x, adj)

    # Transform their data into our format
    data = transform_data(adj, x, cg_dict["label"][0].tolist())

    # Generate test nodes
    # Use only these specific nodes as they are the ones added manually, part of the defined shapes
    # node_indices = extract_test_nodes(data, num_samples=10, cg_dict['train_idx'])
    k = 4  # number of nodes for the shape introduced (house, cycle)
    K = 0
    if prog_args.dataset == 'syn1':
        node_indices = list(range(400, 450, 5))
    elif prog_args.dataset == 'syn2':
        node_indices = list(range(400, 425, 5)) + list(range(1100, 1125, 5))
    elif prog_args.dataset == 'syn4':
        node_indices = list(range(511, 571, 6))
        if prog_args.hops == 3:
            k = 5
        else:
            K = 5
    elif prog_args.dataset == 'syn5':
        node_indices = list(range(511, 601, 9))
        if prog_args.hops == 3:
            k = 8
        else:
            k = 5
            K = 8

    # GraphSHAP explainer
    graphshap = GraphSHAP(data, model, adj, writer, prog_args.dataset,
                          prog_args.gpu)

    # Run GNN Explainer and retrieve produced explanations
    gnne = explain.Explainer(
        model=model,
        adj=cg_dict["adj"],
        feat=cg_dict["feat"],
        label=cg_dict["label"],
        pred=cg_dict["pred"],
        train_idx=cg_dict["train_idx"],
        args=prog_args,
        writer=writer,
        print_training=True,
        graph_mode=graph_mode,
        graph_idx=prog_args.graph_idx,
    )

    #if prog_args.explain_node is not None:
    # _, gnne_edge_accuracy, gnne_auc, gnne_node_accuracy = \
    #     gnne.explain_nodes_gnn_stats(
    #         node_indices, prog_args
    # )
    # elif graph_mode:
    #     # Graph explanation
    #     gnne_expl = gnne.explain_graphs([1])[0]

    # GraphSHAP - assess accuracy of explanations
    # Loop over test nodes
    accuracy = []
    feat_accuracy = []
    for node_idx in node_indices:
        start = time.time()
        graphshap_explanations = graphshap.explain(
            [node_idx],
            prog_args.hops,
            prog_args.num_samples,
            prog_args.info,
            prog_args.multiclass,
            prog_args.fullempty,
            prog_args.S,
            prog_args.hv,
            prog_args.feat,
            prog_args.coal,
            prog_args.g,
            prog_args.regu,
        )[0]

        end = time.time()
        print('GS Time:', end - start)

        # Predicted class
        pred_val, predicted_class = y_pred[0, node_idx, :].max(dim=0)

        # Keep only node explanations
        # ,predicted_class]
        graphshap_node_explanations = graphshap_explanations[graphshap.F:]

        # Derive ground truth from graph structure
        ground_truth = list(range(node_idx + 1, node_idx + max(k, K) + 1))

        # Retrieve top k elements indices form graphshap_node_explanations
        if graphshap.neighbours.shape[0] > k:
            i = 0
            val, indices = torch.topk(
                torch.tensor(graphshap_node_explanations.T), k + 1)
            # could weight importance based on val
            for node in graphshap.neighbours[indices]:
                if node.item() in ground_truth:
                    i += 1
            # Sort of accruacy metric
            accuracy.append(i / k)

            print('There are {} from targeted shape among most imp. nodes'.
                  format(i))

        # Look at importance distribution among features
        # Identify most important features and check if it corresponds to truly imp ones
        if prog_args.dataset == 'syn2':
            # ,predicted_class]
            graphshap_feat_explanations = graphshap_explanations[:graphshap.F]
            print('Feature importance graphshap',
                  graphshap_feat_explanations.T)
            if np.argsort(graphshap_feat_explanations)[-1] == 0:
                feat_accuracy.append(1)
            else:
                feat_accuracy.append(0)

    # Metric for graphshap
    final_accuracy = sum(accuracy) / len(accuracy)

    ### GNNE
    # Explain a set of nodes - accuracy on edges this time
    _, gnne_edge_accuracy, gnne_auc, gnne_node_accuracy =\
        gnne.explain_nodes_gnn_stats(
            node_indices, prog_args
        )

    ### GRAD benchmark
    #  MetricS to assess quality of predictionsx
    """
    _, grad_edge_accuracy, grad_auc, grad_node_accuracy =\
            gnne.explain_nodes_gnn_stats(
                node_indices, prog_args, model="grad")
    """
    grad_edge_accuracy = 0
    grad_node_accuracy = 0

    ### GAT
    # Nothing for now - implem a GAT on the side and look at weights coef

    ### Results
    print(
        'Accuracy for GraphSHAP is {:.2f} vs {:.2f},{:.2f} for GNNE vs {:.2f},{:.2f} for GRAD'
        .format(final_accuracy, np.mean(gnne_edge_accuracy),
                np.mean(gnne_node_accuracy), np.mean(grad_edge_accuracy),
                np.mean(grad_node_accuracy)))
    if prog_args.dataset == 'syn2':
        print('Most important feature was found in {:.2f}% of the case'.format(
            100 * np.mean(feat_accuracy)))

    print('GNNE_auc is:', gnne_auc)