def fan(args, feat=None): with open(os.path.join(args.datadir, args.pkl_fname), "rb") as pkl_file: data = pickle.load(pkl_file) graphs = data[0] labels = data[1] test_graphs = data[2] test_labels = data[3] for i in range(len(graphs)): graphs[i].graph["label"] = labels[i] for i in range(len(test_graphs)): test_graphs[i].graph["label"] = test_labels[i] if feat is None: featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) for G in graphs: featgen_const.gen_node_features(G) for G in test_graphs: featgen_const.gen_node_features(G) train_dataset, test_dataset, max_num_nodes = prepare_data( graphs, args, test_graphs=test_graphs) model = models.GcnEncoderGraph( args.input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, ).cuda() train(train_dataset, model, args, test_dataset=test_dataset) evaluate(test_dataset, model, args, "Validation")
def benchmark_task_val(args, writer=None, feat="node-label"): all_vals = [] graphs = io_utils.read_graphfile(args.datadir, args.bmname, max_nodes=args.max_nodes) if feat == "node-feat" and "feat_dim" in graphs[0].graph: print("Using node features") input_dim = graphs[0].graph["feat_dim"] elif feat == "node-label" and "label" in graphs[0].nodes[0]: print("Using node labels") for G in graphs: for u in G.nodes(): G.nodes[u]["feat"] = np.array(G.nodes[u]["label"]) else: print("Using constant labels") featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) for G in graphs: featgen_const.gen_node_features(G) # 10 splits for i in range(10): train_dataset, val_dataset, max_num_nodes, input_dim, assign_input_dim = cross_val.prepare_val_data( graphs, args, i, max_nodes=args.max_nodes) print("Method: base") model = models.GcnEncoderGraph( input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, dropout=args.dropout, args=args, ).cuda() _, val_accs = train( train_dataset, model, args, val_dataset=val_dataset, test_dataset=None, writer=writer, ) all_vals.append(np.array(val_accs)) all_vals = np.vstack(all_vals) all_vals = np.mean(all_vals, axis=0) print(all_vals) print(np.max(all_vals)) print(np.argmax(all_vals))
def FFMpeg(args, writer=None, feat="node-label"): graphs = io_utils.read_graphfile(args.datadir, args.bmname, max_nodes=args.max_nodes) print(max([G.graph["label"] for G in graphs])) if feat == "node-feat" and "feat_dim" in graphs[0].graph: print("Using node features") input_dim = graphs[0].graph["feat_dim"] elif feat == "node-label" and "label" in graphs[0].nodes[0]: print("Using node labels") for G in graphs: for u in G.nodes(): G.nodes[u]["feat"] = np.array(G.nodes[u]["label"]) # make it -1/1 instead of 0/1 # feat = np.array(G.nodes[u]['label']) # G.nodes[u]['feat'] = feat * 2 - 1 else: print("Using constant labels") featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) for G in graphs: featgen_const.gen_node_features(G) train_dataset, val_dataset, test_dataset, max_num_nodes, input_dim, assign_input_dim = prepare_data( graphs, args, max_nodes=args.max_nodes) if args.method == "soft-assign": print("Method: soft-assign") model = models.SoftPoolingGcnEncoder( max_num_nodes, input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, args.hidden_dim, assign_ratio=args.assign_ratio, num_pooling=args.num_pool, bn=args.bn, dropout=args.dropout, linkpred=args.linkpred, args=args, assign_input_dim=assign_input_dim, ).cuda() else: print("Method: base") model = models.GcnEncoderGraph( input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, dropout=args.dropout, args=args, ).cuda() train( train_dataset, model, args, val_dataset=val_dataset, test_dataset=test_dataset, writer=writer, ) evaluate(test_dataset, model, args, "Validation")
def main(): # Load a configuration prog_args = arg_parse() if prog_args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda print("CUDA", prog_args.cuda) else: print("Using CPU") # Configure the logging directory if prog_args.writer: path = os.path.join(prog_args.logdir, io_utils.gen_explainer_prefix(prog_args)) if os.path.isdir(path) and prog_args.clean_log: print('Removing existing log dir: ', path) if not input( "Are you sure you want to remove this directory? (y/n): " ).lower().strip()[:1] == "y": sys.exit(1) shutil.rmtree(path) writer = SummaryWriter(path) else: writer = None # Load a model checkpoint ckpt = io_utils.load_ckpt(prog_args) cg_dict = ckpt["cg"] # get computation graph input_dim = cg_dict["feat"].shape[2] num_classes = cg_dict["pred"].shape[2] print("Loaded model from {}".format(prog_args.ckptdir)) print("input dim: ", input_dim, "; num classes: ", num_classes) # Determine explainer mode graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # build model print("Method: ", prog_args.method) if graph_mode: # Explain Graph prediction model = models.GcnEncoderGraph( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) else: if prog_args.dataset == "ppi_essential": # class weight in CE loss for handling imbalanced label classes prog_args.loss_weight = torch.tensor([1.0, 5.0], dtype=torch.float).cuda() # Explain Node prediction model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) if prog_args.gpu: model = model.cuda() # load state_dict (obtained by model.state_dict() when saving checkpoint) model.load_state_dict(ckpt["model_state"]) # Create explainer explainer = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) # TODO: API should definitely be cleaner # Let's define exactly which modes we support # We could even move each mode to a different method (even file) if prog_args.explain_node is not None: explainer.explain(prog_args.explain_node, unconstrained=False) elif graph_mode: if prog_args.explain_all: explain_path = Path('explanations/gnnexplainer/') explain_path.mkdir(exist_ok=True, parents=True) embeddings_path = Path('embeddings-%s/' % prog_args.bmname) embeddings_path.mkdir(exist_ok=True, parents=True) for i in range(len(cg_dict['all_idx'])): print('Explaining %s' % cg_dict['all_idx'][i]) explainer.explain(node_idx=0, graph_idx=i, graph_mode=True, unconstrained=False, original_idx=cg_dict['all_idx'][i]) elif prog_args.multigraph_class >= 0: print(cg_dict["label"]) # only run for graphs with label specified by multigraph_class labels = cg_dict["label"].numpy() graph_indices = [] for i, l in enumerate(labels): if l == prog_args.multigraph_class: graph_indices.append(i) if len(graph_indices) > 30: break print( "Graph indices for label ", prog_args.multigraph_class, " : ", graph_indices, ) explainer.explain_graphs(graph_indices=graph_indices) elif prog_args.graph_idx == -1: # just run for a customized set of indices explainer.explain_graphs(graph_indices=[1, 2, 3, 4]) else: explainer.explain( node_idx=0, graph_idx=prog_args.graph_idx, graph_mode=True, unconstrained=False, original_idx=cg_dict['all_idx'][prog_args.graph_idx]) # io_utils.plot_cmap_tb(writer, "tab20", 20, "tab20_cmap") else: if prog_args.multinode_class >= 0: print(cg_dict["label"]) # only run for nodes with label specified by multinode_class labels = cg_dict["label"][0] # already numpy matrix node_indices = [] for i, l in enumerate(labels): if len(node_indices) > 4: break if l == prog_args.multinode_class: node_indices.append(i) print( "Node indices for label ", prog_args.multinode_class, " : ", node_indices, ) explainer.explain_nodes(node_indices, prog_args) else: # explain a set of nodes masked_adj = explainer.explain_nodes_gnn_stats( range(400, 700, 5), prog_args)
def main(): # Load a configuration prog_args = arg_parse() if prog_args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda print("CUDA", prog_args.cuda) else: print("Using CPU") # Configure the logging directory if prog_args.writer: path = os.path.join(prog_args.logdir, io_utils.gen_explainer_prefix(prog_args)) if os.path.isdir(path) and prog_args.clean_log: print('Removing existing log dir: ', path) if not input( "Are you sure you want to remove this directory? (y/n): " ).lower().strip()[:1] == "y": sys.exit(1) shutil.rmtree(path) writer = SummaryWriter(path) else: writer = None # Load data and a model checkpoint ckpt = io_utils.load_ckpt(prog_args) cg_dict = ckpt["cg"] # get computation graph input_dim = cg_dict["feat"].shape[2] num_classes = cg_dict["pred"].shape[2] print("Loaded model from {}".format(prog_args.ckptdir)) print("input dim: ", input_dim, "; num classes: ", num_classes) # Determine explainer mode (node classif) graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # build model print("Method: ", prog_args.method) if graph_mode: # Explain Graph prediction model = models.GcnEncoderGraph( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) else: if prog_args.dataset == "ppi_essential": # class weight in CE loss for handling imbalanced label classes prog_args.loss_weight = torch.tensor([1.0, 5.0], dtype=torch.float).cuda() # Explain Node prediction model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) if prog_args.gpu: model = model.cuda() # Load state_dict (obtained by model.state_dict() when saving checkpoint) model.load_state_dict(ckpt["model_state"]) # Convertion data required to get correct model output for GraphSHAP adj = torch.tensor(cg_dict["adj"], dtype=torch.float) x = torch.tensor(cg_dict["feat"], requires_grad=True, dtype=torch.float) if prog_args.gpu: y_pred, att_adj = model(x.cuda(), adj.cuda()) else: y_pred, att_adj = model(x, adj) # Transform their data into our format data = transform_data(adj, x, cg_dict["label"][0].tolist()) # Generate test nodes # Use only these specific nodes as they are the ones added manually, part of the defined shapes # node_indices = extract_test_nodes(data, num_samples=10, cg_dict['train_idx']) k = 4 # number of nodes for the shape introduced (house, cycle) K = 0 if prog_args.dataset == 'syn1': node_indices = list(range(400, 410, 5)) elif prog_args.dataset == 'syn2': node_indices = list(range(400, 405, 5)) + list(range(1100, 1105, 5)) elif prog_args.dataset == 'syn4': node_indices = list(range(511, 523, 6)) if prog_args.hops == 3: k = 5 else: K = 5 elif prog_args.dataset == 'syn5': node_indices = list(range(511, 529, 9)) if prog_args.hops == 3: k = 7 K = 8 else: k = 5 K = 8 # GraphSHAP explainer # graphshap = GraphSHAP(data, model, adj, writer, prog_args.dataset, prog_args.gpu) # Run GNN Explainer and retrieve produced explanations gnne = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) ### GNNE # Explain a set of nodes - accuracy on edges this time t = time.time() gnne_edge_accuracy, gnne_auc, gnne_node_accuracy, important_nodes_gnne =\ gnne.explain_nodes_gnn_stats( node_indices, prog_args ) e = time.time() print('Time: ', e - t)
def main(): # Load a configuration prog_args = arg_parse() if prog_args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda print("CUDA", prog_args.cuda) else: print("Using CPU") # Configure the logging directory if prog_args.writer: path = os.path.join(prog_args.logdir, io_utils.gen_explainer_prefix(prog_args)) if os.path.isdir(path) and prog_args.clean_log: print('Removing existing log dir: ', path) if not input( "Are you sure you want to remove this directory? (y/n): " ).lower().strip()[:1] == "y": sys.exit(1) shutil.rmtree(path) writer = SummaryWriter(path) else: writer = None # Load data and a model checkpoint ckpt = io_utils.load_ckpt(prog_args) cg_dict = ckpt["cg"] # get computation graph input_dim = cg_dict["feat"].shape[2] num_classes = cg_dict["pred"].shape[2] print("Loaded model from {}".format(prog_args.ckptdir)) print("input dim: ", input_dim, "; num classes: ", num_classes) # Determine explainer mode (node classif) graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # build model print("Method: ", prog_args.method) if graph_mode: # Explain Graph prediction model = models.GcnEncoderGraph( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) else: if prog_args.dataset == "ppi_essential": # class weight in CE loss for handling imbalanced label classes prog_args.loss_weight = torch.tensor([1.0, 5.0], dtype=torch.float).cuda() # Explain Node prediction model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) if prog_args.gpu: model = model.cuda() # Load state_dict (obtained by model.state_dict() when saving checkpoint) model.load_state_dict(ckpt["model_state"]) # Convertion data required to get correct model output for GraphSHAP adj = torch.tensor(cg_dict["adj"], dtype=torch.float) x = torch.tensor(cg_dict["feat"], requires_grad=True, dtype=torch.float) if prog_args.gpu: y_pred, att_adj = model(x.cuda(), adj.cuda()) else: y_pred, att_adj = model(x, adj) # Transform their data into our format data = transform_data(adj, x, cg_dict["label"][0].tolist()) # Generate test nodes # Use only these specific nodes as they are the ones added manually, part of the defined shapes # node_indices = extract_test_nodes(data, num_samples=10, cg_dict['train_idx']) k = 4 # number of nodes for the shape introduced (house, cycle) K = 0 if prog_args.dataset == 'syn1': node_indices = list(range(400, 450, 5)) elif prog_args.dataset == 'syn2': node_indices = list(range(400, 425, 5)) + list(range(1100, 1125, 5)) elif prog_args.dataset == 'syn4': node_indices = list(range(511, 571, 6)) if prog_args.hops == 3: k = 5 else: K = 5 elif prog_args.dataset == 'syn5': node_indices = list(range(511, 601, 9)) if prog_args.hops == 3: k = 8 else: k = 5 K = 8 # GraphSHAP explainer graphshap = GraphSHAP(data, model, adj, writer, prog_args.dataset, prog_args.gpu) # Run GNN Explainer and retrieve produced explanations gnne = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) #if prog_args.explain_node is not None: # _, gnne_edge_accuracy, gnne_auc, gnne_node_accuracy = \ # gnne.explain_nodes_gnn_stats( # node_indices, prog_args # ) # elif graph_mode: # # Graph explanation # gnne_expl = gnne.explain_graphs([1])[0] # GraphSHAP - assess accuracy of explanations # Loop over test nodes accuracy = [] feat_accuracy = [] for node_idx in node_indices: start = time.time() graphshap_explanations = graphshap.explain( [node_idx], prog_args.hops, prog_args.num_samples, prog_args.info, prog_args.multiclass, prog_args.fullempty, prog_args.S, prog_args.hv, prog_args.feat, prog_args.coal, prog_args.g, prog_args.regu, )[0] end = time.time() print('GS Time:', end - start) # Predicted class pred_val, predicted_class = y_pred[0, node_idx, :].max(dim=0) # Keep only node explanations # ,predicted_class] graphshap_node_explanations = graphshap_explanations[graphshap.F:] # Derive ground truth from graph structure ground_truth = list(range(node_idx + 1, node_idx + max(k, K) + 1)) # Retrieve top k elements indices form graphshap_node_explanations if graphshap.neighbours.shape[0] > k: i = 0 val, indices = torch.topk( torch.tensor(graphshap_node_explanations.T), k + 1) # could weight importance based on val for node in graphshap.neighbours[indices]: if node.item() in ground_truth: i += 1 # Sort of accruacy metric accuracy.append(i / k) print('There are {} from targeted shape among most imp. nodes'. format(i)) # Look at importance distribution among features # Identify most important features and check if it corresponds to truly imp ones if prog_args.dataset == 'syn2': # ,predicted_class] graphshap_feat_explanations = graphshap_explanations[:graphshap.F] print('Feature importance graphshap', graphshap_feat_explanations.T) if np.argsort(graphshap_feat_explanations)[-1] == 0: feat_accuracy.append(1) else: feat_accuracy.append(0) # Metric for graphshap final_accuracy = sum(accuracy) / len(accuracy) ### GNNE # Explain a set of nodes - accuracy on edges this time _, gnne_edge_accuracy, gnne_auc, gnne_node_accuracy =\ gnne.explain_nodes_gnn_stats( node_indices, prog_args ) ### GRAD benchmark # MetricS to assess quality of predictionsx """ _, grad_edge_accuracy, grad_auc, grad_node_accuracy =\ gnne.explain_nodes_gnn_stats( node_indices, prog_args, model="grad") """ grad_edge_accuracy = 0 grad_node_accuracy = 0 ### GAT # Nothing for now - implem a GAT on the side and look at weights coef ### Results print( 'Accuracy for GraphSHAP is {:.2f} vs {:.2f},{:.2f} for GNNE vs {:.2f},{:.2f} for GRAD' .format(final_accuracy, np.mean(gnne_edge_accuracy), np.mean(gnne_node_accuracy), np.mean(grad_edge_accuracy), np.mean(grad_node_accuracy))) if prog_args.dataset == 'syn2': print('Most important feature was found in {:.2f}% of the case'.format( 100 * np.mean(feat_accuracy))) print('GNNE_auc is:', gnne_auc)