def syn_task1(args, writer=None): # data G, labels, name = gengraph.gen_syn1( feature_generator=featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float))) num_classes = max(labels) + 1 if args.method == "att": print("Method: att") model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) else: print("Method:", args.method) model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model.cuda() train_node_classifier(G, labels, model, args, writer=writer)
def ppi_essential_task(args, writer=None): feat_file = "G-MtfPathways_gene-motifs.csv" # G = io_utils.read_biosnap('data/ppi_essential', 'PP-Pathways_ppi.csv', 'G-HumanEssential.tsv', # feat_file=feat_file) G = io_utils.read_biosnap( "data/ppi_essential", "hi-union-ppi.tsv", "G-HumanEssential.tsv", feat_file=feat_file, ) labels = np.array([G.nodes[u]["label"] for u in G.nodes()]) num_classes = max(labels) + 1 input_dim = G.nodes[next(iter(G.nodes()))]["feat"].shape[0] if args.method == "attn": print("Method: attn") else: print("Method:", args.method) args.loss_weight = torch.tensor([1, 5.0], dtype=torch.float).cuda() model = models.GcnEncoderNode( input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model.cuda() train_node_classifier(G, labels, model, args, writer=writer)
def task_bitcoinalpha(args): A, X = utils.load_XA(args.dataset, datadir="../Generate_XA_Data/XAL") L = utils.load_labels(args.dataset, datadir="../Generate_XA_Data/XAL") num_classes = max(L) + 1 print("NUMBER OF CLASS IS: " + str(num_classes)) input_dim = X.shape[1] print("Input dimension is: ", input_dim) model = models.GcnEncoderNode( input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) train_node_classifier.train(model, A, X, L, args, normalize_adjacency=False)
def bitcoin(args): A, X = utils.load_XA(args.dataset, datadir = "../Generate_XA_Data/XAL") L = utils.load_labels(args.dataset, datadir = "../Generate_XA_Data/XAL") num_classes = max(L) + 1 input_dim = X.shape[1] num_nodes = X.shape[0] ckpt = utils.load_ckpt(args) print("input dim: ", input_dim, "; num classes: ", num_classes) model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=args.hidden_dim, embedding_dim=args.output_dim, label_dim=num_classes, num_layers=args.num_gc_layers, bn=args.bn, args=args, ) model.load_state_dict(ckpt["model_state"]) pred = ckpt["save_data"]["pred"] explainer = pe.Node_Explainer(model, A, X, pred, args.num_gc_layers) node_to_explain = [i for [i] in np.argwhere(np.sum(A,axis = 0) > 2)] explanations = explainer.explain_range(node_to_explain, num_samples = args.num_perturb_samples, top_node = args.top_node) print(explanations) savename = utils.gen_filesave(args) np.save(savename,explanations)
def syn_task5(args, writer=None): # data G, labels, name = gnnexplainer_gengraph.gen_syn5( feature_generator=featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float))) print(labels) print("Number of nodes: ", G.number_of_nodes()) num_classes = max(labels) + 1 if args.method == "attn": print("Method: attn") else: print("Method: base") model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model train_node_classifier(G, labels, model, args, writer=writer)
def reveal(args, idx=None, writer=None): labels_dict = { "None": 5, "Employee": 0, "Vice President": 1, "Manager": 2, "Trader": 3, "CEO+Managing Director+Director+President": 4, } max_enron_id = 183 if idx is None: G_list = [] labels_list = [] for i in range(10): net = pickle.load( open("data/gnn-explainer-enron/enron_slice_{}.pkl".format(i), "rb")) # net.add_nodes_from(range(max_enron_id)) # labels=[n[1].get('role', 'None') for n in net.nodes(data=True)] # labels_num = [labels_dict[l] for l in labels] featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) featgen_const.gen_node_features(net) G_list.append(net) print(net.number_of_nodes()) # labels_list.append(labels_num) G = nx.disjoint_union_all(G_list) model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, len(labels_dict), args.num_gc_layers, bn=args.bn, args=args, ) labels = [n[1].get("role", "None") for n in G.nodes(data=True)] labels_num = [labels_dict[l] for l in labels] for i in range(5): print("Label ", i, ": ", labels_num.count(i)) print("Total num nodes: ", len(labels_num)) print(labels_num) if args.gpu: model = model.cuda() train_node_classifier(G, labels_num, model, args, writer=writer) else: print("Running Enron full task")
def task_syn(args): A, X = utils.load_XA(args.dataset, datadir = "../Generate_XA_Data/XAL") L = utils.load_labels(args.dataset, datadir = "../Generate_XA_Data/XAL") num_classes = max(L) + 1 input_dim = X.shape[1] ckpt = utils.load_ckpt(args) print("input dim: ", input_dim, "; num classes: ", num_classes) model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=args.hidden_dim, embedding_dim=args.output_dim, label_dim=num_classes, num_layers=args.num_gc_layers, bn=args.bn, args=args, ) model.load_state_dict(ckpt["model_state"]) pred = ckpt["save_data"]["pred"] explainer = pe.Node_Explainer(model, A, X, pred, args.num_gc_layers) explanations = {} if args.explain_node == None: if args.dataset == 'syn1': explanations = explainer.explain_range(list(range(300,700)), num_samples = args.num_perturb_samples, top_node = args.top_node) elif args.dataset == 'syn2': explanations = explainer.explain_range(list(range(300,700)) + list(range(1000,1400)), num_samples = args.num_perturb_samples, top_node = args.top_node, pred_threshold = 0.1) elif args.dataset == 'syn3': explanations = explainer.explain_range(list(range(300,1020)), num_samples = args.num_perturb_samples, top_node = args.top_node,pred_threshold = 0.05) elif args.dataset == 'syn4': explanations = explainer.explain_range(list(range(511,871)), num_samples = args.num_perturb_samples, top_node = args.top_node, pred_threshold = 0.1) elif args.dataset == 'syn5': explanations = explainer.explain_range(list(range(511,1231)), num_samples = args.num_perturb_samples, top_node = args.top_node, pred_threshold = 0.05) elif args.dataset == 'syn6': explanations = explainer.explain_range(list(range(300,700)), num_samples = args.num_perturb_samples, top_node = args.top_node) else: explanation = explainer.explain(args.explain_node, num_samples = args.num_perturb_samples, top_node = args.top_node) print(explanation) explanations[args.explain_node] = explanation print(explanations) savename = utils.gen_filesave(args) np.save(savename,explanations)
def enron_task_multigraph(args, idx=None, writer=None): labels_dict = { "None": 5, "Employee": 0, "Vice President": 1, "Manager": 2, "Trader": 3, "CEO+Managing Director+Director+President": 4, } max_enron_id = 183 if idx is None: G_list = [] labels_list = [] for i in range(10): net = pickle.load( open("data/gnn-explainer-enron/enron_slice_{}.pkl".format(i), "rb")) net.add_nodes_from(range(max_enron_id)) labels = [n[1].get("role", "None") for n in net.nodes(data=True)] labels_num = [labels_dict[l] for l in labels] featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) featgen_const.gen_node_features(net) G_list.append(net) labels_list.append(labels_num) # train_dataset, test_dataset, max_num_nodes = prepare_data(G_list, args) model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model.cuda() print(labels_num) train_node_classifier_multigraph(G_list, labels_list, model, args, writer=writer) else: print("Running Enron full task")
def task_syn(args): A, X = utils.load_XA(args.dataset, datadir="../Generate_XA_Data/XAL") L = utils.load_labels(args.dataset, datadir="../Generate_XA_Data/XAL") num_classes = max(L) + 1 input_dim = X.shape[1] model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) train_node_classifier.train(model, A, X, L, args, normalize_adjacency=False)
def syn_task2(args, writer=None): # data G, labels, name = gengraph.gen_syn2() input_dim = len(G.nodes[0]["feat"]) num_classes = max(labels) + 1 if args.method == "attn": print("Method: attn") else: print("Method:", args.method) model = models.GcnEncoderNode( input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model.cuda() train_node_classifier(G, labels, model, args, writer=writer)
def main(): # Parsing defaults for all program parameters unless provided by user prog_args = parse_explainer_args.arg_parse() # More params on top of train.py prog_args.writer = None # Check is for None and default is True path = os.path.join(prog_args.logdir, io_utils.gen_prefix(prog_args)) print("Tensorboard writer path :\n", path) print("No. of epochs :", prog_args.num_epochs) # writer = SummaryWriter(path) if prog_args.gpu: # os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda # env = os.environ.get('CUDA_VISIBLE_DEVICES') # print("Environment is set :", env) print('\nCUDA_VISIBLE_DEVICES') print('------------------------------------------') print("CUDA", prog_args.cuda) else: print('\n------------------------------------------') print("Using CPU") # Loading previously saved computational graph data (model checkpoint) model_dict = io_utils.load_ckpt(prog_args) model_optimizer = model_dict['optimizer'] print("Model optimizer :", model_optimizer) print("Model optimizer state dictionary :\n", model_optimizer.state_dict()['param_groups']) # model.load_state_dict(checkpoint['model_state_dict']) # optimizer.load_state_dict(checkpoint['optimizer_state_dict']) # epoch = checkpoint['epoch'] # loss = checkpoint['loss'] print( '------------------------------------------------------------------------------------' ) print("Keys in loaded model dictionary :", list(model_dict)) print("Keys in loaded model optimizer dictionary:", list(model_optimizer.state_dict())) print("All loaded labels :\n", model_dict['cg']['label']) print() print('mask_act:{}, mask_bias:{}, explainer_suffix:{}'.format( prog_args.mask_act, prog_args.mask_bias, prog_args.explainer_suffix)) # Determine explainer mode graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # Trained data stored in computational graph dictionary cg_dict = model_dict['cg'] input_dim = cg_dict['feat'].shape[2] num_classes = cg_dict['pred'].shape[2] print("\nLoaded model from subdirectory \"{}\" ...".format( prog_args.ckptdir)) print("input dim :", input_dim, "; num classes :", num_classes) print("Labels of retrieved data :\n", cg_dict['label']) print( '------------------------------------------------------------------------------------' ) print("Multigraph class :", prog_args.multigraph_class) print("Graph Index :", prog_args.graph_idx) print("Explainer graph mode :", graph_mode) print("Input dimension :", input_dim) print("Hidden dimension :", prog_args.hidden_dim) print("Output dimension :", prog_args.output_dim) print("Number of classes :", num_classes) print("Number of GCN layers :", prog_args.num_gc_layers) print("Batch Normalization :", prog_args.bn) model = models.GcnEncoderNode(input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args) print("\nGcnEncoderNode model :\n", model) # load state_dict (obtained by model.state_dict() when saving checkpoint) # Loading Model for Inference print("Model checked result :", model.load_state_dict(model_dict['model_state'])) print( '------------------------------------------------------------------------------------\n' ) # Explaining single node prediction print('Explaining single default node :', prog_args.explain_node) # The number of epochs used for explanation training is much smaller than the 1K epochs used for node label # trainings and predictions in the GCN. The former is trained only based on the k-hop labels which depends # on the number GCN layers (at a smaller scale, so the number of epochs can be lower without reducing the # accuracy). Whereas, the latter will affect the node predictions and thus, it will affect the accuracy of # the node explanations. print('GNN Explainer is trained based on {} epochs.'.format( prog_args.num_epochs)) print("Writer :", prog_args.writer) # Create explainer explainer = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=prog_args.writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) if prog_args.explain_node is not None: # Returned masked adjacency, edges and features of the subgraph masked_adj, masked_edges, masked_features = explainer.explain( prog_args.explain_node, unconstrained=False) print("Returned masked adjacency matrix :\n", masked_adj) print("Returned masked edges matrix :\n", masked_edges) print("Returned masked features matrix :\n", masked_features) else: print("Please provide node for explanation.")
def main(): # Load a configuration prog_args = arg_parse() if prog_args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda print("CUDA", prog_args.cuda) else: print("Using CPU") # Configure the logging directory if prog_args.writer: path = os.path.join(prog_args.logdir, io_utils.gen_explainer_prefix(prog_args)) if os.path.isdir(path) and prog_args.clean_log: print('Removing existing log dir: ', path) if not input( "Are you sure you want to remove this directory? (y/n): " ).lower().strip()[:1] == "y": sys.exit(1) shutil.rmtree(path) writer = SummaryWriter(path) else: writer = None # Load a model checkpoint ckpt = io_utils.load_ckpt(prog_args) cg_dict = ckpt["cg"] # get computation graph input_dim = cg_dict["feat"].shape[2] num_classes = cg_dict["pred"].shape[2] print("Loaded model from {}".format(prog_args.ckptdir)) print("input dim: ", input_dim, "; num classes: ", num_classes) # Determine explainer mode graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # build model print("Method: ", prog_args.method) if graph_mode: # Explain Graph prediction model = models.GcnEncoderGraph( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) else: if prog_args.dataset == "ppi_essential": # class weight in CE loss for handling imbalanced label classes prog_args.loss_weight = torch.tensor([1.0, 5.0], dtype=torch.float).cuda() # Explain Node prediction model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) if prog_args.gpu: model = model.cuda() # load state_dict (obtained by model.state_dict() when saving checkpoint) model.load_state_dict(ckpt["model_state"]) # Create explainer explainer = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) # TODO: API should definitely be cleaner # Let's define exactly which modes we support # We could even move each mode to a different method (even file) if prog_args.explain_node is not None: explainer.explain(prog_args.explain_node, unconstrained=False) elif graph_mode: if prog_args.explain_all: explain_path = Path('explanations/gnnexplainer/') explain_path.mkdir(exist_ok=True, parents=True) embeddings_path = Path('embeddings-%s/' % prog_args.bmname) embeddings_path.mkdir(exist_ok=True, parents=True) for i in range(len(cg_dict['all_idx'])): print('Explaining %s' % cg_dict['all_idx'][i]) explainer.explain(node_idx=0, graph_idx=i, graph_mode=True, unconstrained=False, original_idx=cg_dict['all_idx'][i]) elif prog_args.multigraph_class >= 0: print(cg_dict["label"]) # only run for graphs with label specified by multigraph_class labels = cg_dict["label"].numpy() graph_indices = [] for i, l in enumerate(labels): if l == prog_args.multigraph_class: graph_indices.append(i) if len(graph_indices) > 30: break print( "Graph indices for label ", prog_args.multigraph_class, " : ", graph_indices, ) explainer.explain_graphs(graph_indices=graph_indices) elif prog_args.graph_idx == -1: # just run for a customized set of indices explainer.explain_graphs(graph_indices=[1, 2, 3, 4]) else: explainer.explain( node_idx=0, graph_idx=prog_args.graph_idx, graph_mode=True, unconstrained=False, original_idx=cg_dict['all_idx'][prog_args.graph_idx]) # io_utils.plot_cmap_tb(writer, "tab20", 20, "tab20_cmap") else: if prog_args.multinode_class >= 0: print(cg_dict["label"]) # only run for nodes with label specified by multinode_class labels = cg_dict["label"][0] # already numpy matrix node_indices = [] for i, l in enumerate(labels): if len(node_indices) > 4: break if l == prog_args.multinode_class: node_indices.append(i) print( "Node indices for label ", prog_args.multinode_class, " : ", node_indices, ) explainer.explain_nodes(node_indices, prog_args) else: # explain a set of nodes masked_adj = explainer.explain_nodes_gnn_stats( range(400, 700, 5), prog_args)
def syn_task1(args, writer=None): print("\nStart with these parsed program arguments :\n", args) # np.ones(input_dim, dtype=float) = [1., 1., 1., 1., 1., 1., 1., 1., 1., 1.] constant_feature = featureGen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) print("Constant feature generator : ", constant_feature.val) #feat_dict = {i:{'feat': np.array(constant_feature.val, dtype=np.float32)} for i in G.nodes()} #print ('Values of feat_dict[0]["feat"]:', feat_dict[0]['feat']) #nx.set_node_attributes(G, feat_dict) #print('Node attributes of node \'0\', G.nodes[0]["feat"]:', G.nodes[0]['feat']) # Create the BA graph with the "house" motifs G, labels, name = gengraph.gen_syn1(feature_generator=constant_feature) # No .of classes from [0-3] for BA graph with house motifs num_classes = max(labels) + 1 # Update number of classes in argument for training (Out of bounds error) args.num_classes = num_classes # GcnEncoderNode model print("------------ GCNEncoderNode Model ------------") print("Input dimensions :", args.input_dim) print("Hidden dimensions :", args.hidden_dim) print("Output dimensions :", args.output_dim) print("Number of classes in args :", args.num_classes) print("Number of GCN layers :", args.num_gc_layers) print("Method : ", args.method) model = models.GcnEncoderNode(args.input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, args=args) print("GcnEncoderNode model :\n", model) # if args.method == "att": # print("Method: att") # model = models.GcnEncoderNode( # args.input_dim, # args.hidden_dim, # args.output_dim, # num_classes, # args.num_gc_layers, # bn=args.bn, # args=args, # ) # else: # print("Method:", args.method) # model = models.GcnEncoderNode( # args.input_dim, # args.hidden_dim, # args.output_dim, # num_classes, # args.num_gc_layers, # bn=args.bn, # args=args, # ) if args.gpu: model = model.cuda() train_node_classifier(G, labels, model, args, writer=writer) # Return model for manipulations in ipynb return model
def main(): # Load a configuration prog_args = arg_parse() if prog_args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda print("CUDA", prog_args.cuda) else: print("Using CPU") # Configure the logging directory if prog_args.writer: path = os.path.join(prog_args.logdir, io_utils.gen_explainer_prefix(prog_args)) if os.path.isdir(path) and prog_args.clean_log: print('Removing existing log dir: ', path) if not input( "Are you sure you want to remove this directory? (y/n): " ).lower().strip()[:1] == "y": sys.exit(1) shutil.rmtree(path) writer = SummaryWriter(path) else: writer = None # Load data and a model checkpoint ckpt = io_utils.load_ckpt(prog_args) cg_dict = ckpt["cg"] # get computation graph input_dim = cg_dict["feat"].shape[2] num_classes = cg_dict["pred"].shape[2] print("Loaded model from {}".format(prog_args.ckptdir)) print("input dim: ", input_dim, "; num classes: ", num_classes) # Determine explainer mode (node classif) graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # build model print("Method: ", prog_args.method) if graph_mode: # Explain Graph prediction model = models.GcnEncoderGraph( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) else: if prog_args.dataset == "ppi_essential": # class weight in CE loss for handling imbalanced label classes prog_args.loss_weight = torch.tensor([1.0, 5.0], dtype=torch.float).cuda() # Explain Node prediction model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) if prog_args.gpu: model = model.cuda() # Load state_dict (obtained by model.state_dict() when saving checkpoint) model.load_state_dict(ckpt["model_state"]) # Convertion data required to get correct model output for GraphSHAP adj = torch.tensor(cg_dict["adj"], dtype=torch.float) x = torch.tensor(cg_dict["feat"], requires_grad=True, dtype=torch.float) if prog_args.gpu: y_pred, att_adj = model(x.cuda(), adj.cuda()) else: y_pred, att_adj = model(x, adj) # Transform their data into our format data = transform_data(adj, x, cg_dict["label"][0].tolist()) # Generate test nodes # Use only these specific nodes as they are the ones added manually, part of the defined shapes # node_indices = extract_test_nodes(data, num_samples=10, cg_dict['train_idx']) k = 4 # number of nodes for the shape introduced (house, cycle) K = 0 if prog_args.dataset == 'syn1': node_indices = list(range(400, 410, 5)) elif prog_args.dataset == 'syn2': node_indices = list(range(400, 405, 5)) + list(range(1100, 1105, 5)) elif prog_args.dataset == 'syn4': node_indices = list(range(511, 523, 6)) if prog_args.hops == 3: k = 5 else: K = 5 elif prog_args.dataset == 'syn5': node_indices = list(range(511, 529, 9)) if prog_args.hops == 3: k = 7 K = 8 else: k = 5 K = 8 # GraphSHAP explainer # graphshap = GraphSHAP(data, model, adj, writer, prog_args.dataset, prog_args.gpu) # Run GNN Explainer and retrieve produced explanations gnne = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) ### GNNE # Explain a set of nodes - accuracy on edges this time t = time.time() gnne_edge_accuracy, gnne_auc, gnne_node_accuracy, important_nodes_gnne =\ gnne.explain_nodes_gnn_stats( node_indices, prog_args ) e = time.time() print('Time: ', e - t)
def medic(args): """ Creating a simple Graph ConvNet using parameters of args (https://arxiv.org/abs/1609.02907) """ # Loading DataSet from /Pickles global result_test, result_train with open('Pickles/feats.pickle', 'rb') as handle: feats = np.expand_dims(pickle.load(handle), axis=0) with open('Pickles/age_adj.pickle', 'rb') as handle: age_adj = pickle.load(handle) with open('Pickles/preds.pickle', 'rb') as handle: labels = np.expand_dims(pickle.load(handle), axis=0) # initializing model variables num_nodes = labels.shape[1] num_train = int(num_nodes * 0.9) num_classes = max(labels[0]) + 1 idx = [i for i in range(num_nodes)] np.random.shuffle(idx) train_idx = idx[:num_train] test_idx = idx[num_train:] labels = labels.astype(np.long) age_adj = age_adj.astype(np.float) feats = feats.astype(np.float) age_adj = age_adj + np.eye(age_adj.shape[0]) d_hat_inv = np.linalg.inv(np.diag(age_adj.sum(axis=1)))**(1 / 2) temp = np.matmul(d_hat_inv, age_adj) age_adj = np.matmul(temp, d_hat_inv) age_adj = np.expand_dims(age_adj, axis=0) labels_train = torch.tensor(labels[:, train_idx], dtype=torch.long) adj = torch.tensor(age_adj, dtype=torch.float) x = torch.tensor(feats, dtype=torch.float, requires_grad=True) # Creating a model which is used in https://github.com/RexYing/gnn-model-explainer model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model.cuda() scheduler, optimizer = build_optimizer(args, model.parameters(), weight_decay=args.weight_decay) model.train() to_save = (0, None) # used for saving best model # training the model for epoch in range(args.num_epochs): begin_time = time.time() model.zero_grad() if args.gpu: ypred, adj_att = model(x.cuda(), adj.cuda()) else: ypred, adj_att = model(x, adj) ypred_train = ypred[:, train_idx, :] if args.gpu: loss = model.loss(ypred_train, labels_train.cuda()) else: loss = model.loss(ypred_train, labels_train) loss.backward() nn.utils.clip_grad_norm(model.parameters(), args.clip) optimizer.step() # for param_group in optimizer.param_groups: # print(param_group["lr"]) elapsed = time.time() - begin_time result_train, result_test = evaluate_node(ypred.cpu(), labels, train_idx, test_idx) if result_test["acc"] > to_save[0]: to_save = (result_test["acc"], (model, optimizer, args)) if epoch % 10 == 0: print( "epoch: ", epoch, "; loss: ", loss.item(), "; train_acc: ", result_train["acc"], "; test_acc: ", result_test["acc"], "; train_prec: ", result_train["prec"], "; test_prec: ", result_test["prec"], "; epoch time: ", "{0:0.2f}".format(elapsed), ) if epoch % 100 == 0: print(result_train["conf_mat"]) print(result_test["conf_mat"]) if scheduler is not None: scheduler.step() print(result_train["conf_mat"]) print(result_test["conf_mat"]) to_save[1][0].eval() if args.gpu: ypred, _ = to_save[1][0](x.cuda(), adj.cuda()) else: ypred, _ = to_save[1][0](x, adj) cg_data = { "adj": age_adj, "feat": feats, "label": labels, "pred": ypred.cpu().detach().numpy(), "train_idx": train_idx, } # saving the model so that it can be restored for GNN explaining print( save_checkpoint(to_save[1][0], to_save[1][1], args, num_epochs=-1, cg_dict=cg_data)) return to_save[1][0], to_save[1][1], args, cg_data
def main(): # Load a configuration prog_args = arg_parse() if prog_args.gpu: os.environ["CUDA_VISIBLE_DEVICES"] = prog_args.cuda print("CUDA", prog_args.cuda) else: print("Using CPU") # Configure the logging directory if prog_args.writer: path = os.path.join(prog_args.logdir, io_utils.gen_explainer_prefix(prog_args)) if os.path.isdir(path) and prog_args.clean_log: print('Removing existing log dir: ', path) if not input( "Are you sure you want to remove this directory? (y/n): " ).lower().strip()[:1] == "y": sys.exit(1) shutil.rmtree(path) writer = SummaryWriter(path) else: writer = None # Load data and a model checkpoint ckpt = io_utils.load_ckpt(prog_args) cg_dict = ckpt["cg"] # get computation graph input_dim = cg_dict["feat"].shape[2] num_classes = cg_dict["pred"].shape[2] print("Loaded model from {}".format(prog_args.ckptdir)) print("input dim: ", input_dim, "; num classes: ", num_classes) # Determine explainer mode (node classif) graph_mode = (prog_args.graph_mode or prog_args.multigraph_class >= 0 or prog_args.graph_idx >= 0) # build model print("Method: ", prog_args.method) if graph_mode: # Explain Graph prediction model = models.GcnEncoderGraph( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) else: if prog_args.dataset == "ppi_essential": # class weight in CE loss for handling imbalanced label classes prog_args.loss_weight = torch.tensor([1.0, 5.0], dtype=torch.float).cuda() # Explain Node prediction model = models.GcnEncoderNode( input_dim=input_dim, hidden_dim=prog_args.hidden_dim, embedding_dim=prog_args.output_dim, label_dim=num_classes, num_layers=prog_args.num_gc_layers, bn=prog_args.bn, args=prog_args, ) if prog_args.gpu: model = model.cuda() # Load state_dict (obtained by model.state_dict() when saving checkpoint) model.load_state_dict(ckpt["model_state"]) # Convertion data required to get correct model output for GraphSHAP adj = torch.tensor(cg_dict["adj"], dtype=torch.float) x = torch.tensor(cg_dict["feat"], requires_grad=True, dtype=torch.float) if prog_args.gpu: y_pred, att_adj = model(x.cuda(), adj.cuda()) else: y_pred, att_adj = model(x, adj) # Transform their data into our format data = transform_data(adj, x, cg_dict["label"][0].tolist()) # Generate test nodes # Use only these specific nodes as they are the ones added manually, part of the defined shapes # node_indices = extract_test_nodes(data, num_samples=10, cg_dict['train_idx']) k = 4 # number of nodes for the shape introduced (house, cycle) K = 0 if prog_args.dataset == 'syn1': node_indices = list(range(400, 450, 5)) elif prog_args.dataset == 'syn2': node_indices = list(range(400, 425, 5)) + list(range(1100, 1125, 5)) elif prog_args.dataset == 'syn4': node_indices = list(range(511, 571, 6)) if prog_args.hops == 3: k = 5 else: K = 5 elif prog_args.dataset == 'syn5': node_indices = list(range(511, 601, 9)) if prog_args.hops == 3: k = 8 else: k = 5 K = 8 # GraphSHAP explainer graphshap = GraphSHAP(data, model, adj, writer, prog_args.dataset, prog_args.gpu) # Run GNN Explainer and retrieve produced explanations gnne = explain.Explainer( model=model, adj=cg_dict["adj"], feat=cg_dict["feat"], label=cg_dict["label"], pred=cg_dict["pred"], train_idx=cg_dict["train_idx"], args=prog_args, writer=writer, print_training=True, graph_mode=graph_mode, graph_idx=prog_args.graph_idx, ) #if prog_args.explain_node is not None: # _, gnne_edge_accuracy, gnne_auc, gnne_node_accuracy = \ # gnne.explain_nodes_gnn_stats( # node_indices, prog_args # ) # elif graph_mode: # # Graph explanation # gnne_expl = gnne.explain_graphs([1])[0] # GraphSHAP - assess accuracy of explanations # Loop over test nodes accuracy = [] feat_accuracy = [] for node_idx in node_indices: start = time.time() graphshap_explanations = graphshap.explain( [node_idx], prog_args.hops, prog_args.num_samples, prog_args.info, prog_args.multiclass, prog_args.fullempty, prog_args.S, prog_args.hv, prog_args.feat, prog_args.coal, prog_args.g, prog_args.regu, )[0] end = time.time() print('GS Time:', end - start) # Predicted class pred_val, predicted_class = y_pred[0, node_idx, :].max(dim=0) # Keep only node explanations # ,predicted_class] graphshap_node_explanations = graphshap_explanations[graphshap.F:] # Derive ground truth from graph structure ground_truth = list(range(node_idx + 1, node_idx + max(k, K) + 1)) # Retrieve top k elements indices form graphshap_node_explanations if graphshap.neighbours.shape[0] > k: i = 0 val, indices = torch.topk( torch.tensor(graphshap_node_explanations.T), k + 1) # could weight importance based on val for node in graphshap.neighbours[indices]: if node.item() in ground_truth: i += 1 # Sort of accruacy metric accuracy.append(i / k) print('There are {} from targeted shape among most imp. nodes'. format(i)) # Look at importance distribution among features # Identify most important features and check if it corresponds to truly imp ones if prog_args.dataset == 'syn2': # ,predicted_class] graphshap_feat_explanations = graphshap_explanations[:graphshap.F] print('Feature importance graphshap', graphshap_feat_explanations.T) if np.argsort(graphshap_feat_explanations)[-1] == 0: feat_accuracy.append(1) else: feat_accuracy.append(0) # Metric for graphshap final_accuracy = sum(accuracy) / len(accuracy) ### GNNE # Explain a set of nodes - accuracy on edges this time _, gnne_edge_accuracy, gnne_auc, gnne_node_accuracy =\ gnne.explain_nodes_gnn_stats( node_indices, prog_args ) ### GRAD benchmark # MetricS to assess quality of predictionsx """ _, grad_edge_accuracy, grad_auc, grad_node_accuracy =\ gnne.explain_nodes_gnn_stats( node_indices, prog_args, model="grad") """ grad_edge_accuracy = 0 grad_node_accuracy = 0 ### GAT # Nothing for now - implem a GAT on the side and look at weights coef ### Results print( 'Accuracy for GraphSHAP is {:.2f} vs {:.2f},{:.2f} for GNNE vs {:.2f},{:.2f} for GRAD' .format(final_accuracy, np.mean(gnne_edge_accuracy), np.mean(gnne_node_accuracy), np.mean(grad_edge_accuracy), np.mean(grad_node_accuracy))) if prog_args.dataset == 'syn2': print('Most important feature was found in {:.2f}% of the case'.format( 100 * np.mean(feat_accuracy))) print('GNNE_auc is:', gnne_auc)