def gen_syn3(nb_shapes=80, width_basis=300, feature_generator=None, m=5): """ Synthetic Graph #3: Start with Barabasi-Albert graph and attach grid-shaped subgraphs. Args: nb_shapes : The number of shapes (here 'grid') that should be added to the base graph. width_basis : The width of the basis graph (here 'Barabasi-Albert' random graph). feature_generator : A `FeatureGenerator` for node features. If `None`, add constant features to nodes. m : number of edges to attach to existing node (for BA graph) Returns: G : A networkx graph role_id : Role ID for each node in synthetic graph. name : A graph identifier """ basis_type = "ba" list_shapes = [["grid", 3]] * nb_shapes plt.figure(figsize=(8, 6), dpi=300) G, role_id, _ = synthetic_structsim.build_graph(width_basis, basis_type, list_shapes, start=0, m=5) G = perturb([G], 0.01)[0] if feature_generator is None: feature_generator = featgen.ConstFeatureGen(1) feature_generator.gen_node_features(G) name = basis_type + "_" + str(width_basis) + "_" + str(nb_shapes) return G, role_id, name
def pkl_task(args, feat=None): with open(os.path.join(args.datadir, args.pkl_fname), "rb") as pkl_file: data = pickle.load(pkl_file) graphs = data[0] labels = data[1] test_graphs = data[2] test_labels = data[3] for i in range(len(graphs)): graphs[i].graph["label"] = labels[i] for i in range(len(test_graphs)): test_graphs[i].graph["label"] = test_labels[i] if feat is None: featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) for G in graphs: featgen_const.gen_node_features(G) for G in test_graphs: featgen_const.gen_node_features(G) train_dataset, test_dataset, max_num_nodes = prepare_data( graphs, args, test_graphs=test_graphs) model = models.GcnEncoderGraph( args.input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, ) train(train_dataset, model, args, test_dataset=test_dataset) evaluate(test_dataset, model, args, "Validation")
def syn_task5(args, writer=None): # data G, labels, name = gnnexplainer_gengraph.gen_syn5( feature_generator=featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float))) print(labels) print("Number of nodes: ", G.number_of_nodes()) num_classes = max(labels) + 1 if args.method == "attn": print("Method: attn") else: print("Method: base") model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model train_node_classifier(G, labels, model, args, writer=writer)
def benchmark_task_val(args, writer=None, feat="node-label"): all_vals = [] graphs = io_utils.read_graphfile(args.datadir, args.bmname, max_nodes=args.max_nodes) if feat == "node-feat" and "feat_dim" in graphs[0].graph: print("Using node features") input_dim = graphs[0].graph["feat_dim"] elif feat == "node-label" and "label" in graphs[0].nodes[0]: print("Using node labels") for G in graphs: for u in G.nodes(): G.nodes[u]["feat"] = np.array(G.nodes[u]["label"]) else: print("Using constant labels") featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) for G in graphs: featgen_const.gen_node_features(G) # 10 splits for i in range(10): train_dataset, val_dataset, max_num_nodes, input_dim, assign_input_dim = cross_val.prepare_val_data( graphs, args, i, max_nodes=args.max_nodes) print("Method: base") model = models.GcnEncoderGraph( input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, dropout=args.dropout, args=args, ) _, val_accs = train( train_dataset, model, args, val_dataset=val_dataset, test_dataset=None, writer=writer, ) all_vals.append(np.array(val_accs)) all_vals = np.vstack(all_vals) all_vals = np.mean(all_vals, axis=0) print(all_vals) print(np.max(all_vals)) print(np.argmax(all_vals))
def enron_task(args, idx=None, writer=None): labels_dict = { "None": 5, "Employee": 0, "Vice President": 1, "Manager": 2, "Trader": 3, "CEO+Managing Director+Director+President": 4, } max_enron_id = 183 if idx is None: G_list = [] labels_list = [] for i in range(10): net = pickle.load( open("data/gnn-explainer-enron/enron_slice_{}.pkl".format(i), "rb")) # net.add_nodes_from(range(max_enron_id)) # labels=[n[1].get('role', 'None') for n in net.nodes(data=True)] # labels_num = [labels_dict[l] for l in labels] featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) featgen_const.gen_node_features(net) G_list.append(net) print(net.number_of_nodes()) # labels_list.append(labels_num) G = nx.disjoint_union_all(G_list) model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, len(labels_dict), args.num_gc_layers, bn=args.bn, args=args, ) labels = [n[1].get("role", "None") for n in G.nodes(data=True)] labels_num = [labels_dict[l] for l in labels] for i in range(5): print("Label ", i, ": ", labels_num.count(i)) print("Total num nodes: ", len(labels_num)) print(labels_num) if args.gpu: model = model train_node_classifier(G, labels_num, model, args, writer=writer) else: print("Running Enron full task")
def enron_task_multigraph(args, idx=None, writer=None): labels_dict = { "None": 5, "Employee": 0, "Vice President": 1, "Manager": 2, "Trader": 3, "CEO+Managing Director+Director+President": 4, } max_enron_id = 183 if idx is None: G_list = [] labels_list = [] for i in range(10): net = pickle.load( open("data/gnn-explainer-enron/enron_slice_{}.pkl".format(i), "rb")) net.add_nodes_from(range(max_enron_id)) labels = [n[1].get("role", "None") for n in net.nodes(data=True)] labels_num = [labels_dict[l] for l in labels] featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) featgen_const.gen_node_features(net) G_list.append(net) labels_list.append(labels_num) # train_dataset, test_dataset, max_num_nodes = prepare_data(G_list, args) model = models.GcnEncoderNode( args.input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, args=args, ) if args.gpu: model = model print(labels_num) train_node_classifier_multigraph(G_list, labels_list, model, args, writer=writer) else: print("Running Enron full task")
def gen_syn4(nb_shapes=60, width_basis=8, feature_generator=None, m=4): """ Synthetic Graph #4: Start with a tree and attach cycle-shaped subgraphs. Args: nb_shapes : The number of shapes (here 'houses') that should be added to the base graph. width_basis : The width of the basis graph (here a random 'Tree'). feature_generator : A `FeatureGenerator` for node features. If `None`, add constant features to nodes. m : The tree depth. Returns: G : A networkx graph role_id : Role ID for each node in synthetic graph name : A graph identifier """ basis_type = "tree" list_shapes = [["cycle", 6]] * nb_shapes fig = plt.figure(figsize=(8, 6), dpi=300) G, role_id, plugins = synthetic_structsim.build_graph(width_basis, basis_type, list_shapes, start=0) G = perturb([G], 0.01)[0] if feature_generator is None: feature_generator = featgen.ConstFeatureGen(1) feature_generator.gen_node_features(G) name = basis_type + "_" + str(width_basis) + "_" + str(nb_shapes) path = os.path.join("log/syn4_base_h20_o20") writer = SummaryWriter(path) io_utils.log_graph(writer, G, "graph/full") return G, role_id, name
def benchmark_task(args, writer=None, feat="node-label"): graphs = io_utils.read_graphfile(args.datadir, args.bmname, max_nodes=args.max_nodes) print(max([G.graph["label"] for G in graphs])) if feat == "node-feat" and "feat_dim" in graphs[0].graph: print("Using node features") input_dim = graphs[0].graph["feat_dim"] elif feat == "node-label" and "label" in graphs[0].nodes[0]: print("Using node labels") for G in graphs: for u in G.nodes(): G.nodes[u]["feat"] = np.array(G.nodes[u]["label"]) # make it -1/1 instead of 0/1 # feat = np.array(G.nodes[u]['label']) # G.nodes[u]['feat'] = feat * 2 - 1 else: print("Using constant labels") featgen_const = featgen.ConstFeatureGen( np.ones(args.input_dim, dtype=float)) for G in graphs: featgen_const.gen_node_features(G) train_dataset, val_dataset, test_dataset, max_num_nodes, input_dim, assign_input_dim = prepare_data( graphs, args, max_nodes=args.max_nodes) if args.method == "soft-assign": print("Method: soft-assign") model = models.SoftPoolingGcnEncoder( max_num_nodes, input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, args.hidden_dim, assign_ratio=args.assign_ratio, num_pooling=args.num_pool, bn=args.bn, dropout=args.dropout, linkpred=args.linkpred, args=args, assign_input_dim=assign_input_dim, ) else: print("Method: base") model = models.GcnEncoderGraph( input_dim, args.hidden_dim, args.output_dim, args.num_classes, args.num_gc_layers, bn=args.bn, dropout=args.dropout, args=args, ) train( train_dataset, model, args, val_dataset=val_dataset, test_dataset=test_dataset, writer=writer, ) evaluate(test_dataset, model, args, "Validation")
def read_biosnap(datadir, edgelist_file, label_file, feat_file=None, concat=True): """ Read data from BioSnap Returns: List of networkx objects with graph and node labels """ G = nx.Graph() delimiter = "\t" if "tsv" in edgelist_file else "," print(delimiter) df = pd.read_csv(os.path.join(datadir, edgelist_file), delimiter=delimiter, header=None) data = list(map(tuple, df.values.tolist())) G.add_edges_from(data) print("Total nodes: ", G.number_of_nodes()) G = max(nx.connected_component_subgraphs(G), key=len) print("Total nodes in largest connected component: ", G.number_of_nodes()) df = pd.read_csv(os.path.join(datadir, label_file), delimiter="\t", usecols=[0, 1]) data = list(map(tuple, df.values.tolist())) missing_node = 0 for line in data: if int(line[0]) not in G: missing_node += 1 else: G.nodes[int(line[0])]["label"] = int(line[1] == "Essential") print("missing node: ", missing_node) missing_label = 0 remove_nodes = [] for u in G.nodes(): if "label" not in G.nodes[u]: missing_label += 1 remove_nodes.append(u) G.remove_nodes_from(remove_nodes) print("missing_label: ", missing_label) if feat_file is None: feature_generator = featgen.ConstFeatureGen(np.ones(10, dtype=float)) feature_generator.gen_node_features(G) else: df = pd.read_csv(os.path.join(datadir, feat_file), delimiter=",") data = np.array(df.values) print("Feat shape: ", data.shape) for row in data: if int(row[0]) in G: if concat: node = int(row[0]) onehot = np.zeros(10) onehot[min(G.degree[node], 10) - 1] = 1.0 G.nodes[node]["feat"] = np.hstack( (np.log(row[1:] + 0.1), [1.0], onehot)) else: G.nodes[int(row[0])]["feat"] = np.log(row[1:] + 0.1) missing_feat = 0 remove_nodes = [] for u in G.nodes(): if "feat" not in G.nodes[u]: missing_feat += 1 remove_nodes.append(u) G.remove_nodes_from(remove_nodes) print("missing feat: ", missing_feat) return G