def __init__(self, dataset): self.dataset = dataset cfg = self.CONFIG[dataset] rating = pd.read_csv( cfg['rating_path'], sep=cfg['rating_sep'], names=['user_id', 'item_id', 'rating'], usecols=[0, 1, 2], skiprows=1 ) kg = pd.read_csv(cfg['kg_path'], sep='\t', names=['head', 'relation', 'tail']) item2entity = pd.read_csv(cfg['item2id_path'], sep='\t', names=['item_id', 'entity_id']) rating = rating[rating['item_id'].isin(item2entity['item_id'])] rating.reset_index(drop=True, inplace=True) rating['user_id'] = LabelEncoder().fit_transform(rating['user_id']) item2entity = dict(zip(item2entity['item_id'], item2entity['entity_id'])) rating['item_id'] = rating['item_id'].apply(item2entity.__getitem__) rating['label'] = rating['rating'].apply(lambda r: int(r >= cfg['threshold'])) rating = rating[rating['label'] == 1] user_item_graph = dgl.heterograph({ ('user', 'rate', 'item'): (rating['user_id'].to_numpy(), rating['item_id'].to_numpy()) }) # 负采样 neg_sampler = Uniform(1) nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges())) u, v = user_item_graph.edges() self.user_item_graph = dgl.heterograph({('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))}) self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])]) kg['relation'] = LabelEncoder().fit_transform(kg['relation']) # 有重边,即两个实体之间可能存在多条边,关系类型不同 knowledge_graph = dgl.graph((kg['head'], kg['tail'])) knowledge_graph.edata['relation'] = torch.tensor(kg['relation'].tolist()) self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
def to_bidirected_with_reverse_mapping(g): """Makes a graph bidirectional, and returns a mapping array ``mapping`` where ``mapping[i]`` is the reverse edge of edge ID ``i``. Does not work with graphs that have self-loops. """ g_simple, mapping = dgl.to_simple(dgl.add_reverse_edges(g), return_counts='count', writeback_mapping=True) c = g_simple.edata['count'] num_edges = g.num_edges() mapping_offset = torch.zeros(g_simple.num_edges() + 1, dtype=g_simple.idtype) mapping_offset[1:] = c.cumsum(0) idx = mapping.argsort() idx_uniq = idx[mapping_offset[:-1]] reverse_idx = torch.where(idx_uniq >= num_edges, idx_uniq - num_edges, idx_uniq + num_edges) reverse_mapping = mapping[reverse_idx] # Correctness check src1, dst1 = g_simple.edges() src2, dst2 = g_simple.find_edges(reverse_mapping) assert torch.equal(src1, dst2) assert torch.equal(src2, dst1) return g_simple, reverse_mapping
def to_dgl(self: GraphFeaturiser, mol: Mol) -> dgl.DGLGraph: """Generates a DGL graph from a molecule. Args: mol: The molecule to featurise. Returns: A DGL graph of the featurised molecule. """ num_atoms = mol.GetNumAtoms() bonds = mol.GetBonds() bond_from = [bond.GetBeginAtomIdx() for bond in bonds] bond_to = [bond.GetEndAtomIdx() for bond in bonds] g = dgl.graph((torch.tensor(bond_from), torch.tensor(bond_to)), num_nodes=num_atoms) for key, atom_featuriser in self.atom_featurisers.items(): atom_features = atom_featuriser.process_molecule(mol) g.ndata[key] = torch.tensor(atom_features, dtype=torch.float) for key, bond_featuriser in self.bond_featurisers.items(): bond_features = [ bond_featuriser.process_bond(bond) for bond in bonds ] g.edata[key] = torch.tensor(bond_features, dtype=torch.float) g = dgl.add_reverse_edges(g, copy_edata=True) if self.add_self_loops: g = dgl.add_self_loop(g) return g
def __init__(self): g = OAGCoreDataset()[0] author_rank = load_author_rank() rating = pd.DataFrame( [[i, a] for i, (f, r) in enumerate(author_rank.items()) for a in r], columns=['user_id', 'item_id'] ) user_item_graph = dgl.heterograph( {('user', 'rate', 'item'): (rating['user_id'], rating['item_id'])}, num_nodes_dict={'user': len(author_rank), 'item': g.num_nodes('author')} ) # 负采样 neg_sampler = Uniform(1) nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges())) u, v = user_item_graph.edges() self.user_item_graph = dgl.heterograph( {('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))}, num_nodes_dict={ntype: user_item_graph.num_nodes(ntype) for ntype in user_item_graph.ntypes} ) self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])]) knowledge_graph = dgl.to_homogeneous(dgl.node_type_subgraph(g, ['author', 'institution', 'paper'])) knowledge_graph.edata['relation'] = knowledge_graph.edata[dgl.NTYPE] self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
def convert_mag_to_homograph(g, device): """ Featurize node types that don't have input features (i.e. author, institution, field_of_study) by averaging their neighbor features. Then convert the graph to a undirected homogeneous graph. """ src_writes, dst_writes = g.all_edges(etype="writes") src_topic, dst_topic = g.all_edges(etype="has_topic") src_aff, dst_aff = g.all_edges(etype="affiliated_with") new_g = dgl.heterograph({ ("paper", "written", "author"): (dst_writes, src_writes), ("paper", "has_topic", "field"): (src_topic, dst_topic), ("author", "aff", "inst"): (src_aff, dst_aff) }) new_g = new_g.to(device) new_g.nodes["paper"].data["feat"] = g.nodes["paper"].data["feat"] new_g["written"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat")) new_g["has_topic"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat")) new_g["aff"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat")) g.nodes["author"].data["feat"] = new_g.nodes["author"].data["feat"] g.nodes["institution"].data["feat"] = new_g.nodes["inst"].data["feat"] g.nodes["field_of_study"].data["feat"] = new_g.nodes["field"].data["feat"] # Convert to homogeneous graph # Get DGL type id for paper type target_type_id = g.get_ntype_id("paper") g = dgl.to_homogeneous(g, ndata=["feat"]) g = dgl.add_reverse_edges(g, copy_ndata=True) # Mask for paper nodes g.ndata["target_mask"] = g.ndata[dgl.NTYPE] == target_type_id return g
def same_direction(scenario, graph_id): # The edges of the process reading and writing files are regarded as the same direction edge_types = ['execve', 'access', 'mmap2', 'open', 'fstat', 'close', 'read', 'stat', 'write', 'unlink', 'clone', 'waitpid', 'bind', 'listen', 'chmod', 'connect', 'writev', 'recv', 'ftruncate', 'sendmsg', 'send', 'recvmsg', 'accept', 'sendto', 'recvfrom', 'truncate'] node_types = ['process', 'file', 'MAP_ANONYMOUS', 'stdin', 'stdout', 'stderr', 'NA', 'thread'] data_path = 'dataset/split_data/' + scenario + '/' + str(graph_id) + '.csv' # data_entry: source-id, source-type, destination-id, destination-type, edge-type, timestamp, graph-id # The indexes in the list are node id in graph, and the values are original id in raw data node_original_id = [] # One-hot encoding for node type and edge type node_feats, edge_feats = [], [] # src and des nodes in homograph u, v = [], [] with open(data_path, 'r') as file: reader = csv.reader(file) for line in reader: src_id = int(line[0]) src_type = line[1] dst_id = int(line[2]) dst_type = line[3] edge_type = line[4] timestamp = int(line[5]) if src_id not in node_original_id: node_original_id.append(src_id) u.append(node_original_id.index(src_id)) if dst_id not in node_original_id: node_original_id.append(dst_id) v.append(node_original_id.index(dst_id)) # one-hot encoding for node and edge features src_node_feat = [0]*len(node_types) src_node_feat[node_types.index(src_type)] = 1 if node_original_id.index(src_id)+1 > len(node_feats): node_feats[len(node_feats) : node_original_id.index(src_id)+1] = [[0]*len(node_types)] node_feats[node_original_id.index(src_id)] = src_node_feat dst_node_feat = [0]*len(node_types) dst_node_feat[node_types.index(dst_type)] = 1 if node_original_id.index(dst_id)+1 > len(node_feats): node_feats[len(node_feats) : node_original_id.index(dst_id)+1] = [[0]*len(node_types)] node_feats[node_original_id.index(dst_id)] = dst_node_feat edge_feat = [0]*len(edge_types) edge_feat[edge_types.index(edge_type)] = 1 edge_feats.append(edge_feat) u_ids, v_ids = th.tensor(u), th.tensor(v) node_feats, edge_feats = th.tensor(node_feats), th.tensor(edge_feats) g = dgl.graph((u_ids, v_ids), idtype=th.int32) g.ndata['feat'] = node_feats g.edata['feat'] = edge_feats # To eliminate 0-in-degree nodes bg = dgl.add_reverse_edges(g, copy_ndata=True, copy_edata=True) return bg
def get_current_ts(pos_graph, neg_graph): with pos_graph.local_scope(): pos_graph_ = dgl.add_reverse_edges(pos_graph, copy_edata=True) pos_graph_.update_all(fn.copy_e('timestamp', 'times'), fn.max('times', 'ts')) current_ts = pos_ts = pos_graph_.ndata['ts'] num_pos_nodes = pos_graph_.num_nodes() with neg_graph.local_scope(): neg_graph_ = dgl.add_reverse_edges(neg_graph) neg_graph_.edata['timestamp'] = pos_graph_.edata['timestamp'] neg_graph_.update_all(fn.copy_e('timestamp', 'times'), fn.max('times', 'ts')) num_pos_nodes = torch.where(pos_graph_.ndata['ts'] > 0)[0].shape[0] pos_ts = pos_graph_.ndata['ts'][:num_pos_nodes] neg_ts = neg_graph_.ndata['ts'][num_pos_nodes:] current_ts = torch.cat([pos_ts, neg_ts]) return current_ts, pos_ts, num_pos_nodes
def processItem(item): graph, bidirectional, key, categories, tolerance = item graph_dict = {} vertices = graphVertices(graph) edges = graphEdges(graph) graph_dict["num_nodes"] = len(vertices) graph_dict["src"] = [] graph_dict["dst"] = [] graph_dict["node_labels"] = {} nodes = [] graph_edges = [] # This is a hack, please replace test_list = [] for i in range(len(vertices)): vDict = vertices[i].GetDictionary() vLabel = DictionaryValueAtKey.processItem([vDict, key]) graph_dict["node_labels"][i] = vLabel nodes.append(i) # This is a hack, please replace test_list.append(vLabel) # Here we need to call oneHotEncode to create the one host encoding. # What is the input list we need here? # This is a hack, please replace. one_hot_encoded_list = oneHotEncode(test_list, categories) print("Categories", categories) print("Test List", test_list) print("One-Hot-Encoded List",one_hot_encoded_list) # Do something with the one_hot_encoded list for i in range(len(edges)): e = edges[i] sv = e.StartVertex() ev = e.EndVertex() sn = nodes[vertexIndex(sv, vertices, tolerance)] en = nodes[vertexIndex(ev, vertices, tolerance)] if (([sn,en] in graph_edges) == False) and (([en,sn] in graph_edges) == False): graph_edges.append([sn,en]) for anEdge in graph_edges: graph_dict["src"].append(anEdge[0]) graph_dict["dst"].append(anEdge[1]) # Create DDGL graph src = np.array(graph_dict["src"]) dst = np.array(graph_dict["dst"]) num_nodes = graph_dict["num_nodes"] # Create a graph dgl_graph = dgl.graph((src, dst), num_nodes=num_nodes) dgl_graph.ndata['attr'] = torch.ones(num_nodes, 1) if bidirectional: dgl_graph = dgl.add_reverse_edges(dgl_graph) return dgl_graph
def processItem(item): graphs_file_path, edges_file_path, nodes_file_path, graph_id_header, graph_label_header, num_nodes_header, src_header, dst_header, node_label_header, node_attr_key, categories, bidirectional = item graphs = pd.read_csv(graphs_file_path) edges = pd.read_csv(edges_file_path) nodes = pd.read_csv(nodes_file_path) dgl_graphs = [] labels = [] # Create a graph for each graph ID from the edges table. # First process the graphs table into two dictionaries with graph IDs as keys. # The label and number of nodes are values. label_dict = {} num_nodes_dict = {} for _, row in graphs.iterrows(): label_dict[row[graph_id_header]] = row[graph_label_header] num_nodes_dict[row[graph_id_header]] = row[num_nodes_header] # For the edges, first group the table by graph IDs. edges_group = edges.groupby(graph_id_header) # For the nodes, first group the table by graph IDs. nodes_group = nodes.groupby(graph_id_header) # For each graph ID... for graph_id in edges_group.groups: graph_dict = {} graph_dict[src_header] = [] graph_dict[dst_header] = [] graph_dict[node_label_header] = {} graph_dict["node_features"] = [] num_nodes = num_nodes_dict[graph_id] graph_label = label_dict[graph_id] labels.append(graph_label) # Find the edges as well as the number of nodes and its label. edges_of_id = edges_group.get_group(graph_id) src = edges_of_id[src_header].to_numpy() dst = edges_of_id[dst_header].to_numpy() # Find the nodes and their labels and features nodes_of_id = nodes_group.get_group(graph_id) node_labels = nodes_of_id[node_label_header] #graph_dict["node_labels"][graph_id] = node_labels for node_label in node_labels: graph_dict["node_features"].append(torch.tensor(oneHotEncode(node_label, categories))) # Create a graph and add it to the list of graphs and labels. dgl_graph = dgl.graph((src, dst), num_nodes=num_nodes) # Setting the node features as node_attr_key using onehotencoding of node_label dgl_graph.ndata[node_attr_key] = torch.stack(graph_dict["node_features"]) if bidirectional: dgl_graph = dgl.add_reverse_edges(dgl_graph) dgl_graphs.append(dgl_graph) return [dgl_graphs, labels]
def gen_mail(self, args, emb, input_nodes, pair_graph, frontier, mode='train'): pair_graph.ndata['feat'] = emb pair_graph = dgl.add_reverse_edges(pair_graph, copy_edata=True) pair_graph.update_all(MSG.get_edge_msg, fn.mean('m','msg')) frontier.ndata['msg'] = torch.zeros((frontier.num_nodes(), self.nfeat_dim + 2)) frontier.ndata['msg'][pair_graph.ndata[dgl.NID]] = pair_graph.ndata['msg'].to('cpu') for _ in range(args.n_layer): frontier.update_all(fn.copy_u('msg','m'), fn.mean('m','msg')) mail = MSG.msg2mail(frontier.ndata['mail'][input_nodes], frontier.ndata['msg'][input_nodes]) return mail
def processItem(item): file_path, categories, bidirectional = item graphs = [] labels = [] file = open(file_path) if file: lines = file.readlines() n_graphs = int(lines[0]) index = 1 for i in range(n_graphs): graph_dict = {} graph_dict["src"] = [] graph_dict["dst"] = [] graph_dict["node_labels"] = {} graph_dict["node_features"] = [] line = lines[index].split() n_nodes = int(line[0]) graph_dict["num_nodes"] = n_nodes graph_label = int(line[1]) labels.append(graph_label) index += 1 for j in range(n_nodes): line = lines[index + j].split() node_label = int(line[0]) graph_dict["node_labels"][j] = node_label graph_dict["node_features"].append( torch.tensor(oneHotEncode(node_label, categories))) adj_vertices = line[2:] for adj_vertex in adj_vertices: graph_dict["src"].append(j) graph_dict["dst"].append(int(adj_vertex)) # Create DDGL graph src = np.array(graph_dict["src"]) dst = np.array(graph_dict["dst"]) # Create a graph dgl_graph = dgl.graph((src, dst), num_nodes=graph_dict["num_nodes"]) # Setting the node features as 'node_attr' using onehotencoding of vlabel dgl_graph.ndata['node_attr'] = torch.stack( graph_dict["node_features"]) if bidirectional: dgl_graph = dgl.add_reverse_edges(dgl_graph) graphs.append(dgl_graph) index += n_nodes file.close() return [graphs, labels]
def load_dataset(name, device): """ Load dataset and move graph and features to device """ if name not in ["ogbn-products", "ogbn-arxiv", "ogbn-mag"]: raise RuntimeError("Dataset {} is not supported".format(name)) dataset = DglNodePropPredDataset(name=name) splitted_idx = dataset.get_idx_split() train_nid = splitted_idx["train"] val_nid = splitted_idx["valid"] test_nid = splitted_idx["test"] g, labels = dataset[0] g = g.to(device) if name == "ogbn-arxiv": g = dgl.add_reverse_edges(g, copy_ndata=True) g = dgl.add_self_loop(g) g.ndata['feat'] = g.ndata['feat'].float() elif name == "ogbn-mag": # MAG is a heterogeneous graph. The task is to make prediction for # paper nodes labels = labels["paper"] train_nid = train_nid["paper"] val_nid = val_nid["paper"] test_nid = test_nid["paper"] g = convert_mag_to_homograph(g, device) else: g.ndata['feat'] = g.ndata['feat'].float() n_classes = dataset.num_classes labels = labels.squeeze() evaluator = get_ogb_evaluator(name) print(f"# Nodes: {g.number_of_nodes()}\n" f"# Edges: {g.number_of_edges()}\n" f"# Train: {len(train_nid)}\n" f"# Val: {len(val_nid)}\n" f"# Test: {len(test_nid)}\n" f"# Classes: {n_classes}") return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
device = th.device('cuda:%d' % args.gpu) else: device = th.device('cpu') # load ogbn-products data #data = DglNodePropPredDataset(name='ogbn-products') data = DglNodePropPredDataset(name="ogbn-" + args.dataset, root='torch_geometric_data/') splitted_idx = data.get_idx_split() train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx[ 'valid'], splitted_idx['test'] graph, labels = data[0] n_classes = (labels.max() + 1).item() graph = graph.to(device) if args.dataset == "arxiv": graph = dgl.add_reverse_edges(graph, copy_ndata=True) graph = dgl.add_self_loop(graph) graph.ndata['feat'] = graph.ndata['feat'].float() labels = labels[:, 0].to(device) elif args.dataset == "ogbn-mag": labels = labels["paper"] train_idx = train_idx["paper"] val_idx = val_idx["paper"] test_idx = test_idx["paper"] g = convert_mag_to_homograph(g, device) labels = labels[:, 0].to(device) elif args.dataset == "proteins": n_classes = labels.shape[1] graph.update_all(fn.copy_e("feat", "feat_copy"), fn.sum("feat_copy", "feat")) #one_hot = th.zeros(graph.number_of_nodes(), n_classes)
def train(args, logger): task_time = time.strftime("%Y-%m-%d %H:%M", time.localtime()) Path("./saved_models/").mkdir(parents=True, exist_ok=True) Path("./pretrained_models/").mkdir(parents=True, exist_ok=True) MODEL_SAVE_PATH = './saved_models/' Pretrained_MODEL_PATH = './pretrained_models/' get_model_name = lambda part: f'{part}-{args.data}-{args.tasks}-{args.prefix}.pth' get_pretrain_model_name = lambda part: f'{part}-{args.data}-LP-{args.prefix}.pth' device_string = 'cuda:{}'.format(args.gpu) if torch.cuda.is_available() and args.gpu >=0 else 'cpu' print('Model trainging with '+device_string) device = torch.device(device_string) g = load_graphs(f"./data/{args.data}.dgl")[0][0] efeat_dim = g.edata['feat'].shape[1] nfeat_dim = efeat_dim train_loader, val_loader, test_loader, num_val_samples, num_test_samples = dataloader(args, g) encoder = Encoder(args, nfeat_dim, n_head=args.n_head, dropout=args.dropout).to(device) decoder = Decoder(args, nfeat_dim).to(device) msg2mail = Msg2Mail(args, nfeat_dim) fraud_sampler = frauder_sampler(g) optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) scheduler_lr = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=40) if args.warmup: scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=3, after_scheduler=scheduler_lr) optimizer.zero_grad() optimizer.step() loss_fcn = torch.nn.BCEWithLogitsLoss() loss_fcn = loss_fcn.to(device) early_stopper = EarlyStopMonitor(logger=logger, max_round=args.patience, higher_better=True) if args.pretrain: logger.info(f'Loading the linkpred pretrained attention based encoder model') encoder.load_state_dict(torch.load(Pretrained_MODEL_PATH+get_pretrain_model_name('Encoder'))) for epoch in range(args.n_epoch): # reset node state g.ndata['mail'] = torch.zeros((g.num_nodes(), args.n_mail, nfeat_dim+2), dtype=torch.float32) g.ndata['feat'] = torch.zeros((g.num_nodes(), nfeat_dim), dtype=torch.float32) # init as zero, people can init it using others. g.ndata['last_update'] = torch.zeros((g.num_nodes()), dtype=torch.float32) encoder.train() decoder.train() start_epoch = time.time() m_loss = [] logger.info('start {} epoch, current optim lr is {}'.format(epoch, optimizer.param_groups[0]['lr'])) for batch_idx, (input_nodes, pos_graph, neg_graph, blocks, frontier, current_ts) in enumerate(train_loader): pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) if neg_graph is not None else None if not args.no_time or not args.no_pos: current_ts, pos_ts, num_pos_nodes = get_current_ts(args, pos_graph, neg_graph) pos_graph.ndata['ts'] = current_ts else: current_ts, pos_ts, num_pos_nodes = None, None, None _ = dgl.add_reverse_edges(neg_graph) if neg_graph is not None else None emb, _ = encoder(dgl.add_reverse_edges(pos_graph), _, num_pos_nodes) if batch_idx != 0: if 'LP' not in args.tasks and args.balance: neg_graph = fraud_sampler.sample_fraud_event(g, args.bs//5, current_ts.max().cpu()).to(device) logits, labels = decoder(emb, pos_graph, neg_graph) loss = loss_fcn(logits, labels) optimizer.zero_grad() loss.backward() optimizer.step() m_loss.append(loss.item()) # MSG Passing with torch.no_grad(): mail = msg2mail.gen_mail(args, emb, input_nodes, pos_graph, frontier, 'train') if not args.no_time: g.ndata['last_update'][pos_graph.ndata[dgl.NID][:num_pos_nodes]] = pos_ts.to('cpu') g.ndata['feat'][pos_graph.ndata[dgl.NID]] = emb.to('cpu') g.ndata['mail'][input_nodes] = mail if batch_idx % 100 == 1: gpu_mem = torch.cuda.max_memory_allocated() / 1.074e9 if torch.cuda.is_available() and args.gpu >= 0 else 0 torch.cuda.empty_cache() mem_perc = psutil.virtual_memory().percent cpu_perc = psutil.cpu_percent(interval=None) output_string = f'Epoch {epoch} | Step {batch_idx}/{len(train_loader)} | CPU {cpu_perc:.1f}% | Sys Mem {mem_perc:.1f}% | GPU Mem {gpu_mem:.4f}GB ' output_string += f'| {args.tasks} Loss {np.mean(m_loss):.4f}' logger.info(output_string) total_epoch_time = time.time() - start_epoch logger.info(' training epoch: {} took {:.4f}s'.format(epoch, total_epoch_time)) val_ap, val_auc, val_acc, val_loss = eval_epoch(args, logger, g, val_loader, encoder, decoder, msg2mail, loss_fcn, device, num_val_samples) logger.info('Val {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}'.format(args.tasks, val_ap, val_auc, val_acc, val_loss)) if args.warmup: scheduler_warmup.step(epoch) else: scheduler_lr.step() early_stopper_metric = val_ap if 'LP' in args.tasks else val_auc if early_stopper.early_stop_check(early_stopper_metric): logger.info('No improvement over {} epochs, stop training'.format(early_stopper.max_round)) logger.info(f'Loading the best model at epoch {early_stopper.best_epoch}') encoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Encoder'))) decoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Decoder'))) test_result = [early_stopper.best_ap, early_stopper.best_auc, early_stopper.best_acc, early_stopper.best_loss] break test_ap, test_auc, test_acc, test_loss = eval_epoch(args, logger, g, test_loader, encoder, decoder, msg2mail, loss_fcn, device, num_test_samples) logger.info('Test {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}'.format(args.tasks, test_ap, test_auc, test_acc, test_loss)) test_result = [test_ap, test_auc, test_acc, test_loss] if early_stopper.best_epoch == epoch: early_stopper.best_ap = test_ap early_stopper.best_auc = test_auc early_stopper.best_acc = test_acc early_stopper.best_loss = test_loss logger.info(f'Saving the best model at epoch {early_stopper.best_epoch}') torch.save(encoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Encoder')) torch.save(decoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Decoder'))
# # This tutorial loads the dataset from the ``ogb`` package as in the # :doc:`previous tutorial <L1_large_node_classification>`. # import dgl import torch import numpy as np from ogb.nodeproppred import DglNodePropPredDataset dataset = DglNodePropPredDataset('ogbn-arxiv') device = 'cpu' # change to 'cuda' for GPU graph, node_labels = dataset[0] # Add reverse edges since ogbn-arxiv is unidirectional. graph = dgl.add_reverse_edges(graph) print(graph) print(node_labels) node_features = graph.ndata['feat'] node_labels = node_labels[:, 0] num_features = node_features.shape[1] num_classes = (node_labels.max() + 1).item() print('Number of classes:', num_classes) idx_split = dataset.get_idx_split() train_nids = idx_split['train'] valid_nids = idx_split['valid'] test_nids = idx_split['test'] ######################################################################
def eval_epoch(args, logger, g, dataloader, encoder, decoder, msg2mail, loss_fcn, device, num_samples): m_ap, m_auc, m_acc = [[], [], []] if 'LP' in args.tasks else [0, 0, 0] labels_all = torch.zeros((num_samples)).long() logits_all = torch.zeros((num_samples)) attn_weight_all = torch.zeros((num_samples, args.n_mail)) m_loss = [] m_infer_time = [] with torch.no_grad(): encoder.eval() decoder.eval() loss = torch.tensor(0) for batch_idx, (input_nodes, pos_graph, neg_graph, blocks, frontier, current_ts) in enumerate(dataloader): n_sample = pos_graph.num_edges() start_idx = batch_idx * n_sample end_idx = min(num_samples, start_idx + n_sample) pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) if neg_graph is not None else None if not args.no_time or not args.no_pos: current_ts, pos_ts, num_pos_nodes = get_current_ts( args, pos_graph, neg_graph) pos_graph.ndata['ts'] = current_ts else: current_ts, pos_ts, num_pos_nodes = None, None, None _ = dgl.add_reverse_edges( neg_graph) if neg_graph is not None else None start = time.time() emb, attn_weight = encoder(dgl.add_reverse_edges(pos_graph), _, num_pos_nodes) #attn_weight_all[start_idx:end_idx] = attn_weight[:n_sample] logits, labels = decoder(emb, pos_graph, neg_graph) end = time.time() - start m_infer_time.append(end) loss = loss_fcn(logits, labels) m_loss.append(loss.item()) mail = msg2mail.gen_mail(args, emb, input_nodes, pos_graph, frontier, 'val') if not args.no_time: g.ndata['last_update'][pos_graph.ndata[dgl.NID] [:num_pos_nodes]] = pos_ts.to('cpu') g.ndata['feat'][pos_graph.ndata[dgl.NID]] = emb.to('cpu') g.ndata['mail'][input_nodes] = mail labels = labels.long() logits = logits.sigmoid() if 'LP' in args.tasks: pred = logits > 0.5 m_ap.append(average_precision(logits, labels).cpu().numpy()) m_auc.append(auroc(logits, labels).cpu().numpy()) m_acc.append(accuracy(pred, labels).cpu().numpy()) else: labels_all[start_idx:end_idx] = labels logits_all[start_idx:end_idx] = logits if 'LP' in args.tasks: ap, auc, acc = np.mean(m_ap), np.mean(m_auc), np.mean(m_acc) else: pred_all = logits_all > 0.5 ap = average_precision(logits_all, labels_all).cpu().item() auc = auroc(logits_all, labels_all).cpu().item() acc = accuracy(pred_all, labels_all).cpu().item() fprs, tprs, thresholds = roc(logits_all, labels_all) fpr_l, tpr_l, thres_l = get_TPR_FPR_metrics(fprs, tprs, thresholds) print_tp_fp_thres(args.tasks, logger, fpr_l, tpr_l, thres_l) print('总推理时间', np.sum(m_infer_time)) logger.info(attn_weight_all.mean(0)) encoder.train() decoder.train() return ap, auc, acc, np.mean(m_loss)
# The original edge feature of each node in sg2 print("original edge feature of each node in sg2: ") print(sg2.edata['a']) ###################################################################### # Another common transformation is to add a reverse edge for each edge in # the original graph with ``dgl.add_reverse_edges``. # # .. note:: # # If you have an undirected graph, it is better to convert it # into a bidirectional graph first via adding reverse edges. # print("add reverse edges: ") newg = dgl.add_reverse_edges(g) newg.edges() ###################################################################### # Loading and Saving Graphs # ------------------------- # # You can save a graph or a list of graphs via ``dgl.save_graphs`` and # load them back with ``dgl.load_graphs``. # # Save graphs print( "-----------------------------------------------------------------------------------" ) print("Step 5: Loading and Saving Graphs: ")
def main(): # check cuda device = f'cuda:{args.gpu}' if torch.cuda.is_available( ) and args.gpu >= 0 else 'cpu' # load data dataset = DglNodePropPredDataset(name=args.dataset) evaluator = Evaluator(name=args.dataset) split_idx = dataset.get_idx_split() g, labels = dataset[ 0] # graph: DGLGraph object, label: torch tensor of shape (num_nodes, num_tasks) if args.dataset == 'ogbn-arxiv': if args.model == 'gat': g = dgl.add_reverse_edges(g, copy_ndata=True) g = g.add_self_loop() else: g = dgl.to_bidirected(g, copy_ndata=True) feat = g.ndata['feat'] feat = (feat - feat.mean(0)) / feat.std(0) g.ndata['feat'] = feat g = g.to(device) feats = g.ndata['feat'] labels = labels.to(device) # load masks for train / validation / test train_idx = split_idx["train"].to(device) valid_idx = split_idx["valid"].to(device) test_idx = split_idx["test"].to(device) n_features = feats.size()[-1] n_classes = dataset.num_classes # load model if args.model == 'mlp': model = MLP(n_features, args.hid_dim, n_classes, args.num_layers, args.dropout) elif args.model == 'linear': model = MLPLinear(n_features, n_classes) elif args.model == 'gat': model = GAT(in_feats=n_features, n_classes=n_classes, n_hidden=args.hid_dim, n_layers=args.num_layers, n_heads=args.n_heads, activation=F.relu, dropout=args.dropout, attn_drop=args.attn_drop) else: raise NotImplementedError(f'Model {args.model} is not supported.') model = model.to(device) print(f'Model parameters: {sum(p.numel() for p in model.parameters())}') if args.pretrain: print('---------- Before ----------') model.load_state_dict( torch.load(f'base/{args.dataset}-{args.model}.pt')) model.eval() if args.model == 'gat': y_soft = model(g, feats).exp() else: y_soft = model(feats).exp() y_pred = y_soft.argmax(dim=-1, keepdim=True) valid_acc = evaluate(y_pred, labels, valid_idx, evaluator) test_acc = evaluate(y_pred, labels, test_idx, evaluator) print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}') print('---------- Correct & Smoothing ----------') cs = CorrectAndSmooth(num_correction_layers=args.num_correction_layers, correction_alpha=args.correction_alpha, correction_adj=args.correction_adj, num_smoothing_layers=args.num_smoothing_layers, smoothing_alpha=args.smoothing_alpha, smoothing_adj=args.smoothing_adj, scale=args.scale) mask_idx = torch.cat([train_idx, valid_idx]) if args.model != 'gat': y_soft = cs.correct(g, y_soft, labels[mask_idx], mask_idx) y_soft = cs.smooth(g, y_soft, labels[mask_idx], mask_idx) y_pred = y_soft.argmax(dim=-1, keepdim=True) valid_acc = evaluate(y_pred, labels, valid_idx, evaluator) test_acc = evaluate(y_pred, labels, test_idx, evaluator) print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}') else: if args.model == 'gat': opt = optim.RMSprop(model.parameters(), lr=args.lr) else: opt = optim.Adam(model.parameters(), lr=args.lr) best_acc = 0 best_model = copy.deepcopy(model) # training print('---------- Training ----------') for i in range(args.epochs): if args.model == 'gat': adjust_learning_rate(opt, args.lr, i) model.train() opt.zero_grad() if args.model == 'gat': logits = model(g, feats) else: logits = model(feats) train_loss = F.nll_loss(logits[train_idx], labels.squeeze(1)[train_idx]) train_loss.backward() opt.step() model.eval() with torch.no_grad(): if args.model == 'gat': logits = model(g, feats) else: logits = model(feats) y_pred = logits.argmax(dim=-1, keepdim=True) train_acc = evaluate(y_pred, labels, train_idx, evaluator) valid_acc = evaluate(y_pred, labels, valid_idx, evaluator) print( f'Epoch {i} | Train loss: {train_loss.item():.4f} | Train acc: {train_acc:.4f} | Valid acc {valid_acc:.4f}' ) if valid_acc > best_acc: best_acc = valid_acc best_model = copy.deepcopy(model) # testing & saving model print('---------- Testing ----------') best_model.eval() if args.model == 'gat': logits = best_model(g, feats) else: logits = best_model(feats) y_pred = logits.argmax(dim=-1, keepdim=True) test_acc = evaluate(y_pred, labels, test_idx, evaluator) print(f'Test acc: {test_acc:.4f}') if not os.path.exists('base'): os.makedirs('base') torch.save(best_model.state_dict(), f'base/{args.dataset}-{args.model}.pt')
def load_dataset(device, args): """ Load dataset and move graph and features to device """ if args.dataset in [ "reddit", "cora", "ppi", "ppi_large", "yelp", "flickr" ]: # raise RuntimeError("Dataset {} is not supported".format(name)) if args.dataset == "reddit": from dgl.data import RedditDataset data = RedditDataset(self_loop=True) g = data[0] g = dgl.add_self_loop(g) n_classes = data.num_classes elif args.dataset == "cora": from dgl.data import CitationGraphDataset data = CitationGraphDataset('cora', raw_dir=os.path.join( args.data_dir, 'cora')) g = data[0] g = dgl.remove_self_loop(g) g = dgl.add_self_loop(g) n_classes = data.num_classes elif args.dataset == "ppi": data = load_ppi_data(args.data_dir) g = data.g n_classes = data.num_classes elif args.dataset == "ppi_large": data = load_ppi_large_data() g = data.g n_classes = data.num_classes elif args.dataset == "yelp": from torch_geometric.datasets import Yelp pyg_data = Yelp(os.path.join(args.data_dir, 'yelp'))[0] feat = pyg_data.x labels = pyg_data.y u, v = pyg_data.edge_index g = dgl.graph((u, v)) g.ndata['feat'] = feat g.ndata['label'] = labels g.ndata['train_mask'] = pyg_data.train_mask g.ndata['val_mask'] = pyg_data.val_mask g.ndata['test_mask'] = pyg_data.test_mask n_classes = labels.size(1) elif args.dataset == "flickr": from torch_geometric.datasets import Flickr pyg_data = Flickr(os.path.join(args.data_dir, "flickr"))[0] feat = pyg_data.x labels = pyg_data.y # labels = torch.argmax(labels, dim=1) u, v = pyg_data.edge_index g = dgl.graph((u, v)) g.ndata['feat'] = feat g.ndata['label'] = labels g.ndata['train_mask'] = pyg_data.train_mask g.ndata['val_mask'] = pyg_data.val_mask g.ndata['test_mask'] = pyg_data.test_mask n_classes = labels.max().item() + 1 train_mask = g.ndata['train_mask'] val_mask = g.ndata['val_mask'] test_mask = g.ndata['test_mask'] train_nid = train_mask.nonzero().squeeze().long() val_nid = val_mask.nonzero().squeeze().long() test_nid = test_mask.nonzero().squeeze().long() g = g.to(device) labels = g.ndata['label'] else: dataset = DglNodePropPredDataset(name=args.dataset, root=args.data_dir) splitted_idx = dataset.get_idx_split() train_nid = splitted_idx["train"] val_nid = splitted_idx["valid"] test_nid = splitted_idx["test"] g, labels = dataset[0] n_classes = dataset.num_classes g = g.to(device) if args.dataset == "ogbn-arxiv": g = dgl.add_reverse_edges(g, copy_ndata=True) g = dgl.add_self_loop(g) g.ndata['feat'] = g.ndata['feat'].float() elif args.dataset == "ogbn-papers100M": g = dgl.add_reverse_edges(g, copy_ndata=True) g.ndata['feat'] = g.ndata['feat'].float() labels = labels.long() elif args.dataset == "ogbn-mag": # MAG is a heterogeneous graph. The task is to make prediction for # paper nodes path = os.path.join(args.emb_path, f"{args.pretrain_model}_mag") labels = labels["paper"] train_nid = train_nid["paper"] val_nid = val_nid["paper"] test_nid = test_nid["paper"] features = g.nodes['paper'].data['feat'] author_emb = torch.load(os.path.join(path, "author.pt"), map_location=torch.device("cpu")).float() topic_emb = torch.load(os.path.join(path, "field_of_study.pt"), map_location=torch.device("cpu")).float() institution_emb = torch.load( os.path.join(path, "institution.pt"), map_location=torch.device("cpu")).float() g.nodes["author"].data["feat"] = author_emb.to(device) g.nodes["institution"].data["feat"] = institution_emb.to(device) g.nodes["field_of_study"].data["feat"] = topic_emb.to(device) g.nodes["paper"].data["feat"] = features.to(device) paper_dim = g.nodes["paper"].data["feat"].shape[1] author_dim = g.nodes["author"].data["feat"].shape[1] if paper_dim != author_dim: paper_feat = g.nodes["paper"].data.pop("feat") rand_weight = torch.Tensor(paper_dim, author_dim).uniform_(-0.5, 0.5) g.nodes["paper"].data["feat"] = torch.matmul( paper_feat, rand_weight.to(device)) print( f"Randomly project paper feature from dimension {paper_dim} to {author_dim}" ) labels = labels.to(device).squeeze() n_classes = int(labels.max() - labels.min()) + 1 else: g.ndata['feat'] = g.ndata['feat'].float() labels = labels.squeeze() evaluator = get_evaluator(args.dataset) print(f"# Nodes: {g.number_of_nodes()}\n" f"# Edges: {g.number_of_edges()}\n" f"# Train: {len(train_nid)}\n" f"# Val: {len(val_nid)}\n" f"# Test: {len(test_nid)}\n" f"# Classes: {n_classes}") return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
else: sampler = TemporalSampler(k=args.n_neighbors) edge_collator = TemporalEdgeCollator neg_sampler = dgl.dataloading.negative_sampler.Uniform( k=args.num_negative_samples) # Set Train, validation, test and new node test id train_seed = torch.arange(int(TRAIN_SPLIT * graph_no_new_node.num_edges())) valid_seed = torch.arange(int(TRAIN_SPLIT * graph_no_new_node.num_edges()), trainval_div - new_node_eid_delete.size(0)) test_seed = torch.arange(trainval_div - new_node_eid_delete.size(0), graph_no_new_node.num_edges()) test_new_node_seed = torch.arange( trainval_div - new_node_eid_delete.size(0), graph_new_node.num_edges()) g_sampling = None if args.fast_mode else dgl.add_reverse_edges( graph_no_new_node, copy_edata=True) new_node_g_sampling = None if args.fast_mode else dgl.add_reverse_edges( graph_new_node, copy_edata=True) if not args.fast_mode: new_node_g_sampling.ndata[dgl.NID] = new_node_g_sampling.nodes() g_sampling.ndata[dgl.NID] = new_node_g_sampling.nodes() # we highly recommend that you always set the num_workers=0, otherwise the sampled subgraph may not be correct. train_dataloader = TemporalEdgeDataLoader(graph_no_new_node, train_seed, sampler, batch_size=args.batch_size, negative_sampler=neg_sampler, shuffle=False, drop_last=False, num_workers=0,
""" import os graph_list = [] label_list = [] homograph = "dataset/homograph" scenarios = os.listdir(homograph) for scenario in scenarios: filepath = "dataset/homograph/"+scenario graphs = os.listdir(filepath) for graph in graphs: glist, label_dict = dgl.load_graphs(filepath+'/'+graph) graph_list.append(glist[0]) for key, value in label_dict.items(): if key != 'Attack': label_list.append(0) else: label_list.append(1) print(len(graph_list)) print(label_list) """ # graph_list, label_list = dgl.load_graphs("dataset/homograph/YouTube/0.bin") u, v = th.tensor([0, 1, 2, 3]), th.tensor([1, 2, 3, 4]) g = dgl.graph((u,v), idtype=th.int32) g.ndata['feat'] = th.ones(5,2) g.edata['feat'] = th.ones(4,3) bg = dgl.add_reverse_edges(g, copy_ndata=True, copy_edata=True) print(bg)