Esempio n. 1
0
    def __init__(self, dataset):
        self.dataset = dataset
        cfg = self.CONFIG[dataset]

        rating = pd.read_csv(
            cfg['rating_path'], sep=cfg['rating_sep'], names=['user_id', 'item_id', 'rating'],
            usecols=[0, 1, 2], skiprows=1
        )
        kg = pd.read_csv(cfg['kg_path'], sep='\t', names=['head', 'relation', 'tail'])
        item2entity = pd.read_csv(cfg['item2id_path'], sep='\t', names=['item_id', 'entity_id'])

        rating = rating[rating['item_id'].isin(item2entity['item_id'])]
        rating.reset_index(drop=True, inplace=True)
        rating['user_id'] = LabelEncoder().fit_transform(rating['user_id'])
        item2entity = dict(zip(item2entity['item_id'], item2entity['entity_id']))
        rating['item_id'] = rating['item_id'].apply(item2entity.__getitem__)
        rating['label'] = rating['rating'].apply(lambda r: int(r >= cfg['threshold']))
        rating = rating[rating['label'] == 1]
        user_item_graph = dgl.heterograph({
            ('user', 'rate', 'item'): (rating['user_id'].to_numpy(), rating['item_id'].to_numpy())
        })

        # 负采样
        neg_sampler = Uniform(1)
        nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges()))
        u, v = user_item_graph.edges()
        self.user_item_graph = dgl.heterograph({('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))})
        self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])])

        kg['relation'] = LabelEncoder().fit_transform(kg['relation'])
        # 有重边,即两个实体之间可能存在多条边,关系类型不同
        knowledge_graph = dgl.graph((kg['head'], kg['tail']))
        knowledge_graph.edata['relation'] = torch.tensor(kg['relation'].tolist())
        self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
Esempio n. 2
0
def to_bidirected_with_reverse_mapping(g):
    """Makes a graph bidirectional, and returns a mapping array ``mapping`` where ``mapping[i]``
    is the reverse edge of edge ID ``i``.
    Does not work with graphs that have self-loops.
    """
    g_simple, mapping = dgl.to_simple(dgl.add_reverse_edges(g),
                                      return_counts='count',
                                      writeback_mapping=True)
    c = g_simple.edata['count']
    num_edges = g.num_edges()
    mapping_offset = torch.zeros(g_simple.num_edges() + 1,
                                 dtype=g_simple.idtype)
    mapping_offset[1:] = c.cumsum(0)
    idx = mapping.argsort()
    idx_uniq = idx[mapping_offset[:-1]]
    reverse_idx = torch.where(idx_uniq >= num_edges, idx_uniq - num_edges,
                              idx_uniq + num_edges)
    reverse_mapping = mapping[reverse_idx]

    # Correctness check
    src1, dst1 = g_simple.edges()
    src2, dst2 = g_simple.find_edges(reverse_mapping)
    assert torch.equal(src1, dst2)
    assert torch.equal(src2, dst1)
    return g_simple, reverse_mapping
    def to_dgl(self: GraphFeaturiser, mol: Mol) -> dgl.DGLGraph:
        """Generates a DGL graph from a molecule.

        Args:
            mol: The molecule to featurise.

        Returns:
            A DGL graph of the featurised molecule.
        """
        num_atoms = mol.GetNumAtoms()
        bonds = mol.GetBonds()
        bond_from = [bond.GetBeginAtomIdx() for bond in bonds]
        bond_to = [bond.GetEndAtomIdx() for bond in bonds]

        g = dgl.graph((torch.tensor(bond_from), torch.tensor(bond_to)),
                      num_nodes=num_atoms)

        for key, atom_featuriser in self.atom_featurisers.items():
            atom_features = atom_featuriser.process_molecule(mol)
            g.ndata[key] = torch.tensor(atom_features, dtype=torch.float)

        for key, bond_featuriser in self.bond_featurisers.items():
            bond_features = [
                bond_featuriser.process_bond(bond) for bond in bonds
            ]
            g.edata[key] = torch.tensor(bond_features, dtype=torch.float)

        g = dgl.add_reverse_edges(g, copy_edata=True)

        if self.add_self_loops:
            g = dgl.add_self_loop(g)

        return g
Esempio n. 4
0
    def __init__(self):
        g = OAGCoreDataset()[0]
        author_rank = load_author_rank()
        rating = pd.DataFrame(
            [[i, a] for i, (f, r) in enumerate(author_rank.items()) for a in r],
            columns=['user_id', 'item_id']
        )
        user_item_graph = dgl.heterograph(
            {('user', 'rate', 'item'): (rating['user_id'], rating['item_id'])},
            num_nodes_dict={'user': len(author_rank), 'item': g.num_nodes('author')}
        )

        # 负采样
        neg_sampler = Uniform(1)
        nu, nv = neg_sampler(user_item_graph, torch.arange(user_item_graph.num_edges()))
        u, v = user_item_graph.edges()
        self.user_item_graph = dgl.heterograph(
            {('user', 'rate', 'item'): (torch.cat([u, nu]), torch.cat([v, nv]))},
            num_nodes_dict={ntype: user_item_graph.num_nodes(ntype) for ntype in user_item_graph.ntypes}
        )
        self.user_item_graph.edata['label'] = torch.cat([torch.ones(u.shape[0]), torch.zeros(nu.shape[0])])

        knowledge_graph = dgl.to_homogeneous(dgl.node_type_subgraph(g, ['author', 'institution', 'paper']))
        knowledge_graph.edata['relation'] = knowledge_graph.edata[dgl.NTYPE]
        self.knowledge_graph = dgl.add_reverse_edges(knowledge_graph, copy_edata=True)
Esempio n. 5
0
def convert_mag_to_homograph(g, device):
    """
    Featurize node types that don't have input features (i.e. author,
    institution, field_of_study) by averaging their neighbor features.
    Then convert the graph to a undirected homogeneous graph.
    """
    src_writes, dst_writes = g.all_edges(etype="writes")
    src_topic, dst_topic = g.all_edges(etype="has_topic")
    src_aff, dst_aff = g.all_edges(etype="affiliated_with")
    new_g = dgl.heterograph({
        ("paper", "written", "author"): (dst_writes, src_writes),
        ("paper", "has_topic", "field"): (src_topic, dst_topic),
        ("author", "aff", "inst"): (src_aff, dst_aff)
    })
    new_g = new_g.to(device)
    new_g.nodes["paper"].data["feat"] = g.nodes["paper"].data["feat"]
    new_g["written"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
    new_g["has_topic"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
    new_g["aff"].update_all(fn.copy_u("feat", "m"), fn.mean("m", "feat"))
    g.nodes["author"].data["feat"] = new_g.nodes["author"].data["feat"]
    g.nodes["institution"].data["feat"] = new_g.nodes["inst"].data["feat"]
    g.nodes["field_of_study"].data["feat"] = new_g.nodes["field"].data["feat"]

    # Convert to homogeneous graph
    # Get DGL type id for paper type
    target_type_id = g.get_ntype_id("paper")
    g = dgl.to_homogeneous(g, ndata=["feat"])
    g = dgl.add_reverse_edges(g, copy_ndata=True)
    # Mask for paper nodes
    g.ndata["target_mask"] = g.ndata[dgl.NTYPE] == target_type_id
    return g
Esempio n. 6
0
def same_direction(scenario, graph_id):
    # The edges of the process reading and writing files are regarded as the same direction
    edge_types = ['execve', 'access', 'mmap2', 'open', 'fstat', 'close', 'read', 'stat', 'write', 'unlink', 'clone',
                  'waitpid', 'bind', 'listen', 'chmod', 'connect', 'writev', 'recv', 'ftruncate', 'sendmsg', 'send',
                  'recvmsg', 'accept', 'sendto', 'recvfrom', 'truncate']
    node_types = ['process', 'file', 'MAP_ANONYMOUS', 'stdin', 'stdout', 'stderr', 'NA', 'thread']

    data_path = 'dataset/split_data/' + scenario + '/' + str(graph_id) + '.csv'
    # data_entry: source-id, source-type, destination-id, destination-type, edge-type, timestamp, graph-id

    # The indexes in the list are node id in graph, and the values are original id in raw data
    node_original_id = []

    # One-hot encoding for node type and edge type
    node_feats, edge_feats = [], []

    # src and des nodes in homograph
    u, v = [], []

    with open(data_path, 'r') as file:
        reader = csv.reader(file)
        for line in reader:
            src_id = int(line[0])
            src_type = line[1]
            dst_id = int(line[2])
            dst_type = line[3]
            edge_type = line[4]
            timestamp = int(line[5])
            if src_id not in node_original_id:
                node_original_id.append(src_id)
            u.append(node_original_id.index(src_id))
            if dst_id not in node_original_id:
                node_original_id.append(dst_id)
            v.append(node_original_id.index(dst_id))

            # one-hot encoding for node and edge features
            src_node_feat = [0]*len(node_types)
            src_node_feat[node_types.index(src_type)] = 1
            if node_original_id.index(src_id)+1 > len(node_feats):
                node_feats[len(node_feats) : node_original_id.index(src_id)+1] = [[0]*len(node_types)]
                node_feats[node_original_id.index(src_id)] = src_node_feat
            dst_node_feat = [0]*len(node_types)
            dst_node_feat[node_types.index(dst_type)] = 1
            if node_original_id.index(dst_id)+1 > len(node_feats):
                node_feats[len(node_feats) : node_original_id.index(dst_id)+1] = [[0]*len(node_types)]
                node_feats[node_original_id.index(dst_id)] = dst_node_feat
            edge_feat = [0]*len(edge_types)
            edge_feat[edge_types.index(edge_type)] = 1
            edge_feats.append(edge_feat)

    u_ids, v_ids = th.tensor(u), th.tensor(v)
    node_feats, edge_feats = th.tensor(node_feats), th.tensor(edge_feats)
    g = dgl.graph((u_ids, v_ids), idtype=th.int32)
    g.ndata['feat'] = node_feats
    g.edata['feat'] = edge_feats

    # To eliminate 0-in-degree nodes
    bg = dgl.add_reverse_edges(g, copy_ndata=True, copy_edata=True)
    return bg
Esempio n. 7
0
def get_current_ts(pos_graph, neg_graph):
    with pos_graph.local_scope():
        pos_graph_ = dgl.add_reverse_edges(pos_graph, copy_edata=True)
        pos_graph_.update_all(fn.copy_e('timestamp', 'times'),
                              fn.max('times', 'ts'))
        current_ts = pos_ts = pos_graph_.ndata['ts']
        num_pos_nodes = pos_graph_.num_nodes()
    with neg_graph.local_scope():
        neg_graph_ = dgl.add_reverse_edges(neg_graph)
        neg_graph_.edata['timestamp'] = pos_graph_.edata['timestamp']
        neg_graph_.update_all(fn.copy_e('timestamp', 'times'),
                              fn.max('times', 'ts'))
        num_pos_nodes = torch.where(pos_graph_.ndata['ts'] > 0)[0].shape[0]
        pos_ts = pos_graph_.ndata['ts'][:num_pos_nodes]
        neg_ts = neg_graph_.ndata['ts'][num_pos_nodes:]
        current_ts = torch.cat([pos_ts, neg_ts])
    return current_ts, pos_ts, num_pos_nodes
def processItem(item):
	graph, bidirectional, key, categories, tolerance = item
	graph_dict = {}
	vertices = graphVertices(graph)
	edges = graphEdges(graph)
	graph_dict["num_nodes"] = len(vertices)
	graph_dict["src"] = []
	graph_dict["dst"] = []
	graph_dict["node_labels"] = {}
	nodes = []
	graph_edges = []

	# This is a hack, please replace
	test_list = []
	for i in range(len(vertices)):
		vDict = vertices[i].GetDictionary()
		vLabel = DictionaryValueAtKey.processItem([vDict, key])
		graph_dict["node_labels"][i] = vLabel
		nodes.append(i)
		# This is a hack, please replace
		test_list.append(vLabel)

	# Here we need to call oneHotEncode to create the one host encoding.
	# What is the input list we need here?
	# This is a hack, please replace.
	one_hot_encoded_list = oneHotEncode(test_list, categories)
	print("Categories", categories)
	print("Test List", test_list)
	print("One-Hot-Encoded List",one_hot_encoded_list)
	# Do something with the one_hot_encoded list

	for i in range(len(edges)):
		e = edges[i]
		sv = e.StartVertex()
		ev = e.EndVertex()
		sn = nodes[vertexIndex(sv, vertices, tolerance)]
		en = nodes[vertexIndex(ev, vertices, tolerance)]
		if (([sn,en] in graph_edges) == False) and (([en,sn] in graph_edges) == False):
			graph_edges.append([sn,en])

	for anEdge in graph_edges:
		graph_dict["src"].append(anEdge[0])
		graph_dict["dst"].append(anEdge[1])

	# Create DDGL graph
	src = np.array(graph_dict["src"])
	dst = np.array(graph_dict["dst"])
	num_nodes = graph_dict["num_nodes"]
	# Create a graph
	dgl_graph = dgl.graph((src, dst), num_nodes=num_nodes)
	dgl_graph.ndata['attr'] = torch.ones(num_nodes, 1)
	if bidirectional:
		dgl_graph = dgl.add_reverse_edges(dgl_graph)
	return dgl_graph
def processItem(item):
	graphs_file_path, edges_file_path, nodes_file_path, graph_id_header, graph_label_header, num_nodes_header, src_header, dst_header, node_label_header, node_attr_key, categories, bidirectional = item

	graphs = pd.read_csv(graphs_file_path)
	edges = pd.read_csv(edges_file_path)
	nodes = pd.read_csv(nodes_file_path)
	dgl_graphs = []
	labels = []

	# Create a graph for each graph ID from the edges table.
	# First process the graphs table into two dictionaries with graph IDs as keys.
	# The label and number of nodes are values.
	label_dict = {}
	num_nodes_dict = {}
	for _, row in graphs.iterrows():
		label_dict[row[graph_id_header]] = row[graph_label_header]
		num_nodes_dict[row[graph_id_header]] = row[num_nodes_header]
	# For the edges, first group the table by graph IDs.
	edges_group = edges.groupby(graph_id_header)
	# For the nodes, first group the table by graph IDs.
	nodes_group = nodes.groupby(graph_id_header)
	# For each graph ID...
	for graph_id in edges_group.groups:
		graph_dict = {}
		graph_dict[src_header] = []
		graph_dict[dst_header] = []
		graph_dict[node_label_header] = {}
		graph_dict["node_features"] = []
		num_nodes = num_nodes_dict[graph_id]
		graph_label = label_dict[graph_id]
		labels.append(graph_label)

		# Find the edges as well as the number of nodes and its label.
		edges_of_id = edges_group.get_group(graph_id)
		src = edges_of_id[src_header].to_numpy()
		dst = edges_of_id[dst_header].to_numpy()

		# Find the nodes and their labels and features
		nodes_of_id = nodes_group.get_group(graph_id)
		node_labels = nodes_of_id[node_label_header]
		#graph_dict["node_labels"][graph_id] = node_labels

		for node_label in node_labels:
			graph_dict["node_features"].append(torch.tensor(oneHotEncode(node_label, categories)))
		# Create a graph and add it to the list of graphs and labels.
		dgl_graph = dgl.graph((src, dst), num_nodes=num_nodes)
		# Setting the node features as node_attr_key using onehotencoding of node_label
		dgl_graph.ndata[node_attr_key] = torch.stack(graph_dict["node_features"])
		if bidirectional:
			dgl_graph = dgl.add_reverse_edges(dgl_graph)		
		dgl_graphs.append(dgl_graph)
	return [dgl_graphs, labels]
Esempio n. 10
0
    def gen_mail(self, args, emb, input_nodes, pair_graph, frontier, mode='train'):
        pair_graph.ndata['feat'] = emb

        pair_graph = dgl.add_reverse_edges(pair_graph, copy_edata=True)

        pair_graph.update_all(MSG.get_edge_msg, fn.mean('m','msg')) 
        frontier.ndata['msg'] = torch.zeros((frontier.num_nodes(), self.nfeat_dim + 2))
        frontier.ndata['msg'][pair_graph.ndata[dgl.NID]] = pair_graph.ndata['msg'].to('cpu')

        for _ in range(args.n_layer):
            frontier.update_all(fn.copy_u('msg','m'), fn.mean('m','msg'))

        mail = MSG.msg2mail(frontier.ndata['mail'][input_nodes], frontier.ndata['msg'][input_nodes])
        return mail
Esempio n. 11
0
def processItem(item):
    file_path, categories, bidirectional = item
    graphs = []
    labels = []
    file = open(file_path)
    if file:
        lines = file.readlines()
        n_graphs = int(lines[0])
        index = 1
        for i in range(n_graphs):
            graph_dict = {}
            graph_dict["src"] = []
            graph_dict["dst"] = []
            graph_dict["node_labels"] = {}
            graph_dict["node_features"] = []
            line = lines[index].split()
            n_nodes = int(line[0])
            graph_dict["num_nodes"] = n_nodes
            graph_label = int(line[1])
            labels.append(graph_label)
            index += 1
            for j in range(n_nodes):
                line = lines[index + j].split()
                node_label = int(line[0])
                graph_dict["node_labels"][j] = node_label
                graph_dict["node_features"].append(
                    torch.tensor(oneHotEncode(node_label, categories)))
                adj_vertices = line[2:]
                for adj_vertex in adj_vertices:
                    graph_dict["src"].append(j)
                    graph_dict["dst"].append(int(adj_vertex))

            # Create DDGL graph
            src = np.array(graph_dict["src"])
            dst = np.array(graph_dict["dst"])
            # Create a graph
            dgl_graph = dgl.graph((src, dst),
                                  num_nodes=graph_dict["num_nodes"])
            # Setting the node features as 'node_attr' using onehotencoding of vlabel
            dgl_graph.ndata['node_attr'] = torch.stack(
                graph_dict["node_features"])
            if bidirectional:
                dgl_graph = dgl.add_reverse_edges(dgl_graph)
            graphs.append(dgl_graph)
            index += n_nodes
        file.close()
    return [graphs, labels]
Esempio n. 12
0
def load_dataset(name, device):
    """
    Load dataset and move graph and features to device
    """
    if name not in ["ogbn-products", "ogbn-arxiv", "ogbn-mag"]:
        raise RuntimeError("Dataset {} is not supported".format(name))
    dataset = DglNodePropPredDataset(name=name)
    splitted_idx = dataset.get_idx_split()
    train_nid = splitted_idx["train"]
    val_nid = splitted_idx["valid"]
    test_nid = splitted_idx["test"]
    g, labels = dataset[0]
    g = g.to(device)
    if name == "ogbn-arxiv":
        g = dgl.add_reverse_edges(g, copy_ndata=True)
        g = dgl.add_self_loop(g)
        g.ndata['feat'] = g.ndata['feat'].float()
    elif name == "ogbn-mag":
        # MAG is a heterogeneous graph. The task is to make prediction for
        # paper nodes
        labels = labels["paper"]
        train_nid = train_nid["paper"]
        val_nid = val_nid["paper"]
        test_nid = test_nid["paper"]
        g = convert_mag_to_homograph(g, device)
    else:
        g.ndata['feat'] = g.ndata['feat'].float()
    n_classes = dataset.num_classes
    labels = labels.squeeze()
    evaluator = get_ogb_evaluator(name)

    print(f"# Nodes: {g.number_of_nodes()}\n"
          f"# Edges: {g.number_of_edges()}\n"
          f"# Train: {len(train_nid)}\n"
          f"# Val: {len(val_nid)}\n"
          f"# Test: {len(test_nid)}\n"
          f"# Classes: {n_classes}")

    return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
Esempio n. 13
0
        device = th.device('cuda:%d' % args.gpu)
    else:
        device = th.device('cpu')

    # load ogbn-products data
    #data = DglNodePropPredDataset(name='ogbn-products')
    data = DglNodePropPredDataset(name="ogbn-" + args.dataset,
                                  root='torch_geometric_data/')
    splitted_idx = data.get_idx_split()
    train_idx, val_idx, test_idx = splitted_idx['train'], splitted_idx[
        'valid'], splitted_idx['test']
    graph, labels = data[0]
    n_classes = (labels.max() + 1).item()
    graph = graph.to(device)
    if args.dataset == "arxiv":
        graph = dgl.add_reverse_edges(graph, copy_ndata=True)
        graph = dgl.add_self_loop(graph)
        graph.ndata['feat'] = graph.ndata['feat'].float()
        labels = labels[:, 0].to(device)
    elif args.dataset == "ogbn-mag":
        labels = labels["paper"]
        train_idx = train_idx["paper"]
        val_idx = val_idx["paper"]
        test_idx = test_idx["paper"]
        g = convert_mag_to_homograph(g, device)
        labels = labels[:, 0].to(device)
    elif args.dataset == "proteins":
        n_classes = labels.shape[1]
        graph.update_all(fn.copy_e("feat", "feat_copy"),
                         fn.sum("feat_copy", "feat"))
        #one_hot = th.zeros(graph.number_of_nodes(), n_classes)
Esempio n. 14
0
def train(args, logger):
    task_time = time.strftime("%Y-%m-%d %H:%M", time.localtime())
    Path("./saved_models/").mkdir(parents=True, exist_ok=True)
    Path("./pretrained_models/").mkdir(parents=True, exist_ok=True)
    MODEL_SAVE_PATH = './saved_models/'
    Pretrained_MODEL_PATH = './pretrained_models/'
    get_model_name = lambda part: f'{part}-{args.data}-{args.tasks}-{args.prefix}.pth'
    get_pretrain_model_name = lambda part: f'{part}-{args.data}-LP-{args.prefix}.pth'
    device_string = 'cuda:{}'.format(args.gpu) if torch.cuda.is_available() and args.gpu >=0 else 'cpu'
    print('Model trainging with '+device_string)
    device = torch.device(device_string)
    


    g = load_graphs(f"./data/{args.data}.dgl")[0][0]
    
    efeat_dim = g.edata['feat'].shape[1]
    nfeat_dim = efeat_dim


    train_loader, val_loader, test_loader, num_val_samples, num_test_samples = dataloader(args, g)


    encoder = Encoder(args, nfeat_dim, n_head=args.n_head, dropout=args.dropout).to(device)
    decoder = Decoder(args, nfeat_dim).to(device)
    msg2mail = Msg2Mail(args, nfeat_dim)
    fraud_sampler = frauder_sampler(g)

    optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay)
    scheduler_lr = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=40)
    if args.warmup:
        scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=3, after_scheduler=scheduler_lr)
        optimizer.zero_grad()
        optimizer.step()
    loss_fcn = torch.nn.BCEWithLogitsLoss()

    loss_fcn = loss_fcn.to(device)

    early_stopper = EarlyStopMonitor(logger=logger, max_round=args.patience, higher_better=True)

    if args.pretrain:
        logger.info(f'Loading the linkpred pretrained attention based encoder model')
        encoder.load_state_dict(torch.load(Pretrained_MODEL_PATH+get_pretrain_model_name('Encoder')))

    for epoch in range(args.n_epoch):
        # reset node state
        g.ndata['mail'] = torch.zeros((g.num_nodes(), args.n_mail, nfeat_dim+2), dtype=torch.float32) 
        g.ndata['feat'] = torch.zeros((g.num_nodes(), nfeat_dim), dtype=torch.float32) # init as zero, people can init it using others.
        g.ndata['last_update'] = torch.zeros((g.num_nodes()), dtype=torch.float32) 
        encoder.train()
        decoder.train()
        start_epoch = time.time()
        m_loss = []
        logger.info('start {} epoch, current optim lr is {}'.format(epoch, optimizer.param_groups[0]['lr']))
        for batch_idx, (input_nodes, pos_graph, neg_graph, blocks, frontier, current_ts) in enumerate(train_loader):
            

            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device) if neg_graph is not None else None
            

            if not args.no_time or not args.no_pos:
                current_ts, pos_ts, num_pos_nodes = get_current_ts(args, pos_graph, neg_graph)
                pos_graph.ndata['ts'] = current_ts
            else:
                current_ts, pos_ts, num_pos_nodes = None, None, None
            
            _ = dgl.add_reverse_edges(neg_graph) if neg_graph is not None else None
            emb, _ = encoder(dgl.add_reverse_edges(pos_graph), _, num_pos_nodes)
            if batch_idx != 0:
                if 'LP' not in args.tasks and args.balance:
                    neg_graph = fraud_sampler.sample_fraud_event(g, args.bs//5, current_ts.max().cpu()).to(device)
                logits, labels = decoder(emb, pos_graph, neg_graph)

                loss = loss_fcn(logits, labels)
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                m_loss.append(loss.item())


            # MSG Passing
            with torch.no_grad():
                mail = msg2mail.gen_mail(args, emb, input_nodes, pos_graph, frontier, 'train')

                if not args.no_time:
                    g.ndata['last_update'][pos_graph.ndata[dgl.NID][:num_pos_nodes]] = pos_ts.to('cpu')
                g.ndata['feat'][pos_graph.ndata[dgl.NID]] = emb.to('cpu')
                g.ndata['mail'][input_nodes] = mail
            if batch_idx % 100 == 1:
                gpu_mem = torch.cuda.max_memory_allocated() / 1.074e9 if torch.cuda.is_available() and args.gpu >= 0 else 0
                torch.cuda.empty_cache()
                mem_perc = psutil.virtual_memory().percent
                cpu_perc = psutil.cpu_percent(interval=None)
                output_string = f'Epoch {epoch} | Step {batch_idx}/{len(train_loader)} | CPU {cpu_perc:.1f}% | Sys Mem {mem_perc:.1f}% | GPU Mem {gpu_mem:.4f}GB '
                
                output_string += f'| {args.tasks} Loss {np.mean(m_loss):.4f}'

                logger.info(output_string)

        total_epoch_time = time.time() - start_epoch
        logger.info(' training epoch: {} took {:.4f}s'.format(epoch, total_epoch_time))
        val_ap, val_auc, val_acc, val_loss = eval_epoch(args, logger, g, val_loader, encoder, decoder, msg2mail, loss_fcn, device, num_val_samples)
        logger.info('Val {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}'.format(args.tasks, val_ap, val_auc, val_acc, val_loss))

        if args.warmup:
            scheduler_warmup.step(epoch)
        else:
            scheduler_lr.step()

        early_stopper_metric = val_ap if 'LP' in args.tasks else val_auc

        if early_stopper.early_stop_check(early_stopper_metric):
            logger.info('No improvement over {} epochs, stop training'.format(early_stopper.max_round))
            logger.info(f'Loading the best model at epoch {early_stopper.best_epoch}')
            encoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Encoder')))
            decoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Decoder')))

            test_result = [early_stopper.best_ap, early_stopper.best_auc, early_stopper.best_acc, early_stopper.best_loss]
            break

        test_ap, test_auc, test_acc, test_loss = eval_epoch(args, logger, g, test_loader, encoder, decoder, msg2mail, loss_fcn, device, num_test_samples)
        logger.info('Test {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}'.format(args.tasks, test_ap, test_auc, test_acc, test_loss))
        test_result = [test_ap, test_auc, test_acc, test_loss]

        if early_stopper.best_epoch == epoch: 
            early_stopper.best_ap = test_ap
            early_stopper.best_auc = test_auc
            early_stopper.best_acc = test_acc
            early_stopper.best_loss = test_loss
            logger.info(f'Saving the best model at epoch {early_stopper.best_epoch}')
            torch.save(encoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Encoder'))
            torch.save(decoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Decoder'))
#
# This tutorial loads the dataset from the ``ogb`` package as in the
# :doc:`previous tutorial <L1_large_node_classification>`.
#

import dgl
import torch
import numpy as np
from ogb.nodeproppred import DglNodePropPredDataset

dataset = DglNodePropPredDataset('ogbn-arxiv')
device = 'cpu'  # change to 'cuda' for GPU

graph, node_labels = dataset[0]
# Add reverse edges since ogbn-arxiv is unidirectional.
graph = dgl.add_reverse_edges(graph)
print(graph)
print(node_labels)

node_features = graph.ndata['feat']
node_labels = node_labels[:, 0]
num_features = node_features.shape[1]
num_classes = (node_labels.max() + 1).item()
print('Number of classes:', num_classes)

idx_split = dataset.get_idx_split()
train_nids = idx_split['train']
valid_nids = idx_split['valid']
test_nids = idx_split['test']

######################################################################
Esempio n. 16
0
def eval_epoch(args, logger, g, dataloader, encoder, decoder, msg2mail,
               loss_fcn, device, num_samples):

    m_ap, m_auc, m_acc = [[], [], []] if 'LP' in args.tasks else [0, 0, 0]

    labels_all = torch.zeros((num_samples)).long()
    logits_all = torch.zeros((num_samples))

    attn_weight_all = torch.zeros((num_samples, args.n_mail))

    m_loss = []
    m_infer_time = []
    with torch.no_grad():
        encoder.eval()
        decoder.eval()
        loss = torch.tensor(0)
        for batch_idx, (input_nodes, pos_graph, neg_graph, blocks, frontier,
                        current_ts) in enumerate(dataloader):
            n_sample = pos_graph.num_edges()
            start_idx = batch_idx * n_sample
            end_idx = min(num_samples, start_idx + n_sample)

            pos_graph = pos_graph.to(device)
            neg_graph = neg_graph.to(device) if neg_graph is not None else None
            if not args.no_time or not args.no_pos:
                current_ts, pos_ts, num_pos_nodes = get_current_ts(
                    args, pos_graph, neg_graph)
                pos_graph.ndata['ts'] = current_ts
            else:
                current_ts, pos_ts, num_pos_nodes = None, None, None

            _ = dgl.add_reverse_edges(
                neg_graph) if neg_graph is not None else None

            start = time.time()
            emb, attn_weight = encoder(dgl.add_reverse_edges(pos_graph), _,
                                       num_pos_nodes)
            #attn_weight_all[start_idx:end_idx] = attn_weight[:n_sample]

            logits, labels = decoder(emb, pos_graph, neg_graph)
            end = time.time() - start
            m_infer_time.append(end)

            loss = loss_fcn(logits, labels)
            m_loss.append(loss.item())
            mail = msg2mail.gen_mail(args, emb, input_nodes, pos_graph,
                                     frontier, 'val')
            if not args.no_time:
                g.ndata['last_update'][pos_graph.ndata[dgl.NID]
                                       [:num_pos_nodes]] = pos_ts.to('cpu')
            g.ndata['feat'][pos_graph.ndata[dgl.NID]] = emb.to('cpu')
            g.ndata['mail'][input_nodes] = mail

            labels = labels.long()
            logits = logits.sigmoid()
            if 'LP' in args.tasks:
                pred = logits > 0.5
                m_ap.append(average_precision(logits, labels).cpu().numpy())
                m_auc.append(auroc(logits, labels).cpu().numpy())
                m_acc.append(accuracy(pred, labels).cpu().numpy())
            else:
                labels_all[start_idx:end_idx] = labels
                logits_all[start_idx:end_idx] = logits

    if 'LP' in args.tasks:
        ap, auc, acc = np.mean(m_ap), np.mean(m_auc), np.mean(m_acc)
    else:
        pred_all = logits_all > 0.5
        ap = average_precision(logits_all, labels_all).cpu().item()
        auc = auroc(logits_all, labels_all).cpu().item()
        acc = accuracy(pred_all, labels_all).cpu().item()

        fprs, tprs, thresholds = roc(logits_all, labels_all)
        fpr_l, tpr_l, thres_l = get_TPR_FPR_metrics(fprs, tprs, thresholds)
        print_tp_fp_thres(args.tasks, logger, fpr_l, tpr_l, thres_l)

    print('总推理时间', np.sum(m_infer_time))
    logger.info(attn_weight_all.mean(0))
    encoder.train()
    decoder.train()
    return ap, auc, acc, np.mean(m_loss)
Esempio n. 17
0
# The original edge feature of each node in sg2
print("original edge feature of each node in sg2: ")
print(sg2.edata['a'])

######################################################################
# Another common transformation is to add a reverse edge for each edge in
# the original graph with ``dgl.add_reverse_edges``.
#
# .. note::
#
#    If you have an undirected graph, it is better to convert it
#    into a bidirectional graph first via adding reverse edges.
#

print("add reverse edges: ")
newg = dgl.add_reverse_edges(g)
newg.edges()

######################################################################
# Loading and Saving Graphs
# -------------------------
#
# You can save a graph or a list of graphs via ``dgl.save_graphs`` and
# load them back with ``dgl.load_graphs``.
#

# Save graphs
print(
    "-----------------------------------------------------------------------------------"
)
print("Step 5: Loading and Saving Graphs: ")
Esempio n. 18
0
def main():
    # check cuda
    device = f'cuda:{args.gpu}' if torch.cuda.is_available(
    ) and args.gpu >= 0 else 'cpu'
    # load data
    dataset = DglNodePropPredDataset(name=args.dataset)
    evaluator = Evaluator(name=args.dataset)

    split_idx = dataset.get_idx_split()
    g, labels = dataset[
        0]  # graph: DGLGraph object, label: torch tensor of shape (num_nodes, num_tasks)

    if args.dataset == 'ogbn-arxiv':
        if args.model == 'gat':
            g = dgl.add_reverse_edges(g, copy_ndata=True)
            g = g.add_self_loop()
        else:
            g = dgl.to_bidirected(g, copy_ndata=True)

        feat = g.ndata['feat']
        feat = (feat - feat.mean(0)) / feat.std(0)
        g.ndata['feat'] = feat

    g = g.to(device)
    feats = g.ndata['feat']
    labels = labels.to(device)

    # load masks for train / validation / test
    train_idx = split_idx["train"].to(device)
    valid_idx = split_idx["valid"].to(device)
    test_idx = split_idx["test"].to(device)

    n_features = feats.size()[-1]
    n_classes = dataset.num_classes

    # load model
    if args.model == 'mlp':
        model = MLP(n_features, args.hid_dim, n_classes, args.num_layers,
                    args.dropout)
    elif args.model == 'linear':
        model = MLPLinear(n_features, n_classes)
    elif args.model == 'gat':
        model = GAT(in_feats=n_features,
                    n_classes=n_classes,
                    n_hidden=args.hid_dim,
                    n_layers=args.num_layers,
                    n_heads=args.n_heads,
                    activation=F.relu,
                    dropout=args.dropout,
                    attn_drop=args.attn_drop)
    else:
        raise NotImplementedError(f'Model {args.model} is not supported.')

    model = model.to(device)
    print(f'Model parameters: {sum(p.numel() for p in model.parameters())}')

    if args.pretrain:
        print('---------- Before ----------')
        model.load_state_dict(
            torch.load(f'base/{args.dataset}-{args.model}.pt'))
        model.eval()

        if args.model == 'gat':
            y_soft = model(g, feats).exp()
        else:
            y_soft = model(feats).exp()

        y_pred = y_soft.argmax(dim=-1, keepdim=True)
        valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}')

        print('---------- Correct & Smoothing ----------')
        cs = CorrectAndSmooth(num_correction_layers=args.num_correction_layers,
                              correction_alpha=args.correction_alpha,
                              correction_adj=args.correction_adj,
                              num_smoothing_layers=args.num_smoothing_layers,
                              smoothing_alpha=args.smoothing_alpha,
                              smoothing_adj=args.smoothing_adj,
                              scale=args.scale)

        mask_idx = torch.cat([train_idx, valid_idx])
        if args.model != 'gat':
            y_soft = cs.correct(g, y_soft, labels[mask_idx], mask_idx)
        y_soft = cs.smooth(g, y_soft, labels[mask_idx], mask_idx)
        y_pred = y_soft.argmax(dim=-1, keepdim=True)
        valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Valid acc: {valid_acc:.4f} | Test acc: {test_acc:.4f}')
    else:
        if args.model == 'gat':
            opt = optim.RMSprop(model.parameters(), lr=args.lr)
        else:
            opt = optim.Adam(model.parameters(), lr=args.lr)

        best_acc = 0
        best_model = copy.deepcopy(model)

        # training
        print('---------- Training ----------')
        for i in range(args.epochs):
            if args.model == 'gat':
                adjust_learning_rate(opt, args.lr, i)

            model.train()
            opt.zero_grad()

            if args.model == 'gat':
                logits = model(g, feats)
            else:
                logits = model(feats)

            train_loss = F.nll_loss(logits[train_idx],
                                    labels.squeeze(1)[train_idx])
            train_loss.backward()

            opt.step()

            model.eval()
            with torch.no_grad():
                if args.model == 'gat':
                    logits = model(g, feats)
                else:
                    logits = model(feats)

                y_pred = logits.argmax(dim=-1, keepdim=True)

                train_acc = evaluate(y_pred, labels, train_idx, evaluator)
                valid_acc = evaluate(y_pred, labels, valid_idx, evaluator)

                print(
                    f'Epoch {i} | Train loss: {train_loss.item():.4f} | Train acc: {train_acc:.4f} | Valid acc {valid_acc:.4f}'
                )

                if valid_acc > best_acc:
                    best_acc = valid_acc
                    best_model = copy.deepcopy(model)

        # testing & saving model
        print('---------- Testing ----------')
        best_model.eval()

        if args.model == 'gat':
            logits = best_model(g, feats)
        else:
            logits = best_model(feats)

        y_pred = logits.argmax(dim=-1, keepdim=True)
        test_acc = evaluate(y_pred, labels, test_idx, evaluator)
        print(f'Test acc: {test_acc:.4f}')

        if not os.path.exists('base'):
            os.makedirs('base')

        torch.save(best_model.state_dict(),
                   f'base/{args.dataset}-{args.model}.pt')
Esempio n. 19
0
def load_dataset(device, args):
    """
    Load dataset and move graph and features to device
    """
    if args.dataset in [
            "reddit", "cora", "ppi", "ppi_large", "yelp", "flickr"
    ]:
        # raise RuntimeError("Dataset {} is not supported".format(name))
        if args.dataset == "reddit":
            from dgl.data import RedditDataset
            data = RedditDataset(self_loop=True)
            g = data[0]
            g = dgl.add_self_loop(g)
            n_classes = data.num_classes
        elif args.dataset == "cora":
            from dgl.data import CitationGraphDataset
            data = CitationGraphDataset('cora',
                                        raw_dir=os.path.join(
                                            args.data_dir, 'cora'))
            g = data[0]
            g = dgl.remove_self_loop(g)
            g = dgl.add_self_loop(g)
            n_classes = data.num_classes
        elif args.dataset == "ppi":
            data = load_ppi_data(args.data_dir)
            g = data.g
            n_classes = data.num_classes
        elif args.dataset == "ppi_large":
            data = load_ppi_large_data()
            g = data.g
            n_classes = data.num_classes
        elif args.dataset == "yelp":
            from torch_geometric.datasets import Yelp
            pyg_data = Yelp(os.path.join(args.data_dir, 'yelp'))[0]
            feat = pyg_data.x
            labels = pyg_data.y
            u, v = pyg_data.edge_index
            g = dgl.graph((u, v))
            g.ndata['feat'] = feat
            g.ndata['label'] = labels
            g.ndata['train_mask'] = pyg_data.train_mask
            g.ndata['val_mask'] = pyg_data.val_mask
            g.ndata['test_mask'] = pyg_data.test_mask
            n_classes = labels.size(1)
        elif args.dataset == "flickr":
            from torch_geometric.datasets import Flickr
            pyg_data = Flickr(os.path.join(args.data_dir, "flickr"))[0]
            feat = pyg_data.x
            labels = pyg_data.y
            # labels = torch.argmax(labels, dim=1)
            u, v = pyg_data.edge_index
            g = dgl.graph((u, v))
            g.ndata['feat'] = feat
            g.ndata['label'] = labels
            g.ndata['train_mask'] = pyg_data.train_mask
            g.ndata['val_mask'] = pyg_data.val_mask
            g.ndata['test_mask'] = pyg_data.test_mask
            n_classes = labels.max().item() + 1

        train_mask = g.ndata['train_mask']
        val_mask = g.ndata['val_mask']
        test_mask = g.ndata['test_mask']
        train_nid = train_mask.nonzero().squeeze().long()
        val_nid = val_mask.nonzero().squeeze().long()
        test_nid = test_mask.nonzero().squeeze().long()
        g = g.to(device)
        labels = g.ndata['label']

    else:
        dataset = DglNodePropPredDataset(name=args.dataset, root=args.data_dir)
        splitted_idx = dataset.get_idx_split()
        train_nid = splitted_idx["train"]
        val_nid = splitted_idx["valid"]
        test_nid = splitted_idx["test"]
        g, labels = dataset[0]
        n_classes = dataset.num_classes
        g = g.to(device)

        if args.dataset == "ogbn-arxiv":
            g = dgl.add_reverse_edges(g, copy_ndata=True)
            g = dgl.add_self_loop(g)
            g.ndata['feat'] = g.ndata['feat'].float()

        elif args.dataset == "ogbn-papers100M":
            g = dgl.add_reverse_edges(g, copy_ndata=True)
            g.ndata['feat'] = g.ndata['feat'].float()
            labels = labels.long()

        elif args.dataset == "ogbn-mag":
            # MAG is a heterogeneous graph. The task is to make prediction for
            # paper nodes
            path = os.path.join(args.emb_path, f"{args.pretrain_model}_mag")
            labels = labels["paper"]
            train_nid = train_nid["paper"]
            val_nid = val_nid["paper"]
            test_nid = test_nid["paper"]
            features = g.nodes['paper'].data['feat']
            author_emb = torch.load(os.path.join(path, "author.pt"),
                                    map_location=torch.device("cpu")).float()
            topic_emb = torch.load(os.path.join(path, "field_of_study.pt"),
                                   map_location=torch.device("cpu")).float()
            institution_emb = torch.load(
                os.path.join(path, "institution.pt"),
                map_location=torch.device("cpu")).float()

            g.nodes["author"].data["feat"] = author_emb.to(device)
            g.nodes["institution"].data["feat"] = institution_emb.to(device)
            g.nodes["field_of_study"].data["feat"] = topic_emb.to(device)
            g.nodes["paper"].data["feat"] = features.to(device)
            paper_dim = g.nodes["paper"].data["feat"].shape[1]
            author_dim = g.nodes["author"].data["feat"].shape[1]
            if paper_dim != author_dim:
                paper_feat = g.nodes["paper"].data.pop("feat")
                rand_weight = torch.Tensor(paper_dim,
                                           author_dim).uniform_(-0.5, 0.5)
                g.nodes["paper"].data["feat"] = torch.matmul(
                    paper_feat, rand_weight.to(device))
                print(
                    f"Randomly project paper feature from dimension {paper_dim} to {author_dim}"
                )

            labels = labels.to(device).squeeze()
            n_classes = int(labels.max() - labels.min()) + 1

        else:
            g.ndata['feat'] = g.ndata['feat'].float()

        labels = labels.squeeze()

    evaluator = get_evaluator(args.dataset)

    print(f"# Nodes: {g.number_of_nodes()}\n"
          f"# Edges: {g.number_of_edges()}\n"
          f"# Train: {len(train_nid)}\n"
          f"# Val: {len(val_nid)}\n"
          f"# Test: {len(test_nid)}\n"
          f"# Classes: {n_classes}")

    return g, labels, n_classes, train_nid, val_nid, test_nid, evaluator
Esempio n. 20
0
    else:
        sampler = TemporalSampler(k=args.n_neighbors)
        edge_collator = TemporalEdgeCollator

    neg_sampler = dgl.dataloading.negative_sampler.Uniform(
        k=args.num_negative_samples)
    # Set Train, validation, test and new node test id
    train_seed = torch.arange(int(TRAIN_SPLIT * graph_no_new_node.num_edges()))
    valid_seed = torch.arange(int(TRAIN_SPLIT * graph_no_new_node.num_edges()),
                              trainval_div - new_node_eid_delete.size(0))
    test_seed = torch.arange(trainval_div - new_node_eid_delete.size(0),
                             graph_no_new_node.num_edges())
    test_new_node_seed = torch.arange(
        trainval_div - new_node_eid_delete.size(0), graph_new_node.num_edges())

    g_sampling = None if args.fast_mode else dgl.add_reverse_edges(
        graph_no_new_node, copy_edata=True)
    new_node_g_sampling = None if args.fast_mode else dgl.add_reverse_edges(
        graph_new_node, copy_edata=True)
    if not args.fast_mode:
        new_node_g_sampling.ndata[dgl.NID] = new_node_g_sampling.nodes()
        g_sampling.ndata[dgl.NID] = new_node_g_sampling.nodes()

    # we highly recommend that you always set the num_workers=0, otherwise the sampled subgraph may not be correct.
    train_dataloader = TemporalEdgeDataLoader(graph_no_new_node,
                                              train_seed,
                                              sampler,
                                              batch_size=args.batch_size,
                                              negative_sampler=neg_sampler,
                                              shuffle=False,
                                              drop_last=False,
                                              num_workers=0,
Esempio n. 21
0
"""
import os

graph_list = []
label_list = []

homograph = "dataset/homograph"
scenarios = os.listdir(homograph)
for scenario in scenarios:
    filepath = "dataset/homograph/"+scenario
    graphs = os.listdir(filepath)
    for graph in graphs:
        glist, label_dict = dgl.load_graphs(filepath+'/'+graph)
        graph_list.append(glist[0])
        for key, value in label_dict.items():
            if key != 'Attack':
                label_list.append(0)
            else:
                label_list.append(1)
print(len(graph_list))
print(label_list)
"""

# graph_list, label_list = dgl.load_graphs("dataset/homograph/YouTube/0.bin")
u, v = th.tensor([0, 1, 2, 3]), th.tensor([1, 2, 3, 4])
g = dgl.graph((u,v), idtype=th.int32)
g.ndata['feat'] = th.ones(5,2)
g.edata['feat'] = th.ones(4,3)
bg = dgl.add_reverse_edges(g, copy_ndata=True, copy_edata=True)
print(bg)