def check_score_func(func_name): batch_size = 10 neg_sample_size = 10 g, entity_emb, rel_emb = generate_rand_graph(100, func_name) hidden_dim = entity_emb.shape[1] ke_score_func = ke_score_funcs[func_name] model = BaseKEModel(ke_score_func, entity_emb, rel_emb) EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler') sampler = EdgeSampler(g, batch_size=batch_size, neg_sample_size=neg_sample_size, negative_mode='PBG-head', num_workers=1, shuffle=False, exclude_positive=False, return_false_neg=False) for pos_g, neg_g in sampler: neg_g = create_neg_subgraph(pos_g, neg_g, True, True, g.number_of_nodes()) pos_g.copy_from_parent() neg_g.copy_from_parent() score1 = F.reshape(model.predict_score(neg_g), (batch_size, -1)) score2 = model.predict_neg_score(pos_g, neg_g) score2 = F.reshape(score2, (batch_size, -1)) np.testing.assert_allclose(F.asnumpy(score1), F.asnumpy(score2), rtol=1e-5, atol=1e-5)
def forward_test(self, pos_g, neg_g, logs, gpu_id=-1): """Do the forward and generate ranking results. Parameters ---------- pos_g : DGLGraph Graph holding positive edges. neg_g : DGLGraph Graph holding negative edges. logs : List Where to put results in. gpu_id : int Which gpu to accelerate the calculation. if -1 is provided, cpu is used. """ pos_g.ndata['emb'] = self.entity_emb(pos_g.ndata['id'], gpu_id, False) pos_g.edata['emb'] = self.relation_emb(pos_g.edata['id'], gpu_id, False) self.score_func.prepare(pos_g, gpu_id, False) batch_size = pos_g.number_of_edges() pos_scores = self.predict_score(pos_g) pos_scores = reshape(pos_scores, batch_size, -1) neg_scores = self.predict_neg_score( pos_g, neg_g, to_device=cuda, gpu_id=gpu_id, trace=False, neg_deg_sample=self.args.neg_deg_sample_eval) neg_scores = reshape(neg_scores, batch_size, -1) # We need to filter the positive edges in the negative graph. if self.args.eval_filter: filter_bias = reshape(neg_g.edata['bias'], batch_size, -1) if gpu_id >= 0: filter_bias = cuda(filter_bias, gpu_id) # find all indices where it is not false negative sample mask = filter_bias != -1 # To compute the rank of a positive edge among all negative edges, # we need to know how many negative edges have higher scores than # the positive edge. for i in range(batch_size): if self.args.eval_filter: # select all the true negative samples where its score >= positive sample ranking = F.asnumpy( F.sum(masked_select(neg_scores[i] >= pos_scores[i], mask[i]), dim=0) + 1) else: ranking = F.asnumpy( F.sum(neg_scores[i] >= pos_scores[i], dim=0) + 1) logs.append({ 'MRR': 1.0 / ranking, 'MR': float(ranking), 'HITS@1': 1.0 if ranking <= 1 else 0.0, 'HITS@3': 1.0 if ranking <= 3 else 0.0, 'HITS@10': 1.0 if ranking <= 10 else 0.0 })
def start(self): """Start service of KVServer """ server_ip, server_port = self._addr.split(':') _receiver_wait(self._receiver, server_ip, int(server_port), self._client_count) _network_wait() # wait client's start for ID, addr in self._client_namebook.items(): client_ip, client_port = addr.split(':') _add_receiver_addr(self._sender, client_ip, int(client_port), ID) _sender_connect(self._sender) # Service loop while True: msg = _recv_kv_msg(self._receiver) if msg.type == KVMsgType.INIT: if (msg.name in self._is_init) == False: # we hack the msg format here: # msg.id store the shape of target tensor # msg.data has two row, and the first row is # the init_type, [0, 0] means 'zero' and [1,1] # means 'uniform'. The second row is the min & max threshold. data_shape = F.asnumpy(msg.id).tolist() row_0 = (F.asnumpy(msg.data).tolist())[0] row_1 = (F.asnumpy(msg.data).tolist())[1] init_type = 'zero' if row_0[0] == 0.0 else 'uniform' self._init_data(name=msg.name, shape=data_shape, init_type=init_type, low=row_1[0], high=row_1[1]) self._is_init.add(msg.name) elif msg.type == KVMsgType.PUSH: self._push_handler(msg.name, msg.id, msg.data) elif msg.type == KVMsgType.PULL: res_tensor = self._pull_handler(msg.name, msg.id) back_msg = KVStoreMsg(type=KVMsgType.PULL_BACK, rank=self._server_id, name=msg.name, id=msg.id, data=res_tensor) _send_kv_msg(self._sender, back_msg, msg.rank) elif msg.type == KVMsgType.BARRIER: self._barrier_count += 1 if self._barrier_count == self._client_count: back_msg = KVStoreMsg(type=KVMsgType.BARRIER, rank=self._server_id, name=None, id=None, data=None) for i in range(self._client_count): _send_kv_msg(self._sender, back_msg, i) self._barrier_count = 0 elif msg.type == KVMsgType.FINAL: print("Exit KVStore service, server ID: %d" % self._server_id) break # exit loop else: raise RuntimeError('Unknown type of kvstore message: %d' % msg.type.value)
def save_cache(self, mg, src, dst, ntid, etid, ntypes, etypes): nx.write_gpickle(mg, os.path.join(self._dir, 'cached_mg.gpickle')) np.save(os.path.join(self._dir, 'cached_src.npy'), src) np.save(os.path.join(self._dir, 'cached_dst.npy'), dst) np.save(os.path.join(self._dir, 'cached_ntid.npy'), ntid) np.save(os.path.join(self._dir, 'cached_etid.npy'), etid) save_strlist(os.path.join(self._dir, 'cached_ntypes.txt'), ntypes) save_strlist(os.path.join(self._dir, 'cached_etypes.txt'), etypes) np.save(os.path.join(self._dir, 'cached_train_idx.npy'), F.asnumpy(self.train_idx)) np.save(os.path.join(self._dir, 'cached_test_idx.npy'), F.asnumpy(self.test_idx)) np.save(os.path.join(self._dir, 'cached_labels.npy'), F.asnumpy(self.labels))
def knn_graphE(x, k, istrain=False): """Transforms the given point set to a directed graph, whose coordinates are given as a matrix. The predecessors of each point are its k-nearest neighbors. If a 3D tensor is given instead, then each row would be transformed into a separate graph. The graphs will be unioned. Parameters ---------- x : Tensor The input tensor. If 2D, each row of ``x`` corresponds to a node. If 3D, a k-NN graph would be constructed for each row. Then the graphs are unioned. k : int The number of neighbors Returns ------- DGLGraph The graph. The node IDs are in the same order as ``x``. """ if F.ndim(x) == 2: x = F.unsqueeze(x, 0) n_samples, n_points, _ = F.shape(x) dist = pairwise_squared_distance(x) if istrain and np.random.rand() > 0.5: k_indices = F.argtopk(dist, round(1.5 * k), 2, descending=False) rand_k = np.random.permutation(round(1.5 * k) - 1)[0:k - 1] + 1 # 0 + random k-1 rand_k = np.append(rand_k, 0) k_indices = k_indices[:, :, rand_k] # add 0 else: k_indices = F.argtopk(dist, k, 2, descending=False) dst = F.copy_to(k_indices, F.cpu()) src = F.zeros_like(dst) + F.reshape(F.arange(0, n_points), (1, -1, 1)) per_sample_offset = F.reshape( F.arange(0, n_samples) * n_points, (-1, 1, 1)) dst += per_sample_offset src += per_sample_offset dst = F.reshape(dst, (-1, )) src = F.reshape(src, (-1, )) adj = sparse.csr_matrix( (F.asnumpy(F.zeros_like(dst) + 1), (F.asnumpy(dst), F.asnumpy(src)))) g = DGLGraph(adj, readonly=True) return g
def main(): parser = argparse.ArgumentParser(description='Partition a knowledge graph') parser.add_argument('--data_path', type=str, default='data', help='root path of all dataset') parser.add_argument('--dataset', type=str, default='FB15k', help='dataset name, under data_path') parser.add_argument('--data_files', type=str, default=None, nargs='+', help='a list of data files, e.g. entity relation train valid test') parser.add_argument('--format', type=str, default='built_in', help='the format of the dataset, it can be built_in,'\ 'raw_udd_{htr} and udd_{htr}') parser.add_argument('-k', '--num-parts', required=True, type=int, help='The number of partitions') args = parser.parse_args() num_parts = args.num_parts print('load dataset..') # load dataset and samplers dataset = get_dataset(args.data_path, args.dataset, args.format, args.data_files) print('construct graph...') src, etype_id, dst = dataset.train coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities]) g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True) g.edata['tid'] = F.tensor(etype_id, F.int64) print('partition graph...') part_dict = dgl.transform.metis_partition(g, num_parts, 1) tot_num_inner_edges = 0 for part_id in part_dict: part = part_dict[part_id] num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata['inner_node']))[0]) num_inner_edges = len(np.nonzero(F.asnumpy(part.edata['inner_edge']))[0]) print('part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition'.format( part_id, part.number_of_nodes(), part.number_of_edges(), num_inner_nodes, num_inner_edges)) tot_num_inner_edges += num_inner_edges part.copy_from_parent() print('write graph to txt file...') txt_file_graph = os.path.join(args.data_path, args.dataset) txt_file_graph = os.path.join(txt_file_graph, 'partition_') write_txt_graph(txt_file_graph, 'train.txt', part_dict, g.number_of_nodes(), dataset.n_relations) print('there are {} edges in the graph and {} edge cuts for {} partitions.'.format( g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict)))
def push(self, name, id_tensor, data_tensor): """Push message to KVServer. Note that push() is an async operation that will return immediately after calling. Parameters ---------- name : str data name id_tensor : tensor (mx.ndarray or torch.tensor) a vector storing the global data ID data_tensor : tensor (mx.ndarray or torch.tensor) a tensor with the same row size of data ID """ assert len(name) > 0, 'name cannot be empty.' assert F.ndim(id_tensor) == 1, 'ID must be a vector.' assert F.shape(id_tensor)[0] == F.shape( data_tensor)[0], 'The data must has the same row size with ID.' # partition data (we can move this part of code into C-api if needed) server_id = self._data_store[name + '-part-'][id_tensor] # sort index by server id sorted_id = F.tensor(np.argsort(F.asnumpy(server_id))) id_tensor = id_tensor[sorted_id] data_tensor = data_tensor[sorted_id] server, count = np.unique(F.asnumpy(server_id), return_counts=True) # push data to server by order start = 0 for idx in range(len(server)): end = start + count[idx] if start == end: # don't have any data for target server continue partial_id = id_tensor[start:end] partial_data = data_tensor[start:end] if server[ idx] in self._local_server_id and self._close_shared_mem == False: if (name + '-g2l-' in self._has_data) == True: local_id = self._data_store[name + '-g2l-'][partial_id] else: local_id = partial_id self._push_handler(name + '-data-', local_id, data_tensor, self._data_store) else: msg = KVStoreMsg(type=KVMsgType.PUSH, rank=self._client_id, name=name, id=partial_id, data=partial_data) _send_kv_msg(self._sender, msg, server[idx]) start += count[idx]
def pull_model(self, client, pos_g, neg_g): with th.no_grad(): entity_id = F.cat(seq=[pos_g.ndata["id"], neg_g.ndata["id"]], dim=0) relation_id = pos_g.edata["id"] entity_id = F.tensor(np.unique(F.asnumpy(entity_id))) relation_id = F.tensor(np.unique(F.asnumpy(relation_id))) l2g = client.get_local2global() global_entity_id = l2g[entity_id] entity_data = client.pull(name="entity_emb", id_tensor=global_entity_id) relation_data = client.pull(name="relation_emb", id_tensor=relation_id) self.entity_emb.emb[entity_id] = entity_data self.relation_emb.emb[relation_id] = relation_data
def k_fold_split(dataset, labels, task_id, k=5, log=True): """Sort molecules based on their label values for a task and then split them for k-fold cross validation by taking consecutive chunks. Parameters ---------- dataset We assume ``len(dataset)`` gives the size for the dataset, ``dataset[i]`` gives the ith datapoint and ``dataset.smiles[i]`` gives the SMILES for the ith datapoint. labels : tensor of shape (N, T) Dataset labels all tasks. N for the number of datapoints and T for the number of tasks. task_id : int Index for the task. k : int Number of folds to use and should be no smaller than 2. Default to be 5. log : bool Whether to print a message at the start of preparing each fold. Returns ------- list of 2-tuples Each element of the list represents a fold and is a 2-tuple ``(train_set, val_set)``. ``train_set`` and ``val_set`` also have ``len(dataset)`` and ``dataset[i]`` behaviors. """ if not isinstance(labels, np.ndarray): labels = F.asnumpy(labels) task_labels = labels[:, task_id] sorted_indices = np.argsort(task_labels).tolist() return base_k_fold_split( partial(indices_split, indices=sorted_indices), dataset, k, log)
def forward_test(self, pos_g, neg_g, logs, gpu_id=-1): pos_g.ndata['emb'] = self.entity_emb(pos_g.ndata['id'], gpu_id, False) pos_g.edata['emb'] = self.relation_emb(pos_g.edata['id'], gpu_id, False) batch_size = pos_g.number_of_edges() pos_scores = self.predict_score(pos_g) pos_scores = reshape(logsigmoid(pos_scores), batch_size, -1) neg_scores = self.predict_neg_score(pos_g, neg_g, to_device=cuda, gpu_id=gpu_id, trace=False) neg_scores = reshape(logsigmoid(neg_scores), batch_size, -1) # We need to filter the positive edges in the negative graph. filter_bias = reshape(neg_g.edata['bias'], batch_size, -1) if self.args.gpu >= 0: filter_bias = cuda(filter_bias, self.args.gpu) neg_scores += filter_bias # To compute the rank of a positive edge among all negative edges, # we need to know how many negative edges have higher scores than # the positive edge. rankings = F.sum(neg_scores > pos_scores, dim=1) + 1 rankings = F.asnumpy(rankings) for i in range(batch_size): ranking = rankings[i] logs.append({ 'MRR': 1.0 / ranking, 'MR': float(ranking), 'HITS@1': 1.0 if ranking <= 1 else 0.0, 'HITS@3': 1.0 if ranking <= 3 else 0.0, 'HITS@10': 1.0 if ranking <= 10 else 0.0 })
def write_txt_graph(path, file_name, part_dict, total_nodes, total_relations): partition_book = [0] * total_nodes for part_id in part_dict: print('write graph %d...' % part_id) # Get (h,r,t) triples partition_path = path + str(part_id) if not os.path.exists(partition_path): os.mkdir(partition_path) triple_file = os.path.join(partition_path, file_name) f = open(triple_file, 'w') graph = part_dict[part_id] src, dst = graph.all_edges(form='uv', order='eid') rel = graph.edata['tid'] assert len(src) == len(rel) src = F.asnumpy(src) dst = F.asnumpy(dst) rel = F.asnumpy(rel) for i in range(len(src)): f.write( str(src[i]) + '\t' + str(rel[i]) + '\t' + str(dst[i]) + '\n') f.close() # Get local2global l2g_file = os.path.join(partition_path, 'local_to_global.txt') f = open(l2g_file, 'w') pid = F.asnumpy(graph.parent_nid) for i in range(len(pid)): f.write(str(pid[i]) + '\n') f.close() # Update partition_book partition = F.asnumpy(graph.ndata['part_id']) for i in range(len(pid)): partition_book[pid[i]] = partition[i] # Write partition_book.txt for part_id in part_dict: partition_path = path + str(part_id) pb_file = os.path.join(partition_path, 'partition_book.txt') f = open(pb_file, 'w') for i in range(len(partition_book)): f.write(str(partition_book[i]) + '\n') f.close() # Write relation_count.txt for part_id in part_dict: partition_path = path + str(part_id) rel_count_file = os.path.join(partition_path, 'relation_count.txt') f = open(rel_count_file, 'w') f.write(str(total_relations) + '\n') f.close()
def main(): parser = argparse.ArgumentParser(description='Partition a graph') parser.add_argument('--data', required=True, type=str, help='The file path of the input graph in the DGL format.') parser.add_argument('-k', '--num-parts', required=True, type=int, help='The number of partitions') parser.add_argument('--num-hops', type=int, default=1, help='The number of hops of HALO nodes we include in a partition') parser.add_argument('-m', '--method', required=True, type=str, help='The partitioning method: random, metis') parser.add_argument('-o', '--output', required=True, type=str, help='The output directory of the partitioned results') args = parser.parse_args() data_path = args.data num_parts = args.num_parts num_hops = args.num_hops method = args.method output = args.output glist, _ = load_graphs(data_path) g = glist[0] if args.method == 'metis': part_dict = dgl.transform.metis_partition(g, num_parts, num_hops) elif args.method == 'random': node_parts = np.random.choice(num_parts, g.number_of_nodes()) part_dict = dgl.transform.partition_graph_with_halo(g, node_parts, num_hops) else: raise Exception('unknown partitioning method: ' + args.method) tot_num_inner_edges = 0 for part_id in part_dict: part = part_dict[part_id] num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata['inner_node']))[0]) num_inner_edges = len(np.nonzero(F.asnumpy(part.edata['inner_edge']))[0]) print('part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition'.format( part_id, part.number_of_nodes(), part.number_of_edges(), num_inner_nodes, num_inner_edges)) tot_num_inner_edges += num_inner_edges # TODO I duplicate some node features. part.copy_from_parent() save_graphs(output + '/' + str(part_id) + '.dgl', [part]) print('there are {} edges in the graph and {} edge cuts for {} partitions.'.format( g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict)))
def get_partition_list(g, psize): p_gs = metis_partition(g, psize) graphs = [] for k, val in p_gs.items(): nids = val.ndata[dgl.NID] nids = F.asnumpy(nids) graphs.append(nids) return graphs
def segmented_knn_graph(x, k, segs): """Transforms the given point set to a directed graph, whose coordinates are given as a matrix. The predecessors of each point are its k-nearest neighbors. The matrices are concatenated along the first axis, and are segmented by ``segs``. Each block would be transformed into a separate graph. The graphs will be unioned. Parameters ---------- x : Tensor The input tensor. k : int The number of neighbors segs : iterable of int Number of points of each point set. Must sum up to the number of rows in ``x``. Returns ------- DGLGraph The graph. The node IDs are in the same order as ``x``. """ n_total_points, _ = F.shape(x) offset = np.insert(np.cumsum(segs), 0, 0) h_list = F.split(x, segs, 0) dst = [ F.argtopk(pairwise_squared_distance(h_g), k, 1, descending=False) + offset[i] for i, h_g in enumerate(h_list) ] dst = F.cat(dst, 0) src = F.arange(0, n_total_points).unsqueeze(1).expand(n_total_points, k) dst = F.reshape(dst, (-1, )) src = F.reshape(src, (-1, )) # !!! fix shape adj = sparse.csr_matrix( (F.asnumpy(F.zeros_like(dst) + 1), (F.asnumpy(dst), F.asnumpy(src))), shape=(n_total_points, n_total_points)) g = DGLGraph(adj, readonly=True) return g
def check(self, eval_type): edges = self.get_edges(eval_type) subg = self.g.edge_subgraph(edges) if eval_type == 'valid': data = self.valid elif eval_type == 'test': data = self.test subg.copy_from_parent() src, dst, eid = subg.all_edges('all', order='eid') src_id = subg.ndata['id'][src] dst_id = subg.ndata['id'][dst] etype = subg.edata['id'][eid] orig_src = np.array([t[0] for t in data]) orig_etype = np.array([t[1] for t in data]) orig_dst = np.array([t[2] for t in data]) np.testing.assert_equal(F.asnumpy(src_id), orig_src) np.testing.assert_equal(F.asnumpy(dst_id), orig_dst) np.testing.assert_equal(F.asnumpy(etype), orig_etype)
def pull(self, name, ID): """Pull sparse message from KVServer Note that we assume the row Ids in ID is in the ascending order. Parameters ---------- name : str data name ID : tensor (mx.ndarray or torch.tensor) a vector storing the IDs Return ------ tensor a tensor with the same row size of ID """ assert F.ndim(ID) == 1, 'ID must be a vector.' group_size = [0] * self._server_count numpy_id = F.asnumpy(ID) count = math.ceil(self._data_size[name] / self._server_count) server_id = numpy_id / count id_list, id_count = np.unique(server_id, return_counts=True) for idx in range(len(id_list)): group_size[int(id_list[idx])] += id_count[idx] min_idx = 0 max_idx = 0 server_count = 0 for idx in range(self._server_count): if group_size[idx] == 0: continue server_count += 1 max_idx += group_size[idx] range_id = ID[min_idx:max_idx] min_idx = max_idx msg = KVStoreMsg(type=KVMsgType.PULL, rank=self._client_id, name=name, id=range_id, data=None) _send_kv_msg(self._sender, msg, idx) # Recv back message msg_list = [] for idx in range(self._server_count): if group_size[idx] == 0: continue msg = _recv_kv_msg(self._receiver) assert msg.type == KVMsgType.PULL_BACK, 'Recv kv msg error.' msg_list.append(msg) return self._merge_msg(msg_list)
def push(self, name, ID, data): """Push sparse message to KVServer The push() API will partition message into different KVServer nodes automatically. Note that we assume the row Ids in ID is in the ascending order. Parameters ---------- name : str data name ID : tensor (mx.ndarray or torch.tensor) a vector storing the global IDs data : tensor (mx.ndarray or torch.tensor) a tensor with the same row size of id """ assert F.ndim(ID) == 1, 'ID must be a vector.' assert F.shape(ID)[0] == F.shape( data)[0], 'The data must has the same row size with ID.' group_size = [0] * self._server_count numpy_id = F.asnumpy(ID) count = math.ceil(self._data_size[name] / self._server_count) server_id = numpy_id / count id_list, id_count = np.unique(server_id, return_counts=True) for idx in range(len(id_list)): group_size[int(id_list[idx])] += id_count[idx] min_idx = 0 max_idx = 0 for idx in range(self._server_count): if group_size[idx] == 0: continue max_idx += group_size[idx] range_id = ID[min_idx:max_idx] range_data = data[min_idx:max_idx] min_idx = max_idx msg = KVStoreMsg(type=KVMsgType.PUSH, rank=self._client_id, name=name, id=range_id, data=range_data) _send_kv_msg(self._sender, msg, idx)
def forward_test_wikikg(self, query, ans, candidate, mode, logs, gpu_id=-1): """Do the forward and generate ranking results. Parameters ---------- query : Tensor input head and relation for test or valid ans : Tenseor the correct tail entity index cadidate : Tensor negative sampled tail entity """ scores = self.predict_score_wikikg(query, candidate, mode, to_device=cuda, gpu_id=gpu_id, trace=False) if mode == "Valid": batch_size = query.shape[0] neg_scores = reshape(scores, batch_size, -1) for i in range(batch_size): ranking = F.asnumpy( F.sum(neg_scores[i] >= neg_scores[i][ans[i]], dim=0) + 1) logs.append({ 'MRR': 1.0 / ranking, 'MR': float(ranking), 'HITS@1': 1.0 if ranking <= 1 else 0.0, 'HITS@3': 1.0 if ranking <= 3 else 0.0, 'HITS@10': 1.0 if ranking <= 10 else 0.0 }) else: argsort = F.argsort(scores, dim=1, descending=True) logs.append(argsort[:, :10])
def process_raw_tuples(self, raw_rdf_graphs): triplets = OrderedDict() mg = nx.MultiDiGraph() ent_classes = OrderedDict() rel_classes = OrderedDict() entities = OrderedDict() id2entity = {} labels = OrderedDict() id2label = {} dataset_pairs = [] src = [] dst = [] ntid = [] etid = [] mutag_graph = raw_rdf_graphs[0] ts = [] # make triplets sorted each time we load the graph for t in mutag_graph: ts.append(t) ts.sort() for (sbj, pred, obj) in ts: if pred in triplets: triplets[pred].append((sbj, pred, obj)) else: triplets[pred] = [] triplets[pred].append((sbj, pred, obj)) for key, triples in triplets.items(): if key == self.is_mutagenic: continue for (sbj, pred, obj) in triples: sbjent = self.parse_sbj(sbj) rel = self.parse_pred(pred) objent = self.parse_obj(obj) processed = self.process_tuples(sbjent, rel, objent) if processed is None: # ignored continue sbjclsid = _get_id(ent_classes, sbjent.n_type) objclsid = _get_id(ent_classes, objent.n_type) relclsid = _get_id(rel_classes, rel.r_type) mg.add_edge(sbjent.n_type, objent.n_type, key=rel.r_type) if self._insert_reverse: mg.add_edge(objent.n_type, sbjent.n_type, key='rev-%s' % rel.r_type) # instance graph src_id = _get_id(entities, str(sbjent)) _map_object(id2entity, src_id, sbjent) if len(entities) > len(ntid): # found new entity ntid.append(sbjclsid) dst_id = _get_id(entities, str(objent)) _map_object(id2entity, src_id, objent) if len(entities) > len(ntid): # found new entity ntid.append(objclsid) src.append(src_id) dst.append(dst_id) etid.append(relclsid) # handle label is_mutagenic_triplets = triplets[self.is_mutagenic] for (sbj, pred, obj) in is_mutagenic_triplets: #print("{} {} {}".format(sbj, pred, obj)) sbj_id = _get_id(entities, str(self.parse_sbj(sbj))) label = _get_id(labels, str(obj)) _map_object(id2label, label, obj) dataset_pairs.append((sbj_id, label)) src = np.array(src) dst = np.array(dst) ntid = np.array(ntid) etid = np.array(etid) ntypes = list(ent_classes.keys()) etypes = list(rel_classes.keys()) # add reverse edge with reverse relation if self._insert_reverse: print('Adding reverse edges ...') newsrc = np.hstack([src, dst]) newdst = np.hstack([dst, src]) src = newsrc dst = newdst etid = np.hstack([etid, etid + len(etypes)]) etypes.extend(['rev-%s' % t for t in etypes]) self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes) # get global to subgraph local id mapping idmap = F.asnumpy(self.graph.nodes[self.predict_category].data[dgl.NID]) glb2lcl = {glbid : lclid for lclid, glbid in enumerate(idmap)} lcl2glb = {lclid : glbid for lclid, glbid in enumerate(idmap)} self.split_dataset(dataset_pairs, labels, glb2lcl) self.lcl2glb = lcl2glb self.id2entity = id2entity self.id2label = id2label
def _check_topk_score2(score_model, g, num_entity, num_rels, exclude_mode): hidden_dim = 32 num_entity = 40 num_rels = 4 with tempfile.TemporaryDirectory() as tmpdirname: entity_emb, rel_emb = generate_rand_emb(score_model.model_name, num_entity, num_rels, hidden_dim, 'none') create_emb_file(Path(tmpdirname), 'entity.npy', entity_emb.numpy()) create_emb_file(Path(tmpdirname), 'relation.npy', rel_emb.numpy()) score_model.load(Path(tmpdirname)) score_model.attach_graph(g) score_func = score_model._score_func head = F.arange(0, num_entity // 2) rel = F.arange(0, num_rels) tail = F.arange(num_entity // 2, num_entity) # exec_model==triplet_wise tw_rel = np.random.randint(0, num_rels, num_entity // 2) tw_rel = F.tensor(tw_rel) result1 = score_model.link_predict(head, tw_rel, tail, exec_mode='triplet_wise', exclude_mode=exclude_mode, batch_size=16) assert len(result1) == 1 scores = [] head_ids = [] rel_ids = [] tail_ids = [] for i in range(head.shape[0]): hemb = F.take(entity_emb, head[i], 0) remb = F.take(rel_emb, tw_rel[i], 0) temb = F.unsqueeze(F.take(entity_emb, tail[i], 0), dim=0) edge = FakeEdge(hemb, temb, remb) score = F.asnumpy(score_func.edge_func(edge)['score']) scores.append(score) head_ids.append(F.asnumpy(head[i])) rel_ids.append(F.asnumpy(tw_rel[i])) tail_ids.append(F.asnumpy(tail[i])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) head_ids = np.asarray(head_ids) rel_ids = np.asarray(rel_ids) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] if exclude_mode is None or exclude_mode == 'mask': idx = idx[:10] head_ids = head_ids[idx] rel_ids = rel_ids[idx] tail_ids = tail_ids[idx] score_topk = scores[idx] if exclude_mode == 'mask': mask = np.zeros((10,)) for i in range(10): if (head_ids[i] + 1) % num_entity == tail_ids[i] or \ (head_ids[i] - 1) % num_entity == tail_ids[i]: mask[i] = 1 else: c_head_idx = [] c_rel_idx = [] c_tail_idx = [] c_score_topk = [] cur_idx = 0 while len(c_head_idx) < 10: c_idx = idx[cur_idx] cur_idx += 1 if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \ (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]: continue c_head_idx.append(head_ids[c_idx]) c_tail_idx.append(tail_ids[c_idx]) c_rel_idx.append(rel_ids[c_idx]) c_score_topk.append(scores[c_idx]) head_ids = F.tensor(c_head_idx) rel_ids = F.tensor(c_rel_idx) tail_ids = F.tensor(c_tail_idx) score_topk = F.tensor(c_score_topk) r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[0] np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r1_rel, rel_ids) np.testing.assert_allclose(r1_tail, tail_ids) np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) if exclude_mode == 'mask': np.testing.assert_allclose(r1_mask, mask) else: assert r1_mask is None # exec_mode==all result1 = score_model.link_predict(head, rel, tail, topk=20, exclude_mode=exclude_mode, batch_size=16) result2 = score_model.link_predict(head=head, tail=tail, topk=20, exclude_mode=exclude_mode, batch_size=16) assert len(result1) == 1 assert len(result2) == 1 scores = [] head_ids = [] rel_ids = [] tail_ids = [] for i in range(head.shape[0]): for j in range(rel.shape[0]): for k in range(tail.shape[0]): hemb = F.take(entity_emb, head[i], 0) remb = F.take(rel_emb, rel[j], 0) temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0) edge = FakeEdge(hemb, temb, remb) score = F.asnumpy(score_func.edge_func(edge)['score']) scores.append(score) head_ids.append(F.asnumpy(head[i])) rel_ids.append(F.asnumpy(rel[j])) tail_ids.append(F.asnumpy(tail[k])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) head_ids = np.asarray(head_ids) rel_ids = np.asarray(rel_ids) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] if exclude_mode is None or exclude_mode == 'mask': idx = idx[:20] head_ids = head_ids[idx] rel_ids = rel_ids[idx] tail_ids = tail_ids[idx] score_topk = scores[idx] if exclude_mode == 'mask': mask = np.zeros((20,)) for i in range(20): if (head_ids[i] + 1) % num_entity == tail_ids[i] or \ (head_ids[i] - 1) % num_entity == tail_ids[i]: mask[i] = 1 else: c_head_idx = [] c_rel_idx = [] c_tail_idx = [] c_score_topk = [] cur_idx = 0 while len(c_head_idx) < 20: c_idx = idx[cur_idx] cur_idx += 1 if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \ (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]: continue c_head_idx.append(head_ids[c_idx]) c_tail_idx.append(tail_ids[c_idx]) c_rel_idx.append(rel_ids[c_idx]) c_score_topk.append(scores[c_idx]) head_ids = F.tensor(c_head_idx) rel_ids = F.tensor(c_rel_idx) tail_ids = F.tensor(c_tail_idx) score_topk = F.tensor(c_score_topk) r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[0] r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[0] np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r2_head, head_ids) np.testing.assert_allclose(r1_rel, rel_ids) np.testing.assert_allclose(r2_rel, rel_ids) np.testing.assert_allclose(r1_tail, tail_ids) np.testing.assert_allclose(r2_tail, tail_ids) if exclude_mode == 'mask': np.testing.assert_allclose(r1_mask, mask) np.testing.assert_allclose(r2_mask, mask) else: assert r1_mask is None assert r2_mask is None result1 = score_model.link_predict(head, rel, tail, exec_mode='batch_rel', exclude_mode=exclude_mode, batch_size=16) result2 = score_model.link_predict(head=head, tail=tail, exec_mode='batch_rel', exclude_mode=exclude_mode, batch_size=16) assert len(result1) == num_rels assert len(result2) == num_rels for j in range(rel.shape[0]): scores = [] head_ids = [] rel_ids = [] tail_ids = [] for i in range(head.shape[0]): for k in range(tail.shape[0]): hemb = F.take(entity_emb, head[i], 0) remb = F.take(rel_emb, rel[j], 0) temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0) edge = FakeEdge(hemb, temb, remb) score = F.asnumpy(score_func.edge_func(edge)['score']) scores.append(score) head_ids.append(F.asnumpy(head[i])) rel_ids.append(F.asnumpy(rel[j])) tail_ids.append(F.asnumpy(tail[k])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) head_ids = np.asarray(head_ids) rel_ids = np.asarray(rel_ids) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] if exclude_mode is None or exclude_mode == 'mask': idx = idx[:10] head_ids = head_ids[idx] rel_ids = rel_ids[idx] tail_ids = tail_ids[idx] score_topk = scores[idx] if exclude_mode == 'mask': mask = np.full((10,), False) for i in range(10): if (head_ids[i] + 1) % num_entity == tail_ids[i] or \ (head_ids[i] - 1) % num_entity == tail_ids[i]: mask[i] = True else: c_head_idx = [] c_rel_idx = [] c_tail_idx = [] c_score_topk = [] cur_idx = 0 while len(c_head_idx) < 10: c_idx = idx[cur_idx] cur_idx += 1 if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \ (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]: continue c_head_idx.append(head_ids[c_idx]) c_tail_idx.append(tail_ids[c_idx]) c_rel_idx.append(rel_ids[c_idx]) c_score_topk.append(scores[c_idx]) head_ids = F.tensor(c_head_idx) rel_ids = F.tensor(c_rel_idx) tail_ids = F.tensor(c_tail_idx) score_topk = F.tensor(c_score_topk) r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[j] r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[j] np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r2_head, head_ids) np.testing.assert_allclose(r1_rel, rel_ids) np.testing.assert_allclose(r2_rel, rel_ids) np.testing.assert_allclose(r1_tail, tail_ids) np.testing.assert_allclose(r2_tail, tail_ids) if exclude_mode == 'mask': np.testing.assert_allclose(r1_mask, mask) np.testing.assert_allclose(r2_mask, mask) else: assert r1_mask is None assert r2_mask is None head = F.arange(0, num_entity) rel = F.arange(0, num_rels) tail = F.arange(0, num_entity) result1 = score_model.link_predict(head, rel, tail, exec_mode='batch_head', exclude_mode=exclude_mode, batch_size=16) result2 = score_model.link_predict(exec_mode='batch_head', exclude_mode=exclude_mode, batch_size=16) assert len(result1) == num_entity assert len(result2) == num_entity for i in range(head.shape[0]): scores = [] head_ids = [] rel_ids = [] tail_ids = [] for j in range(rel.shape[0]): for k in range(tail.shape[0]): hemb = F.take(entity_emb, head[i], 0) remb = F.take(rel_emb, rel[j], 0) temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0) edge = FakeEdge(hemb, temb, remb) score = F.asnumpy(score_func.edge_func(edge)['score']) scores.append(score) head_ids.append(F.asnumpy(head[i])) rel_ids.append(F.asnumpy(rel[j])) tail_ids.append(F.asnumpy(tail[k])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) head_ids = np.asarray(head_ids) rel_ids = np.asarray(rel_ids) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] if exclude_mode is None or exclude_mode == 'mask': idx = idx[:10] head_ids = head_ids[idx] rel_ids = rel_ids[idx] tail_ids = tail_ids[idx] score_topk = scores[idx] if exclude_mode == 'mask': mask = np.full((10,), False) for l in range(10): if (head_ids[l] + 1) % num_entity == tail_ids[l] or \ (head_ids[l] - 1) % num_entity == tail_ids[l]: mask[l] = True else: c_head_idx = [] c_rel_idx = [] c_tail_idx = [] c_score_topk = [] cur_idx = 0 while len(c_head_idx) < 10: c_idx = idx[cur_idx] cur_idx += 1 if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \ (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]: continue c_head_idx.append(head_ids[c_idx]) c_tail_idx.append(tail_ids[c_idx]) c_rel_idx.append(rel_ids[c_idx]) c_score_topk.append(scores[c_idx]) head_ids = F.tensor(c_head_idx) rel_ids = F.tensor(c_rel_idx) tail_ids = F.tensor(c_tail_idx) score_topk = F.tensor(c_score_topk) r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[i] r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[i] np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r2_head, head_ids) np.testing.assert_allclose(r1_rel, rel_ids) np.testing.assert_allclose(r2_rel, rel_ids) np.testing.assert_allclose(r1_tail, tail_ids) np.testing.assert_allclose(r2_tail, tail_ids) np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5) if exclude_mode == 'mask': np.testing.assert_allclose(r1_mask, mask) np.testing.assert_allclose(r2_mask, mask) else: assert r1_mask is None assert r2_mask is None result1 = score_model.link_predict(head, rel, tail, exec_mode='batch_tail', exclude_mode=exclude_mode) result2 = score_model.link_predict(exec_mode='batch_tail', exclude_mode=exclude_mode) assert len(result1) == num_entity assert len(result2) == num_entity for k in range(tail.shape[0]): scores = [] head_ids = [] rel_ids = [] tail_ids = [] for i in range(head.shape[0]): for j in range(rel.shape[0]): hemb = F.take(entity_emb, head[i], 0) remb = F.take(rel_emb, rel[j], 0) temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0) edge = FakeEdge(hemb, temb, remb) score = F.asnumpy(score_func.edge_func(edge)['score']) scores.append(score) head_ids.append(F.asnumpy(head[i])) rel_ids.append(F.asnumpy(rel[j])) tail_ids.append(F.asnumpy(tail[k])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) head_ids = np.asarray(head_ids) rel_ids = np.asarray(rel_ids) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] if exclude_mode is None or exclude_mode == 'mask': idx = idx[:10] head_ids = head_ids[idx] rel_ids = rel_ids[idx] tail_ids = tail_ids[idx] score_topk = scores[idx] if exclude_mode == 'mask': mask = np.full((10,), False) for l in range(10): if (head_ids[l] + 1) % num_entity == tail_ids[l] or \ (head_ids[l] - 1) % num_entity == tail_ids[l]: mask[l] = True else: c_head_idx = [] c_rel_idx = [] c_tail_idx = [] c_score_topk = [] cur_idx = 0 while len(c_head_idx) < 10: c_idx = idx[cur_idx] cur_idx += 1 if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \ (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]: continue c_head_idx.append(head_ids[c_idx]) c_tail_idx.append(tail_ids[c_idx]) c_rel_idx.append(rel_ids[c_idx]) c_score_topk.append(scores[c_idx]) head_ids = F.tensor(c_head_idx) rel_ids = F.tensor(c_rel_idx) tail_ids = F.tensor(c_tail_idx) score_topk = F.tensor(c_score_topk) r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[k] r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[k] np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r2_head, head_ids) np.testing.assert_allclose(r1_rel, rel_ids) np.testing.assert_allclose(r2_rel, rel_ids) np.testing.assert_allclose(r1_tail, tail_ids) np.testing.assert_allclose(r2_tail, tail_ids) np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5) if exclude_mode == 'mask': np.testing.assert_allclose(r1_mask, mask) np.testing.assert_allclose(r2_mask, mask) else: assert r1_mask is None assert r2_mask is None
def run_topk_emb(sfunc, sim_func, create_emb_sim=create_kge_emb_sim): hidden_dim = 32 num_head = 40 num_tail = 40 num_emb = 80 emb = F.uniform((num_emb, hidden_dim), F.float32, F.cpu(), -1, 1) head = F.arange(0, num_head) tail = F.arange(num_head, num_head+num_tail) sim_infer = create_emb_sim(emb, sfunc) result1 = sim_infer.topK(head, tail, pair_ws=True) scores = [] head_ids = [] tail_ids = [] for i in range(head.shape[0]): j = i hemb = F.take(emb, head[i], 0) temb = F.take(emb, tail[j], 0) score = sim_func(hemb, temb) scores.append(F.asnumpy(score)) head_ids.append(F.asnumpy(head[i])) tail_ids.append(F.asnumpy(tail[j])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) head_ids = np.asarray(head_ids) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] idx = idx[:10] head_ids = head_ids[idx] tail_ids = tail_ids[idx] score_topk = scores[idx] r1_head, r1_tail, r1_score = result1[0] np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r1_tail, tail_ids) print('pass pair wise') head = F.arange(0, num_head) tail = F.arange(num_head, num_head+num_tail) result1 = sim_infer.topK(head, tail) assert len(result1) == 1 scores = [] head_ids = [] tail_ids = [] for i in range(head.shape[0]): for j in range(tail.shape[0]): hemb = F.take(emb, head[i], 0) temb = F.take(emb, tail[j], 0) score = sim_func(hemb, temb) scores.append(F.asnumpy(score)) head_ids.append(F.asnumpy(head[i])) tail_ids.append(F.asnumpy(tail[j])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) head_ids = np.asarray(head_ids) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] idx = idx[:10] head_ids = head_ids[idx] tail_ids = tail_ids[idx] score_topk = scores[idx] r1_head, r1_tail, r1_score = result1[0] np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r1_tail, tail_ids) emb_ids = F.arange(0, num_emb) result1 = sim_infer.topK(emb_ids, emb_ids, bcast=True) result2 = sim_infer.topK(bcast=True) assert len(result1) == emb_ids.shape[0] assert len(result2) == emb_ids.shape[0] for i in range(emb_ids.shape[0]): scores = [] head_ids = [] tail_ids = [] for j in range(emb_ids.shape[0]): hemb = F.take(emb, emb_ids[i], 0) temb = F.take(emb, emb_ids[j], 0) score = sim_func(hemb, temb) score = F.asnumpy(score) scores.append(score) tail_ids.append(F.asnumpy(emb_ids[j])) scores = np.asarray(scores) scores = scores.reshape(scores.shape[0]) tail_ids = np.asarray(tail_ids) idx = np.argsort(scores) idx = idx[::-1] idx = idx[:10] head_ids = np.full((10,), F.asnumpy(emb_ids[i])) tail_ids = tail_ids[idx] score_topk = scores[idx] r1_head, r1_tail, r1_score = result1[i] np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r1_head, head_ids) np.testing.assert_allclose(r1_tail, tail_ids) r2_head, r2_tail, r2_score = result2[i] np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5) np.testing.assert_allclose(r2_head, head_ids) np.testing.assert_allclose(r2_tail, tail_ids) print('pass all')
def main(): parser = argparse.ArgumentParser(description="Partition a knowledge graph") parser.add_argument( "--data_path", type=str, default="data", help="The path of the directory where DGL-KE loads knowledge graph data.", ) parser.add_argument( "--dataset", type=str, default="FB15k", help="dataset name, under data_path" ) parser.add_argument( "--data_files", type=str, default=None, nargs="+", help="A list of data file names. This is used if users want to train KGE" "on their own datasets. If the format is raw_udd_{htr}," "users need to provide train_file [valid_file] [test_file]." "If the format is udd_{htr}, users need to provide" "entity_file relation_file train_file [valid_file] [test_file]." "In both cases, valid_file and test_file are optional.", ) parser.add_argument( "--delimiter", type=str, default="\t", help="Delimiter used in data files. Note all files should use the same delimiter.", ) parser.add_argument( "--format", type=str, default="built_in", help="The format of the dataset. For builtin knowledge graphs," "the foramt should be built_in. For users own knowledge graphs," "it needs to be raw_udd_{htr} or udd_{htr}.", ) parser.add_argument( "-k", "--num-parts", required=True, type=int, help="The number of partitions" ) args = parser.parse_args() num_parts = args.num_parts print("load dataset..") # load dataset and samplers dataset = get_dataset( args.data_path, args.dataset, args.format, args.delimiter, args.data_files ) print("construct graph...") src, etype_id, dst = dataset.train coo = sp.sparse.coo_matrix( (np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities] ) g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True) g.edata["tid"] = F.tensor(etype_id, F.int64) print("partition graph...") part_dict = dgl.transform.metis_partition(g, num_parts, 1) tot_num_inner_edges = 0 for part_id in part_dict: part = part_dict[part_id] num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata["inner_node"]))[0]) num_inner_edges = len(np.nonzero(F.asnumpy(part.edata["inner_edge"]))[0]) print( "part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition".format( part_id, part.number_of_nodes(), part.number_of_edges(), num_inner_nodes, num_inner_edges, ) ) tot_num_inner_edges += num_inner_edges part.copy_from_parent() print("write graph to txt file...") txt_file_graph = os.path.join(args.data_path, args.dataset) txt_file_graph = os.path.join(txt_file_graph, "partition_") write_txt_graph( txt_file_graph, "train.txt", part_dict, g.number_of_nodes(), dataset.n_relations ) print( "there are {} edges in the graph and {} edge cuts for {} partitions.".format( g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict), ) )
def process_raw_tuples(self, raw_tuples, root_path): """Processing raw RDF dataset Parameters ---------- raw_tuples: Raw rdf tuples root_path: str Root path containing the data """ mg = nx.MultiDiGraph() ent_classes = OrderedDict() rel_classes = OrderedDict() entities = OrderedDict() src = [] dst = [] ntid = [] etid = [] sorted_tuples = [] for t in raw_tuples: sorted_tuples.append(t) sorted_tuples.sort() for i, (sbj, pred, obj) in enumerate(sorted_tuples): if self.verbose and i % self._print_every == 0: print('Processed %d tuples, found %d valid tuples.' % (i, len(src))) sbjent = self.parse_entity(sbj) rel = self.parse_relation(pred) objent = self.parse_entity(obj) processed = self.process_tuple((sbj, pred, obj), sbjent, rel, objent) if processed is None: # ignored continue # meta graph sbjclsid = _get_id(ent_classes, sbjent.cls) objclsid = _get_id(ent_classes, objent.cls) relclsid = _get_id(rel_classes, rel.cls) mg.add_edge(sbjent.cls, objent.cls, key=rel.cls) if self._insert_reverse: mg.add_edge(objent.cls, sbjent.cls, key='rev-%s' % rel.cls) # instance graph src_id = _get_id(entities, str(sbjent)) if len(entities) > len(ntid): # found new entity ntid.append(sbjclsid) dst_id = _get_id(entities, str(objent)) if len(entities) > len(ntid): # found new entity ntid.append(objclsid) src.append(src_id) dst.append(dst_id) etid.append(relclsid) src = np.asarray(src) dst = np.asarray(dst) ntid = np.asarray(ntid) etid = np.asarray(etid) ntypes = list(ent_classes.keys()) etypes = list(rel_classes.keys()) # add reverse edge with reverse relation if self._insert_reverse: if self.verbose: print('Adding reverse edges ...') newsrc = np.hstack([src, dst]) newdst = np.hstack([dst, src]) src = newsrc dst = newdst etid = np.hstack([etid, etid + len(etypes)]) etypes.extend(['rev-%s' % t for t in etypes]) hg = self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes) if self.verbose: print('Load training/validation/testing split ...') idmap = F.asnumpy(hg.nodes[self.predict_category].data[dgl.NID]) glb2lcl = {glbid: lclid for lclid, glbid in enumerate(idmap)} def findidfn(ent): if ent not in entities: return None else: return glb2lcl[entities[ent]] self._hg = hg train_idx, test_idx, labels, num_classes = self.load_data_split( findidfn, root_path) train_mask = idx2mask(train_idx, self._hg.number_of_nodes(self.predict_category)) test_mask = idx2mask(test_idx, self._hg.number_of_nodes(self.predict_category)) labels = F.tensor(labels, F.data_type_dict['int64']) train_mask = generate_mask_tensor(train_mask) test_mask = generate_mask_tensor(test_mask) self._hg.nodes[self.predict_category].data['train_mask'] = train_mask self._hg.nodes[self.predict_category].data['test_mask'] = test_mask self._hg.nodes[self.predict_category].data['labels'] = labels self._num_classes = num_classes # save for compatability self._train_idx = F.tensor(train_idx) self._test_idx = F.tensor(test_idx) self._labels = labels
def process_raw_tuples(self, raw_tuples): mg = nx.MultiDiGraph() ent_classes = OrderedDict() rel_classes = OrderedDict() entities = OrderedDict() src = [] dst = [] ntid = [] etid = [] sorted_tuples = [] for t in raw_tuples: sorted_tuples.append(t) sorted_tuples.sort() for i, (sbj, pred, obj) in enumerate(sorted_tuples): if i % self._print_every == 0: print('Processed %d tuples, found %d valid tuples.' % (i, len(src))) sbjent = self.parse_entity(sbj) rel = self.parse_relation(pred) objent = self.parse_entity(obj) processed = self.process_tuple((sbj, pred, obj), sbjent, rel, objent) if processed is None: # ignored continue # meta graph sbjclsid = _get_id(ent_classes, sbjent.cls) objclsid = _get_id(ent_classes, objent.cls) relclsid = _get_id(rel_classes, rel.cls) mg.add_edge(sbjent.cls, objent.cls, key=rel.cls) if self._insert_reverse: mg.add_edge(objent.cls, sbjent.cls, key='rev-%s' % rel.cls) # instance graph src_id = _get_id(entities, str(sbjent)) if len(entities) > len(ntid): # found new entity ntid.append(sbjclsid) dst_id = _get_id(entities, str(objent)) if len(entities) > len(ntid): # found new entity ntid.append(objclsid) src.append(src_id) dst.append(dst_id) etid.append(relclsid) src = np.asarray(src) dst = np.asarray(dst) ntid = np.asarray(ntid) etid = np.asarray(etid) ntypes = list(ent_classes.keys()) etypes = list(rel_classes.keys()) # add reverse edge with reverse relation if self._insert_reverse: print('Adding reverse edges ...') newsrc = np.hstack([src, dst]) newdst = np.hstack([dst, src]) src = newsrc dst = newdst etid = np.hstack([etid, etid + len(etypes)]) etypes.extend(['rev-%s' % t for t in etypes]) self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes) print('Load training/validation/testing split ...') idmap = F.asnumpy( self.graph.nodes[self.predict_category].data[dgl.NID]) glb2lcl = {glbid: lclid for lclid, glbid in enumerate(idmap)} def findidfn(ent): if ent not in entities: return None else: return glb2lcl[entities[ent]] self.load_data_split(findidfn) self.save_cache(mg, src, dst, ntid, etid, ntypes, etypes)
def val_mask(self): deprecate_property('dataset.val_mask', 'graph.ndata[\'val_mask\']') return F.asnumpy(self._graph.ndata['val_mask'])
def train_val_test_split(dataset, labels, task_id, frac_train=0.8, frac_val=0.1, frac_test=0.1, bucket_size=10, random_state=None): """Split the dataset into training, validation and test subsets as stated above. Parameters ---------- dataset We assume ``len(dataset)`` gives the size for the dataset, ``dataset[i]`` gives the ith datapoint and ``dataset.smiles[i]`` gives the SMILES for the ith datapoint. labels : tensor of shape (N, T) Dataset labels all tasks. N for the number of datapoints and T for the number of tasks. task_id : int Index for the task. frac_train : float Fraction of data to use for training. By default, we set this to be 0.8, i.e. 80% of the dataset is used for training. frac_val : float Fraction of data to use for validation. By default, we set this to be 0.1, i.e. 10% of the dataset is used for validation. frac_test : float Fraction of data to use for test. By default, we set this to be 0.1, i.e. 10% of the dataset is used for test. bucket_size : int Size of bucket of datapoints. Default to 10. random_state : None, int or array_like, optional Random seed used to initialize the pseudo-random number generator. Can be any integer between 0 and 2**32 - 1 inclusive, an array (or other sequence) of such integers, or None (the default). If seed is None, then RandomState will try to read data from /dev/urandom (or the Windows analogue) if available or seed from the clock otherwise. Returns ------- list of length 3 Subsets for training, validation and test, which also have ``len(dataset)`` and ``dataset[i]`` behaviors """ train_val_test_sanity_check(frac_train, frac_val, frac_test) if random_state is not None: np.random.seed(random_state) if not isinstance(labels, np.ndarray): labels = F.asnumpy(labels) task_labels = labels[:, task_id] sorted_indices = np.argsort(task_labels) train_bucket_cutoff = int(np.round(frac_train * bucket_size)) val_bucket_cutoff = int(np.round( frac_val * bucket_size)) + train_bucket_cutoff train_indices, val_indices, test_indices = [], [], [] while sorted_indices.shape[0] >= bucket_size: current_batch, sorted_indices = np.split(sorted_indices, [bucket_size]) shuffled = np.random.permutation(range(bucket_size)) train_indices.extend( current_batch[shuffled[:train_bucket_cutoff]].tolist()) val_indices.extend(current_batch[ shuffled[train_bucket_cutoff:val_bucket_cutoff]].tolist()) test_indices.extend( current_batch[shuffled[val_bucket_cutoff:]].tolist()) # Place rest samples in the training set. train_indices.extend(sorted_indices.tolist()) return [ Subset(dataset, train_indices), Subset(dataset, val_indices), Subset(dataset, test_indices) ]
def check_infer_score(func_name): batch_size = 10 ke_score_func = ke_infer_funcs[func_name] # normal head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'none') if args is None: score_func = ke_score_func() elif type(args) is tuple: score_func = ke_score_func(*list(args)) else: score_func = ke_score_func(args) score1 = score_func.infer(head_emb, rel_emb, tail_emb) assert(score1.shape[0] == head_emb.shape[0]) h_score = [] for i in range(head_emb.shape[0]): r_score = [] for j in range(rel_emb.shape[0]): t_score = [] for k in range(tail_emb.shape[0]): hemb = head_emb[i] remb = rel_emb[j] temb = F.unsqueeze(tail_emb[k], dim=0) edge = FakeEdge(hemb, temb, remb) score = score_func.edge_func(edge)['score'] t_score.append(F.asnumpy(score)) r_score.append(t_score) h_score.append(r_score) score2 = np.asarray(h_score).reshape(head_emb.shape[0], rel_emb.shape[0], tail_emb.shape[0]) np.testing.assert_allclose(F.asnumpy(score1), score2, rtol=1e-5, atol=1e-5) # bcast head head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'head') if args is None: score_func = ke_score_func() elif type(args) is tuple: score_func = ke_score_func(*list(args)) else: score_func = ke_score_func(args) score1 = score_func.infer(head_emb, rel_emb, tail_emb) assert(score1.shape[0] == head_emb.shape[0]) h_score = [] for i in range(head_emb.shape[0]): r_score = [] for j in range(rel_emb.shape[0]): t_score = [] for k in range(tail_emb.shape[0]): hemb = head_emb[i] remb = rel_emb[j] temb = F.unsqueeze(tail_emb[k], dim=0) edge = FakeEdge(hemb, temb, remb) score = score_func.edge_func(edge)['score'] t_score.append(F.asnumpy(score)) r_score.append(t_score) h_score.append(r_score) score2 = np.asarray(h_score).reshape(1, rel_emb.shape[0], tail_emb.shape[0]) np.testing.assert_allclose(F.asnumpy(score1), score2, rtol=1e-5, atol=1e-5) # bcast rel head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'rel') if args is None: score_func = ke_score_func() elif type(args) is tuple: score_func = ke_score_func(*list(args)) else: score_func = ke_score_func(args) score1 = score_func.infer(head_emb, rel_emb, tail_emb) assert(score1.shape[0] == head_emb.shape[0]) h_score = [] for i in range(head_emb.shape[0]): r_score = [] for j in range(rel_emb.shape[0]): t_score = [] for k in range(tail_emb.shape[0]): hemb = head_emb[i] remb = rel_emb[j] temb = F.unsqueeze(tail_emb[k], dim=0) edge = FakeEdge(hemb, temb, remb) score = score_func.edge_func(edge)['score'] t_score.append(F.asnumpy(score)) r_score.append(t_score) h_score.append(r_score) score2 = np.asarray(h_score).reshape(head_emb.shape[0], 1, tail_emb.shape[0]) np.testing.assert_allclose(F.asnumpy(score1), score2, rtol=1e-5, atol=1e-5) # bcast tail head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'tail') if args is None: score_func = ke_score_func() elif type(args) is tuple: score_func = ke_score_func(*list(args)) else: score_func = ke_score_func(args) score1 = score_func.infer(head_emb, rel_emb, tail_emb) assert(score1.shape[0] == head_emb.shape[0]) h_score = [] for i in range(head_emb.shape[0]): r_score = [] for j in range(rel_emb.shape[0]): t_score = [] for k in range(tail_emb.shape[0]): hemb = head_emb[i] remb = rel_emb[j] temb = F.unsqueeze(tail_emb[k], dim=0) edge = FakeEdge(hemb, temb, remb) score = score_func.edge_func(edge)['score'] t_score.append(F.asnumpy(score)) r_score.append(t_score) h_score.append(r_score) score2 = np.asarray(h_score).reshape(head_emb.shape[0], rel_emb.shape[0], 1) np.testing.assert_allclose(F.asnumpy(score1), score2, rtol=1e-5, atol=1e-5)
def topK(self, head=None, tail=None, bcast=False, pair_ws=False, k=10): if head is None: head = F.arange(0, self.emb.shape[0]) else: head = F.tensor(head) if tail is None: tail = F.arange(0, self.emb.shape[0]) else: tail = F.tensor(tail) head_emb = self.emb[head] tail_emb = self.emb[tail] if pair_ws is True: result = [] batch_size = self.batch_size # chunked cal score score = [] num_head = head.shape[0] num_tail = tail.shape[0] for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sh_emb = F.copy_to(sh_emb, self.device) st_emb = tail_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] st_emb = F.copy_to(st_emb, self.device) score.append(F.copy_to(self.sim_func(sh_emb, st_emb, pw=True), F.cpu())) score = F.cat(score, dim=0) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] result.append((F.asnumpy(head[sidx]), F.asnumpy(tail[sidx]), F.asnumpy(score))) else: num_head = head.shape[0] num_tail = tail.shape[0] batch_size = self.batch_size # chunked cal score score = [] for i in range((num_head + batch_size - 1) // batch_size): sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \ if (i + 1) * batch_size < num_head \ else num_head] sh_emb = F.copy_to(sh_emb, self.device) s_score = [] for j in range((num_tail + batch_size - 1) // batch_size): st_emb = tail_emb[j * batch_size : (j + 1) * batch_size \ if (j + 1) * batch_size < num_tail \ else num_tail] st_emb = F.copy_to(st_emb, self.device) s_score.append(F.copy_to(self.sim_func(sh_emb, st_emb), F.cpu())) score.append(F.cat(s_score, dim=1)) score = F.cat(score, dim=0) if bcast is False: result = [] idx = F.arange(0, num_head * num_tail) score = F.reshape(score, (num_head * num_tail, )) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] sidx = sidx idx = idx[sidx] tail_idx = idx % num_tail idx = floor_divide(idx, num_tail) head_idx = idx % num_head result.append((F.asnumpy(head[head_idx]), F.asnumpy(tail[tail_idx]), F.asnumpy(score))) else: # bcast at head result = [] for i in range(num_head): i_score = score[i] sidx = F.argsort(i_score, dim=0, descending=True) idx = F.arange(0, num_tail) i_idx = sidx[:k] i_score = i_score[i_idx] idx = idx[i_idx] result.append((np.full((k,), F.asnumpy(head[i])), F.asnumpy(tail[idx]), F.asnumpy(i_score))) return result
def test_mask(self): deprecate_property('dataset.test_mask', 'graph.ndata[\'test_mask\']') return F.asnumpy(self._graph.ndata['test_mask'])
def topK(self, head=None, rel=None, tail=None, exec_mode='all', k=10): if head is None: head = F.arange(0, self.model.num_entity) else: head = F.tensor(head) if rel is None: rel = F.arange(0, self.model.num_rel) else: rel = F.tensor(rel) if tail is None: tail = F.arange(0, self.model.num_entity) else: tail = F.tensor(tail) num_head = F.shape(head)[0] num_rel = F.shape(rel)[0] num_tail = F.shape(tail)[0] if exec_mode == 'triplet_wise': result = [] assert num_head == num_rel, \ 'For triplet wise exection mode, head, relation and tail lists should have same length' assert num_head == num_tail, \ 'For triplet wise exection mode, head, relation and tail lists should have same length' raw_score = self.model.score(head, rel, tail, triplet_wise=True) score = self.score_func(raw_score) idx = F.arange(0, num_head) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] idx = idx[sidx] result.append((F.asnumpy(head[idx]), F.asnumpy(rel[idx]), F.asnumpy(tail[idx]), F.asnumpy(score))) elif exec_mode == 'all': result = [] raw_score = self.model.score(head, rel, tail) score = self.score_func(raw_score) idx = F.arange(0, num_head * num_rel * num_tail) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] idx = idx[sidx] tail_idx = idx % num_tail idx = floor_divide(idx, num_tail) rel_idx = idx % num_rel idx = floor_divide(idx, num_rel) head_idx = idx % num_head result.append((F.asnumpy(head[head_idx]), F.asnumpy(rel[rel_idx]), F.asnumpy(tail[tail_idx]), F.asnumpy(score))) elif exec_mode == 'batch_head': result = [] for i in range(num_head): raw_score = self.model.score(F.unsqueeze(head[i], 0), rel, tail) score = self.score_func(raw_score) idx = F.arange(0, num_rel * num_tail) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] idx = idx[sidx] tail_idx = idx % num_tail idx = floor_divide(idx, num_tail) rel_idx = idx % num_rel result.append((np.full((k,), F.asnumpy(head[i])), F.asnumpy(rel[rel_idx]), F.asnumpy(tail[tail_idx]), F.asnumpy(score))) elif exec_mode == 'batch_rel': result = [] for i in range(num_rel): raw_score = self.model.score(head, F.unsqueeze(rel[i], 0), tail) score = self.score_func(raw_score) idx = F.arange(0, num_head * num_tail) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] idx = idx[sidx] tail_idx = idx % num_tail idx = floor_divide(idx, num_tail) head_idx = idx % num_head result.append((F.asnumpy(head[head_idx]), np.full((k,), F.asnumpy(rel[i])), F.asnumpy(tail[tail_idx]), F.asnumpy(score))) elif exec_mode == 'batch_tail': result = [] for i in range(num_tail): raw_score = self.model.score(head, rel, F.unsqueeze(tail[i], 0)) score = self.score_func(raw_score) idx = F.arange(0, num_head * num_rel) sidx = F.argsort(score, dim=0, descending=True) sidx = sidx[:k] score = score[sidx] idx = idx[sidx] rel_idx = idx % num_rel idx = floor_divide(idx, num_rel) head_idx = idx % num_head result.append((F.asnumpy(head[head_idx]), F.asnumpy(rel[rel_idx]), np.full((k,), F.asnumpy(tail[i])), F.asnumpy(score))) else: assert False, 'unknow execution mode type {}'.format(exec_mode) return result