Example #1
0
def check_score_func(func_name):
    batch_size = 10
    neg_sample_size = 10
    g, entity_emb, rel_emb = generate_rand_graph(100, func_name)
    hidden_dim = entity_emb.shape[1]
    ke_score_func = ke_score_funcs[func_name]
    model = BaseKEModel(ke_score_func, entity_emb, rel_emb)

    EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
    sampler = EdgeSampler(g,
                          batch_size=batch_size,
                          neg_sample_size=neg_sample_size,
                          negative_mode='PBG-head',
                          num_workers=1,
                          shuffle=False,
                          exclude_positive=False,
                          return_false_neg=False)

    for pos_g, neg_g in sampler:
        neg_g = create_neg_subgraph(pos_g, neg_g, True, True,
                                    g.number_of_nodes())
        pos_g.copy_from_parent()
        neg_g.copy_from_parent()
        score1 = F.reshape(model.predict_score(neg_g), (batch_size, -1))
        score2 = model.predict_neg_score(pos_g, neg_g)
        score2 = F.reshape(score2, (batch_size, -1))
        np.testing.assert_allclose(F.asnumpy(score1),
                                   F.asnumpy(score2),
                                   rtol=1e-5,
                                   atol=1e-5)
Example #2
0
    def forward_test(self, pos_g, neg_g, logs, gpu_id=-1):
        """Do the forward and generate ranking results.

        Parameters
        ----------
        pos_g : DGLGraph
            Graph holding positive edges.
        neg_g : DGLGraph
            Graph holding negative edges.
        logs : List
            Where to put results in.
        gpu_id : int
            Which gpu to accelerate the calculation. if -1 is provided, cpu is used.
        """
        pos_g.ndata['emb'] = self.entity_emb(pos_g.ndata['id'], gpu_id, False)
        pos_g.edata['emb'] = self.relation_emb(pos_g.edata['id'], gpu_id,
                                               False)
        self.score_func.prepare(pos_g, gpu_id, False)

        batch_size = pos_g.number_of_edges()
        pos_scores = self.predict_score(pos_g)
        pos_scores = reshape(pos_scores, batch_size, -1)

        neg_scores = self.predict_neg_score(
            pos_g,
            neg_g,
            to_device=cuda,
            gpu_id=gpu_id,
            trace=False,
            neg_deg_sample=self.args.neg_deg_sample_eval)
        neg_scores = reshape(neg_scores, batch_size, -1)
        # We need to filter the positive edges in the negative graph.
        if self.args.eval_filter:
            filter_bias = reshape(neg_g.edata['bias'], batch_size, -1)
            if gpu_id >= 0:
                filter_bias = cuda(filter_bias, gpu_id)
            # find all indices where it is not false negative sample
            mask = filter_bias != -1

        # To compute the rank of a positive edge among all negative edges,
        # we need to know how many negative edges have higher scores than
        # the positive edge.
        for i in range(batch_size):
            if self.args.eval_filter:
                # select all the true negative samples where its score >= positive sample
                ranking = F.asnumpy(
                    F.sum(masked_select(neg_scores[i] >= pos_scores[i],
                                        mask[i]),
                          dim=0) + 1)
            else:
                ranking = F.asnumpy(
                    F.sum(neg_scores[i] >= pos_scores[i], dim=0) + 1)
            logs.append({
                'MRR': 1.0 / ranking,
                'MR': float(ranking),
                'HITS@1': 1.0 if ranking <= 1 else 0.0,
                'HITS@3': 1.0 if ranking <= 3 else 0.0,
                'HITS@10': 1.0 if ranking <= 10 else 0.0
            })
Example #3
0
 def start(self):
     """Start service of KVServer
     """
     server_ip, server_port = self._addr.split(':')
     _receiver_wait(self._receiver, server_ip, int(server_port),
                    self._client_count)
     _network_wait()  # wait client's start
     for ID, addr in self._client_namebook.items():
         client_ip, client_port = addr.split(':')
         _add_receiver_addr(self._sender, client_ip, int(client_port), ID)
     _sender_connect(self._sender)
     # Service loop
     while True:
         msg = _recv_kv_msg(self._receiver)
         if msg.type == KVMsgType.INIT:
             if (msg.name in self._is_init) == False:
                 # we hack the msg format here:
                 # msg.id store the shape of target tensor
                 # msg.data has two row, and the first row is
                 # the init_type, [0, 0] means 'zero' and [1,1]
                 # means 'uniform'. The second row is the min & max threshold.
                 data_shape = F.asnumpy(msg.id).tolist()
                 row_0 = (F.asnumpy(msg.data).tolist())[0]
                 row_1 = (F.asnumpy(msg.data).tolist())[1]
                 init_type = 'zero' if row_0[0] == 0.0 else 'uniform'
                 self._init_data(name=msg.name,
                                 shape=data_shape,
                                 init_type=init_type,
                                 low=row_1[0],
                                 high=row_1[1])
                 self._is_init.add(msg.name)
         elif msg.type == KVMsgType.PUSH:
             self._push_handler(msg.name, msg.id, msg.data)
         elif msg.type == KVMsgType.PULL:
             res_tensor = self._pull_handler(msg.name, msg.id)
             back_msg = KVStoreMsg(type=KVMsgType.PULL_BACK,
                                   rank=self._server_id,
                                   name=msg.name,
                                   id=msg.id,
                                   data=res_tensor)
             _send_kv_msg(self._sender, back_msg, msg.rank)
         elif msg.type == KVMsgType.BARRIER:
             self._barrier_count += 1
             if self._barrier_count == self._client_count:
                 back_msg = KVStoreMsg(type=KVMsgType.BARRIER,
                                       rank=self._server_id,
                                       name=None,
                                       id=None,
                                       data=None)
                 for i in range(self._client_count):
                     _send_kv_msg(self._sender, back_msg, i)
                 self._barrier_count = 0
         elif msg.type == KVMsgType.FINAL:
             print("Exit KVStore service, server ID: %d" % self._server_id)
             break  # exit loop
         else:
             raise RuntimeError('Unknown type of kvstore message: %d' %
                                msg.type.value)
Example #4
0
File: rdf.py Project: zhoujf620/dgl
 def save_cache(self, mg, src, dst, ntid, etid, ntypes, etypes):
     nx.write_gpickle(mg, os.path.join(self._dir, 'cached_mg.gpickle'))
     np.save(os.path.join(self._dir, 'cached_src.npy'), src)
     np.save(os.path.join(self._dir, 'cached_dst.npy'), dst)
     np.save(os.path.join(self._dir, 'cached_ntid.npy'), ntid)
     np.save(os.path.join(self._dir, 'cached_etid.npy'), etid)
     save_strlist(os.path.join(self._dir, 'cached_ntypes.txt'), ntypes)
     save_strlist(os.path.join(self._dir, 'cached_etypes.txt'), etypes)
     np.save(os.path.join(self._dir, 'cached_train_idx.npy'), F.asnumpy(self.train_idx))
     np.save(os.path.join(self._dir, 'cached_test_idx.npy'), F.asnumpy(self.test_idx))
     np.save(os.path.join(self._dir, 'cached_labels.npy'), F.asnumpy(self.labels))
Example #5
0
def knn_graphE(x, k, istrain=False):
    """Transforms the given point set to a directed graph, whose coordinates
    are given as a matrix. The predecessors of each point are its k-nearest
    neighbors.

    If a 3D tensor is given instead, then each row would be transformed into
    a separate graph.  The graphs will be unioned.

    Parameters
    ----------
    x : Tensor
        The input tensor.

        If 2D, each row of ``x`` corresponds to a node.

        If 3D, a k-NN graph would be constructed for each row.  Then
        the graphs are unioned.
    k : int
        The number of neighbors

    Returns
    -------
    DGLGraph
        The graph.  The node IDs are in the same order as ``x``.
    """
    if F.ndim(x) == 2:
        x = F.unsqueeze(x, 0)
    n_samples, n_points, _ = F.shape(x)

    dist = pairwise_squared_distance(x)
    if istrain and np.random.rand() > 0.5:
        k_indices = F.argtopk(dist, round(1.5 * k), 2, descending=False)
        rand_k = np.random.permutation(round(1.5 * k) -
                                       1)[0:k - 1] + 1  # 0 + random k-1
        rand_k = np.append(rand_k, 0)
        k_indices = k_indices[:, :, rand_k]  # add 0
    else:
        k_indices = F.argtopk(dist, k, 2, descending=False)

    dst = F.copy_to(k_indices, F.cpu())

    src = F.zeros_like(dst) + F.reshape(F.arange(0, n_points), (1, -1, 1))

    per_sample_offset = F.reshape(
        F.arange(0, n_samples) * n_points, (-1, 1, 1))
    dst += per_sample_offset
    src += per_sample_offset
    dst = F.reshape(dst, (-1, ))
    src = F.reshape(src, (-1, ))
    adj = sparse.csr_matrix(
        (F.asnumpy(F.zeros_like(dst) + 1), (F.asnumpy(dst), F.asnumpy(src))))

    g = DGLGraph(adj, readonly=True)
    return g
Example #6
0
def main():
    parser = argparse.ArgumentParser(description='Partition a knowledge graph')
    parser.add_argument('--data_path', type=str, default='data',
                        help='root path of all dataset')
    parser.add_argument('--dataset', type=str, default='FB15k',
                        help='dataset name, under data_path')
    parser.add_argument('--data_files', type=str, default=None, nargs='+',
                        help='a list of data files, e.g. entity relation train valid test')
    parser.add_argument('--format', type=str, default='built_in',
                        help='the format of the dataset, it can be built_in,'\
                                'raw_udd_{htr} and udd_{htr}')
    parser.add_argument('-k', '--num-parts', required=True, type=int,
                        help='The number of partitions')
    args = parser.parse_args()
    num_parts = args.num_parts

    print('load dataset..')

    # load dataset and samplers
    dataset = get_dataset(args.data_path, args.dataset, args.format, args.data_files)

    print('construct graph...')

    src, etype_id, dst = dataset.train
    coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)),
            shape=[dataset.n_entities, dataset.n_entities])
    g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True)
    g.edata['tid'] = F.tensor(etype_id, F.int64)

    print('partition graph...')

    part_dict = dgl.transform.metis_partition(g, num_parts, 1)

    tot_num_inner_edges = 0
    for part_id in part_dict:
        part = part_dict[part_id]

        num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata['inner_node']))[0])
        num_inner_edges = len(np.nonzero(F.asnumpy(part.edata['inner_edge']))[0])
        print('part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition'.format(
              part_id, part.number_of_nodes(), part.number_of_edges(),
              num_inner_nodes, num_inner_edges))
        tot_num_inner_edges += num_inner_edges

        part.copy_from_parent()

    print('write graph to txt file...')

    txt_file_graph = os.path.join(args.data_path, args.dataset)
    txt_file_graph = os.path.join(txt_file_graph, 'partition_')
    write_txt_graph(txt_file_graph, 'train.txt', part_dict, g.number_of_nodes(), dataset.n_relations)

    print('there are {} edges in the graph and {} edge cuts for {} partitions.'.format(
        g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict)))
Example #7
0
    def push(self, name, id_tensor, data_tensor):
        """Push message to KVServer.

        Note that push() is an async operation that will return immediately after calling.

        Parameters
        ----------
        name : str
            data name
        id_tensor : tensor (mx.ndarray or torch.tensor)
            a vector storing the global data ID
        data_tensor : tensor (mx.ndarray or torch.tensor)
            a tensor with the same row size of data ID
        """
        assert len(name) > 0, 'name cannot be empty.'
        assert F.ndim(id_tensor) == 1, 'ID must be a vector.'
        assert F.shape(id_tensor)[0] == F.shape(
            data_tensor)[0], 'The data must has the same row size with ID.'

        # partition data (we can move this part of code into C-api if needed)
        server_id = self._data_store[name + '-part-'][id_tensor]
        # sort index by server id
        sorted_id = F.tensor(np.argsort(F.asnumpy(server_id)))
        id_tensor = id_tensor[sorted_id]
        data_tensor = data_tensor[sorted_id]
        server, count = np.unique(F.asnumpy(server_id), return_counts=True)
        # push data to server by order
        start = 0
        for idx in range(len(server)):
            end = start + count[idx]
            if start == end:  # don't have any data for target server
                continue
            partial_id = id_tensor[start:end]
            partial_data = data_tensor[start:end]

            if server[
                    idx] in self._local_server_id and self._close_shared_mem == False:
                if (name + '-g2l-' in self._has_data) == True:
                    local_id = self._data_store[name + '-g2l-'][partial_id]
                else:
                    local_id = partial_id
                self._push_handler(name + '-data-', local_id, data_tensor,
                                   self._data_store)
            else:
                msg = KVStoreMsg(type=KVMsgType.PUSH,
                                 rank=self._client_id,
                                 name=name,
                                 id=partial_id,
                                 data=partial_data)
                _send_kv_msg(self._sender, msg, server[idx])

            start += count[idx]
Example #8
0
    def pull_model(self, client, pos_g, neg_g):
        with th.no_grad():
            entity_id = F.cat(seq=[pos_g.ndata["id"], neg_g.ndata["id"]], dim=0)
            relation_id = pos_g.edata["id"]
            entity_id = F.tensor(np.unique(F.asnumpy(entity_id)))
            relation_id = F.tensor(np.unique(F.asnumpy(relation_id)))

            l2g = client.get_local2global()
            global_entity_id = l2g[entity_id]

            entity_data = client.pull(name="entity_emb", id_tensor=global_entity_id)
            relation_data = client.pull(name="relation_emb", id_tensor=relation_id)

            self.entity_emb.emb[entity_id] = entity_data
            self.relation_emb.emb[relation_id] = relation_data
Example #9
0
    def k_fold_split(dataset, labels, task_id, k=5, log=True):
        """Sort molecules based on their label values for a task and then split them
        for k-fold cross validation by taking consecutive chunks.

        Parameters
        ----------
        dataset
            We assume ``len(dataset)`` gives the size for the dataset, ``dataset[i]``
            gives the ith datapoint and ``dataset.smiles[i]`` gives the SMILES for the
            ith datapoint.
        labels : tensor of shape (N, T)
            Dataset labels all tasks. N for the number of datapoints and T for the number
            of tasks.
        task_id : int
            Index for the task.
        k : int
            Number of folds to use and should be no smaller than 2. Default to be 5.
        log : bool
            Whether to print a message at the start of preparing each fold.

        Returns
        -------
        list of 2-tuples
            Each element of the list represents a fold and is a 2-tuple ``(train_set, val_set)``.
            ``train_set`` and ``val_set`` also have ``len(dataset)`` and ``dataset[i]`` behaviors.
        """
        if not isinstance(labels, np.ndarray):
            labels = F.asnumpy(labels)
        task_labels = labels[:, task_id]
        sorted_indices = np.argsort(task_labels).tolist()

        return base_k_fold_split(
            partial(indices_split, indices=sorted_indices), dataset, k, log)
Example #10
0
    def forward_test(self, pos_g, neg_g, logs, gpu_id=-1):
        pos_g.ndata['emb'] = self.entity_emb(pos_g.ndata['id'], gpu_id, False)
        pos_g.edata['emb'] = self.relation_emb(pos_g.edata['id'], gpu_id,
                                               False)

        batch_size = pos_g.number_of_edges()
        pos_scores = self.predict_score(pos_g)
        pos_scores = reshape(logsigmoid(pos_scores), batch_size, -1)

        neg_scores = self.predict_neg_score(pos_g,
                                            neg_g,
                                            to_device=cuda,
                                            gpu_id=gpu_id,
                                            trace=False)
        neg_scores = reshape(logsigmoid(neg_scores), batch_size, -1)

        # We need to filter the positive edges in the negative graph.
        filter_bias = reshape(neg_g.edata['bias'], batch_size, -1)
        if self.args.gpu >= 0:
            filter_bias = cuda(filter_bias, self.args.gpu)
        neg_scores += filter_bias
        # To compute the rank of a positive edge among all negative edges,
        # we need to know how many negative edges have higher scores than
        # the positive edge.
        rankings = F.sum(neg_scores > pos_scores, dim=1) + 1
        rankings = F.asnumpy(rankings)
        for i in range(batch_size):
            ranking = rankings[i]
            logs.append({
                'MRR': 1.0 / ranking,
                'MR': float(ranking),
                'HITS@1': 1.0 if ranking <= 1 else 0.0,
                'HITS@3': 1.0 if ranking <= 3 else 0.0,
                'HITS@10': 1.0 if ranking <= 10 else 0.0
            })
Example #11
0
def write_txt_graph(path, file_name, part_dict, total_nodes, total_relations):
    partition_book = [0] * total_nodes
    for part_id in part_dict:
        print('write graph %d...' % part_id)
        # Get (h,r,t) triples
        partition_path = path + str(part_id)
        if not os.path.exists(partition_path):
            os.mkdir(partition_path)
        triple_file = os.path.join(partition_path, file_name)
        f = open(triple_file, 'w')
        graph = part_dict[part_id]
        src, dst = graph.all_edges(form='uv', order='eid')
        rel = graph.edata['tid']
        assert len(src) == len(rel)
        src = F.asnumpy(src)
        dst = F.asnumpy(dst)
        rel = F.asnumpy(rel)
        for i in range(len(src)):
            f.write(
                str(src[i]) + '\t' + str(rel[i]) + '\t' + str(dst[i]) + '\n')
        f.close()
        # Get local2global
        l2g_file = os.path.join(partition_path, 'local_to_global.txt')
        f = open(l2g_file, 'w')
        pid = F.asnumpy(graph.parent_nid)
        for i in range(len(pid)):
            f.write(str(pid[i]) + '\n')
        f.close()
        # Update partition_book
        partition = F.asnumpy(graph.ndata['part_id'])
        for i in range(len(pid)):
            partition_book[pid[i]] = partition[i]
    # Write partition_book.txt
    for part_id in part_dict:
        partition_path = path + str(part_id)
        pb_file = os.path.join(partition_path, 'partition_book.txt')
        f = open(pb_file, 'w')
        for i in range(len(partition_book)):
            f.write(str(partition_book[i]) + '\n')
        f.close()
    # Write relation_count.txt
    for part_id in part_dict:
        partition_path = path + str(part_id)
        rel_count_file = os.path.join(partition_path, 'relation_count.txt')
        f = open(rel_count_file, 'w')
        f.write(str(total_relations) + '\n')
        f.close()
Example #12
0
def main():
    parser = argparse.ArgumentParser(description='Partition a graph')
    parser.add_argument('--data', required=True, type=str,
                        help='The file path of the input graph in the DGL format.')
    parser.add_argument('-k', '--num-parts', required=True, type=int,
                        help='The number of partitions')
    parser.add_argument('--num-hops', type=int, default=1,
                        help='The number of hops of HALO nodes we include in a partition')
    parser.add_argument('-m', '--method', required=True, type=str,
                        help='The partitioning method: random, metis')
    parser.add_argument('-o', '--output', required=True, type=str,
                        help='The output directory of the partitioned results')
    args = parser.parse_args()
    data_path = args.data
    num_parts = args.num_parts
    num_hops = args.num_hops
    method = args.method
    output = args.output

    glist, _ = load_graphs(data_path)
    g = glist[0]

    if args.method == 'metis':
        part_dict = dgl.transform.metis_partition(g, num_parts, num_hops)
    elif args.method == 'random':
        node_parts = np.random.choice(num_parts, g.number_of_nodes())
        part_dict = dgl.transform.partition_graph_with_halo(g, node_parts, num_hops)
    else:
        raise Exception('unknown partitioning method: ' + args.method)

    tot_num_inner_edges = 0
    for part_id in part_dict:
        part = part_dict[part_id]

        num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata['inner_node']))[0])
        num_inner_edges = len(np.nonzero(F.asnumpy(part.edata['inner_edge']))[0])
        print('part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition'.format(
              part_id, part.number_of_nodes(), part.number_of_edges(),
              num_inner_nodes, num_inner_edges))
        tot_num_inner_edges += num_inner_edges

        # TODO I duplicate some node features.
        part.copy_from_parent()
        save_graphs(output + '/' + str(part_id) + '.dgl', [part])
    print('there are {} edges in the graph and {} edge cuts for {} partitions.'.format(
        g.number_of_edges(), g.number_of_edges() - tot_num_inner_edges, len(part_dict)))
Example #13
0
def get_partition_list(g, psize):
    p_gs = metis_partition(g, psize)
    graphs = []
    for k, val in p_gs.items():
        nids = val.ndata[dgl.NID]
        nids = F.asnumpy(nids)
        graphs.append(nids)
    return graphs
Example #14
0
def segmented_knn_graph(x, k, segs):
    """Transforms the given point set to a directed graph, whose coordinates
    are given as a matrix.  The predecessors of each point are its k-nearest
    neighbors.

    The matrices are concatenated along the first axis, and are segmented by
    ``segs``.  Each block would be transformed into a separate graph.  The
    graphs will be unioned.

    Parameters
    ----------
    x : Tensor
        The input tensor.
    k : int
        The number of neighbors
    segs : iterable of int
        Number of points of each point set.
        Must sum up to the number of rows in ``x``.

    Returns
    -------
    DGLGraph
        The graph.  The node IDs are in the same order as ``x``.
    """
    n_total_points, _ = F.shape(x)
    offset = np.insert(np.cumsum(segs), 0, 0)

    h_list = F.split(x, segs, 0)
    dst = [
        F.argtopk(pairwise_squared_distance(h_g), k, 1, descending=False) +
        offset[i] for i, h_g in enumerate(h_list)
    ]
    dst = F.cat(dst, 0)
    src = F.arange(0, n_total_points).unsqueeze(1).expand(n_total_points, k)

    dst = F.reshape(dst, (-1, ))
    src = F.reshape(src, (-1, ))
    # !!! fix shape
    adj = sparse.csr_matrix(
        (F.asnumpy(F.zeros_like(dst) + 1), (F.asnumpy(dst), F.asnumpy(src))),
        shape=(n_total_points, n_total_points))

    g = DGLGraph(adj, readonly=True)
    return g
Example #15
0
    def check(self, eval_type):
        edges = self.get_edges(eval_type)
        subg = self.g.edge_subgraph(edges)
        if eval_type == 'valid':
            data = self.valid
        elif eval_type == 'test':
            data = self.test

        subg.copy_from_parent()
        src, dst, eid = subg.all_edges('all', order='eid')
        src_id = subg.ndata['id'][src]
        dst_id = subg.ndata['id'][dst]
        etype = subg.edata['id'][eid]

        orig_src = np.array([t[0] for t in data])
        orig_etype = np.array([t[1] for t in data])
        orig_dst = np.array([t[2] for t in data])
        np.testing.assert_equal(F.asnumpy(src_id), orig_src)
        np.testing.assert_equal(F.asnumpy(dst_id), orig_dst)
        np.testing.assert_equal(F.asnumpy(etype), orig_etype)
Example #16
0
    def pull(self, name, ID):
        """Pull sparse message from KVServer

        Note that we assume the row Ids in ID is in the ascending order.

        Parameters
        ----------
        name : str
            data name
        ID : tensor (mx.ndarray or torch.tensor)
            a vector storing the IDs

        Return
        ------
        tensor
            a tensor with the same row size of ID
        """
        assert F.ndim(ID) == 1, 'ID must be a vector.'
        group_size = [0] * self._server_count
        numpy_id = F.asnumpy(ID)
        count = math.ceil(self._data_size[name] / self._server_count)
        server_id = numpy_id / count
        id_list, id_count = np.unique(server_id, return_counts=True)
        for idx in range(len(id_list)):
            group_size[int(id_list[idx])] += id_count[idx]
        min_idx = 0
        max_idx = 0
        server_count = 0
        for idx in range(self._server_count):
            if group_size[idx] == 0:
                continue
            server_count += 1
            max_idx += group_size[idx]
            range_id = ID[min_idx:max_idx]
            min_idx = max_idx
            msg = KVStoreMsg(type=KVMsgType.PULL,
                             rank=self._client_id,
                             name=name,
                             id=range_id,
                             data=None)
            _send_kv_msg(self._sender, msg, idx)
        # Recv back message
        msg_list = []
        for idx in range(self._server_count):
            if group_size[idx] == 0:
                continue
            msg = _recv_kv_msg(self._receiver)
            assert msg.type == KVMsgType.PULL_BACK, 'Recv kv msg error.'
            msg_list.append(msg)

        return self._merge_msg(msg_list)
Example #17
0
    def push(self, name, ID, data):
        """Push sparse message to KVServer

        The push() API will partition message into different 
        KVServer nodes automatically.

        Note that we assume the row Ids in ID is in the ascending order.

        Parameters
        ----------
        name : str
            data name
        ID : tensor (mx.ndarray or torch.tensor)
            a vector storing the global IDs
        data : tensor (mx.ndarray or torch.tensor)
            a tensor with the same row size of id
        """
        assert F.ndim(ID) == 1, 'ID must be a vector.'
        assert F.shape(ID)[0] == F.shape(
            data)[0], 'The data must has the same row size with ID.'
        group_size = [0] * self._server_count
        numpy_id = F.asnumpy(ID)
        count = math.ceil(self._data_size[name] / self._server_count)
        server_id = numpy_id / count
        id_list, id_count = np.unique(server_id, return_counts=True)
        for idx in range(len(id_list)):
            group_size[int(id_list[idx])] += id_count[idx]
        min_idx = 0
        max_idx = 0
        for idx in range(self._server_count):
            if group_size[idx] == 0:
                continue
            max_idx += group_size[idx]
            range_id = ID[min_idx:max_idx]
            range_data = data[min_idx:max_idx]
            min_idx = max_idx
            msg = KVStoreMsg(type=KVMsgType.PUSH,
                             rank=self._client_id,
                             name=name,
                             id=range_id,
                             data=range_data)
            _send_kv_msg(self._sender, msg, idx)
Example #18
0
    def forward_test_wikikg(self,
                            query,
                            ans,
                            candidate,
                            mode,
                            logs,
                            gpu_id=-1):
        """Do the forward and generate ranking results.

        Parameters
        ----------
        query : Tensor
            input head and relation for test or valid
        ans : Tenseor
            the correct tail entity index
        cadidate : Tensor
            negative sampled tail entity
        """
        scores = self.predict_score_wikikg(query,
                                           candidate,
                                           mode,
                                           to_device=cuda,
                                           gpu_id=gpu_id,
                                           trace=False)
        if mode == "Valid":
            batch_size = query.shape[0]
            neg_scores = reshape(scores, batch_size, -1)
            for i in range(batch_size):
                ranking = F.asnumpy(
                    F.sum(neg_scores[i] >= neg_scores[i][ans[i]], dim=0) + 1)
                logs.append({
                    'MRR': 1.0 / ranking,
                    'MR': float(ranking),
                    'HITS@1': 1.0 if ranking <= 1 else 0.0,
                    'HITS@3': 1.0 if ranking <= 3 else 0.0,
                    'HITS@10': 1.0 if ranking <= 10 else 0.0
                })
        else:
            argsort = F.argsort(scores, dim=1, descending=True)
            logs.append(argsort[:, :10])
Example #19
0
    def process_raw_tuples(self, raw_rdf_graphs):
        triplets = OrderedDict()
        mg = nx.MultiDiGraph()
        ent_classes = OrderedDict()
        rel_classes = OrderedDict()
        entities = OrderedDict()
        id2entity = {}
        labels = OrderedDict()
        id2label = {}
        dataset_pairs = []

        src = []
        dst = []
        ntid = []
        etid = []

        mutag_graph = raw_rdf_graphs[0]
        ts = []
        # make triplets sorted each time we load the graph
        for t in mutag_graph:
            ts.append(t)
        ts.sort()
        
        for (sbj, pred, obj) in ts:
            if pred in triplets:
                triplets[pred].append((sbj, pred, obj))
            else:
                triplets[pred] = []
                triplets[pred].append((sbj, pred, obj))

        for key, triples in triplets.items():
            if key == self.is_mutagenic:
                continue
            for (sbj, pred, obj) in triples:
                sbjent = self.parse_sbj(sbj)
                rel = self.parse_pred(pred)
                objent = self.parse_obj(obj)

                processed = self.process_tuples(sbjent, rel, objent)
                if processed is None:
                    # ignored
                    continue

                sbjclsid = _get_id(ent_classes, sbjent.n_type)
                objclsid = _get_id(ent_classes, objent.n_type)
                relclsid = _get_id(rel_classes, rel.r_type)
                mg.add_edge(sbjent.n_type, objent.n_type, key=rel.r_type)
                if self._insert_reverse:
                    mg.add_edge(objent.n_type, sbjent.n_type, key='rev-%s' % rel.r_type)
                # instance graph
                src_id = _get_id(entities, str(sbjent))
                _map_object(id2entity, src_id, sbjent)
                if len(entities) > len(ntid):  # found new entity
                    ntid.append(sbjclsid)
                dst_id = _get_id(entities, str(objent))
                _map_object(id2entity, src_id, objent)
                if len(entities) > len(ntid):  # found new entity
                    ntid.append(objclsid)
                src.append(src_id)
                dst.append(dst_id)
                etid.append(relclsid)

        # handle label
        is_mutagenic_triplets = triplets[self.is_mutagenic]
        for (sbj, pred, obj) in is_mutagenic_triplets:
            #print("{} {} {}".format(sbj, pred, obj))
            sbj_id = _get_id(entities, str(self.parse_sbj(sbj)))
            label = _get_id(labels, str(obj))
            _map_object(id2label, label, obj)
            dataset_pairs.append((sbj_id, label))

        src = np.array(src)
        dst = np.array(dst)
        ntid = np.array(ntid)
        etid = np.array(etid)
        ntypes = list(ent_classes.keys())
        etypes = list(rel_classes.keys())

        # add reverse edge with reverse relation
        if self._insert_reverse:
            print('Adding reverse edges ...')
            newsrc = np.hstack([src, dst])
            newdst = np.hstack([dst, src])
            src = newsrc
            dst = newdst
            etid = np.hstack([etid, etid + len(etypes)])
            etypes.extend(['rev-%s' % t for t in etypes])

        self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)
        # get global to subgraph local id mapping
        idmap = F.asnumpy(self.graph.nodes[self.predict_category].data[dgl.NID])
        glb2lcl = {glbid : lclid for lclid, glbid in enumerate(idmap)}
        lcl2glb = {lclid : glbid for lclid, glbid in enumerate(idmap)}
        self.split_dataset(dataset_pairs, labels, glb2lcl)
        self.lcl2glb = lcl2glb
        self.id2entity = id2entity
        self.id2label = id2label
Example #20
0
def _check_topk_score2(score_model, g, num_entity, num_rels, exclude_mode):
    hidden_dim = 32
    num_entity = 40
    num_rels = 4
    with tempfile.TemporaryDirectory() as tmpdirname:
        entity_emb, rel_emb = generate_rand_emb(score_model.model_name, num_entity, num_rels, hidden_dim, 'none')
        create_emb_file(Path(tmpdirname), 'entity.npy', entity_emb.numpy())
        create_emb_file(Path(tmpdirname), 'relation.npy', rel_emb.numpy())

        score_model.load(Path(tmpdirname))
        score_model.attach_graph(g)
        score_func = score_model._score_func

    head = F.arange(0, num_entity // 2)
    rel = F.arange(0, num_rels)
    tail = F.arange(num_entity // 2, num_entity)

    # exec_model==triplet_wise
    tw_rel = np.random.randint(0, num_rels, num_entity // 2)
    tw_rel = F.tensor(tw_rel)
    result1 = score_model.link_predict(head, tw_rel, tail, exec_mode='triplet_wise', exclude_mode=exclude_mode, batch_size=16)
    assert len(result1) == 1
    scores = []
    head_ids = []
    rel_ids = []
    tail_ids = []
    for i in range(head.shape[0]):
        hemb = F.take(entity_emb, head[i], 0)
        remb = F.take(rel_emb, tw_rel[i], 0)
        temb = F.unsqueeze(F.take(entity_emb, tail[i], 0), dim=0)
        edge = FakeEdge(hemb, temb, remb)
        score = F.asnumpy(score_func.edge_func(edge)['score'])
        scores.append(score)
        head_ids.append(F.asnumpy(head[i]))
        rel_ids.append(F.asnumpy(tw_rel[i]))
        tail_ids.append(F.asnumpy(tail[i]))
    scores = np.asarray(scores)
    scores = scores.reshape(scores.shape[0])
    head_ids = np.asarray(head_ids)
    rel_ids = np.asarray(rel_ids)
    tail_ids = np.asarray(tail_ids)
    idx = np.argsort(scores)
    idx = idx[::-1]
    if exclude_mode is None or exclude_mode == 'mask':
        idx = idx[:10]
        head_ids = head_ids[idx]
        rel_ids = rel_ids[idx]
        tail_ids = tail_ids[idx]
        score_topk = scores[idx]
        if exclude_mode == 'mask':
            mask = np.zeros((10,))
            for i in range(10):
                if (head_ids[i] + 1) % num_entity == tail_ids[i] or \
                    (head_ids[i] - 1) % num_entity == tail_ids[i]:
                    mask[i] = 1
    else:
        c_head_idx = []
        c_rel_idx = []
        c_tail_idx = []
        c_score_topk = []
        cur_idx = 0
        while len(c_head_idx) < 10:
            c_idx = idx[cur_idx]
            cur_idx += 1
            if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \
                (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]:
                continue
            c_head_idx.append(head_ids[c_idx])
            c_tail_idx.append(tail_ids[c_idx])
            c_rel_idx.append(rel_ids[c_idx])
            c_score_topk.append(scores[c_idx])
        head_ids = F.tensor(c_head_idx)
        rel_ids = F.tensor(c_rel_idx)
        tail_ids = F.tensor(c_tail_idx)
        score_topk = F.tensor(c_score_topk)

    r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[0]
    np.testing.assert_allclose(r1_head, head_ids)
    np.testing.assert_allclose(r1_rel, rel_ids)
    np.testing.assert_allclose(r1_tail, tail_ids)
    np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
    if exclude_mode == 'mask':
        np.testing.assert_allclose(r1_mask, mask)
    else:
        assert r1_mask is None

    # exec_mode==all
    result1 = score_model.link_predict(head, rel, tail, topk=20, exclude_mode=exclude_mode, batch_size=16)
    result2 = score_model.link_predict(head=head, tail=tail, topk=20, exclude_mode=exclude_mode, batch_size=16)
    assert len(result1) == 1
    assert len(result2) == 1

    scores = []
    head_ids = []
    rel_ids = []
    tail_ids = []
    for i in range(head.shape[0]):
        for j in range(rel.shape[0]):
            for k in range(tail.shape[0]):
                hemb = F.take(entity_emb, head[i], 0)
                remb = F.take(rel_emb, rel[j], 0)
                temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = F.asnumpy(score_func.edge_func(edge)['score'])
                scores.append(score)
                head_ids.append(F.asnumpy(head[i]))
                rel_ids.append(F.asnumpy(rel[j]))
                tail_ids.append(F.asnumpy(tail[k]))

    scores = np.asarray(scores)
    scores = scores.reshape(scores.shape[0])
    head_ids = np.asarray(head_ids)
    rel_ids = np.asarray(rel_ids)
    tail_ids = np.asarray(tail_ids)
    idx = np.argsort(scores)
    idx = idx[::-1]
    if exclude_mode is None or exclude_mode == 'mask':
        idx = idx[:20]
        head_ids = head_ids[idx]
        rel_ids = rel_ids[idx]
        tail_ids = tail_ids[idx]
        score_topk = scores[idx]
        if exclude_mode == 'mask':
            mask = np.zeros((20,))
            for i in range(20):
                if (head_ids[i] + 1) % num_entity == tail_ids[i] or \
                    (head_ids[i] - 1) % num_entity == tail_ids[i]:
                    mask[i] = 1
    else:
        c_head_idx = []
        c_rel_idx = []
        c_tail_idx = []
        c_score_topk = []
        cur_idx = 0
        while len(c_head_idx) < 20:
            c_idx = idx[cur_idx]
            cur_idx += 1
            if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \
                (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]:
                continue
            c_head_idx.append(head_ids[c_idx])
            c_tail_idx.append(tail_ids[c_idx])
            c_rel_idx.append(rel_ids[c_idx])
            c_score_topk.append(scores[c_idx])
        head_ids = F.tensor(c_head_idx)
        rel_ids = F.tensor(c_rel_idx)
        tail_ids = F.tensor(c_tail_idx)
        score_topk = F.tensor(c_score_topk)

    r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[0]
    r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[0]
    np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
    np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5)
    np.testing.assert_allclose(r1_head, head_ids)
    np.testing.assert_allclose(r2_head, head_ids)
    np.testing.assert_allclose(r1_rel, rel_ids)
    np.testing.assert_allclose(r2_rel, rel_ids)
    np.testing.assert_allclose(r1_tail, tail_ids)
    np.testing.assert_allclose(r2_tail, tail_ids)
    if exclude_mode == 'mask':
        np.testing.assert_allclose(r1_mask, mask)
        np.testing.assert_allclose(r2_mask, mask)
    else:
        assert r1_mask is None
        assert r2_mask is None

    result1 = score_model.link_predict(head, rel, tail, exec_mode='batch_rel', exclude_mode=exclude_mode, batch_size=16)
    result2 = score_model.link_predict(head=head, tail=tail, exec_mode='batch_rel', exclude_mode=exclude_mode, batch_size=16)
    assert len(result1) == num_rels
    assert len(result2) == num_rels
    for j in range(rel.shape[0]):
        scores = []
        head_ids = []
        rel_ids = []
        tail_ids = []
        for i in range(head.shape[0]):
            for k in range(tail.shape[0]):
                hemb = F.take(entity_emb, head[i], 0)
                remb = F.take(rel_emb, rel[j], 0)
                temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = F.asnumpy(score_func.edge_func(edge)['score'])
                scores.append(score)
                head_ids.append(F.asnumpy(head[i]))
                rel_ids.append(F.asnumpy(rel[j]))
                tail_ids.append(F.asnumpy(tail[k]))

        scores = np.asarray(scores)
        scores = scores.reshape(scores.shape[0])
        head_ids = np.asarray(head_ids)
        rel_ids = np.asarray(rel_ids)
        tail_ids = np.asarray(tail_ids)
        idx = np.argsort(scores)
        idx = idx[::-1]
        if exclude_mode is None or exclude_mode == 'mask':
            idx = idx[:10]
            head_ids = head_ids[idx]
            rel_ids = rel_ids[idx]
            tail_ids = tail_ids[idx]
            score_topk = scores[idx]
            if exclude_mode == 'mask':
                mask = np.full((10,), False)
                for i in range(10):
                    if (head_ids[i] + 1) % num_entity == tail_ids[i] or \
                        (head_ids[i] - 1) % num_entity == tail_ids[i]:
                        mask[i] = True
        else:
            c_head_idx = []
            c_rel_idx = []
            c_tail_idx = []
            c_score_topk = []
            cur_idx = 0
            while len(c_head_idx) < 10:
                c_idx = idx[cur_idx]
                cur_idx += 1
                if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \
                    (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]:
                    continue
                c_head_idx.append(head_ids[c_idx])
                c_tail_idx.append(tail_ids[c_idx])
                c_rel_idx.append(rel_ids[c_idx])
                c_score_topk.append(scores[c_idx])
            head_ids = F.tensor(c_head_idx)
            rel_ids = F.tensor(c_rel_idx)
            tail_ids = F.tensor(c_tail_idx)
            score_topk = F.tensor(c_score_topk)

        r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[j]
        r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[j]
        np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
        np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5)
        np.testing.assert_allclose(r1_head, head_ids)
        np.testing.assert_allclose(r2_head, head_ids)
        np.testing.assert_allclose(r1_rel, rel_ids)
        np.testing.assert_allclose(r2_rel, rel_ids)
        np.testing.assert_allclose(r1_tail, tail_ids)
        np.testing.assert_allclose(r2_tail, tail_ids)
        if exclude_mode == 'mask':
            np.testing.assert_allclose(r1_mask, mask)
            np.testing.assert_allclose(r2_mask, mask)
        else:
            assert r1_mask is None
            assert r2_mask is None


    head = F.arange(0, num_entity)
    rel = F.arange(0, num_rels)
    tail = F.arange(0, num_entity)
    result1 = score_model.link_predict(head, rel, tail, exec_mode='batch_head', exclude_mode=exclude_mode, batch_size=16)
    result2 = score_model.link_predict(exec_mode='batch_head', exclude_mode=exclude_mode, batch_size=16)
    assert len(result1) == num_entity
    assert len(result2) == num_entity

    for i in range(head.shape[0]):
        scores = []
        head_ids = []
        rel_ids = []
        tail_ids = []
        for j in range(rel.shape[0]):
            for k in range(tail.shape[0]):
                hemb = F.take(entity_emb, head[i], 0)
                remb = F.take(rel_emb, rel[j], 0)
                temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = F.asnumpy(score_func.edge_func(edge)['score'])
                scores.append(score)
                head_ids.append(F.asnumpy(head[i]))
                rel_ids.append(F.asnumpy(rel[j]))
                tail_ids.append(F.asnumpy(tail[k]))

        scores = np.asarray(scores)
        scores = scores.reshape(scores.shape[0])
        head_ids = np.asarray(head_ids)
        rel_ids = np.asarray(rel_ids)
        tail_ids = np.asarray(tail_ids)
        idx = np.argsort(scores)
        idx = idx[::-1]
        if exclude_mode is None or exclude_mode == 'mask':
            idx = idx[:10]
            head_ids = head_ids[idx]
            rel_ids = rel_ids[idx]
            tail_ids = tail_ids[idx]
            score_topk = scores[idx]
            if exclude_mode == 'mask':
                mask = np.full((10,), False)
                for l in range(10):
                    if (head_ids[l] + 1) % num_entity == tail_ids[l] or \
                        (head_ids[l] - 1) % num_entity == tail_ids[l]:
                        mask[l] = True
        else:
            c_head_idx = []
            c_rel_idx = []
            c_tail_idx = []
            c_score_topk = []
            cur_idx = 0
            while len(c_head_idx) < 10:
                c_idx = idx[cur_idx]
                cur_idx += 1
                if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \
                    (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]:
                    continue
                c_head_idx.append(head_ids[c_idx])
                c_tail_idx.append(tail_ids[c_idx])
                c_rel_idx.append(rel_ids[c_idx])
                c_score_topk.append(scores[c_idx])
            head_ids = F.tensor(c_head_idx)
            rel_ids = F.tensor(c_rel_idx)
            tail_ids = F.tensor(c_tail_idx)
            score_topk = F.tensor(c_score_topk)

        r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[i]
        r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[i]
        np.testing.assert_allclose(r1_head, head_ids)
        np.testing.assert_allclose(r2_head, head_ids)
        np.testing.assert_allclose(r1_rel, rel_ids)
        np.testing.assert_allclose(r2_rel, rel_ids)
        np.testing.assert_allclose(r1_tail, tail_ids)
        np.testing.assert_allclose(r2_tail, tail_ids)
        np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
        np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5)
        if exclude_mode == 'mask':
            np.testing.assert_allclose(r1_mask, mask)
            np.testing.assert_allclose(r2_mask, mask)
        else:
            assert r1_mask is None
            assert r2_mask is None

    result1 = score_model.link_predict(head, rel, tail, exec_mode='batch_tail', exclude_mode=exclude_mode)
    result2 = score_model.link_predict(exec_mode='batch_tail', exclude_mode=exclude_mode)
    assert len(result1) == num_entity
    assert len(result2) == num_entity
    for k in range(tail.shape[0]):
        scores = []
        head_ids = []
        rel_ids = []
        tail_ids = []
        for i in range(head.shape[0]):
            for j in range(rel.shape[0]):
                hemb = F.take(entity_emb, head[i], 0)
                remb = F.take(rel_emb, rel[j], 0)
                temb = F.unsqueeze(F.take(entity_emb, tail[k], 0), dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = F.asnumpy(score_func.edge_func(edge)['score'])
                scores.append(score)
                head_ids.append(F.asnumpy(head[i]))
                rel_ids.append(F.asnumpy(rel[j]))
                tail_ids.append(F.asnumpy(tail[k]))

        scores = np.asarray(scores)
        scores = scores.reshape(scores.shape[0])
        head_ids = np.asarray(head_ids)
        rel_ids = np.asarray(rel_ids)
        tail_ids = np.asarray(tail_ids)
        idx = np.argsort(scores)
        idx = idx[::-1]
        if exclude_mode is None or exclude_mode == 'mask':
            idx = idx[:10]
            head_ids = head_ids[idx]
            rel_ids = rel_ids[idx]
            tail_ids = tail_ids[idx]
            score_topk = scores[idx]
            if exclude_mode == 'mask':
                mask = np.full((10,), False)
                for l in range(10):
                    if (head_ids[l] + 1) % num_entity == tail_ids[l] or \
                        (head_ids[l] - 1) % num_entity == tail_ids[l]:
                        mask[l] = True
        else:
            c_head_idx = []
            c_rel_idx = []
            c_tail_idx = []
            c_score_topk = []
            cur_idx = 0
            while len(c_head_idx) < 10:
                c_idx = idx[cur_idx]
                cur_idx += 1
                if (head_ids[c_idx] + 1) % num_entity == tail_ids[c_idx] or \
                    (head_ids[c_idx] - 1) % num_entity == tail_ids[c_idx]:
                    continue
                c_head_idx.append(head_ids[c_idx])
                c_tail_idx.append(tail_ids[c_idx])
                c_rel_idx.append(rel_ids[c_idx])
                c_score_topk.append(scores[c_idx])
            head_ids = F.tensor(c_head_idx)
            rel_ids = F.tensor(c_rel_idx)
            tail_ids = F.tensor(c_tail_idx)
            score_topk = F.tensor(c_score_topk)

        r1_head, r1_rel, r1_tail, r1_score, r1_mask = result1[k]
        r2_head, r2_rel, r2_tail, r2_score, r2_mask = result2[k]
        np.testing.assert_allclose(r1_head, head_ids)
        np.testing.assert_allclose(r2_head, head_ids)
        np.testing.assert_allclose(r1_rel, rel_ids)
        np.testing.assert_allclose(r2_rel, rel_ids)
        np.testing.assert_allclose(r1_tail, tail_ids)
        np.testing.assert_allclose(r2_tail, tail_ids)
        np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
        np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5)
        if exclude_mode == 'mask':
            np.testing.assert_allclose(r1_mask, mask)
            np.testing.assert_allclose(r2_mask, mask)
        else:
            assert r1_mask is None
            assert r2_mask is None
Example #21
0
def run_topk_emb(sfunc, sim_func, create_emb_sim=create_kge_emb_sim):
    hidden_dim = 32
    num_head = 40
    num_tail = 40
    num_emb = 80

    emb = F.uniform((num_emb, hidden_dim), F.float32, F.cpu(), -1, 1)
    head = F.arange(0, num_head)
    tail = F.arange(num_head, num_head+num_tail)
    sim_infer = create_emb_sim(emb, sfunc)

    result1 = sim_infer.topK(head, tail, pair_ws=True)
    scores = []
    head_ids = []
    tail_ids = []
    for i in range(head.shape[0]):
        j = i
        hemb = F.take(emb, head[i], 0)
        temb = F.take(emb, tail[j], 0)

        score = sim_func(hemb, temb)
        scores.append(F.asnumpy(score))
        head_ids.append(F.asnumpy(head[i]))
        tail_ids.append(F.asnumpy(tail[j]))
    scores = np.asarray(scores)
    scores = scores.reshape(scores.shape[0])
    head_ids = np.asarray(head_ids)
    tail_ids = np.asarray(tail_ids)
    idx = np.argsort(scores)
    idx = idx[::-1]
    idx = idx[:10]
    head_ids = head_ids[idx]
    tail_ids = tail_ids[idx]
    score_topk = scores[idx]

    r1_head, r1_tail, r1_score = result1[0]
    np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
    np.testing.assert_allclose(r1_head, head_ids)
    np.testing.assert_allclose(r1_tail, tail_ids)
    print('pass pair wise')

    head = F.arange(0, num_head)
    tail = F.arange(num_head, num_head+num_tail)
    result1 = sim_infer.topK(head, tail)
    assert len(result1) == 1
    scores = []
    head_ids = []
    tail_ids = []
    for i in range(head.shape[0]):
        for j in range(tail.shape[0]):
            hemb = F.take(emb, head[i], 0)
            temb = F.take(emb, tail[j], 0)

            score = sim_func(hemb, temb)
            scores.append(F.asnumpy(score))
            head_ids.append(F.asnumpy(head[i]))
            tail_ids.append(F.asnumpy(tail[j]))
    scores = np.asarray(scores)
    scores = scores.reshape(scores.shape[0])
    head_ids = np.asarray(head_ids)
    tail_ids = np.asarray(tail_ids)
    idx = np.argsort(scores)
    idx = idx[::-1]
    idx = idx[:10]
    head_ids = head_ids[idx]
    tail_ids = tail_ids[idx]
    score_topk = scores[idx]

    r1_head, r1_tail, r1_score = result1[0]
    np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
    np.testing.assert_allclose(r1_head, head_ids)
    np.testing.assert_allclose(r1_tail, tail_ids)

    emb_ids = F.arange(0, num_emb)
    result1 = sim_infer.topK(emb_ids, emb_ids, bcast=True)
    result2 = sim_infer.topK(bcast=True)
    assert len(result1) == emb_ids.shape[0]
    assert len(result2) == emb_ids.shape[0]

    for i in range(emb_ids.shape[0]):
        scores = []
        head_ids = []
        tail_ids = []
        for j in range(emb_ids.shape[0]):
            hemb = F.take(emb, emb_ids[i], 0)
            temb = F.take(emb, emb_ids[j], 0)

            score = sim_func(hemb, temb)
            score = F.asnumpy(score)
            scores.append(score)
            tail_ids.append(F.asnumpy(emb_ids[j]))
        scores = np.asarray(scores)
        scores = scores.reshape(scores.shape[0])
        tail_ids = np.asarray(tail_ids)
        idx = np.argsort(scores)
        idx = idx[::-1]
        idx = idx[:10]
        head_ids = np.full((10,), F.asnumpy(emb_ids[i]))
        tail_ids = tail_ids[idx]
        score_topk = scores[idx]

        r1_head, r1_tail, r1_score = result1[i]
        np.testing.assert_allclose(r1_score, score_topk, rtol=1e-5, atol=1e-5)
        np.testing.assert_allclose(r1_head, head_ids)
        np.testing.assert_allclose(r1_tail, tail_ids)
        r2_head, r2_tail, r2_score = result2[i]
        np.testing.assert_allclose(r2_score, score_topk, rtol=1e-5, atol=1e-5)
        np.testing.assert_allclose(r2_head, head_ids)
        np.testing.assert_allclose(r2_tail, tail_ids)
    print('pass all')
Example #22
0
def main():
    parser = argparse.ArgumentParser(description="Partition a knowledge graph")
    parser.add_argument(
        "--data_path",
        type=str,
        default="data",
        help="The path of the directory where DGL-KE loads knowledge graph data.",
    )
    parser.add_argument(
        "--dataset", type=str, default="FB15k", help="dataset name, under data_path"
    )
    parser.add_argument(
        "--data_files",
        type=str,
        default=None,
        nargs="+",
        help="A list of data file names. This is used if users want to train KGE"
        "on their own datasets. If the format is raw_udd_{htr},"
        "users need to provide train_file [valid_file] [test_file]."
        "If the format is udd_{htr}, users need to provide"
        "entity_file relation_file train_file [valid_file] [test_file]."
        "In both cases, valid_file and test_file are optional.",
    )
    parser.add_argument(
        "--delimiter",
        type=str,
        default="\t",
        help="Delimiter used in data files. Note all files should use the same delimiter.",
    )
    parser.add_argument(
        "--format",
        type=str,
        default="built_in",
        help="The format of the dataset. For builtin knowledge graphs,"
        "the foramt should be built_in. For users own knowledge graphs,"
        "it needs to be raw_udd_{htr} or udd_{htr}.",
    )
    parser.add_argument(
        "-k", "--num-parts", required=True, type=int, help="The number of partitions"
    )
    args = parser.parse_args()
    num_parts = args.num_parts

    print("load dataset..")

    # load dataset and samplers
    dataset = get_dataset(
        args.data_path, args.dataset, args.format, args.delimiter, args.data_files
    )

    print("construct graph...")

    src, etype_id, dst = dataset.train
    coo = sp.sparse.coo_matrix(
        (np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities]
    )
    g = dgl.DGLGraph(coo, readonly=True, multigraph=True, sort_csr=True)
    g.edata["tid"] = F.tensor(etype_id, F.int64)

    print("partition graph...")

    part_dict = dgl.transform.metis_partition(g, num_parts, 1)

    tot_num_inner_edges = 0
    for part_id in part_dict:
        part = part_dict[part_id]

        num_inner_nodes = len(np.nonzero(F.asnumpy(part.ndata["inner_node"]))[0])
        num_inner_edges = len(np.nonzero(F.asnumpy(part.edata["inner_edge"]))[0])
        print(
            "part {} has {} nodes and {} edges. {} nodes and {} edges are inside the partition".format(
                part_id,
                part.number_of_nodes(),
                part.number_of_edges(),
                num_inner_nodes,
                num_inner_edges,
            )
        )
        tot_num_inner_edges += num_inner_edges

        part.copy_from_parent()

    print("write graph to txt file...")

    txt_file_graph = os.path.join(args.data_path, args.dataset)
    txt_file_graph = os.path.join(txt_file_graph, "partition_")
    write_txt_graph(
        txt_file_graph, "train.txt", part_dict, g.number_of_nodes(), dataset.n_relations
    )

    print(
        "there are {} edges in the graph and {} edge cuts for {} partitions.".format(
            g.number_of_edges(),
            g.number_of_edges() - tot_num_inner_edges,
            len(part_dict),
        )
    )
Example #23
0
File: rdf.py Project: xnuohz/dgl
    def process_raw_tuples(self, raw_tuples, root_path):
        """Processing raw RDF dataset

        Parameters
        ----------
        raw_tuples:
            Raw rdf tuples
        root_path: str
            Root path containing the data
        """
        mg = nx.MultiDiGraph()
        ent_classes = OrderedDict()
        rel_classes = OrderedDict()
        entities = OrderedDict()
        src = []
        dst = []
        ntid = []
        etid = []
        sorted_tuples = []
        for t in raw_tuples:
            sorted_tuples.append(t)
        sorted_tuples.sort()

        for i, (sbj, pred, obj) in enumerate(sorted_tuples):
            if self.verbose and i % self._print_every == 0:
                print('Processed %d tuples, found %d valid tuples.' %
                      (i, len(src)))
            sbjent = self.parse_entity(sbj)
            rel = self.parse_relation(pred)
            objent = self.parse_entity(obj)
            processed = self.process_tuple((sbj, pred, obj), sbjent, rel,
                                           objent)
            if processed is None:
                # ignored
                continue
            # meta graph
            sbjclsid = _get_id(ent_classes, sbjent.cls)
            objclsid = _get_id(ent_classes, objent.cls)
            relclsid = _get_id(rel_classes, rel.cls)
            mg.add_edge(sbjent.cls, objent.cls, key=rel.cls)
            if self._insert_reverse:
                mg.add_edge(objent.cls, sbjent.cls, key='rev-%s' % rel.cls)
            # instance graph
            src_id = _get_id(entities, str(sbjent))
            if len(entities) > len(ntid):  # found new entity
                ntid.append(sbjclsid)
            dst_id = _get_id(entities, str(objent))
            if len(entities) > len(ntid):  # found new entity
                ntid.append(objclsid)
            src.append(src_id)
            dst.append(dst_id)
            etid.append(relclsid)

        src = np.asarray(src)
        dst = np.asarray(dst)
        ntid = np.asarray(ntid)
        etid = np.asarray(etid)
        ntypes = list(ent_classes.keys())
        etypes = list(rel_classes.keys())

        # add reverse edge with reverse relation
        if self._insert_reverse:
            if self.verbose:
                print('Adding reverse edges ...')
            newsrc = np.hstack([src, dst])
            newdst = np.hstack([dst, src])
            src = newsrc
            dst = newdst
            etid = np.hstack([etid, etid + len(etypes)])
            etypes.extend(['rev-%s' % t for t in etypes])

        hg = self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)

        if self.verbose:
            print('Load training/validation/testing split ...')
        idmap = F.asnumpy(hg.nodes[self.predict_category].data[dgl.NID])
        glb2lcl = {glbid: lclid for lclid, glbid in enumerate(idmap)}

        def findidfn(ent):
            if ent not in entities:
                return None
            else:
                return glb2lcl[entities[ent]]

        self._hg = hg
        train_idx, test_idx, labels, num_classes = self.load_data_split(
            findidfn, root_path)

        train_mask = idx2mask(train_idx,
                              self._hg.number_of_nodes(self.predict_category))
        test_mask = idx2mask(test_idx,
                             self._hg.number_of_nodes(self.predict_category))
        labels = F.tensor(labels, F.data_type_dict['int64'])

        train_mask = generate_mask_tensor(train_mask)
        test_mask = generate_mask_tensor(test_mask)
        self._hg.nodes[self.predict_category].data['train_mask'] = train_mask
        self._hg.nodes[self.predict_category].data['test_mask'] = test_mask
        self._hg.nodes[self.predict_category].data['labels'] = labels
        self._num_classes = num_classes

        # save for compatability
        self._train_idx = F.tensor(train_idx)
        self._test_idx = F.tensor(test_idx)
        self._labels = labels
Example #24
0
    def process_raw_tuples(self, raw_tuples):
        mg = nx.MultiDiGraph()
        ent_classes = OrderedDict()
        rel_classes = OrderedDict()
        entities = OrderedDict()
        src = []
        dst = []
        ntid = []
        etid = []
        sorted_tuples = []
        for t in raw_tuples:
            sorted_tuples.append(t)
        sorted_tuples.sort()

        for i, (sbj, pred, obj) in enumerate(sorted_tuples):
            if i % self._print_every == 0:
                print('Processed %d tuples, found %d valid tuples.' %
                      (i, len(src)))
            sbjent = self.parse_entity(sbj)
            rel = self.parse_relation(pred)
            objent = self.parse_entity(obj)
            processed = self.process_tuple((sbj, pred, obj), sbjent, rel,
                                           objent)
            if processed is None:
                # ignored
                continue
            # meta graph
            sbjclsid = _get_id(ent_classes, sbjent.cls)
            objclsid = _get_id(ent_classes, objent.cls)
            relclsid = _get_id(rel_classes, rel.cls)
            mg.add_edge(sbjent.cls, objent.cls, key=rel.cls)
            if self._insert_reverse:
                mg.add_edge(objent.cls, sbjent.cls, key='rev-%s' % rel.cls)
            # instance graph
            src_id = _get_id(entities, str(sbjent))
            if len(entities) > len(ntid):  # found new entity
                ntid.append(sbjclsid)
            dst_id = _get_id(entities, str(objent))
            if len(entities) > len(ntid):  # found new entity
                ntid.append(objclsid)
            src.append(src_id)
            dst.append(dst_id)
            etid.append(relclsid)

        src = np.asarray(src)
        dst = np.asarray(dst)
        ntid = np.asarray(ntid)
        etid = np.asarray(etid)
        ntypes = list(ent_classes.keys())
        etypes = list(rel_classes.keys())

        # add reverse edge with reverse relation
        if self._insert_reverse:
            print('Adding reverse edges ...')
            newsrc = np.hstack([src, dst])
            newdst = np.hstack([dst, src])
            src = newsrc
            dst = newdst
            etid = np.hstack([etid, etid + len(etypes)])
            etypes.extend(['rev-%s' % t for t in etypes])

        self.build_graph(mg, src, dst, ntid, etid, ntypes, etypes)

        print('Load training/validation/testing split ...')
        idmap = F.asnumpy(
            self.graph.nodes[self.predict_category].data[dgl.NID])
        glb2lcl = {glbid: lclid for lclid, glbid in enumerate(idmap)}

        def findidfn(ent):
            if ent not in entities:
                return None
            else:
                return glb2lcl[entities[ent]]

        self.load_data_split(findidfn)

        self.save_cache(mg, src, dst, ntid, etid, ntypes, etypes)
Example #25
0
 def val_mask(self):
     deprecate_property('dataset.val_mask', 'graph.ndata[\'val_mask\']')
     return F.asnumpy(self._graph.ndata['val_mask'])
Example #26
0
    def train_val_test_split(dataset,
                             labels,
                             task_id,
                             frac_train=0.8,
                             frac_val=0.1,
                             frac_test=0.1,
                             bucket_size=10,
                             random_state=None):
        """Split the dataset into training, validation and test subsets as stated above.

        Parameters
        ----------
        dataset
            We assume ``len(dataset)`` gives the size for the dataset, ``dataset[i]``
            gives the ith datapoint and ``dataset.smiles[i]`` gives the SMILES for the
            ith datapoint.
        labels : tensor of shape (N, T)
            Dataset labels all tasks. N for the number of datapoints and T for the number
            of tasks.
        task_id : int
            Index for the task.
        frac_train : float
            Fraction of data to use for training. By default, we set this to be 0.8, i.e.
            80% of the dataset is used for training.
        frac_val : float
            Fraction of data to use for validation. By default, we set this to be 0.1, i.e.
            10% of the dataset is used for validation.
        frac_test : float
            Fraction of data to use for test. By default, we set this to be 0.1, i.e.
            10% of the dataset is used for test.
        bucket_size : int
            Size of bucket of datapoints. Default to 10.
        random_state : None, int or array_like, optional
            Random seed used to initialize the pseudo-random number generator.
            Can be any integer between 0 and 2**32 - 1 inclusive, an array
            (or other sequence) of such integers, or None (the default).
            If seed is None, then RandomState will try to read data from /dev/urandom
            (or the Windows analogue) if available or seed from the clock otherwise.

        Returns
        -------
        list of length 3
            Subsets for training, validation and test, which also have ``len(dataset)``
            and ``dataset[i]`` behaviors
        """
        train_val_test_sanity_check(frac_train, frac_val, frac_test)

        if random_state is not None:
            np.random.seed(random_state)

        if not isinstance(labels, np.ndarray):
            labels = F.asnumpy(labels)
        task_labels = labels[:, task_id]
        sorted_indices = np.argsort(task_labels)

        train_bucket_cutoff = int(np.round(frac_train * bucket_size))
        val_bucket_cutoff = int(np.round(
            frac_val * bucket_size)) + train_bucket_cutoff

        train_indices, val_indices, test_indices = [], [], []

        while sorted_indices.shape[0] >= bucket_size:
            current_batch, sorted_indices = np.split(sorted_indices,
                                                     [bucket_size])
            shuffled = np.random.permutation(range(bucket_size))
            train_indices.extend(
                current_batch[shuffled[:train_bucket_cutoff]].tolist())
            val_indices.extend(current_batch[
                shuffled[train_bucket_cutoff:val_bucket_cutoff]].tolist())
            test_indices.extend(
                current_batch[shuffled[val_bucket_cutoff:]].tolist())

        # Place rest samples in the training set.
        train_indices.extend(sorted_indices.tolist())

        return [
            Subset(dataset, train_indices),
            Subset(dataset, val_indices),
            Subset(dataset, test_indices)
        ]
Example #27
0
def check_infer_score(func_name):
    batch_size = 10

    ke_score_func = ke_infer_funcs[func_name]

    # normal
    head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'none')
    if args is None:
        score_func = ke_score_func()
    elif type(args) is tuple:
        score_func = ke_score_func(*list(args))
    else:
        score_func = ke_score_func(args)
    score1 = score_func.infer(head_emb, rel_emb, tail_emb)
    assert(score1.shape[0] == head_emb.shape[0])
    h_score = []
    for i in range(head_emb.shape[0]):
        r_score = []
        for j in range(rel_emb.shape[0]):
            t_score = []
            for k in range(tail_emb.shape[0]):
                hemb = head_emb[i]
                remb = rel_emb[j]
                temb = F.unsqueeze(tail_emb[k], dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = score_func.edge_func(edge)['score']
                t_score.append(F.asnumpy(score))
            r_score.append(t_score)
        h_score.append(r_score)
    score2 = np.asarray(h_score).reshape(head_emb.shape[0], rel_emb.shape[0], tail_emb.shape[0])
    np.testing.assert_allclose(F.asnumpy(score1), score2,
                                   rtol=1e-5, atol=1e-5)

    # bcast head
    head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'head')
    if args is None:
        score_func = ke_score_func()
    elif type(args) is tuple:
        score_func = ke_score_func(*list(args))
    else:
        score_func = ke_score_func(args)
    score1 = score_func.infer(head_emb, rel_emb, tail_emb)
    assert(score1.shape[0] == head_emb.shape[0])
    h_score = []
    for i in range(head_emb.shape[0]):
        r_score = []
        for j in range(rel_emb.shape[0]):
            t_score = []
            for k in range(tail_emb.shape[0]):
                hemb = head_emb[i]
                remb = rel_emb[j]
                temb = F.unsqueeze(tail_emb[k], dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = score_func.edge_func(edge)['score']
                t_score.append(F.asnumpy(score))
            r_score.append(t_score)
        h_score.append(r_score)
    score2 = np.asarray(h_score).reshape(1, rel_emb.shape[0], tail_emb.shape[0])
    np.testing.assert_allclose(F.asnumpy(score1), score2,
                                   rtol=1e-5, atol=1e-5)

    # bcast rel
    head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'rel')
    if args is None:
        score_func = ke_score_func()
    elif type(args) is tuple:
        score_func = ke_score_func(*list(args))
    else:
        score_func = ke_score_func(args)
    score1 = score_func.infer(head_emb, rel_emb, tail_emb)
    assert(score1.shape[0] == head_emb.shape[0])
    h_score = []
    for i in range(head_emb.shape[0]):
        r_score = []
        for j in range(rel_emb.shape[0]):
            t_score = []
            for k in range(tail_emb.shape[0]):
                hemb = head_emb[i]
                remb = rel_emb[j]
                temb = F.unsqueeze(tail_emb[k], dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = score_func.edge_func(edge)['score']
                t_score.append(F.asnumpy(score))
            r_score.append(t_score)
        h_score.append(r_score)
    score2 = np.asarray(h_score).reshape(head_emb.shape[0], 1, tail_emb.shape[0])
    np.testing.assert_allclose(F.asnumpy(score1), score2,
                                   rtol=1e-5, atol=1e-5)

    # bcast tail
    head_emb, rel_emb, tail_emb, args = generate_rand_emb(func_name, 'tail')
    if args is None:
        score_func = ke_score_func()
    elif type(args) is tuple:
        score_func = ke_score_func(*list(args))
    else:
        score_func = ke_score_func(args)
    score1 = score_func.infer(head_emb, rel_emb, tail_emb)
    assert(score1.shape[0] == head_emb.shape[0])
    h_score = []
    for i in range(head_emb.shape[0]):
        r_score = []
        for j in range(rel_emb.shape[0]):
            t_score = []
            for k in range(tail_emb.shape[0]):
                hemb = head_emb[i]
                remb = rel_emb[j]
                temb = F.unsqueeze(tail_emb[k], dim=0)
                edge = FakeEdge(hemb, temb, remb)
                score = score_func.edge_func(edge)['score']
                t_score.append(F.asnumpy(score))
            r_score.append(t_score)
        h_score.append(r_score)
    score2 = np.asarray(h_score).reshape(head_emb.shape[0], rel_emb.shape[0], 1)
    np.testing.assert_allclose(F.asnumpy(score1), score2,
                                   rtol=1e-5, atol=1e-5)
Example #28
0
    def topK(self, head=None, tail=None, bcast=False, pair_ws=False, k=10):
        if head is None:
            head = F.arange(0, self.emb.shape[0])
        else:
            head = F.tensor(head)
        if tail is None:
            tail = F.arange(0, self.emb.shape[0])
        else:
            tail = F.tensor(tail)

        head_emb = self.emb[head]
        tail_emb = self.emb[tail]
        if pair_ws is True:
            result = []
            batch_size = self.batch_size
            # chunked cal score
            score = []
            num_head = head.shape[0]
            num_tail = tail.shape[0]
            for i in range((num_head + batch_size - 1) // batch_size):
                sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \
                                                   if (i + 1) * batch_size < num_head \
                                                   else num_head]
                sh_emb = F.copy_to(sh_emb, self.device)
                st_emb = tail_emb[i * batch_size : (i + 1) * batch_size \
                                                   if (i + 1) * batch_size < num_head \
                                                   else num_head]
                st_emb = F.copy_to(st_emb, self.device)
                score.append(F.copy_to(self.sim_func(sh_emb, st_emb, pw=True), F.cpu()))
            score = F.cat(score, dim=0)

            sidx = F.argsort(score, dim=0, descending=True)
            sidx = sidx[:k]
            score = score[sidx]
            result.append((F.asnumpy(head[sidx]),
                           F.asnumpy(tail[sidx]),
                           F.asnumpy(score)))
        else:
            num_head = head.shape[0]
            num_tail = tail.shape[0]
            batch_size = self.batch_size

            # chunked cal score
            score = []
            for i in range((num_head + batch_size - 1) // batch_size):
                sh_emb = head_emb[i * batch_size : (i + 1) * batch_size \
                                            if (i + 1) * batch_size < num_head \
                                            else num_head]
                sh_emb = F.copy_to(sh_emb, self.device)
                s_score = []
                for j in range((num_tail + batch_size - 1) // batch_size):
                    st_emb = tail_emb[j * batch_size : (j + 1) * batch_size \
                                                    if (j + 1) * batch_size < num_tail \
                                                    else num_tail]
                    st_emb = F.copy_to(st_emb, self.device)
                    s_score.append(F.copy_to(self.sim_func(sh_emb, st_emb), F.cpu()))
                score.append(F.cat(s_score, dim=1))
            score = F.cat(score, dim=0)

            if bcast is False:
                result = []
                idx = F.arange(0, num_head * num_tail)
                score = F.reshape(score, (num_head * num_tail, ))

                sidx = F.argsort(score, dim=0, descending=True)
                sidx = sidx[:k]
                score = score[sidx]
                sidx = sidx
                idx = idx[sidx]
                tail_idx = idx % num_tail
                idx = floor_divide(idx, num_tail)
                head_idx = idx % num_head

                result.append((F.asnumpy(head[head_idx]),
                           F.asnumpy(tail[tail_idx]),
                           F.asnumpy(score)))

            else: # bcast at head
                result = []
                for i in range(num_head):
                    i_score = score[i]

                    sidx = F.argsort(i_score, dim=0, descending=True)
                    idx = F.arange(0, num_tail)
                    i_idx = sidx[:k]
                    i_score = i_score[i_idx]
                    idx = idx[i_idx]

                    result.append((np.full((k,), F.asnumpy(head[i])),
                                  F.asnumpy(tail[idx]),
                                  F.asnumpy(i_score)))

        return result
Example #29
0
 def test_mask(self):
     deprecate_property('dataset.test_mask', 'graph.ndata[\'test_mask\']')
     return F.asnumpy(self._graph.ndata['test_mask'])
Example #30
0
    def topK(self, head=None, rel=None, tail=None, exec_mode='all', k=10):
        if head is None:
            head = F.arange(0, self.model.num_entity)
        else:
            head = F.tensor(head)
        if rel is None:
            rel = F.arange(0, self.model.num_rel)
        else:
            rel = F.tensor(rel)
        if tail is None:
            tail = F.arange(0, self.model.num_entity)
        else:
            tail = F.tensor(tail)

        num_head = F.shape(head)[0]
        num_rel = F.shape(rel)[0]
        num_tail = F.shape(tail)[0]

        if exec_mode == 'triplet_wise':
            result = []
            assert num_head == num_rel, \
                'For triplet wise exection mode, head, relation and tail lists should have same length'
            assert num_head == num_tail, \
                'For triplet wise exection mode, head, relation and tail lists should have same length'

            raw_score = self.model.score(head, rel, tail, triplet_wise=True)
            score = self.score_func(raw_score)
            idx = F.arange(0, num_head)

            sidx = F.argsort(score, dim=0, descending=True)
            sidx = sidx[:k]
            score = score[sidx]
            idx = idx[sidx]

            result.append((F.asnumpy(head[idx]),
                           F.asnumpy(rel[idx]),
                           F.asnumpy(tail[idx]),
                           F.asnumpy(score)))
        elif exec_mode == 'all':
            result = []
            raw_score = self.model.score(head, rel, tail)
            score = self.score_func(raw_score)
            idx = F.arange(0, num_head * num_rel * num_tail)

            sidx = F.argsort(score, dim=0, descending=True)
            sidx = sidx[:k]
            score = score[sidx]
            idx = idx[sidx]

            tail_idx = idx % num_tail
            idx = floor_divide(idx, num_tail)
            rel_idx = idx % num_rel
            idx = floor_divide(idx, num_rel)
            head_idx = idx % num_head

            result.append((F.asnumpy(head[head_idx]),
                           F.asnumpy(rel[rel_idx]),
                           F.asnumpy(tail[tail_idx]),
                           F.asnumpy(score)))
        elif exec_mode == 'batch_head':
            result = []
            for i in range(num_head):
                raw_score = self.model.score(F.unsqueeze(head[i], 0), rel, tail)
                score = self.score_func(raw_score)
                idx = F.arange(0, num_rel * num_tail)

                sidx = F.argsort(score, dim=0, descending=True)
                sidx = sidx[:k]
                score = score[sidx]
                idx = idx[sidx]
                tail_idx = idx % num_tail
                idx = floor_divide(idx, num_tail)
                rel_idx = idx % num_rel

                result.append((np.full((k,), F.asnumpy(head[i])),
                               F.asnumpy(rel[rel_idx]),
                               F.asnumpy(tail[tail_idx]),
                               F.asnumpy(score)))
        elif exec_mode == 'batch_rel':
            result = []
            for i in range(num_rel):
                raw_score = self.model.score(head, F.unsqueeze(rel[i], 0), tail)
                score = self.score_func(raw_score)
                idx = F.arange(0, num_head * num_tail)

                sidx = F.argsort(score, dim=0, descending=True)
                sidx = sidx[:k]
                score = score[sidx]
                idx = idx[sidx]
                tail_idx = idx % num_tail
                idx = floor_divide(idx, num_tail)
                head_idx = idx % num_head

                result.append((F.asnumpy(head[head_idx]),
                               np.full((k,), F.asnumpy(rel[i])),
                               F.asnumpy(tail[tail_idx]),
                               F.asnumpy(score)))
        elif exec_mode == 'batch_tail':
            result = []
            for i in range(num_tail):
                raw_score = self.model.score(head, rel, F.unsqueeze(tail[i], 0))
                score = self.score_func(raw_score)
                idx = F.arange(0, num_head * num_rel)

                sidx = F.argsort(score, dim=0, descending=True)
                sidx = sidx[:k]
                score = score[sidx]
                idx = idx[sidx]
                rel_idx = idx % num_rel
                idx = floor_divide(idx, num_rel)
                head_idx = idx % num_head
                result.append((F.asnumpy(head[head_idx]),
                               F.asnumpy(rel[rel_idx]),
                               np.full((k,), F.asnumpy(tail[i])),
                               F.asnumpy(score)))
        else:
            assert False, 'unknow execution mode type {}'.format(exec_mode)

        return result