Example #1
0
def to_batches(
    instances: List[Instance], batch_size: int, relation_to_idx: Dict[str, int]
) -> List[Tuple[Batch, List[int], Tensor, List[Instance]]]:
    nb_instances, res = len(instances), []
    batches = make_batches(nb_instances, batch_size)
    for batch_start, batch_end in batches:
        batch_instances = instances[batch_start:batch_end]
        max_nb_entities = max(i.nb_nodes for i in batch_instances)
        this_batch_size = len(batch_instances)

        batch_pairs = [
            to_data(i, relation_to_idx, max_nb_entities)
            for i in batch_instances
        ]
        batch_data: List[GeometricData] = [d for d, _ in batch_pairs]
        batch_targets: List[List[int]] = [[p[0], p[1]] for _, p in batch_pairs]

        max_node = max(i + 1 for b in batch_data
                       for i in b.x[:, 0].cpu().numpy())
        batch = Batch.from_data_list(batch_data)
        slices = [max_node for _ in batch_data]
        targets = torch.LongTensor(batch_targets).view(this_batch_size, 1, 2)

        res += [(batch, slices, targets, batch_instances)]
    return res
Example #2
0
    def __init__(self, batch_size: int, nb_examples: int, nb_epochs: int,
                 random_state: np.random.RandomState):
        self.batch_size = batch_size
        self.nb_examples = nb_examples
        self.nb_epochs = nb_epochs
        self.random_state = random_state

        size = self.nb_epochs * self.nb_examples
        self.curriculum = np.zeros(size, dtype=np.int32)

        for epoch_no in range(nb_epochs):
            start, end = epoch_no * nb_examples, (epoch_no + 1) * nb_examples
            self.curriculum[start:end] = self.random_state.permutation(
                nb_examples)

        self.batches = make_batches(self.curriculum.shape[0], self.batch_size)
        self.nb_batches = len(self.batches)
Example #3
0
def accuracy_b(scoring_function: Callable[[List[Instance], List[str]],
                                          Tuple[Tensor, Any]],
               instances: List[Instance],
               relation_to_predicate: Dict[str, str],
               predicate_to_relations: Dict[str, List[str]],
               sample_size: Optional[int] = None,
               batch_size: Optional[int] = None) -> float:
    predicate_lst = sorted(predicate_to_relations.keys())
    relation_lst = [predicate_to_relations[p][0] for p in predicate_lst]

    if sample_size is not None:
        instances = instances[:sample_size]

    nb_instances = len(instances)

    batches = [(None, None)]
    if batch_size is not None:
        batches = make_batches(nb_instances, batch_size)

    nb_relations = len(relation_lst)

    is_correct_lst = []

    for batch_start, batch_end in batches:
        batch = instances[batch_start:batch_end]
        batch_size = len(batch)

        with torch.no_grad():
            scores, _ = scoring_function(batch, relation_lst)
            scores = scores.view(batch_size, nb_relations)
            scores_np = scores.cpu().numpy()
        predicted = np.argmax(scores_np, axis=1)

        def norm(a: str) -> str:
            return predicate_to_relations[relation_to_predicate[a]][0]

        true = np.array([relation_lst.index(norm(i.target[1])) for i in batch],
                        dtype=predicted.dtype)
        is_correct_lst += (predicted == true).tolist()

    return np.mean(is_correct_lst).item() * 100.0
Example #4
0
    def __init__(self, data: Data, batch_size: int, nb_epochs: int,
                 random_state: np.random.RandomState) -> None:
        self.data = data
        self.batch_size = batch_size
        self.random_state = random_state

        size = nb_epochs * data.nb_examples
        self.curriculum_Xs = np.zeros(size, dtype=np.int32)
        self.curriculum_Xp = np.zeros(size, dtype=np.int32)
        self.curriculum_Xo = np.zeros(size, dtype=np.int32)
        self.curriculum_Xi = np.zeros(size, dtype=np.int32)

        for epoch_no in range(nb_epochs):
            curriculum_order = self.random_state.permutation(data.nb_examples)
            start = epoch_no * data.nb_examples
            end = (epoch_no + 1) * data.nb_examples
            self.curriculum_Xs[start:end] = data.Xs[curriculum_order]
            self.curriculum_Xp[start:end] = data.Xp[curriculum_order]
            self.curriculum_Xo[start:end] = data.Xo[curriculum_order]
            self.curriculum_Xi[start:end] = data.Xi[curriculum_order]

        self.batches = make_batches(self.curriculum_Xs.shape[0], batch_size)
        self.nb_batches = len(self.batches)
Example #5
0
def test_clutrr_v3():
    embedding_size = 20
    batch_size = 8

    torch.manual_seed(0)

    triples, hops = [], []

    for i in range(32):
        triples += [(f'a{i}', 'p', f'b{i}'), (f'b{i}', 'q', f'c{i}')]
        hops += [(f'a{i}', 'r', f'c{i}')]

    entity_lst = sorted({s
                         for (s, _, _) in triples + hops}
                        | {o
                           for (_, _, o) in triples + hops})
    predicate_lst = sorted({p for (_, p, _) in triples + hops})

    nb_entities, nb_predicates = len(entity_lst), len(predicate_lst)

    entity_to_index = {e: i for i, e in enumerate(entity_lst)}
    predicate_to_index = {p: i for i, p in enumerate(predicate_lst)}

    kernel = GaussianKernel(slope=None)

    entity_embeddings = nn.Embedding(nb_entities, embedding_size, sparse=True)
    predicate_embeddings = nn.Embedding(nb_predicates,
                                        embedding_size,
                                        sparse=True)

    # _hops = LinearReformulator(2, embedding_size)
    _hops = AttentiveReformulator(2, predicate_embeddings)

    model = NeuralKB(kernel=kernel, scoring_type='concat')
    hoppy = Hoppy(model, hops_lst=[(_hops, False)], depth=1)

    params = [
        p for p in hoppy.parameters()
        if not torch.equal(p, entity_embeddings.weight)
        and not torch.equal(p, predicate_embeddings.weight)
    ]

    for tensor in params:
        print(f'\t{tensor.size()}\t{tensor.device}')

    loss_function = nn.BCELoss()

    optimizer = optim.Adagrad(params, lr=0.1)

    hops_data = []
    for i in range(64):
        hops_data += hops

    batches = make_batches(len(hops_data), batch_size)

    rs = np.random.RandomState()

    c, d = 0.0, 0.0
    p_emb = predicate_embeddings(
        torch.from_numpy(np.array([predicate_to_index['p']])))
    q_emb = predicate_embeddings(
        torch.from_numpy(np.array([predicate_to_index['q']])))

    for batch_start, batch_end in batches:
        hops_batch = hops_data[batch_start:batch_end]

        s_lst = [s for (s, _, _) in hops_batch]
        p_lst = [p for (_, p, _) in hops_batch]
        o_lst = [o for (_, _, o) in hops_batch]

        nb_positives = len(s_lst)
        nb_negatives = nb_positives * 3

        s_n_lst = rs.permutation(nb_entities)[:nb_negatives].tolist()
        nb_negatives = len(s_n_lst)
        o_n_lst = rs.permutation(nb_entities)[:nb_negatives].tolist()
        p_n_lst = list(islice(cycle(p_lst), nb_negatives))

        xs_np = np.array([entity_to_index[s] for s in s_lst] + s_n_lst)
        xp_np = np.array([predicate_to_index[p] for p in p_lst + p_n_lst])
        xo_np = np.array([entity_to_index[o] for o in o_lst] + o_n_lst)

        xs_emb = entity_embeddings(torch.from_numpy(xs_np))
        xp_emb = predicate_embeddings(torch.from_numpy(xp_np))
        xo_emb = entity_embeddings(torch.from_numpy(xo_np))

        rel_emb = encode_relation(facts=triples,
                                  relation_embeddings=predicate_embeddings,
                                  relation_to_idx=predicate_to_index)
        arg1_emb, arg2_emb = encode_arguments(
            facts=triples,
            entity_embeddings=entity_embeddings,
            entity_to_idx=entity_to_index)

        facts = [rel_emb, arg1_emb, arg2_emb]

        scores = hoppy.score(xp_emb,
                             xs_emb,
                             xo_emb,
                             facts=facts,
                             entity_embeddings=entity_embeddings.weight)

        labels_np = np.zeros(xs_np.shape[0])
        labels_np[:nb_positives] = 1
        labels = torch.from_numpy(labels_np).float()

        # for s, p, o, l in zip(xs_np, xp_np, xo_np, labels):
        #     print(s, p, o, l)

        loss = loss_function(scores, labels)

        hop_1_emb = hoppy.hops_lst[0][0].hops_lst[0](xp_emb)
        hop_2_emb = hoppy.hops_lst[0][0].hops_lst[1](xp_emb)

        c = kernel.pairwise(p_emb, hop_1_emb).mean().cpu().detach().numpy()
        d = kernel.pairwise(q_emb, hop_2_emb).mean().cpu().detach().numpy()

        print(c, d)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    assert c > 0.95 and d > 0.95
Example #6
0
File: slow.py Project: alex4321/ctp
def evaluate_slow(entity_embeddings: nn.Embedding,
                  predicate_embeddings: nn.Embedding, test_triples: Tuple[str,
                                                                          str,
                                                                          str],
                  all_triples: Tuple[str, str,
                                     str], entity_to_index: Dict[str, int],
                  predicate_to_index: Dict[str,
                                           int], model: BaseLatentFeatureModel,
                  batch_size: int, device: torch.device):

    xs = np.array([entity_to_index.get(s) for (s, _, _) in test_triples])
    xp = np.array([predicate_to_index.get(p) for (_, p, _) in test_triples])
    xo = np.array([entity_to_index.get(o) for (_, _, o) in test_triples])

    sp_to_o, po_to_s = {}, {}
    for s, p, o in all_triples:
        s_idx, p_idx, o_idx = entity_to_index.get(s), predicate_to_index.get(
            p), entity_to_index.get(o)
        sp_key = (s_idx, p_idx)
        po_key = (p_idx, o_idx)

        if sp_key not in sp_to_o:
            sp_to_o[sp_key] = []
        if po_key not in po_to_s:
            po_to_s[po_key] = []

        sp_to_o[sp_key] += [o_idx]
        po_to_s[po_key] += [s_idx]

    assert xs.shape == xp.shape == xo.shape
    nb_test_triples = xs.shape[0]

    batches = make_batches(nb_test_triples, batch_size)

    hits = dict()
    hits_at = [1, 3, 5, 10]

    for hits_at_value in hits_at:
        hits[hits_at_value] = 0.0

    def hits_at_n(n_, rank):
        if rank <= n_:
            hits[n_] = hits.get(n_, 0) + 1

    counter = 0
    mrr = 0.0

    ranks_l, ranks_r = [], []
    for start, end in batches:
        batch_xs = xs[start:end]
        batch_xp = xp[start:end]
        batch_xo = xo[start:end]

        batch_size = batch_xs.shape[0]
        counter += batch_size * 2

        with torch.no_grad():
            tensor_xs = torch.LongTensor(batch_xs).to(device)
            tensor_xp = torch.LongTensor(batch_xp).to(device)
            tensor_xo = torch.LongTensor(batch_xo).to(device)

            tensor_xs_emb = entity_embeddings(tensor_xs)
            tensor_xp_emb = predicate_embeddings(tensor_xp)
            tensor_xo_emb = entity_embeddings(tensor_xo)
            # print(entity_embeddings.weight.shape)

            if model.model.facts[0].shape[0] < 90000:
                res_sp, res_po = model.forward_(tensor_xp_emb, tensor_xs_emb,
                                                tensor_xo_emb)
            else:
                res_sp, res_po = model.forward__(tensor_xp_emb, tensor_xs_emb,
                                                 tensor_xo_emb)

            _scores_sp, _ = res_sp
            _scores_po, _ = res_po

            scores_sp, scores_po = _scores_sp.cpu().numpy(), _scores_po.cpu(
            ).numpy()

            del _scores_sp, _scores_po
            del tensor_xs, tensor_xp, tensor_xo
            del tensor_xs_emb, tensor_xp_emb, tensor_xo_emb
            del res_sp, res_po
            # print(scores_sp.shape, scores_po.shape)

        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        batch_size = batch_xs.shape[0]
        for elem_idx in range(batch_size):
            s_idx, p_idx, o_idx = batch_xs[elem_idx], batch_xp[
                elem_idx], batch_xo[elem_idx]

            # Code for the filtered setting
            sp_key = (s_idx, p_idx)
            po_key = (p_idx, o_idx)

            o_to_remove = sp_to_o[sp_key]
            s_to_remove = po_to_s[po_key]

            for tmp_o_idx in o_to_remove:
                if tmp_o_idx != o_idx:
                    scores_sp[elem_idx, tmp_o_idx] = -np.infty

            for tmp_s_idx in s_to_remove:
                if tmp_s_idx != s_idx:
                    scores_po[elem_idx, tmp_s_idx] = -np.infty
            # End of code for the filtered setting

            rank_l = 1 + np.argsort(np.argsort(-scores_po[elem_idx, :]))[s_idx]
            rank_r = 1 + np.argsort(np.argsort(-scores_sp[elem_idx, :]))[o_idx]

            ranks_l += [rank_l]
            ranks_r += [rank_r]

            mrr += 1.0 / rank_l
            mrr += 1.0 / rank_r

            for n in hits_at:
                hits_at_n(n, rank_l)

            for n in hits_at:
                hits_at_n(n, rank_r)

    counter = float(counter)

    mrr /= counter

    for n in hits_at:
        hits[n] /= counter

    metrics = dict()
    metrics['MRR'] = mrr
    for n in hits_at:
        metrics['hits@{}'.format(n)] = hits[n]

    return metrics
Example #7
0
def test_learning_v3():
    embedding_size = 10
    batch_size = 16

    triples, hops = [], []

    for i in range(16):
        triples += [(f'a{i}', 'p', f'b{i}'), (f'b{i}', 'q', f'c{i}')]
        hops += [(f'a{i}', 'r', f'c{i}')]

    entity_lst = sorted({e
                         for (e, _, _) in triples + hops}
                        | {e
                           for (e, _, e) in triples + hops})
    predicate_lst = sorted({p for (_, p, _) in triples + hops})

    nb_entities, nb_predicates = len(entity_lst), len(predicate_lst)

    entity_to_index = {e: i for i, e in enumerate(entity_lst)}
    predicate_to_index = {p: i for i, p in enumerate(predicate_lst)}

    torch.manual_seed(0)

    kernel = GaussianKernel()

    entity_embeddings = nn.Embedding(nb_entities,
                                     embedding_size * 2,
                                     sparse=True)
    predicate_embeddings = nn.Embedding(nb_predicates,
                                        embedding_size * 2,
                                        sparse=True)

    fact_rel = torch.LongTensor(
        np.array([predicate_to_index[p] for (_, p, _) in triples]))
    fact_arg1 = torch.LongTensor(
        np.array([entity_to_index[s] for (s, _, _) in triples]))
    fact_arg2 = torch.LongTensor(
        np.array([entity_to_index[o] for (_, _, o) in triples]))
    facts = [fact_rel, fact_arg1, fact_arg2]

    model = NeuralKB(entity_embeddings=entity_embeddings,
                     predicate_embeddings=predicate_embeddings,
                     kernel=kernel,
                     facts=facts)

    reformulator = AttentiveReformulator(2, predicate_embeddings)
    hoppy = SimpleHoppy(model, entity_embeddings, hops=reformulator)

    N3_reg = N3()

    params = [
        p for p in hoppy.parameters()
        if not torch.equal(p, entity_embeddings.weight)
        and not torch.equal(p, predicate_embeddings.weight)
    ]

    loss_function = nn.CrossEntropyLoss(reduction='mean')

    p_emb = predicate_embeddings(
        torch.LongTensor(np.array([predicate_to_index['p']])))
    q_emb = predicate_embeddings(
        torch.LongTensor(np.array([predicate_to_index['q']])))
    # r_emb = predicate_embeddings(torch.LongTensor(np.array([predicate_to_index['r']])))

    optimizer = optim.Adagrad(params, lr=0.1)

    hops_data = []
    for i in range(128):
        hops_data += hops

    batches = make_batches(len(hops_data), batch_size)

    c, d = 0.0, 0.0

    for batch_start, batch_end in batches:
        hops_batch = hops_data[batch_start:batch_end]

        s_lst = [s for (s, _, _) in hops_batch]
        p_lst = [p for (_, p, _) in hops_batch]
        o_lst = [o for (_, _, o) in hops_batch]

        xs_np = np.array([entity_to_index[s] for s in s_lst])
        xp_np = np.array([predicate_to_index[p] for p in p_lst])
        xo_np = np.array([entity_to_index[o] for o in o_lst])

        xs = torch.LongTensor(xs_np)
        xp = torch.LongTensor(xp_np)
        xo = torch.LongTensor(xo_np)

        xs_emb = entity_embeddings(xs)
        xp_emb = predicate_embeddings(xp)
        xo_emb = entity_embeddings(xo)

        sp_scores, po_scores = hoppy.forward(xp_emb, xs_emb, xo_emb)

        loss = loss_function(sp_scores, xo) + loss_function(po_scores, xs)

        factors = [hoppy.factor(e) for e in [xp_emb, xs_emb, xo_emb]]
        loss += 0.1 * N3_reg(factors)

        tmp = hoppy.hops(xp_emb)
        hop_1_emb = tmp[0]
        hop_2_emb = tmp[1]

        c = kernel.pairwise(p_emb, hop_1_emb).mean().cpu().detach().numpy()
        d = kernel.pairwise(q_emb, hop_2_emb).mean().cpu().detach().numpy()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    assert c > 0.95
    assert d > 0.95
Example #8
0
def evaluate_naive(entity_embeddings: nn.Embedding,
                   predicate_embeddings: nn.Embedding,
                   test_triples: Tuple[str, str, str],
                   all_triples: Tuple[str, str, str],
                   entity_to_index: Dict[str, int],
                   predicate_to_index: Dict[str, int],
                   model: BaseLatentFeatureModel,
                   batch_size: int,
                   device: torch.device):

    index_to_entity = {index: entity for entity, index in entity_to_index.items()}
    index_to_predicate = {index: predicate for predicate, index in predicate_to_index.items()}

    test_triples = {(entity_to_index[s], predicate_to_index[p], entity_to_index[o]) for s, p, o in test_triples}
    all_triples = {(entity_to_index[s], predicate_to_index[p], entity_to_index[o]) for s, p, o in all_triples}

    entities = sorted(index_to_entity.keys())

    hits = dict()
    hits_at = [1, 3, 5, 10]

    for hits_at_value in hits_at:
        hits[hits_at_value] = 0.0

    def hits_at_n(n_, rank):
        if rank <= n_:
            hits[n_] = hits.get(n_, 0) + 1

    counter = 0
    mrr = 0.0

    for s_idx, p_idx, o_idx in test_triples:
        corrupted_subject = [(entity, p_idx, o_idx) for entity in entities if (entity, p_idx, o_idx) not in all_triples or entity == s_idx]
        corrupted_object = [(s_idx, p_idx, entity) for entity in entities if (s_idx, p_idx, entity) not in all_triples or entity == o_idx]

        index_l = corrupted_subject.index((s_idx, p_idx, o_idx))
        index_r = corrupted_object.index((s_idx, p_idx, o_idx))

        nb_corrupted_l = len(corrupted_subject)
        # nb_corrupted_r = len(corrupted_object)

        corrupted = corrupted_subject + corrupted_object

        nb_corrupted = len(corrupted)

        batches = make_batches(nb_corrupted, batch_size)

        scores_lst = []
        for start, end in batches:
            batch = np.array(corrupted[start:end])
            x_sub, x_pred, x_obj = batch[:, 0], batch[:, 1], batch[:, 2]

            with torch.no_grad():
                tensor_xs = torch.from_numpy(x_sub).to(device)
                tensor_xp = torch.from_numpy(x_pred).to(device)
                tensor_xo = torch.from_numpy(x_obj).to(device)

                tensor_xs_emb = entity_embeddings(tensor_xs)
                tensor_xp_emb = predicate_embeddings(tensor_xp)
                tensor_xo_emb = entity_embeddings(tensor_xo)

                scores_np = model.score(tensor_xp_emb, tensor_xs_emb, tensor_xo_emb).cpu().numpy()
                scores_lst += scores_np.tolist()

        scores_l = scores_lst[:nb_corrupted_l]
        scores_r = scores_lst[nb_corrupted_l:]

        rank_l = 1 + np.argsort(np.argsort(- np.array(scores_l)))[index_l]
        counter += 1

        for n in hits_at:
            hits_at_n(n, rank_l)

        mrr += 1.0 / rank_l

        rank_r = 1 + np.argsort(np.argsort(- np.array(scores_r)))[index_r]
        counter += 1

        for n in hits_at:
            hits_at_n(n, rank_r)

        mrr += 1.0 / rank_r

    counter = float(counter)

    mrr /= counter

    for n in hits_at:
        hits[n] /= counter

    metrics = dict()
    metrics['MRR'] = mrr
    for n in hits_at:
        metrics['hits@{}'.format(n)] = hits[n]

    return metrics