def to_batches( instances: List[Instance], batch_size: int, relation_to_idx: Dict[str, int] ) -> List[Tuple[Batch, List[int], Tensor, List[Instance]]]: nb_instances, res = len(instances), [] batches = make_batches(nb_instances, batch_size) for batch_start, batch_end in batches: batch_instances = instances[batch_start:batch_end] max_nb_entities = max(i.nb_nodes for i in batch_instances) this_batch_size = len(batch_instances) batch_pairs = [ to_data(i, relation_to_idx, max_nb_entities) for i in batch_instances ] batch_data: List[GeometricData] = [d for d, _ in batch_pairs] batch_targets: List[List[int]] = [[p[0], p[1]] for _, p in batch_pairs] max_node = max(i + 1 for b in batch_data for i in b.x[:, 0].cpu().numpy()) batch = Batch.from_data_list(batch_data) slices = [max_node for _ in batch_data] targets = torch.LongTensor(batch_targets).view(this_batch_size, 1, 2) res += [(batch, slices, targets, batch_instances)] return res
def __init__(self, batch_size: int, nb_examples: int, nb_epochs: int, random_state: np.random.RandomState): self.batch_size = batch_size self.nb_examples = nb_examples self.nb_epochs = nb_epochs self.random_state = random_state size = self.nb_epochs * self.nb_examples self.curriculum = np.zeros(size, dtype=np.int32) for epoch_no in range(nb_epochs): start, end = epoch_no * nb_examples, (epoch_no + 1) * nb_examples self.curriculum[start:end] = self.random_state.permutation( nb_examples) self.batches = make_batches(self.curriculum.shape[0], self.batch_size) self.nb_batches = len(self.batches)
def accuracy_b(scoring_function: Callable[[List[Instance], List[str]], Tuple[Tensor, Any]], instances: List[Instance], relation_to_predicate: Dict[str, str], predicate_to_relations: Dict[str, List[str]], sample_size: Optional[int] = None, batch_size: Optional[int] = None) -> float: predicate_lst = sorted(predicate_to_relations.keys()) relation_lst = [predicate_to_relations[p][0] for p in predicate_lst] if sample_size is not None: instances = instances[:sample_size] nb_instances = len(instances) batches = [(None, None)] if batch_size is not None: batches = make_batches(nb_instances, batch_size) nb_relations = len(relation_lst) is_correct_lst = [] for batch_start, batch_end in batches: batch = instances[batch_start:batch_end] batch_size = len(batch) with torch.no_grad(): scores, _ = scoring_function(batch, relation_lst) scores = scores.view(batch_size, nb_relations) scores_np = scores.cpu().numpy() predicted = np.argmax(scores_np, axis=1) def norm(a: str) -> str: return predicate_to_relations[relation_to_predicate[a]][0] true = np.array([relation_lst.index(norm(i.target[1])) for i in batch], dtype=predicted.dtype) is_correct_lst += (predicted == true).tolist() return np.mean(is_correct_lst).item() * 100.0
def __init__(self, data: Data, batch_size: int, nb_epochs: int, random_state: np.random.RandomState) -> None: self.data = data self.batch_size = batch_size self.random_state = random_state size = nb_epochs * data.nb_examples self.curriculum_Xs = np.zeros(size, dtype=np.int32) self.curriculum_Xp = np.zeros(size, dtype=np.int32) self.curriculum_Xo = np.zeros(size, dtype=np.int32) self.curriculum_Xi = np.zeros(size, dtype=np.int32) for epoch_no in range(nb_epochs): curriculum_order = self.random_state.permutation(data.nb_examples) start = epoch_no * data.nb_examples end = (epoch_no + 1) * data.nb_examples self.curriculum_Xs[start:end] = data.Xs[curriculum_order] self.curriculum_Xp[start:end] = data.Xp[curriculum_order] self.curriculum_Xo[start:end] = data.Xo[curriculum_order] self.curriculum_Xi[start:end] = data.Xi[curriculum_order] self.batches = make_batches(self.curriculum_Xs.shape[0], batch_size) self.nb_batches = len(self.batches)
def test_clutrr_v3(): embedding_size = 20 batch_size = 8 torch.manual_seed(0) triples, hops = [], [] for i in range(32): triples += [(f'a{i}', 'p', f'b{i}'), (f'b{i}', 'q', f'c{i}')] hops += [(f'a{i}', 'r', f'c{i}')] entity_lst = sorted({s for (s, _, _) in triples + hops} | {o for (_, _, o) in triples + hops}) predicate_lst = sorted({p for (_, p, _) in triples + hops}) nb_entities, nb_predicates = len(entity_lst), len(predicate_lst) entity_to_index = {e: i for i, e in enumerate(entity_lst)} predicate_to_index = {p: i for i, p in enumerate(predicate_lst)} kernel = GaussianKernel(slope=None) entity_embeddings = nn.Embedding(nb_entities, embedding_size, sparse=True) predicate_embeddings = nn.Embedding(nb_predicates, embedding_size, sparse=True) # _hops = LinearReformulator(2, embedding_size) _hops = AttentiveReformulator(2, predicate_embeddings) model = NeuralKB(kernel=kernel, scoring_type='concat') hoppy = Hoppy(model, hops_lst=[(_hops, False)], depth=1) params = [ p for p in hoppy.parameters() if not torch.equal(p, entity_embeddings.weight) and not torch.equal(p, predicate_embeddings.weight) ] for tensor in params: print(f'\t{tensor.size()}\t{tensor.device}') loss_function = nn.BCELoss() optimizer = optim.Adagrad(params, lr=0.1) hops_data = [] for i in range(64): hops_data += hops batches = make_batches(len(hops_data), batch_size) rs = np.random.RandomState() c, d = 0.0, 0.0 p_emb = predicate_embeddings( torch.from_numpy(np.array([predicate_to_index['p']]))) q_emb = predicate_embeddings( torch.from_numpy(np.array([predicate_to_index['q']]))) for batch_start, batch_end in batches: hops_batch = hops_data[batch_start:batch_end] s_lst = [s for (s, _, _) in hops_batch] p_lst = [p for (_, p, _) in hops_batch] o_lst = [o for (_, _, o) in hops_batch] nb_positives = len(s_lst) nb_negatives = nb_positives * 3 s_n_lst = rs.permutation(nb_entities)[:nb_negatives].tolist() nb_negatives = len(s_n_lst) o_n_lst = rs.permutation(nb_entities)[:nb_negatives].tolist() p_n_lst = list(islice(cycle(p_lst), nb_negatives)) xs_np = np.array([entity_to_index[s] for s in s_lst] + s_n_lst) xp_np = np.array([predicate_to_index[p] for p in p_lst + p_n_lst]) xo_np = np.array([entity_to_index[o] for o in o_lst] + o_n_lst) xs_emb = entity_embeddings(torch.from_numpy(xs_np)) xp_emb = predicate_embeddings(torch.from_numpy(xp_np)) xo_emb = entity_embeddings(torch.from_numpy(xo_np)) rel_emb = encode_relation(facts=triples, relation_embeddings=predicate_embeddings, relation_to_idx=predicate_to_index) arg1_emb, arg2_emb = encode_arguments( facts=triples, entity_embeddings=entity_embeddings, entity_to_idx=entity_to_index) facts = [rel_emb, arg1_emb, arg2_emb] scores = hoppy.score(xp_emb, xs_emb, xo_emb, facts=facts, entity_embeddings=entity_embeddings.weight) labels_np = np.zeros(xs_np.shape[0]) labels_np[:nb_positives] = 1 labels = torch.from_numpy(labels_np).float() # for s, p, o, l in zip(xs_np, xp_np, xo_np, labels): # print(s, p, o, l) loss = loss_function(scores, labels) hop_1_emb = hoppy.hops_lst[0][0].hops_lst[0](xp_emb) hop_2_emb = hoppy.hops_lst[0][0].hops_lst[1](xp_emb) c = kernel.pairwise(p_emb, hop_1_emb).mean().cpu().detach().numpy() d = kernel.pairwise(q_emb, hop_2_emb).mean().cpu().detach().numpy() print(c, d) loss.backward() optimizer.step() optimizer.zero_grad() assert c > 0.95 and d > 0.95
def evaluate_slow(entity_embeddings: nn.Embedding, predicate_embeddings: nn.Embedding, test_triples: Tuple[str, str, str], all_triples: Tuple[str, str, str], entity_to_index: Dict[str, int], predicate_to_index: Dict[str, int], model: BaseLatentFeatureModel, batch_size: int, device: torch.device): xs = np.array([entity_to_index.get(s) for (s, _, _) in test_triples]) xp = np.array([predicate_to_index.get(p) for (_, p, _) in test_triples]) xo = np.array([entity_to_index.get(o) for (_, _, o) in test_triples]) sp_to_o, po_to_s = {}, {} for s, p, o in all_triples: s_idx, p_idx, o_idx = entity_to_index.get(s), predicate_to_index.get( p), entity_to_index.get(o) sp_key = (s_idx, p_idx) po_key = (p_idx, o_idx) if sp_key not in sp_to_o: sp_to_o[sp_key] = [] if po_key not in po_to_s: po_to_s[po_key] = [] sp_to_o[sp_key] += [o_idx] po_to_s[po_key] += [s_idx] assert xs.shape == xp.shape == xo.shape nb_test_triples = xs.shape[0] batches = make_batches(nb_test_triples, batch_size) hits = dict() hits_at = [1, 3, 5, 10] for hits_at_value in hits_at: hits[hits_at_value] = 0.0 def hits_at_n(n_, rank): if rank <= n_: hits[n_] = hits.get(n_, 0) + 1 counter = 0 mrr = 0.0 ranks_l, ranks_r = [], [] for start, end in batches: batch_xs = xs[start:end] batch_xp = xp[start:end] batch_xo = xo[start:end] batch_size = batch_xs.shape[0] counter += batch_size * 2 with torch.no_grad(): tensor_xs = torch.LongTensor(batch_xs).to(device) tensor_xp = torch.LongTensor(batch_xp).to(device) tensor_xo = torch.LongTensor(batch_xo).to(device) tensor_xs_emb = entity_embeddings(tensor_xs) tensor_xp_emb = predicate_embeddings(tensor_xp) tensor_xo_emb = entity_embeddings(tensor_xo) # print(entity_embeddings.weight.shape) if model.model.facts[0].shape[0] < 90000: res_sp, res_po = model.forward_(tensor_xp_emb, tensor_xs_emb, tensor_xo_emb) else: res_sp, res_po = model.forward__(tensor_xp_emb, tensor_xs_emb, tensor_xo_emb) _scores_sp, _ = res_sp _scores_po, _ = res_po scores_sp, scores_po = _scores_sp.cpu().numpy(), _scores_po.cpu( ).numpy() del _scores_sp, _scores_po del tensor_xs, tensor_xp, tensor_xo del tensor_xs_emb, tensor_xp_emb, tensor_xo_emb del res_sp, res_po # print(scores_sp.shape, scores_po.shape) if torch.cuda.is_available(): torch.cuda.empty_cache() batch_size = batch_xs.shape[0] for elem_idx in range(batch_size): s_idx, p_idx, o_idx = batch_xs[elem_idx], batch_xp[ elem_idx], batch_xo[elem_idx] # Code for the filtered setting sp_key = (s_idx, p_idx) po_key = (p_idx, o_idx) o_to_remove = sp_to_o[sp_key] s_to_remove = po_to_s[po_key] for tmp_o_idx in o_to_remove: if tmp_o_idx != o_idx: scores_sp[elem_idx, tmp_o_idx] = -np.infty for tmp_s_idx in s_to_remove: if tmp_s_idx != s_idx: scores_po[elem_idx, tmp_s_idx] = -np.infty # End of code for the filtered setting rank_l = 1 + np.argsort(np.argsort(-scores_po[elem_idx, :]))[s_idx] rank_r = 1 + np.argsort(np.argsort(-scores_sp[elem_idx, :]))[o_idx] ranks_l += [rank_l] ranks_r += [rank_r] mrr += 1.0 / rank_l mrr += 1.0 / rank_r for n in hits_at: hits_at_n(n, rank_l) for n in hits_at: hits_at_n(n, rank_r) counter = float(counter) mrr /= counter for n in hits_at: hits[n] /= counter metrics = dict() metrics['MRR'] = mrr for n in hits_at: metrics['hits@{}'.format(n)] = hits[n] return metrics
def test_learning_v3(): embedding_size = 10 batch_size = 16 triples, hops = [], [] for i in range(16): triples += [(f'a{i}', 'p', f'b{i}'), (f'b{i}', 'q', f'c{i}')] hops += [(f'a{i}', 'r', f'c{i}')] entity_lst = sorted({e for (e, _, _) in triples + hops} | {e for (e, _, e) in triples + hops}) predicate_lst = sorted({p for (_, p, _) in triples + hops}) nb_entities, nb_predicates = len(entity_lst), len(predicate_lst) entity_to_index = {e: i for i, e in enumerate(entity_lst)} predicate_to_index = {p: i for i, p in enumerate(predicate_lst)} torch.manual_seed(0) kernel = GaussianKernel() entity_embeddings = nn.Embedding(nb_entities, embedding_size * 2, sparse=True) predicate_embeddings = nn.Embedding(nb_predicates, embedding_size * 2, sparse=True) fact_rel = torch.LongTensor( np.array([predicate_to_index[p] for (_, p, _) in triples])) fact_arg1 = torch.LongTensor( np.array([entity_to_index[s] for (s, _, _) in triples])) fact_arg2 = torch.LongTensor( np.array([entity_to_index[o] for (_, _, o) in triples])) facts = [fact_rel, fact_arg1, fact_arg2] model = NeuralKB(entity_embeddings=entity_embeddings, predicate_embeddings=predicate_embeddings, kernel=kernel, facts=facts) reformulator = AttentiveReformulator(2, predicate_embeddings) hoppy = SimpleHoppy(model, entity_embeddings, hops=reformulator) N3_reg = N3() params = [ p for p in hoppy.parameters() if not torch.equal(p, entity_embeddings.weight) and not torch.equal(p, predicate_embeddings.weight) ] loss_function = nn.CrossEntropyLoss(reduction='mean') p_emb = predicate_embeddings( torch.LongTensor(np.array([predicate_to_index['p']]))) q_emb = predicate_embeddings( torch.LongTensor(np.array([predicate_to_index['q']]))) # r_emb = predicate_embeddings(torch.LongTensor(np.array([predicate_to_index['r']]))) optimizer = optim.Adagrad(params, lr=0.1) hops_data = [] for i in range(128): hops_data += hops batches = make_batches(len(hops_data), batch_size) c, d = 0.0, 0.0 for batch_start, batch_end in batches: hops_batch = hops_data[batch_start:batch_end] s_lst = [s for (s, _, _) in hops_batch] p_lst = [p for (_, p, _) in hops_batch] o_lst = [o for (_, _, o) in hops_batch] xs_np = np.array([entity_to_index[s] for s in s_lst]) xp_np = np.array([predicate_to_index[p] for p in p_lst]) xo_np = np.array([entity_to_index[o] for o in o_lst]) xs = torch.LongTensor(xs_np) xp = torch.LongTensor(xp_np) xo = torch.LongTensor(xo_np) xs_emb = entity_embeddings(xs) xp_emb = predicate_embeddings(xp) xo_emb = entity_embeddings(xo) sp_scores, po_scores = hoppy.forward(xp_emb, xs_emb, xo_emb) loss = loss_function(sp_scores, xo) + loss_function(po_scores, xs) factors = [hoppy.factor(e) for e in [xp_emb, xs_emb, xo_emb]] loss += 0.1 * N3_reg(factors) tmp = hoppy.hops(xp_emb) hop_1_emb = tmp[0] hop_2_emb = tmp[1] c = kernel.pairwise(p_emb, hop_1_emb).mean().cpu().detach().numpy() d = kernel.pairwise(q_emb, hop_2_emb).mean().cpu().detach().numpy() loss.backward() optimizer.step() optimizer.zero_grad() assert c > 0.95 assert d > 0.95
def evaluate_naive(entity_embeddings: nn.Embedding, predicate_embeddings: nn.Embedding, test_triples: Tuple[str, str, str], all_triples: Tuple[str, str, str], entity_to_index: Dict[str, int], predicate_to_index: Dict[str, int], model: BaseLatentFeatureModel, batch_size: int, device: torch.device): index_to_entity = {index: entity for entity, index in entity_to_index.items()} index_to_predicate = {index: predicate for predicate, index in predicate_to_index.items()} test_triples = {(entity_to_index[s], predicate_to_index[p], entity_to_index[o]) for s, p, o in test_triples} all_triples = {(entity_to_index[s], predicate_to_index[p], entity_to_index[o]) for s, p, o in all_triples} entities = sorted(index_to_entity.keys()) hits = dict() hits_at = [1, 3, 5, 10] for hits_at_value in hits_at: hits[hits_at_value] = 0.0 def hits_at_n(n_, rank): if rank <= n_: hits[n_] = hits.get(n_, 0) + 1 counter = 0 mrr = 0.0 for s_idx, p_idx, o_idx in test_triples: corrupted_subject = [(entity, p_idx, o_idx) for entity in entities if (entity, p_idx, o_idx) not in all_triples or entity == s_idx] corrupted_object = [(s_idx, p_idx, entity) for entity in entities if (s_idx, p_idx, entity) not in all_triples or entity == o_idx] index_l = corrupted_subject.index((s_idx, p_idx, o_idx)) index_r = corrupted_object.index((s_idx, p_idx, o_idx)) nb_corrupted_l = len(corrupted_subject) # nb_corrupted_r = len(corrupted_object) corrupted = corrupted_subject + corrupted_object nb_corrupted = len(corrupted) batches = make_batches(nb_corrupted, batch_size) scores_lst = [] for start, end in batches: batch = np.array(corrupted[start:end]) x_sub, x_pred, x_obj = batch[:, 0], batch[:, 1], batch[:, 2] with torch.no_grad(): tensor_xs = torch.from_numpy(x_sub).to(device) tensor_xp = torch.from_numpy(x_pred).to(device) tensor_xo = torch.from_numpy(x_obj).to(device) tensor_xs_emb = entity_embeddings(tensor_xs) tensor_xp_emb = predicate_embeddings(tensor_xp) tensor_xo_emb = entity_embeddings(tensor_xo) scores_np = model.score(tensor_xp_emb, tensor_xs_emb, tensor_xo_emb).cpu().numpy() scores_lst += scores_np.tolist() scores_l = scores_lst[:nb_corrupted_l] scores_r = scores_lst[nb_corrupted_l:] rank_l = 1 + np.argsort(np.argsort(- np.array(scores_l)))[index_l] counter += 1 for n in hits_at: hits_at_n(n, rank_l) mrr += 1.0 / rank_l rank_r = 1 + np.argsort(np.argsort(- np.array(scores_r)))[index_r] counter += 1 for n in hits_at: hits_at_n(n, rank_r) mrr += 1.0 / rank_r counter = float(counter) mrr /= counter for n in hits_at: hits[n] /= counter metrics = dict() metrics['MRR'] = mrr for n in hits_at: metrics['hits@{}'.format(n)] = hits[n] return metrics