def evaluate_subgraph_extraction(nhops, e_field, p_field, limit=None, show_errors=False): ''' e_field, p_field <str> names of the fields in MongoDB to look up the IDs ''' samples = mongo.get_sample(limit=limit) # iterate over the cursor accs = [] for doc in samples: # get correct entities and predicates from the GS annotations e_ids = doc[e_field] p_uris = doc[p_field] # extract the subgraph kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(nhops, p_uris, namespace, True) entities, _, _ = kg.compute_hops(e_ids) kg.remove() # check if we hit the answer set if 'answers_ids' in doc: correct_answers_ids = set(doc['answers_ids']) # print(correct_answers_ids) n_hits = len(correct_answers_ids & set(entities)) # accuracy acc = float(n_hits) / len(correct_answers_ids) accs.append(acc) if show_errors & (acc < 1): print(doc['question']) print(doc['entity_ids']) print(doc['predicate_uris']) return accs
def filter_answer_by_class(classes, answers_ids): classes_ids = [_id for e in classes for _id in e] kg = HDTDocument(hdt_path+hdt_file) a_ids = [_id for e in answers_ids for _id in e] a_ids = kg.filter_types(a_ids, classes_ids) kg.remove() a_ids = [_id for _a_ids in a_ids for _id in _a_ids] answers_ids = [{_id: a_score} for e in answers_ids for _id, a_score in e.items() if _id in a_ids] return answers_ids
def hop(activations, constraints, predicates_ids, verbose=False, _bool_answer=False, max_triples=500000): # extract the subgraph for the selected entities top_entities_ids = [_id for e in activations + constraints for _id in e] # exclude types predicate top_predicates_ids = [ _id for p in predicates_ids for _id in p if _id != 68655 ] # iteratively call the HDT API to retrieve all subgraph partitions activations = defaultdict(int) offset = 0 while True: # get the subgraph for selected predicates only kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(1, top_predicates_ids, namespace, True) entities, predicate_ids, adjacencies = kg.compute_hops( top_entities_ids, max_triples, offset) kg.remove() if not entities: # filter out the answers by min activation scores if not _bool_answer and constraints: # normalize activations by checking the 'must' constraints: number of constraints * weights min_a = len(constraints) * 1 if predicates_ids != top_predicates_ids: min_a -= 1 else: min_a = 0 # return HDT ids of the activated entities return [ a_id for a_id, a_score in activations.items() if a_score > min_a ] if verbose: print("Subgraph extracted:") print("%d entities" % len(entities)) print("%d predicates" % len(predicate_ids)) print("Loading adjacencies..") offset += max_triples # index entity ids global -> local entities_dict = {k: v for v, k in enumerate(entities)} adj_shape = (len(entities), len(entities)) # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops A = generate_adj_sp(adjacencies, adj_shape, include_inverse=True) # activations of entities and predicates e_ids = [ entities_dict[entity_id] for entity_id in top_entities_ids if entity_id in entities_dict ] # assert len(top_entities_ids) == len(e_ids) p_ids = [ predicate_ids.index(entity_id) for entity_id in top_predicates_ids if entity_id in predicate_ids ] # assert len(top_predicates_ids) == len(p_ids) if p_ids: # graph activation vectors x = np.zeros(len(entities)) x[e_ids] = 1 p = np.zeros(len(predicate_ids)) p[p_ids] = 1 # slice A by the selected predicates and concatenate edge lists y = (x @ sp.hstack(A * p)).reshape( [len(predicate_ids), len(entities)]).sum(0) # check output size assert y.shape[0] == len(entities) # harvest activations top = np.argwhere(y > 0).T.tolist()[0] if len(top) > 0: activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i]
torch.tensor(p_attention_masks), torch.tensor(all_predicate_ids)], torch.tensor([answer_id]), first_question]) # carry over history to the next dialogue turn dialogue_history.extend([question, answer_label]) del entity_ids, predicate_ids, adjacencies print("Compiled dataset with %d samples" % len(train_dataset)) return train_dataset train_dataset = prepare_dataset(train_conversations_path) valid_dataset = prepare_dataset(dev_conversations_path) # remove everything from memory but model and tensors for training/validaton kg.remove() del kg print("Dataset loaded") # train model (matching nodes and relations with a Transformer with subgraph sampling) n_batches = 1000 import random import numpy as np from functools import reduce import sys import os import gc import psutil
def entity_linking(spans_field, save, show_errors=True, add_nieghbours=True, lookup_embeddings=True): # iterate over the cursor cursor = mongo.get_sample(limit=limit) count = 0 # hold macro-average stats for the model performance over the samples ps, rs, fs = [], [], [] with cursor: for doc in cursor: # if 'entity_ids_guess' not in doc: correct_uris = doc['entity_uris'] print(set(correct_uris)) # get entity spans e_spans = doc[spans_field] # e_spans = doc[spans_field+'_guess'] # print(e_spans) # get entity matches TODO save scores top_ids = [] top_entities = {} for span in e_spans: print("Span: %s" % span) print("Index lookup..") guessed_labels, guessed_ids, look_up_ids = [], [], [] for match in e_index.match_label(span, top=string_cutoff): label = match['_source']['label_exact'] degree = match['_source']['count'] # print(degree) _id = match['_source']['id'] # avoid expanding heavy hitters if int(degree) < max_degree: look_up_ids.append(_id) guessed_ids.append(_id) if label not in guessed_labels: guessed_labels.append(label) uri = match['_source']['uri'] # print(uri) print("%d candidate labels" % len(guessed_labels)) if add_nieghbours: print("KG lookup..") kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(1, [], namespace, True) # get a sample of the subgraph: the first <max_triples> only entities, predicate_ids, adjacencies = kg.compute_hops( look_up_ids, max_triples, 0) kg.remove() # look up labels for e_id in entities: match = e_index.look_up_by_id(e_id) if match: label = match[0]['_source']['label_exact'] if label not in guessed_labels: guessed_labels.append(label) guessed_ids.extend(entities) # score with embeddings guessed_labels = [ label for label in guessed_labels if label in e_vectors ] print("%d candidate labels" % len(guessed_labels)) if guessed_labels and lookup_embeddings: print("Embeddings lookup..") dists = e_vectors.distance(span, [ label for label in guessed_labels if label in e_vectors ]) top = np.argsort(dists)[:semantic_cutoff].tolist() top_labels = [guessed_labels[i] for i in top] print("selected labels: %s" % top_labels) print("Index lookup..") top_entities[span] = [] for i, label in enumerate(top_labels): print(label) for match in e_index.look_up_by_label(label): distance = float(dists[top[i]]) degree = match['_source']['count'] _id = match['_source']['id'] uri = match['_source']['uri'] print(uri) top_entities[span].append({ 'rank': i + 1, 'distance': distance, 'degree': degree, 'id': _id, 'uri': uri }) top_ids.append(_id) else: top_labels = guessed_labels top_ids.extend(guessed_ids) # evaluate against the correct entity ids top_ids = list(set(top_ids)) correct_ids = set(doc['entity_ids']) n_hits = len(correct_ids & set(top_ids)) try: r = float(n_hits) / len(correct_ids) except ZeroDivisionError: \ print(doc['question']) try: p = float(n_hits) / len(top_ids) except ZeroDivisionError: p = 0 try: f = 2 * p * r / (p + r) except ZeroDivisionError: f = 0 print("P: %.2f R: %.2f F: %.2f" % (p, r, f)) # add stats ps.append(p) rs.append(r) fs.append(f) # save to MongoDB if save: doc['entity_ids_guess'] = top_ids doc['entity_guess'] = top_entities mongo.col.update_one({'_id': doc['_id']}, {"$set": doc}, upsert=True) count += 1 print("P: %.2f R: %.2f F: %.2f" % (np.mean(ps), np.mean(rs), np.mean(fs))) print("Fin. Results for %d questions" % len(ps)) if save: print("%d documents annotated with entity ids guess" % count)
def hop(entities, constraints, top_predicates, verbose=False, max_triples=500000): ''' Extract the subgraph for the selected entities ''' # print(top_predicates) n_constraints = len(constraints) if entities: n_constraints += 1 top_entities = entities + constraints all_entities_ids = [_id for e in top_entities for _id in e] top_predicates_ids = [_id for p in top_predicates for _id in p if _id] # iteratively call the HDT API to retrieve all subgraph partitions activations = defaultdict(int) offset = 0 while True: # get the subgraph for selected predicates only kg = HDTDocument(hdt_path + hdt_file) # print(top_predicates_ids) kg.configure_hops(1, top_predicates_ids, namespace, True) entities, predicate_ids, adjacencies = kg.compute_hops( all_entities_ids, max_triples, offset) kg.remove() # print(adjacencies) # show subgraph entities # print([e_index.look_up_by_id(e)[0]['_source']['uri'] for e in entities]) if not entities: answers = [{ a_id: a_score } for a_id, a_score in activations.items()] return answers # if verbose: # print("Subgraph extracted:") # print("%d entities"%len(entities)) # print("%d predicates"%len(predicate_ids)) # print("Loading adjacencies..") offset += max_triples # index entity ids global -> local entities_dict = {k: v for v, k in enumerate(entities)} # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops A = generate_adj_sp(adjacencies, len(entities), include_inverse=True) # print(predicate_ids) # activate entities -- build sparse matrix row, col, data = [], [], [] for i, concept_ids in enumerate(top_entities): for entity_id, score in concept_ids.items(): if entity_id in entities_dict: # print(e_index.look_up_by_id(entity_id)[0]['_source']['uri']) # print(score) local_id = entities_dict[entity_id] row.append(i) col.append(local_id) data.append(score) x = sp.csr_matrix((data, (row, col)), shape=(len(top_entities), len(entities))) # iterate over predicates ye = sp.csr_matrix((len(top_entities), len(entities))) # activate predicates if top_predicates_ids: yp = sp.csr_matrix((len(top_predicates), len(entities))) for i, concept_ids in enumerate(top_predicates): # activate predicates p = np.zeros([len(predicate_ids)]) # iterate over synonyms for p_id, score in concept_ids.items(): if p_id in predicate_ids: local_id = predicate_ids.index(p_id) p[local_id] = score # slice A by the selected predicates _A = sum(p * A) _y = x @ _A # normalize: cut top to 1 _y[_y > 1] = 1 yp[i] = _y.sum(0) ye += _y y = sp.vstack([ye, yp]) # fall back to evaluate all predicates else: y = x @ sum(A) sum_a = sum(y) sum_a_norm = sum_a.toarray()[0] / ( len(top_predicates) + n_constraints ) #normalize(sum_a, norm='max', axis=1).toarray()[0] # normalize: cut top to 1 sum_a_norm[sum_a_norm > 1] = 1 # activations across components y_counts = binarize(y, threshold=0.0) count_a = sum(y_counts).toarray()[0] # final scores y = (sum_a_norm + count_a) / (len(top_predicates) + n_constraints + 1) # check output size assert y.shape[0] == len(entities) top = np.argwhere(y > 0).T.tolist()[0] if len(top) > 0: activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i] # if not such answer found fall back to return the answers satisfying max of the constraints else: # select answers that satisfy maximum number of constraints y_p = np.argmax(y) # maximum number of satisfied constraints max_cs = y[y_p] # at least some activation (evidence from min one constraint) if max_cs != 0: # select answers top = np.argwhere(y == max_cs).T.tolist()[0] activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i]
def forward(self, e_scores, entity_ids, p_scores, answer=None, all_predicate_ids=all_predicate_ids): ''' Inputs: *e_scores*: entity scores from Transformer *entity_ids*: global entity ids to request the KG for adjacencies *p_scores*: predicate scores from Transformer Outputs: *subgraph*: subgraph edges and entities ''' # with torch.autograd.detect_anomaly(): # get the top-k (predicates/)entities based on the score vectors weights, indices = torch.sort(e_scores.view(-1), descending=True) sampled_entities = entity_ids[ indices[:self.top_e]].tolist() # choose top-k matching entities # print("Retrieving adjacencies for %d entities"%len(sampled_entities)) # sample predicates? sampled_predicates = [] # predicate_ids.tolist() # weights, indices = torch.sort(p_scores.view(-1), descending=True) # sampled_predicates = predicate_ids[indices[:self.top_p]].tolist() with torch.no_grad(): # initialise connection to the Wikidata KG through the HDT API kg = HDTDocument(self.hdt_path) # request kg through hdt api for a subgraph given entity and relation subsets kg.configure_hops(1, sampled_predicates, 'predef-wikidata2018-09-all', True, False) s_entity_ids, s_predicate_ids, adjacencies = kg.compute_hops( sampled_entities, 5000, 0) kg.remove() del kg # print("Retrieved new subgraph with %d entities and %d relations" % (len(s_entity_ids), len(s_predicate_ids))) # check subgraph exists if not s_entity_ids: return (), None # check we are in the right subgraph if answer is not None and answer not in s_entity_ids: return (), None # build a lookup table for entity & predicate scores e_table = build_look_up(entity_ids) p_table = build_look_up(all_predicate_ids) del all_predicate_ids # load subgraph into tensor indices, relation_mask = adj(adjacencies, len(s_entity_ids), len(s_predicate_ids)) # print("%d triples" % len(indices)) # lookup local scores to activate respective entities & predicates e_scores = look_up(e_table, s_entity_ids, e_scores) p_scores = look_up(p_table, s_predicate_ids, p_scores) del p_table, s_predicate_ids, e_table, adjacencies # clean up gc.collect() torch.cuda.empty_cache() return (indices, e_scores, p_scores, relation_mask), s_entity_ids