def evaluate_subgraph_extraction(nhops, e_field, p_field, limit=None, show_errors=False): ''' e_field, p_field <str> names of the fields in MongoDB to look up the IDs ''' samples = mongo.get_sample(limit=limit) # iterate over the cursor accs = [] for doc in samples: # get correct entities and predicates from the GS annotations e_ids = doc[e_field] p_uris = doc[p_field] # extract the subgraph kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(nhops, p_uris, namespace, True) entities, _, _ = kg.compute_hops(e_ids) kg.remove() # check if we hit the answer set if 'answers_ids' in doc: correct_answers_ids = set(doc['answers_ids']) # print(correct_answers_ids) n_hits = len(correct_answers_ids & set(entities)) # accuracy acc = float(n_hits) / len(correct_answers_ids) accs.append(acc) if show_errors & (acc < 1): print(doc['question']) print(doc['entity_ids']) print(doc['predicate_uris']) return accs
def hop(activations, constraints, predicates_ids, verbose=False, _bool_answer=False, max_triples=500000): # extract the subgraph for the selected entities top_entities_ids = [_id for e in activations + constraints for _id in e] # exclude types predicate top_predicates_ids = [ _id for p in predicates_ids for _id in p if _id != 68655 ] # iteratively call the HDT API to retrieve all subgraph partitions activations = defaultdict(int) offset = 0 while True: # get the subgraph for selected predicates only kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(1, top_predicates_ids, namespace, True) entities, predicate_ids, adjacencies = kg.compute_hops( top_entities_ids, max_triples, offset) kg.remove() if not entities: # filter out the answers by min activation scores if not _bool_answer and constraints: # normalize activations by checking the 'must' constraints: number of constraints * weights min_a = len(constraints) * 1 if predicates_ids != top_predicates_ids: min_a -= 1 else: min_a = 0 # return HDT ids of the activated entities return [ a_id for a_id, a_score in activations.items() if a_score > min_a ] if verbose: print("Subgraph extracted:") print("%d entities" % len(entities)) print("%d predicates" % len(predicate_ids)) print("Loading adjacencies..") offset += max_triples # index entity ids global -> local entities_dict = {k: v for v, k in enumerate(entities)} adj_shape = (len(entities), len(entities)) # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops A = generate_adj_sp(adjacencies, adj_shape, include_inverse=True) # activations of entities and predicates e_ids = [ entities_dict[entity_id] for entity_id in top_entities_ids if entity_id in entities_dict ] # assert len(top_entities_ids) == len(e_ids) p_ids = [ predicate_ids.index(entity_id) for entity_id in top_predicates_ids if entity_id in predicate_ids ] # assert len(top_predicates_ids) == len(p_ids) if p_ids: # graph activation vectors x = np.zeros(len(entities)) x[e_ids] = 1 p = np.zeros(len(predicate_ids)) p[p_ids] = 1 # slice A by the selected predicates and concatenate edge lists y = (x @ sp.hstack(A * p)).reshape( [len(predicate_ids), len(entities)]).sum(0) # check output size assert y.shape[0] == len(entities) # harvest activations top = np.argwhere(y > 0).T.tolist()[0] if len(top) > 0: activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i]
top_properties_ids = list( set([ e_candidate['uri'] for e in top_properties.values() for e_candidate in e ])) top_p_scores = { e_candidate['id']: e_candidate['score'] for e in top_properties.values() for e_candidate in e } n_e_activations = len(top_entities_ids) n_p_activations = len(top_properties_ids) # extract the subgraph kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(nhops, top_properties_ids, namespace, True) entities, predicate_ids, adjacencies = kg.compute_hops(top_entities_ids) kg.remove() if not max_x: max_x = len(entities) if not max_p: max_p = len(predicate_ids) # check if we hit the answer set correct_answers_ids = set(doc['answers_ids']) n_gs_answers = len(correct_answers_ids) n_hits = len(correct_answers_ids & set(entities)) # accuracy acc = float(n_hits) / len(correct_answers_ids) accs.append(acc) # pick only the samples where we find the correct subgraph
class KBQA(): def __init__(self, dataset_name='lcquad'): ''' Setup models, indices, embeddings and connection to the KG through the HDT API ''' # connect to the entity and predicate catalogs self.e_index = IndexSearch('dbpedia201604e') self.p_index = IndexSearch('dbpedia201604p') # load embeddings self.word_vectors = load_embeddings(embeddings_path, embeddings_choice) self.p_vectors = load_embeddings(embeddings_path, 'fasttext_p_labels') # load pre-trained question type classification model with open(model_path+'qtype_lcquad_%s.pkl'%(embeddings_choice), 'rb') as f: self.model_settings = pkl.load(f) self.qt_model = build_qt_inference_model(self.model_settings) self.qt_model.load_weights(model_path+'_qtype_weights.best.hdf5', by_name=True) # load pre-trained question parsing model with open(model_path+'lcquad_%s.pkl'%(embeddings_choice), 'rb') as f: ep_model_settings = pkl.load(f) self.ep_model = build_ep_inference_model(ep_model_settings) # load weights # ep_model.load_weights('checkpoints/_'+modelname+'_weights.best.hdf5', by_name=True) self.ep_model.load_weights(model_path+'2hops-types.h5', by_name=True) # connect to the knowledge graph hdt file self.kg = HDTDocument(hdt_path+hdt_file) # functions for entity linking and relation detection def entity_linking(self, e_spans, verbose=False, cutoff=500, threshold=0): guessed_ids = [] for span in e_spans: span_ids = self.e_index.label_scores(span, top=cutoff, threshold=threshold, verbose=verbose, scale=0.3, max_degree=50000) guessed_ids.append(span_ids) return guessed_ids def relation_detection(self, p_spans, verbose=False, cutoff=500, threshold=0.0): guessed_ids = [] for span in p_spans: span_ids = {} guessed_labels = [] if span in self.p_vectors: guessed_labels.append([span, 1]) for p, score in self.p_vectors.most_similar(span, topn=cutoff): if score >= threshold: guessed_labels.append([p, score]) for label, score in guessed_labels: for match in self.p_index.look_up_by_label(label): _id = match['_source']['id'] span_ids[_id] = score if verbose: uri = match['_source']['uri'] print(uri) print(score) guessed_ids.append(span_ids) return guessed_ids # MP functions def generate_adj_sp(self, adjacencies, n_entities, include_inverse): ''' Build adjacency matrix ''' adj_shape = (n_entities, n_entities) # colect all predicate matrices separately into a list sp_adjacencies = [] for edges in adjacencies: # split subject (row) and object (col) node URIs n_edges = len(edges) row, col = np.transpose(edges) # duplicate edges in the opposite direction if include_inverse: _row = np.hstack([row, col]) col = np.hstack([col, row]) row = _row n_edges *= 2 # create adjacency matrix for this predicate data = np.ones(n_edges) adj = sp.csr_matrix((data, (row, col)), shape=adj_shape) sp_adjacencies.append(adj) return np.asarray(sp_adjacencies) def hop(self, entities, constraints, top_predicates, verbose=False, max_triples=500000, bl_p=[68655]): ''' Extract the subgraph for the selected entities bl_p -- the list of predicates to ignore (e.g. type predicate is too expensive to expand) ''' # print(top_predicates) n_constraints = len(constraints) if entities: n_constraints += 1 top_entities = entities + constraints all_entities_ids = [_id for e in top_entities for _id in e] top_predicates_ids = [_id for p in top_predicates for _id in p if _id not in bl_p] # iteratively call the HDT API to retrieve all subgraph partitions activations = defaultdict(int) offset = 0 while True: # get the subgraph for selected predicates only # print(top_predicates_ids) self.kg.configure_hops(1, top_predicates_ids, namespace, True) entities, predicate_ids, adjacencies = self.kg.compute_hops(all_entities_ids, max_triples, offset) # print(adjacencies) # show subgraph entities # print([e_index.look_up_by_id(e)[0]['_source']['uri'] for e in entities]) if not entities: answers = [{a_id: a_score} for a_id, a_score in activations.items()] return answers if verbose: print("Subgraph extracted:") print("%d entities"%len(entities)) print("%d predicates"%len(predicate_ids)) print("Loading adjacencies..") offset += max_triples # index entity ids global -> local entities_dict = {k: v for v, k in enumerate(entities)} # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops A = self.generate_adj_sp(adjacencies, len(entities), include_inverse=True) # print(predicate_ids) # activate entities -- build sparse matrix row, col, data = [], [], [] for i, concept_ids in enumerate(top_entities): for entity_id, score in concept_ids.items(): if entity_id in entities_dict: # print(e_index.look_up_by_id(entity_id)[0]['_source']['uri']) # print(score) local_id = entities_dict[entity_id] row.append(i) col.append(local_id) data.append(score) x = sp.csr_matrix((data, (row, col)), shape=(len(top_entities), len(entities))) # iterate over predicates ye = sp.csr_matrix((len(top_entities), len(entities))) # activate predicates if top_predicates_ids: yp = sp.csr_matrix((len(top_predicates), len(entities))) for i, concept_ids in enumerate(top_predicates): # activate predicates p = np.zeros([len(predicate_ids)]) # iterate over synonyms for p_id, score in concept_ids.items(): if p_id in predicate_ids: local_id = predicate_ids.index(p_id) p[local_id] = score # slice A by the selected predicates _A = sum(p*A) _y = x @ _A # normalize: cut top to 1 _y[_y > 1] = 1 yp[i] = _y.sum(0) ye += _y y = sp.vstack([ye,yp]) # fall back to evaluate all predicates else: y = x @ sum(A) sum_a = sum(y) sum_a_norm = sum_a.toarray()[0] / (len(top_predicates) + n_constraints) #normalize(sum_a, norm='max', axis=1).toarray()[0] # normalize: cut top to 1 sum_a_norm[sum_a_norm > 1] = 1 # activations across components y_counts = binarize(y, threshold=0.0) count_a = sum(y_counts).toarray()[0] # final scores y = (sum_a_norm + count_a) / (len(top_predicates) + n_constraints + 1) # check output size assert y.shape[0] == len(entities) top = np.argwhere(y > 0).T.tolist()[0] if len(top) > 0: activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i] # if not such answer found fall back to return the answers satisfying max of the constraints else: # select answers that satisfy maximum number of constraints y_p = np.argmax(y) # maximum number of satisfied constraints max_cs = y[y_p] # at least some activation (evidence from min one constraint) if max_cs != 0: # select answers top = np.argwhere(y == max_cs).T.tolist()[0] activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i] def request(self, question, top_n=3, verbose=False): # parse question into words and embed x_test_sent = np.zeros((self.model_settings['max_len'], self.model_settings['emb_dim'])) q_words = text_to_word_sequence(question) for i, word in enumerate(q_words): x_test_sent[i] = self.word_vectors.query(word) # predict question type if verbose: print(x_test_sent) y_p = self.qt_model.predict(np.array([x_test_sent])) y_p = np.argmax(y_p, axis=-1)[0] p_qt = question_types[y_p] ask_question = p_qt == 'ASK' print(p_qt) # use GS spans + preprocess y_p = self.ep_model.predict(np.array([x_test_sent])) y_p = np.argmax(y_p, axis=-1)[0] e_spans1 = collect_mentions(q_words, y_p, 1) p_spans1 = collect_mentions(q_words, y_p, 2) p_spans2 = collect_mentions(q_words, y_p, 3) # c_spans1 = doc['c1_spans'] # c_spans2 = doc['c2_spans'] # match predicates top_predicates_ids1 = self.relation_detection(p_spans1, threshold=0) top_predicates_ids2 = self.relation_detection(p_spans2, threshold=0) # use GS classes # classes1 = [{_id: 1} for _id in doc['classes_ids'] if _id in doc['1hop_ids'][0]] # classes2 = [{_id: 1} for _id in doc['classes_ids'] if _id in doc['2hop_ids'][0]] top_entities_ids1 = self.entity_linking(e_spans1, threshold=0.7) if ask_question: a_threshold = 0.0 else: a_threshold = 0.5 # MP answers_ids = [] # 1st hop answers_ids1 = self.hop([], top_entities_ids1, top_predicates_ids1, verbose) # if classes1: # answers_ids1 = filter_answer_by_class(classes1, answers_ids1) answers1 = [{a_id: a_score} for activations in answers_ids1 for a_id, a_score in activations.items() if a_score > a_threshold] # 2nd hop if top_predicates_ids1 and top_predicates_ids2: answers_ids = self.hop(answers1, [], top_predicates_ids2, verbose) # if classes2: # answers_ids = filter_answer_by_class(classes2, answers_ids) answers = [{a_id: a_score} for activations in answers_ids for a_id, a_score in activations.items() if a_score > a_threshold] else: answers = answers1 answers_ids = [_id for a in answers for _id in a] # show spans print(e_spans1) print(p_spans1) print(p_spans2) # show matches print([{self.e_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in top_entities_ids1 for _id, score in answer.items() if self.e_index.look_up_by_id(_id)][:top_n]) print([{self.p_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in top_predicates_ids1 for _id, score in answer.items() if self.p_index.look_up_by_id(_id)][:top_n]) print([{self.p_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in top_predicates_ids2 for _id, score in answer.items() if self.p_index.look_up_by_id(_id)][:top_n]) # show intermediate answers if there was a second hop if top_predicates_ids2: print([{self.e_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in answers1 for _id, score in answer.items() if self.e_index.look_up_by_id(_id)][:top_n]) if ask_question: # make sure the output matches every input basket all_entities_baskets = [set(e.keys()) for e in top_entities_ids1] answers = all(x & set(answers_ids) for x in all_entities_baskets) else: # show answers answers = [{self.e_index.look_up_by_id(_id)[0]['_source']['uri']: score} for answer in answers for _id, score in answer.items() if self.e_index.look_up_by_id(_id)][:top_n] if verbose: print(answers) return answers def test_request(self): question = "What are some other works of the author of The Phantom of the Opera?" self.request(question, verbose=True)
def entity_linking(spans_field, save, show_errors=True, add_nieghbours=True, lookup_embeddings=True): # iterate over the cursor cursor = mongo.get_sample(limit=limit) count = 0 # hold macro-average stats for the model performance over the samples ps, rs, fs = [], [], [] with cursor: for doc in cursor: # if 'entity_ids_guess' not in doc: correct_uris = doc['entity_uris'] print(set(correct_uris)) # get entity spans e_spans = doc[spans_field] # e_spans = doc[spans_field+'_guess'] # print(e_spans) # get entity matches TODO save scores top_ids = [] top_entities = {} for span in e_spans: print("Span: %s" % span) print("Index lookup..") guessed_labels, guessed_ids, look_up_ids = [], [], [] for match in e_index.match_label(span, top=string_cutoff): label = match['_source']['label_exact'] degree = match['_source']['count'] # print(degree) _id = match['_source']['id'] # avoid expanding heavy hitters if int(degree) < max_degree: look_up_ids.append(_id) guessed_ids.append(_id) if label not in guessed_labels: guessed_labels.append(label) uri = match['_source']['uri'] # print(uri) print("%d candidate labels" % len(guessed_labels)) if add_nieghbours: print("KG lookup..") kg = HDTDocument(hdt_path + hdt_file) kg.configure_hops(1, [], namespace, True) # get a sample of the subgraph: the first <max_triples> only entities, predicate_ids, adjacencies = kg.compute_hops( look_up_ids, max_triples, 0) kg.remove() # look up labels for e_id in entities: match = e_index.look_up_by_id(e_id) if match: label = match[0]['_source']['label_exact'] if label not in guessed_labels: guessed_labels.append(label) guessed_ids.extend(entities) # score with embeddings guessed_labels = [ label for label in guessed_labels if label in e_vectors ] print("%d candidate labels" % len(guessed_labels)) if guessed_labels and lookup_embeddings: print("Embeddings lookup..") dists = e_vectors.distance(span, [ label for label in guessed_labels if label in e_vectors ]) top = np.argsort(dists)[:semantic_cutoff].tolist() top_labels = [guessed_labels[i] for i in top] print("selected labels: %s" % top_labels) print("Index lookup..") top_entities[span] = [] for i, label in enumerate(top_labels): print(label) for match in e_index.look_up_by_label(label): distance = float(dists[top[i]]) degree = match['_source']['count'] _id = match['_source']['id'] uri = match['_source']['uri'] print(uri) top_entities[span].append({ 'rank': i + 1, 'distance': distance, 'degree': degree, 'id': _id, 'uri': uri }) top_ids.append(_id) else: top_labels = guessed_labels top_ids.extend(guessed_ids) # evaluate against the correct entity ids top_ids = list(set(top_ids)) correct_ids = set(doc['entity_ids']) n_hits = len(correct_ids & set(top_ids)) try: r = float(n_hits) / len(correct_ids) except ZeroDivisionError: \ print(doc['question']) try: p = float(n_hits) / len(top_ids) except ZeroDivisionError: p = 0 try: f = 2 * p * r / (p + r) except ZeroDivisionError: f = 0 print("P: %.2f R: %.2f F: %.2f" % (p, r, f)) # add stats ps.append(p) rs.append(r) fs.append(f) # save to MongoDB if save: doc['entity_ids_guess'] = top_ids doc['entity_guess'] = top_entities mongo.col.update_one({'_id': doc['_id']}, {"$set": doc}, upsert=True) count += 1 print("P: %.2f R: %.2f F: %.2f" % (np.mean(ps), np.mean(rs), np.mean(fs))) print("Fin. Results for %d questions" % len(ps)) if save: print("%d documents annotated with entity ids guess" % count)
def hop(entities, constraints, top_predicates, verbose=False, max_triples=500000): ''' Extract the subgraph for the selected entities ''' # print(top_predicates) n_constraints = len(constraints) if entities: n_constraints += 1 top_entities = entities + constraints all_entities_ids = [_id for e in top_entities for _id in e] top_predicates_ids = [_id for p in top_predicates for _id in p if _id] # iteratively call the HDT API to retrieve all subgraph partitions activations = defaultdict(int) offset = 0 while True: # get the subgraph for selected predicates only kg = HDTDocument(hdt_path + hdt_file) # print(top_predicates_ids) kg.configure_hops(1, top_predicates_ids, namespace, True) entities, predicate_ids, adjacencies = kg.compute_hops( all_entities_ids, max_triples, offset) kg.remove() # print(adjacencies) # show subgraph entities # print([e_index.look_up_by_id(e)[0]['_source']['uri'] for e in entities]) if not entities: answers = [{ a_id: a_score } for a_id, a_score in activations.items()] return answers # if verbose: # print("Subgraph extracted:") # print("%d entities"%len(entities)) # print("%d predicates"%len(predicate_ids)) # print("Loading adjacencies..") offset += max_triples # index entity ids global -> local entities_dict = {k: v for v, k in enumerate(entities)} # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops A = generate_adj_sp(adjacencies, len(entities), include_inverse=True) # print(predicate_ids) # activate entities -- build sparse matrix row, col, data = [], [], [] for i, concept_ids in enumerate(top_entities): for entity_id, score in concept_ids.items(): if entity_id in entities_dict: # print(e_index.look_up_by_id(entity_id)[0]['_source']['uri']) # print(score) local_id = entities_dict[entity_id] row.append(i) col.append(local_id) data.append(score) x = sp.csr_matrix((data, (row, col)), shape=(len(top_entities), len(entities))) # iterate over predicates ye = sp.csr_matrix((len(top_entities), len(entities))) # activate predicates if top_predicates_ids: yp = sp.csr_matrix((len(top_predicates), len(entities))) for i, concept_ids in enumerate(top_predicates): # activate predicates p = np.zeros([len(predicate_ids)]) # iterate over synonyms for p_id, score in concept_ids.items(): if p_id in predicate_ids: local_id = predicate_ids.index(p_id) p[local_id] = score # slice A by the selected predicates _A = sum(p * A) _y = x @ _A # normalize: cut top to 1 _y[_y > 1] = 1 yp[i] = _y.sum(0) ye += _y y = sp.vstack([ye, yp]) # fall back to evaluate all predicates else: y = x @ sum(A) sum_a = sum(y) sum_a_norm = sum_a.toarray()[0] / ( len(top_predicates) + n_constraints ) #normalize(sum_a, norm='max', axis=1).toarray()[0] # normalize: cut top to 1 sum_a_norm[sum_a_norm > 1] = 1 # activations across components y_counts = binarize(y, threshold=0.0) count_a = sum(y_counts).toarray()[0] # final scores y = (sum_a_norm + count_a) / (len(top_predicates) + n_constraints + 1) # check output size assert y.shape[0] == len(entities) top = np.argwhere(y > 0).T.tolist()[0] if len(top) > 0: activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i] # if not such answer found fall back to return the answers satisfying max of the constraints else: # select answers that satisfy maximum number of constraints y_p = np.argmax(y) # maximum number of satisfied constraints max_cs = y[y_p] # at least some activation (evidence from min one constraint) if max_cs != 0: # select answers top = np.argwhere(y == max_cs).T.tolist()[0] activations1 = np.asarray(entities)[top] # store the activation values per id answer id for i, e in enumerate(entities): if e in activations1: activations[e] += y[i]
def forward(self, e_scores, entity_ids, p_scores, answer=None, all_predicate_ids=all_predicate_ids): ''' Inputs: *e_scores*: entity scores from Transformer *entity_ids*: global entity ids to request the KG for adjacencies *p_scores*: predicate scores from Transformer Outputs: *subgraph*: subgraph edges and entities ''' # with torch.autograd.detect_anomaly(): # get the top-k (predicates/)entities based on the score vectors weights, indices = torch.sort(e_scores.view(-1), descending=True) sampled_entities = entity_ids[ indices[:self.top_e]].tolist() # choose top-k matching entities # print("Retrieving adjacencies for %d entities"%len(sampled_entities)) # sample predicates? sampled_predicates = [] # predicate_ids.tolist() # weights, indices = torch.sort(p_scores.view(-1), descending=True) # sampled_predicates = predicate_ids[indices[:self.top_p]].tolist() with torch.no_grad(): # initialise connection to the Wikidata KG through the HDT API kg = HDTDocument(self.hdt_path) # request kg through hdt api for a subgraph given entity and relation subsets kg.configure_hops(1, sampled_predicates, 'predef-wikidata2018-09-all', True, False) s_entity_ids, s_predicate_ids, adjacencies = kg.compute_hops( sampled_entities, 5000, 0) kg.remove() del kg # print("Retrieved new subgraph with %d entities and %d relations" % (len(s_entity_ids), len(s_predicate_ids))) # check subgraph exists if not s_entity_ids: return (), None # check we are in the right subgraph if answer is not None and answer not in s_entity_ids: return (), None # build a lookup table for entity & predicate scores e_table = build_look_up(entity_ids) p_table = build_look_up(all_predicate_ids) del all_predicate_ids # load subgraph into tensor indices, relation_mask = adj(adjacencies, len(s_entity_ids), len(s_predicate_ids)) # print("%d triples" % len(indices)) # lookup local scores to activate respective entities & predicates e_scores = look_up(e_table, s_entity_ids, e_scores) p_scores = look_up(p_table, s_predicate_ids, p_scores) del p_table, s_predicate_ids, e_table, adjacencies # clean up gc.collect() torch.cuda.empty_cache() return (indices, e_scores, p_scores, relation_mask), s_entity_ids