Beispiel #1
0
def evaluate_subgraph_extraction(nhops,
                                 e_field,
                                 p_field,
                                 limit=None,
                                 show_errors=False):
    '''
    e_field, p_field <str> names of the fields in MongoDB to look up the IDs
    '''
    samples = mongo.get_sample(limit=limit)
    # iterate over the cursor
    accs = []
    for doc in samples:
        # get correct entities and predicates from the GS annotations
        e_ids = doc[e_field]
        p_uris = doc[p_field]

        # extract the subgraph
        kg = HDTDocument(hdt_path + hdt_file)
        kg.configure_hops(nhops, p_uris, namespace, True)
        entities, _, _ = kg.compute_hops(e_ids)
        kg.remove()

        # check if we hit the answer set
        if 'answers_ids' in doc:
            correct_answers_ids = set(doc['answers_ids'])
            #         print(correct_answers_ids)
            n_hits = len(correct_answers_ids & set(entities))
            # accuracy
            acc = float(n_hits) / len(correct_answers_ids)
            accs.append(acc)
            if show_errors & (acc < 1):
                print(doc['question'])
                print(doc['entity_ids'])
                print(doc['predicate_uris'])
    return accs
def filter_answer_by_class(classes, answers_ids):
    classes_ids = [_id for e in classes for _id in e]
    kg = HDTDocument(hdt_path+hdt_file)
    a_ids = [_id for e in answers_ids for _id in e]
    a_ids = kg.filter_types(a_ids, classes_ids)
    kg.remove()
    a_ids = [_id for _a_ids in a_ids for _id in _a_ids]
    answers_ids = [{_id: a_score} for e in answers_ids for _id, a_score in e.items() if _id in a_ids]
    return answers_ids
Beispiel #3
0
def hop(activations,
        constraints,
        predicates_ids,
        verbose=False,
        _bool_answer=False,
        max_triples=500000):
    # extract the subgraph for the selected entities
    top_entities_ids = [_id for e in activations + constraints for _id in e]
    # exclude types predicate
    top_predicates_ids = [
        _id for p in predicates_ids for _id in p if _id != 68655
    ]

    # iteratively call the HDT API to retrieve all subgraph partitions
    activations = defaultdict(int)
    offset = 0
    while True:
        # get the subgraph for selected predicates only
        kg = HDTDocument(hdt_path + hdt_file)
        kg.configure_hops(1, top_predicates_ids, namespace, True)
        entities, predicate_ids, adjacencies = kg.compute_hops(
            top_entities_ids, max_triples, offset)
        kg.remove()

        if not entities:
            # filter out the answers by min activation scores
            if not _bool_answer and constraints:
                # normalize activations by checking the 'must' constraints: number of constraints * weights
                min_a = len(constraints) * 1
                if predicates_ids != top_predicates_ids:
                    min_a -= 1
            else:
                min_a = 0
            # return HDT ids of the activated entities
            return [
                a_id for a_id, a_score in activations.items()
                if a_score > min_a
            ]

        if verbose:
            print("Subgraph extracted:")
            print("%d entities" % len(entities))
            print("%d predicates" % len(predicate_ids))
            print("Loading adjacencies..")

        offset += max_triples
        # index entity ids global -> local
        entities_dict = {k: v for v, k in enumerate(entities)}
        adj_shape = (len(entities), len(entities))
        # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops
        A = generate_adj_sp(adjacencies, adj_shape, include_inverse=True)

        # activations of entities and predicates
        e_ids = [
            entities_dict[entity_id] for entity_id in top_entities_ids
            if entity_id in entities_dict
        ]
        #     assert len(top_entities_ids) == len(e_ids)
        p_ids = [
            predicate_ids.index(entity_id) for entity_id in top_predicates_ids
            if entity_id in predicate_ids
        ]
        #     assert len(top_predicates_ids) == len(p_ids)
        if p_ids:
            # graph activation vectors
            x = np.zeros(len(entities))
            x[e_ids] = 1
            p = np.zeros(len(predicate_ids))
            p[p_ids] = 1

            # slice A by the selected predicates and concatenate edge lists
            y = (x @ sp.hstack(A * p)).reshape(
                [len(predicate_ids), len(entities)]).sum(0)
            # check output size
            assert y.shape[0] == len(entities)

            # harvest activations
            top = np.argwhere(y > 0).T.tolist()[0]
            if len(top) > 0:
                activations1 = np.asarray(entities)[top]
                # store the activation values per id answer id
                for i, e in enumerate(entities):
                    if e in activations1:
                        activations[e] += y[i]
Beispiel #4
0
                                           torch.tensor(p_attention_masks), torch.tensor(all_predicate_ids)],
                                           torch.tensor([answer_id]), first_question])
            # carry over history to the next dialogue turn
            dialogue_history.extend([question, answer_label])

    del entity_ids, predicate_ids, adjacencies

    print("Compiled dataset with %d samples" % len(train_dataset))
    return train_dataset


train_dataset = prepare_dataset(train_conversations_path)
valid_dataset = prepare_dataset(dev_conversations_path)

# remove everything from memory but model and tensors for training/validaton
kg.remove()
del kg

print("Dataset loaded")

# train model (matching nodes and relations with a Transformer with subgraph sampling)
n_batches = 1000

import random
import numpy as np
from functools import reduce
import sys
import os
import gc
import psutil
Beispiel #5
0
def entity_linking(spans_field,
                   save,
                   show_errors=True,
                   add_nieghbours=True,
                   lookup_embeddings=True):
    # iterate over the cursor
    cursor = mongo.get_sample(limit=limit)
    count = 0
    # hold macro-average stats for the model performance over the samples
    ps, rs, fs = [], [], []
    with cursor:
        for doc in cursor:
            # if 'entity_ids_guess' not in doc:
            correct_uris = doc['entity_uris']
            print(set(correct_uris))
            # get entity spans
            e_spans = doc[spans_field]
            #         e_spans = doc[spans_field+'_guess']
            #     print(e_spans)
            # get entity matches TODO save scores
            top_ids = []
            top_entities = {}
            for span in e_spans:
                print("Span: %s" % span)
                print("Index lookup..")
                guessed_labels, guessed_ids, look_up_ids = [], [], []
                for match in e_index.match_label(span, top=string_cutoff):
                    label = match['_source']['label_exact']
                    degree = match['_source']['count']
                    #                 print(degree)
                    _id = match['_source']['id']
                    # avoid expanding heavy hitters
                    if int(degree) < max_degree:
                        look_up_ids.append(_id)
                    guessed_ids.append(_id)
                    if label not in guessed_labels:
                        guessed_labels.append(label)
                    uri = match['_source']['uri']
    #                 print(uri)

                print("%d candidate labels" % len(guessed_labels))
                if add_nieghbours:
                    print("KG lookup..")
                    kg = HDTDocument(hdt_path + hdt_file)
                    kg.configure_hops(1, [], namespace, True)
                    # get a sample of the subgraph: the first <max_triples> only
                    entities, predicate_ids, adjacencies = kg.compute_hops(
                        look_up_ids, max_triples, 0)
                    kg.remove()
                    # look up labels
                    for e_id in entities:
                        match = e_index.look_up_by_id(e_id)
                        if match:
                            label = match[0]['_source']['label_exact']
                            if label not in guessed_labels:
                                guessed_labels.append(label)
                    guessed_ids.extend(entities)

                # score with embeddings
                guessed_labels = [
                    label for label in guessed_labels if label in e_vectors
                ]
                print("%d candidate labels" % len(guessed_labels))
                if guessed_labels and lookup_embeddings:
                    print("Embeddings lookup..")
                    dists = e_vectors.distance(span, [
                        label for label in guessed_labels if label in e_vectors
                    ])
                    top = np.argsort(dists)[:semantic_cutoff].tolist()
                    top_labels = [guessed_labels[i] for i in top]
                    print("selected labels: %s" % top_labels)
                    print("Index lookup..")
                    top_entities[span] = []
                    for i, label in enumerate(top_labels):
                        print(label)
                        for match in e_index.look_up_by_label(label):
                            distance = float(dists[top[i]])
                            degree = match['_source']['count']
                            _id = match['_source']['id']
                            uri = match['_source']['uri']
                            print(uri)
                            top_entities[span].append({
                                'rank': i + 1,
                                'distance': distance,
                                'degree': degree,
                                'id': _id,
                                'uri': uri
                            })
                            top_ids.append(_id)
                else:
                    top_labels = guessed_labels
                    top_ids.extend(guessed_ids)

            # evaluate against the correct entity ids
            top_ids = list(set(top_ids))
            correct_ids = set(doc['entity_ids'])
            n_hits = len(correct_ids & set(top_ids))
            try:
                r = float(n_hits) / len(correct_ids)
            except ZeroDivisionError:                \
                                print(doc['question'])
            try:
                p = float(n_hits) / len(top_ids)
            except ZeroDivisionError:
                p = 0
            try:
                f = 2 * p * r / (p + r)
            except ZeroDivisionError:
                f = 0
            print("P: %.2f R: %.2f F: %.2f" % (p, r, f))

            # add stats
            ps.append(p)
            rs.append(r)
            fs.append(f)

            # save to MongoDB
            if save:
                doc['entity_ids_guess'] = top_ids
                doc['entity_guess'] = top_entities
                mongo.col.update_one({'_id': doc['_id']}, {"$set": doc},
                                     upsert=True)
                count += 1

    print("P: %.2f R: %.2f F: %.2f" % (np.mean(ps), np.mean(rs), np.mean(fs)))
    print("Fin. Results for %d questions" % len(ps))
    if save:
        print("%d documents annotated with entity ids guess" % count)
Beispiel #6
0
def hop(entities,
        constraints,
        top_predicates,
        verbose=False,
        max_triples=500000):
    '''
    Extract the subgraph for the selected entities
    '''
    #     print(top_predicates)
    n_constraints = len(constraints)
    if entities:
        n_constraints += 1

    top_entities = entities + constraints
    all_entities_ids = [_id for e in top_entities for _id in e]
    top_predicates_ids = [_id for p in top_predicates for _id in p if _id]

    # iteratively call the HDT API to retrieve all subgraph partitions
    activations = defaultdict(int)
    offset = 0

    while True:
        # get the subgraph for selected predicates only
        kg = HDTDocument(hdt_path + hdt_file)
        #         print(top_predicates_ids)
        kg.configure_hops(1, top_predicates_ids, namespace, True)
        entities, predicate_ids, adjacencies = kg.compute_hops(
            all_entities_ids, max_triples, offset)
        kg.remove()
        #         print(adjacencies)
        # show subgraph entities
        #         print([e_index.look_up_by_id(e)[0]['_source']['uri'] for e in entities])

        if not entities:
            answers = [{
                a_id: a_score
            } for a_id, a_score in activations.items()]
            return answers

        # if verbose:
        # print("Subgraph extracted:")
        # print("%d entities"%len(entities))
        # print("%d predicates"%len(predicate_ids))
        # print("Loading adjacencies..")

        offset += max_triples
        # index entity ids global -> local
        entities_dict = {k: v for v, k in enumerate(entities)}
        # generate a list of adjacency matrices per predicate assuming the graph is undirected wo self-loops
        A = generate_adj_sp(adjacencies, len(entities), include_inverse=True)
        #         print(predicate_ids)
        # activate entities -- build sparse matrix
        row, col, data = [], [], []
        for i, concept_ids in enumerate(top_entities):
            for entity_id, score in concept_ids.items():
                if entity_id in entities_dict:
                    #                     print(e_index.look_up_by_id(entity_id)[0]['_source']['uri'])
                    #                     print(score)
                    local_id = entities_dict[entity_id]
                    row.append(i)
                    col.append(local_id)
                    data.append(score)
        x = sp.csr_matrix((data, (row, col)),
                          shape=(len(top_entities), len(entities)))

        # iterate over predicates
        ye = sp.csr_matrix((len(top_entities), len(entities)))
        # activate predicates
        if top_predicates_ids:
            yp = sp.csr_matrix((len(top_predicates), len(entities)))
            for i, concept_ids in enumerate(top_predicates):
                # activate predicates
                p = np.zeros([len(predicate_ids)])
                # iterate over synonyms
                for p_id, score in concept_ids.items():
                    if p_id in predicate_ids:
                        local_id = predicate_ids.index(p_id)
                        p[local_id] = score
                # slice A by the selected predicates
                _A = sum(p * A)
                _y = x @ _A
                # normalize: cut top to 1
                _y[_y > 1] = 1
                yp[i] = _y.sum(0)
                ye += _y
            y = sp.vstack([ye, yp])
        # fall back to evaluate all predicates
        else:
            y = x @ sum(A)
        sum_a = sum(y)
        sum_a_norm = sum_a.toarray()[0] / (
            len(top_predicates) + n_constraints
        )  #normalize(sum_a, norm='max', axis=1).toarray()[0]
        # normalize: cut top to 1
        sum_a_norm[sum_a_norm > 1] = 1
        # activations across components
        y_counts = binarize(y, threshold=0.0)
        count_a = sum(y_counts).toarray()[0]
        # final scores
        y = (sum_a_norm + count_a) / (len(top_predicates) + n_constraints + 1)

        # check output size
        assert y.shape[0] == len(entities)

        top = np.argwhere(y > 0).T.tolist()[0]
        if len(top) > 0:
            activations1 = np.asarray(entities)[top]
            # store the activation values per id answer id
            for i, e in enumerate(entities):
                if e in activations1:
                    activations[e] += y[i]
        # if not such answer found fall back to return the answers satisfying max of the constraints
        else:
            # select answers that satisfy maximum number of constraints
            y_p = np.argmax(y)
            # maximum number of satisfied constraints
            max_cs = y[y_p]
            # at least some activation (evidence from min one constraint)
            if max_cs != 0:
                # select answers
                top = np.argwhere(y == max_cs).T.tolist()[0]
                activations1 = np.asarray(entities)[top]
                # store the activation values per id answer id
                for i, e in enumerate(entities):
                    if e in activations1:
                        activations[e] += y[i]
Beispiel #7
0
    def forward(self,
                e_scores,
                entity_ids,
                p_scores,
                answer=None,
                all_predicate_ids=all_predicate_ids):
        '''
        Inputs:
            *e_scores*: entity scores from Transformer
            *entity_ids*: global entity ids to request the KG for adjacencies
            *p_scores*: predicate scores from Transformer
        Outputs:
            *subgraph*: subgraph edges and entities
        '''
        #         with torch.autograd.detect_anomaly():
        # get the top-k (predicates/)entities based on the score vectors
        weights, indices = torch.sort(e_scores.view(-1), descending=True)
        sampled_entities = entity_ids[
            indices[:self.top_e]].tolist()  # choose top-k matching entities
        #         print("Retrieving adjacencies for %d entities"%len(sampled_entities))
        # sample predicates?
        sampled_predicates = []  # predicate_ids.tolist()
        #         weights, indices = torch.sort(p_scores.view(-1), descending=True)
        #         sampled_predicates = predicate_ids[indices[:self.top_p]].tolist()

        with torch.no_grad():

            # initialise connection to the Wikidata KG through the HDT API
            kg = HDTDocument(self.hdt_path)
            # request kg through hdt api for a subgraph given entity and relation subsets
            kg.configure_hops(1, sampled_predicates,
                              'predef-wikidata2018-09-all', True, False)
            s_entity_ids, s_predicate_ids, adjacencies = kg.compute_hops(
                sampled_entities, 5000, 0)
            kg.remove()
            del kg
            #         print("Retrieved new subgraph with %d entities and %d relations" % (len(s_entity_ids), len(s_predicate_ids)))

            # check subgraph exists
            if not s_entity_ids:
                return (), None

            # check we are in the right subgraph
            if answer is not None and answer not in s_entity_ids:
                return (), None

            # build a lookup table for entity & predicate scores
            e_table = build_look_up(entity_ids)
            p_table = build_look_up(all_predicate_ids)
            del all_predicate_ids

        # load subgraph into tensor
        indices, relation_mask = adj(adjacencies, len(s_entity_ids),
                                     len(s_predicate_ids))
        #         print("%d triples" % len(indices))

        # lookup local scores to activate respective entities & predicates
        e_scores = look_up(e_table, s_entity_ids, e_scores)
        p_scores = look_up(p_table, s_predicate_ids, p_scores)
        del p_table, s_predicate_ids, e_table, adjacencies

        # clean up
        gc.collect()
        torch.cuda.empty_cache()

        return (indices, e_scores, p_scores, relation_mask), s_entity_ids