Ejemplo n.º 1
0
    def answer(self, question):
        print '[BetaAnswer.answer]'
        question, entity_link_res, query_graphs = self.pipeline.gen_candidate_query_graph_for_prediction(
            question)
        for i in xrange(len(entity_link_res)):
            mid = entity_link_res[i]['topic']
            entity_link_res[i]['description'] = DBManager.get_description(mid)
            types = []
            for t in DBManager.get_notable_type(mid):
                name = DBManager.get_name(t)[0]
                if name:
                    types.append(name)
            entity_link_res[i]['notable_type'] = ' '.join(types)
            entity_link_res[i]['topic_name'] = DBManager.get_name(mid)[0]
        print ' rank query pattern'
        if len(query_graphs) == 0:
            return question, [], []
        hash_to_score = self.ranker.rank_queries(query_graphs)
        for i in range(len(query_graphs)):
            hcode = query_graphs[i]['hash']
            query_graphs[i]['score'] = float(hash_to_score.get(hcode, -2.))

        query_graphs = sorted(query_graphs,
                              key=lambda x: x['score'],
                              reverse=True)
        return question, entity_link_res, query_graphs
Ejemplo n.º 2
0
    def __init__(self, use_aqqu=False, use_relation_matcher=False):
        self.use_relation_matcher = use_relation_matcher
        if use_relation_matcher:
            self.relation_matcher = RelationMatcher(
                globals.config.get('BetaAnswer', 'relation-matcher'))
        if use_aqqu:
            print "Start initializing Aqqu entity linker..."
            self.entity_linker = AqquEntityLinker()
            print "Finish"
        else:
            print "Start initializing deep CRF entity linker..."
            mention_tagger = EntityMentionTagger(
                globals.config.get('BetaAnswer', 'entity-mention-tagger'))
            self.entity_linker = EntityLinker(mention_tagger)

        # self.freebase = FreebaseClient()
        self.db_manger = DBManager()
        self.lemma = dict()
        self.lemmatiser = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.stopwords = set(stopwords.words('english'))
        self.qwords = {'what', "where", "when", "who", "which", "how"}
        self.qwords_rel_cooccur = dict()
        with open(
                globals.config.get("FREEBASE",
                                   "question-word-relation-cooccur")) as fin:
            for line in fin:
                qw_rel, cooccur = line.strip().split('\t')
                self.qwords_rel_cooccur[qw_rel] = float(cooccur)
Ejemplo n.º 3
0
 def get_candidate_topic_entities_given_mention(self, question, mention):
     entities = DBManager.get_candidate_entities(mention, 0.1)
     candidates = []
     for e in entities:
         mid = e[0]
         entity_score = e[1]
         c = dict()
         c['mention'] = mention
         c['entity_score'] = entity_score
         c['topic'] = mid
         question, c[
             'mention_score'] = self.entity_mention_tagger.get_mention_likelihood(
                 question, mention)
         candidates.append(c)
     return question, candidates
Ejemplo n.º 4
0
    def add_answer_constraints(self, question, queries):
        question = question.split()
        male = {
            'dad', 'father', 'brother', "grandfather", "grandson", "son",
            "husband"
        }
        female = {
            'mom', 'mother', 'sister', 'grandmother', 'granddaughter',
            'daughter', 'wife'
        }
        # initialize answer constraints
        for i in xrange(len(queries)):
            queries[i]['answer_constraints'] = []
            queries[i]['gender_consistency'] = 0.

        for w in question:
            # gender consistency
            if w in male or w in female:
                for i in xrange(len(queries)):
                    gender_constraints = DBManager.get_property(
                        queries[i]['answer'], "gender")
                    if len(gender_constraints) > 0:
                        queries[i]['answer_constraints'].extend(
                            gender_constraints)
                        queries[i]['gender_consistency'] = float(
                            ("m.05zppz" == gender_constraints[0][-1]
                             and w in male)
                            or ('m.02zsn' == gender_constraints[0][-1]
                                and w in female))

        question_lemma = set([self._get_lemma(w) for w in question])

        for i in xrange(len(queries)):
            queries[i]['type_consistency'] = 0.
            answer = queries[i]['answer']
            types = self.get_type(answer)
            # The percentage of the words in the constraint entity that appear in the question
            best_type_index = -1
            best_type_overlap = 0.
            for j, t in enumerate(types):
                type_name = DBManager.get_name(t)[0].lower() if t.startswith(
                    'm.') else t
                num_overlap = 0
                for w_ in type_name.split():
                    w_ = self._get_lemma(w_)
                    if w_ in question_lemma:
                        num_overlap += 1
                if best_type_overlap < (num_overlap * 1. /
                                        len(type_name.split())):
                    best_type_overlap = (num_overlap * 1. /
                                         len(type_name.split()))
                    best_type_index = j

            if best_type_overlap > 0.:
                queries[i]['type_consistency'] = best_type_overlap
                queries[i]['answer_constraints'].append([
                    answer, 'common.topic.notable_types',
                    types[best_type_index]
                ])
                # print "$" * 40, question, self.get_name(types[best_type_index])

        return queries
Ejemplo n.º 5
0
class Pipeline(object):
    def __init__(self, use_aqqu=False, use_relation_matcher=False):
        self.use_relation_matcher = use_relation_matcher
        if use_relation_matcher:
            self.relation_matcher = RelationMatcher(
                globals.config.get('BetaAnswer', 'relation-matcher'))
        if use_aqqu:
            print "Start initializing Aqqu entity linker..."
            self.entity_linker = AqquEntityLinker()
            print "Finish"
        else:
            print "Start initializing deep CRF entity linker..."
            mention_tagger = EntityMentionTagger(
                globals.config.get('BetaAnswer', 'entity-mention-tagger'))
            self.entity_linker = EntityLinker(mention_tagger)

        # self.freebase = FreebaseClient()
        self.db_manger = DBManager()
        self.lemma = dict()
        self.lemmatiser = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.stopwords = set(stopwords.words('english'))
        self.qwords = {'what', "where", "when", "who", "which", "how"}
        self.qwords_rel_cooccur = dict()
        with open(
                globals.config.get("FREEBASE",
                                   "question-word-relation-cooccur")) as fin:
            for line in fin:
                qw_rel, cooccur = line.strip().split('\t')
                self.qwords_rel_cooccur[qw_rel] = float(cooccur)

    def _get_lemma(self, word):
        if word not in self.lemma:
            self.lemma[word] = self.lemmatiser.lemmatize(word)
        return self.lemma[word]

    def get_qword(self, question):
        question = question.split(' ')
        length = len(question)
        for i in xrange(length):
            w = question[i]
            if w in self.qwords:
                # return w
                if w == 'which':
                    return question[
                        i + 1] if question[i + 1] != '<$>' else question[i + 2]
                else:
                    return w
        return "none"

    def add_topic_feature(self, question, queries):
        """
        :param question:
        :return:
            queries: a list of query
                      a feature have keys
                      ["topic", "topic_name", "mention", "mention score", "entity_score"]
        """

        ret_queries = []
        for i in xrange(len(queries)):
            name = self.get_name(queries[i]['topic'])
            if name == None:
                continue
                # raise ValueError("Topic name is None")
            queries[i]['topic_name'] = name
            queries[i]['topic_notable_type'] = self.db_manger.get_notable_type(
                queries[i]['topic'])
            # queries[i]['topic_type'] = self.db_manger.get_type(queries[i]['topic'])
            ret_queries.append(queries[i])

        # print '[Pipeline.add_topic_feature]', question
        return ret_queries

    def _get_qword_relation_cooccur(self, question, relation):
        rel = ".".join(relation.split('.')[-3:-1])
        for w in question.split():
            if w in self.qwords:
                return self.qwords_rel_cooccur.get(w + " " + rel, 0.)
        return 0.

    def add_path_feature(self, question, queries, topk=-1, debug=False):
        """
        :param question:
        :return:
            queries: a list of query
                      the increased feature keys are
                      ["path", "relation", "pattern", "relation_score", "answer"]
        """
        print '[Pipeline.add_relation_match_feature]', question
        new_queries = []
        pattern_relation_set = set()

        for f in queries:
            mid = f['topic']
            paths = self.db_manger.get_subgraph(mid)
            # if debug:
            #     "DEBUG"
            #     print "topic", mid
            for path in paths:
                relation = path[-1][1]
                answer = path[-1][2]
                # if debug:
                #     print relation, answer
                if not self.get_name(answer):
                    # print "{} has no name, {}!!!".format(answer, relation)
                    continue
                feat = copy.deepcopy(f)
                feat['path'] = path
                feat['pattern'] = question.replace(f['mention'], '<$>')
                feat['relation'] = relation
                # relation_lemmas = [self.lemmatiser.lemmatize(w) for w in relation.split('.')[-1].split('_')]
                # pattern_lemmas = [self.lemmatiser.lemmatize(w) for w in feat['pattern'].split()]
                # feat['pattern_lemma'] = ' '.join(pattern_lemmas)
                # feat['relation_lemma'] = ' '.join(relation_lemmas)
                # feat['rel_pat_overlap'] = 1. if (set(relation_lemmas) - self.stopwords).intersection(
                #     set(pattern_lemmas)) else 0.
                relation_stem = [
                    self.stemmer.stem(w)
                    for w in relation.split('.')[-1].split('_')
                ]
                pattern_stem = [
                    self.stemmer.stem(w) for w in feat['pattern'].split()
                ]
                feat['relation_stem'] = ' '.join(relation_stem)
                feat['pattern_stem'] = ' '.join(pattern_stem)
                feat['rel_pat_overlap'] = 1. if (set(relation_stem) -
                                                 self.stopwords).intersection(
                                                     set(pattern_stem)) else 0.
                pattern_relation_set.add((feat['pattern'], feat['relation']))
                feat['answer'] = answer
                feat["qw_rel_occur"] = self._get_qword_relation_cooccur(
                    question, relation)
                new_queries.append(feat)

        if len(new_queries) == 0:
            return []
        queries = None
        # generate pattern-relation match score, distributed representations of pattern and relation
        if self.use_relation_matcher:
            patterns = []
            relations = []
            for p, r in pattern_relation_set:
                patterns.append(p)
                relations.append(r)

            scores, pattern_reprs, relation_reprs = self.relation_matcher.get_batch_match_score(
                patterns, relations)
            # pattern_reprs = dict(zip(patterns, pattern_reprs))
            # relation_reprs = dict(zip(relations, relation_reprs))
            relation_match_score = dict()
            if topk > 0:
                # use pattern-relation score to filter
                pq = []
                for i, s in enumerate(scores):
                    s = float(s)

                    if len(pq) < topk:
                        pq.append([s, i])
                    elif pq[0][0] < s:
                        heapq.heapreplace(pq, [s, i])
                for s, i in pq:
                    relation_match_score[(patterns[i], relations[i])] = s
            else:
                # print '[Pipeline.add_relation_match_feature] generate pattern-relation score'
                for p, r, s in itertools.izip(patterns, relations, scores):
                    relation_match_score[(p, r)] = float(s)
            ret_queries = []
            for i in xrange(len(new_queries)):
                if (new_queries[i]['pattern'],
                        new_queries[i]['relation']) in relation_match_score:
                    new_queries[i]['relation_score'] = relation_match_score[(
                        new_queries[i]['pattern'], new_queries[i]['relation'])]
                    ret_queries.append(new_queries[i])
                # features[i]['pattern_repr'] = pattern_reprs[features[i]['pattern']]
                # features[i]['relation_repr'] = relation_reprs[features[i]['relation']]
            return ret_queries
        else:
            # print 'return new queries'
            for i in xrange(len(new_queries)):
                if 'pattern' not in new_queries[i]:
                    pass
                    # print 'xxx', new_queries[i]
            return new_queries

    def get_name(self, entry):
        if entry.startswith('m.'):
            return self.db_manger.get_name(entry)[0]

        elif entry.endswith('^^gYear'):
            return entry[1:-8]
        elif entry.endswith('^^date'):
            return entry[1:5]
        elif entry.isdigit() and len(entry) == 4:  # isYear
            return entry
        else:
            # print entry, "has no name"
            return None

    def hash_query(self, path, constraints, mention, answer_constraints):
        """path contains mediator, constraints is list of triple"""
        sequence = []

        if len(path) == 2:
            # ignore mediator and answer
            sequence.append(path[0][0])  # subject
            sequence.append(path[0][1])  # first relation
            sequence.append(path[1][1])  # second relation
            sequence.append(mention)

            consts = set()
            # ignore mediator
            for c in constraints:  # order makes no sense
                consts.add((c[1], c[2]))
            for c in consts:
                sequence.extend(c)
        else:
            # ignore answer
            sequence.append(path[0][0])  # subject
            sequence.append(path[0][1])  # first relation
            sequence.append(mention)

        # add answer constraints
        consts = set()  # order makes no sense
        for c in answer_constraints:
            consts.add((c[1], c[2]))
        for c in consts:
            sequence.extend(c)
        return hash(" ".join(sequence))

    def add_answer_constraints(self, question, queries):
        question = question.split()
        male = {
            'dad', 'father', 'brother', "grandfather", "grandson", "son",
            "husband"
        }
        female = {
            'mom', 'mother', 'sister', 'grandmother', 'granddaughter',
            'daughter', 'wife'
        }
        # initialize answer constraints
        for i in xrange(len(queries)):
            queries[i]['answer_constraints'] = []
            queries[i]['gender_consistency'] = 0.

        for w in question:
            # gender consistency
            if w in male or w in female:
                for i in xrange(len(queries)):
                    gender_constraints = DBManager.get_property(
                        queries[i]['answer'], "gender")
                    if len(gender_constraints) > 0:
                        queries[i]['answer_constraints'].extend(
                            gender_constraints)
                        queries[i]['gender_consistency'] = float(
                            ("m.05zppz" == gender_constraints[0][-1]
                             and w in male)
                            or ('m.02zsn' == gender_constraints[0][-1]
                                and w in female))

        question_lemma = set([self._get_lemma(w) for w in question])

        for i in xrange(len(queries)):
            queries[i]['type_consistency'] = 0.
            answer = queries[i]['answer']
            types = self.get_type(answer)
            # The percentage of the words in the constraint entity that appear in the question
            best_type_index = -1
            best_type_overlap = 0.
            for j, t in enumerate(types):
                type_name = DBManager.get_name(t)[0].lower() if t.startswith(
                    'm.') else t
                num_overlap = 0
                for w_ in type_name.split():
                    w_ = self._get_lemma(w_)
                    if w_ in question_lemma:
                        num_overlap += 1
                if best_type_overlap < (num_overlap * 1. /
                                        len(type_name.split())):
                    best_type_overlap = (num_overlap * 1. /
                                         len(type_name.split()))
                    best_type_index = j

            if best_type_overlap > 0.:
                queries[i]['type_consistency'] = best_type_overlap
                queries[i]['answer_constraints'].append([
                    answer, 'common.topic.notable_types',
                    types[best_type_index]
                ])
                # print "$" * 40, question, self.get_name(types[best_type_index])

        return queries

    def add_cvt_constraints(self, question, queries):
        pass

    def add_constraints(self, question, queries, debug=False):
        word_in_question = set(question.split())
        candidates_topics = set()
        topic_to_mention = dict()
        for i in xrange(len(queries)):
            candidates_topics.add(queries[i]['topic'])
            topic_to_mention[queries[i]['topic']] = queries[i]['mention']

        for i in xrange(len(queries)):
            queries[i]['constraint_entity_in_q'] = 0
            queries[i]['constraint_entity_word'] = 0
            queries[i]['constraints'] = []
            # Add constraint feature for CVT
            if len(queries[i]['path']) == 2:
                cvt = queries[i]['path'][0][-1]
                cons_paths = self.db_manger.get_one_hop_path(cvt)

                num_name_cross = 0

                for _, rel, obj in cons_paths:
                    if obj == queries[i]['answer'] or obj == queries[i][
                            'topic']:
                        continue
                    # The constraint entity occurs in the question
                    # TODO: constraint entity can't overlap with topic entity
                    if obj in candidates_topics and topic_to_mention[
                            obj] != topic_to_mention[queries[i]['topic']]:
                        queries[i]['constraint_entity_in_q'] += 1
                        queries[i]['constraints'].append([cvt, rel, obj])
                    # Some words of the constraint entity's name appear in the question
                    # percentage of the words in the name of the constraint entity appear in the question

                    # TODO:
                    name = self.get_name(obj)

                    if not name:
                        pass
                        # print "constraint", obj, "has no name!"
                    else:

                        cons_words = set(name.lower().split())
                        intersect_per = len(
                            cons_words.intersection(
                                word_in_question)) * 1.0 / len(cons_words)
                        queries[i]['constraint_entity_word'] += intersect_per

                        if intersect_per > 0:
                            num_name_cross += 1
                            # queries[i]['constraint_entity_word_detail'] += '\t'+name

                if num_name_cross > 0:
                    queries[i][
                        'constraint_entity_word'] *= 1.0 / num_name_cross

            # Add constraint feature for answer
            # name = self.get_name(queries[i]['answer'])
            # if not name:
            #     print queries[i]['answer'], "has no name!!"
            #     queries[i]['answer_word'] = 0
            # else:
            #     answer_words = set(name.lower().split())
            #     queries[i]['answer_word'] = len(answer_words.intersection(qwords)) * 1.0 / len(answer_words)

        queries = self.add_answer_constraints(question, queries)
        return queries

    def get_type(self, entry):
        if entry.startswith('m.'):
            types = self.db_manger.get_notable_type(entry)
            return types if types else ['none']
        if entry.endswith('^^gYear'):
            return ['year']
        if entry.endswith('^^date'):
            return ['date']
        else:
            return ['illegal']

    def add_answer_feature(self, question, queries):
        """queries: this is extracted query pattern """
        qword = self.get_qword(question)
        for i in xrange(len(queries)):
            type_dist = dict()  # type distribution
            for answer in queries[i]['pattern_answer']:
                for t in self.get_type(answer):
                    type_dist[t] = type_dist.get(t, 0) + 1
            sum_ = sum(type_dist.itervalues())
            for t in type_dist.iterkeys():
                type_dist[t] *= 1. / sum_
            queries[i]['answer_types'] = type_dist
            queries[i]['qword'] = qword
        return queries

    def create_query_graph_given_topic(self, question, mention):
        question, queries = self.entity_linker.get_candidate_topic_entities_given_mention(
            question, mention)
        queries = self.add_path_feature(question, queries)
        return question, queries

    def gen_candidate_relations(self, question, debug=False):
        question, candidates = self.entity_linker.get_candidate_topic_entities(
            question)
        candidate_relations = set()
        # if debug:
        #     print question
        #     for c in candidates:
        #         print c
        for f in candidates:
            mid = f['topic']
            rels = [
                r[-1] for r in self.db_manger.get_multiple_hop_relations(mid)
            ]
            # if debug:
            #     print mid
            #     for r in rels:
            #         print r
            candidate_relations.update(rels)
        return question, candidate_relations

    def gen_candidate_query_graph(self, question, debug=False):
        # generate entity feature
        question, queries = self.entity_linker.get_candidate_topic_entities(
            question)
        queries = self.add_topic_feature(question, queries)
        # if debug:
        #     print queries
        #     for q in queries:
        #         print q
        queries = self.add_path_feature(question,
                                        queries,
                                        topk=-1,
                                        debug=debug)
        queries = self.add_constraints(question, queries, debug)

        for i in xrange(len(queries)):
            queries[i]['hash'] = self.hash_query(
                queries[i]['path'], queries[i].get('constraints', []),
                queries[i]['mention'], queries[i]['answer_constraints'])

        # if debug:
        #     for q in queries:
        #         print q
        return question, queries

    def gen_candidate_query_graph_for_prediction(self, question):
        # print '[Pipeline.gen_candidate_query_graph_for_prediction]'
        # print ' entity_linker.get_candidate_topic_entities({})'.format(question)
        question, entity_link_res = self.entity_linker.get_candidate_topic_entities(
            question)
        # print ' add_topic_feature'
        queries = self.add_topic_feature(question, entity_link_res)
        # print ' add_path_feature'
        queries = self.add_path_feature(question, queries, topk=-1)
        # print ' add_constraints'
        queries = self.add_constraints(question, queries)

        for i in xrange(len(queries)):
            queries[i]['hash'] = self.hash_query(
                queries[i]['path'], queries[i].get('constraints', []),
                queries[i]['mention'], queries[i]['answer_constraints'])

        # print ' extract query pattern'
        query_hash_to_pattern = dict()
        query_hash_to_answers = dict()
        for i in xrange(len(queries)):
            code = queries[i]['hash']

            if code not in query_hash_to_pattern:
                query_hash_to_answers[code] = set()
                query_hash_to_pattern[code] = queries[
                    i]  # select one representative
            query_hash_to_answers[code].add(queries[i]['answer'])

        for hash_code in query_hash_to_answers.keys():
            query_hash_to_pattern[hash_code]['pattern_answer'] = list(
                query_hash_to_answers[hash_code])
            query_hash_to_pattern[hash_code]['num_answer'] = len(
                query_hash_to_answers[hash_code])
        return question, entity_link_res, query_hash_to_pattern.values()

    def extract_query_pattern_and_f1(self, queries, gold_answers, debug=False):
        if not isinstance(gold_answers, set):
            gold_answers = set(gold_answers)
        query_hash_to_answers = dict()
        query_hash_to_pattern = dict()
        for i in xrange(len(queries)):
            code = queries[i]['hash']

            if code not in query_hash_to_pattern:
                query_hash_to_answers[code] = set()
                query_hash_to_pattern[code] = queries[
                    i]  # select one representative
            query_hash_to_answers[code].add(queries[i]['answer'])

        for hash_code in query_hash_to_answers.keys():
            _, _, f1 = compute_f1(gold_answers,
                                  query_hash_to_answers[hash_code])
            query_hash_to_pattern[hash_code]['f1'] = f1
            query_hash_to_pattern[hash_code]['pattern_answer'] = list(
                query_hash_to_answers[hash_code])
            query_hash_to_pattern[hash_code]['num_answer'] = len(
                query_hash_to_answers[hash_code])
        return query_hash_to_pattern.values()

    @staticmethod
    def to_svm_ranker_input(query_pattern, keys):
        """query_pattern must contain 'qid' """
        f = ' '.join([
            "{}:{}".format(j + 1, query_pattern.get(k, 0))
            for j, k in enumerate(keys)
        ])
        return "{} qid:{} {}".format(query_pattern['f1'], query_pattern['qid'],
                                     f)
Ejemplo n.º 6
0
    def get_candidate_topic_entities(self, sentence):
        # TODO: support gYear gDate
        """
        Returns:
            candidates: list of dict
        """
        # 需要优化: 找到所有实体及mention后, 再去统一计算mention likelihood
        res = self.entity_mention_tagger.tag(sentence)
        print res['mentions']
        sentence = res['sentence']
        candidates = dict()
        for (surface, start), likelihood in res['mentions'].items():
            # print '-' * 20
            surface_ = surface.lower().replace(' ', '')
            entity_res = DBManager.get_candidate_entities(surface_, 0.1)
            # print "key = {}, likelihood = {}, find entities {}".format(surface, likelihood, entity_res)
            for e in entity_res:
                mid = e[0]
                entity_score = e[1]
                if entity_score >= 1.1:  # alias
                    continue
                if mid not in candidates or entity_score > candidates[mid][
                        'entity_score']:
                    candidates[mid] = dict()
                    candidates[mid]['topic'] = mid
                    candidates[mid]['mention'] = surface
                    candidates[mid]['entity_score'] = entity_score
                    candidates[mid]['mention_score'] = likelihood
                    candidates[mid]['start'] = start
        # use ngram of
        if len(candidates) == 0:
            # print '[get_candidate_topic_entities] use ngram of tagged mention'
            # all_pos = res['pos']
            for surface, start in res['mentions'].keys():
                surface = surface.lower().split()  # WHY lowercase

                if len(surface) == 0:
                    continue
                # start = find_word(sentence.split(), surface)
                # print sentence, surface, start
                # if start == -1:  # WHY?
                #     continue
                l = len(surface)
                found = False
                for j in range(l, 0, -1):
                    # if found:
                    #     break
                    for i in range(l - j + 1):
                        # if self.is_entity_occurrence(all_pos, sentence, start + i, start + i + j):
                        s = ''.join(surface[i:i + j])
                        s = ''.join(
                            [c for c in s if c not in self.punctuations])
                        entity_res = DBManager.get_candidate_entities(s, 0.1)
                        print surface[i:i + j], entity_res
                        for mid, entity_score in entity_res:

                            if entity_score < 1.1 and (
                                    mid not in candidates or entity_score >
                                    candidates[mid]['entity_score']):
                                candidates[mid] = dict()
                                candidates[mid]['topic'] = mid
                                candidates[mid]['mention'] = ' '.join(
                                    surface[i:i + j])
                                candidates[mid]['entity_score'] = entity_score
                                candidates[mid]['start'] = start + i
                                _, candidates[mid][
                                    'mention_score'] = self.entity_mention_tagger.get_mention_likelihood(
                                        sentence, ' '.join(surface[i:i + j]))
                        found = len(res) > 0
        # print '[EntityLinker.get_candidate_topic_entities] conclude'
        # for mid, info in candidates.iteritems():
        #     print mid, info
        return sentence, candidates.values()
Ejemplo n.º 7
0
 def __init__(self):
     self.relation_matcher = RelationMatcher(globals.config.get('BetaAnswer', 'relation-matcher'))
     mention_tagger = EntityMentionTagger(globals.config.get('BetaAnswer', 'entity-mention-tagger'))
     self.entity_linker = EntityLinker(mention_tagger)
     self.db_manger = DBManager()
Ejemplo n.º 8
0
class FeatureExtractor(object):

    def __init__(self):
        self.relation_matcher = RelationMatcher(globals.config.get('BetaAnswer', 'relation-matcher'))
        mention_tagger = EntityMentionTagger(globals.config.get('BetaAnswer', 'entity-mention-tagger'))
        self.entity_linker = EntityLinker(mention_tagger)
        self.db_manger = DBManager()

    def add_topic_feature(self, qid, question):

        # generate entity feature
        question, features = self.entity_linker.get_candidate_topic_entities(question)
        print '[FeatureExtractor.add_topic_feature]', question
        for i in xrange(len(features)):
            features[i]['qid'] = qid
        return question, features

    def add_relation_feature(self, question, features, topk):
        extended_features = []
        pat_rel_set = set()
        relations = []
        patterns = []
        for i in xrange(len(features)):
            subject = features[i]['topic']
            core_paths = self.db_manger.get_core_paths_without_object(subject)
            for path in core_paths:
                feat = copy.deepcopy(features[i])

                feat['path'] = path
                feat['relation'] = path[-1]
                feat['pattern'] = question.replace(feat['mention'], '<$>')
                if (feat['pattern'], feat['relation']) not in pat_rel_set:
                    pat_rel_set.add((feat['pattern'], feat['relation']))
                    patterns.append(feat['pattern'])
                    relations.append(feat['relation'])
                extended_features.append(feat)
        if len(extended_features) == 0:
            return []
        features = None
        # generate pattern-relation match score, distributed representations of pattern and relation
        scores, pattern_reprs, relation_reprs = self.relation_matcher.get_batch_match_score(patterns, relations)
        # pattern_reprs = dict(zip(patterns, pattern_reprs))
        # relation_reprs = dict(zip(relations, relation_reprs))
        relation_match_score = dict()
        if topk > 0:
            # use pattern-relation score to filter
            pq = []
            for i, s in enumerate(scores):
                s = float(s)
                # if s < 0:
                #     continue
                if len(pq) < topk:
                    pq.append([s, i])
                elif pq[0][0] < s:
                    heapq.heapreplace(pq, [s, i])
            for s, i in pq:
                relation_match_score[(patterns[i], relations[i])] = s
        else:
            print '[Pipeline.add_relation_match_feature] generate pattern-relation score'
            for p, r, s in itertools.izip(patterns, relations, scores):
                relation_match_score[(p, r)] = float(s)
        ret_features = []
        for i in xrange(len(extended_features)):
            if (extended_features[i]['pattern'], extended_features[i]['relation']) in relation_match_score:
                extended_features[i]['relation_score'] = relation_match_score[(extended_features[i]['pattern'], extended_features[i]['relation'])]
                ret_features.append(extended_features[i])
            # features[i]['pattern_repr'] = pattern_reprs[features[i]['pattern']]
            # features[i]['relation_repr'] = relation_reprs[features[i]['relation']]

        return ret_features

    def get_name(self, entry):
        if entry.startswith('m.'):
            return self.db_manger.get_name(entry)[0]

        elif entry.endswith('^^gYear'):
            return entry[1:-8]
        elif entry.endswith('^^date'):
            return entry[1:5]
        else:
            print entry, "has no name"
            return None

    def add_constraints(self, question, features):
        qwords = set(question.split())
        # TODO: use aqqu to find constraint entities
        candidates_topics = set()
        for i in xrange(len(features)):
            candidates_topics.add(features[i]['topic'])

        for i in xrange(len(features)):
            # Add constraint feature for CVT
            if len(features[i]['path']) == 4:
                cvt = features[i]['path'][2]
                cons_paths = self.db_manger.get_one_hop_path(cvt)
                features[i]['constraint_entity_in_q'] = 0
                features[i]['constraint_entity_word'] = 0
                features[i]['constraints'] = []
                # features[i]['constraint_entity_word_detail'] = ""
                num_name_cross = 0
                # if len(cons_paths) > 10:
                #     continue

                for _, rel, obj in cons_paths:
                    if rel == features[i]['relation'] or obj == features[i]['topic']:  # avoid constraint node being answer
                        continue
                    # The constraint entity occurs in the question
                    if obj in candidates_topics:
                        features[i]['constraint_entity_in_q'] += 1
                        features[i]['constraints'].append({
                            "source_node_index": 0,
                            "node_predicate": rel,
                            "argument": obj
                        })
                    # Some words of the constraint entity's name appear in the question
                    # percentage of the words in the name of the constraint entity appear in the question
                    name = self.get_name(obj)
                    if not name:
                        print obj, "has no name!"
                    else:
                        cons_words = set(name.lower().split())
                        intersect_per = len(cons_words.intersection(qwords)) * 1.0 / len(cons_words)
                        features[i]['constraint_entity_word'] += intersect_per

                        if intersect_per > 0:
                            num_name_cross += 1
                            # features[i]['constraint_entity_word_detail'] += '\t'+name

                if num_name_cross > 0:
                    features[i]['constraint_entity_word'] *= 1.0 / num_name_cross

        return features

    def get_answer(self, path, constraints):
        answers = self.db_manger.get_object(path, constraints)
        pass



    def add_f1_score(self, path, constraints, gold_topic):
        pass

    @staticmethod
    def add_rank(features, gold_topic, gold_relations):
        # if e_pred == e_gold and r_pred == r_gold => rank = 3
        # else if e_pred == e_gold or r_pred == r_gold => rank = 2
        # else rank = 1
        if not isinstance(gold_relations, set):
            gold_relations = set(gold_relations)

        for i in xrange(len(features)):
            if features[i]['relation'] in gold_relations and features[i]['topic'] == gold_topic:
                features[i]['rank'] = 3
            elif features[i]['relation'] in gold_relations or features[i]['topic'] == gold_topic:
                features[i]['rank'] = 2
            else:
                features[i]['rank'] = 1
        return features

    @staticmethod
    def to_svm_ranker_input(features, keys):
        ranker_input = []
        for i in xrange(len(features)):
            f = ' '.join(["{}:{}".format(j + 1, features[i].get(k, 0)) for j, k in enumerate(keys)])
            ranker_input.append("{} qid:{} {}".format(features[i]['rank'], features[i]['qid'], f))
        return ranker_input

    def extract_query_feature(self, question, qid, gold_topic, gold_relations):
        question, features = self.add_topic_feature(qid, question)
        features = self.add_relation_feature(question, features, topk=10)
        features = self.add_constraints(question, features)
        features = self.add_rank(features, gold_topic, gold_relations)
        return question, features