Beispiel #1
0
    def extend_candidates(self, candidates):
        """
        Extend the set of candidates with addition query candidates, which can be based on existing candidates.
        For example, one way to extend is to add candidates containing additional filters, e.g. by notable type.
        :param candidates: A set of candidates to extend.
        :return: A new set of candidates.
        """
        extra_candidates = []
        add_types_filters = \
            globals.config.get('QueryCandidateExtender', 'add-notable-types-filter-templates', '') == "True"
        if add_types_filters and self.ngram_notable_types_npmi and candidates:
            for candidate in candidates:
                n_grams = set(get_n_grams_features(candidate))
                notable_types = set(notable_type for notable_type in candidate.get_answer_notable_types()
                                    if notable_type)
                if len(notable_types) > 1:
                    notable_type_scores = dict()
                    for n_gram, notable_type in product(n_grams, notable_types):
                        pair = (n_gram, notable_type)
                        if notable_type not in notable_type_scores:
                            notable_type_scores[notable_type] = []
                        notable_type_scores[notable_type].append((n_gram, self.ngram_notable_types_npmi[pair])
                                                                 if pair in self.ngram_notable_types_npmi
                                                                 else ("", 0.0))

                    for notable_type, ngram_scores in notable_type_scores.iteritems():
                        scores = [score for ngram, score in ngram_scores]
                        max_score = max(scores)
                        if max_score > self.notable_types_npmi_threshold:
                            avg_score = avg(scores)
                            logger.info("Extending candidate with type filter:")
                            logger.info(candidate)
                            logger.info(notable_type)
                            logger.info(ngram_scores)
                            new_query_candidate = copy.deepcopy(candidate)
                            new_query_candidate.filter_answers_by_type(notable_type,
                                                                       [max_score, avg_score])
                            extra_candidates.append(new_query_candidate)
                            logger.info(candidate.get_results_text())
                            logger.info(new_query_candidate.get_results_text())
        return candidates + extra_candidates
Beispiel #2
0
def extract_npmi_ngram_type_pairs():
    globals.read_configuration('config.cfg')
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    n_gram_type_counts = dict()
    type_counts = dict()
    n_gram_counts = dict()
    total = 0
    year_pattern = re.compile("[0-9]+")
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            if query.oracle_position != -1 and query.oracle_position <= len(query.eval_candidates):
                correct_candidate = query.eval_candidates[query.oracle_position - 1]
                logger.info(query.utterance)
                logger.info(correct_candidate.query_candidate)

                n_grams = set(get_n_grams_features(correct_candidate.query_candidate))

                answer_entities = [mid for answer in query.target_result
                                   if year_pattern.match(answer) is None
                                   for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
                correct_notable_types = set(filter(lambda x: x,
                                                   [KBEntity.get_notable_type(entity_mid)
                                                    for entity_mid in answer_entities]))

                for notable_type in correct_notable_types:
                    if notable_type not in type_counts:
                        type_counts[notable_type] = 0
                    type_counts[notable_type] += 1

                for n_gram in n_grams:
                    if n_gram not in n_gram_counts:
                        n_gram_counts[n_gram] = 0
                    n_gram_counts[n_gram] += 1

                    for notable_type in correct_notable_types:
                        pair = (n_gram, notable_type)
                        if pair not in n_gram_type_counts:
                            n_gram_type_counts[pair] = 0
                        n_gram_type_counts[pair] += 1

                total += 1

    npmi = dict()
    from math import log
    for n_gram_type_pair, n_gram_type_count in n_gram_type_counts.iteritems():
        if n_gram_type_count > 4:
            n_gram, type = n_gram_type_pair
            npmi[n_gram_type_pair] = (log(n_gram_type_count) - log(n_gram_counts[n_gram]) - log(type_counts[type]) +
                                        log(total)) / (-log(n_gram_type_count) + log(total))

    with open("type_model_npmi.pickle", 'wb') as out:
        pickle.dump(npmi, out)

    import operator
    npmi = sorted(npmi.items(), key=operator.itemgetter(1), reverse=True)
    print "\n".join(map(str, npmi[:50]))