コード例 #1
0
ファイル: learn_notable_types.py プロジェクト: DenXX/aqqu
def train_type_model():
    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True)
    features = []
    labels = []
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            tokens = [token.lemma for token in parser.parse(query.utterance).tokens]
            n_grams = get_grams_feats(tokens)

            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            correct_notable_types = set(filter(lambda x: x,
                                               [KBEntity.get_notable_type(entity_mid)
                                                for entity_mid in answer_entities]))

            other_notable_types = set()
            for candidate in query.eval_candidates:
                entities = [mid for entity_name in candidate.prediction
                            for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)]
                other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities]))
            incorrect_notable_types = other_notable_types.difference(correct_notable_types)

            for type in correct_notable_types.union(incorrect_notable_types):
                if type in correct_notable_types:
                    labels.append(1)
                else:
                    labels.append(0)
                features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type"))

    with open("type_model_data.pickle", 'wb') as out:
        pickle.dump((features, labels), out)

    label_encoder = LabelEncoder()
    labels = label_encoder.fit_transform(labels)
    vec = DictVectorizer(sparse=True)
    X = vec.fit_transform(features)
    feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels)
    vec.restrict(feature_selector.get_support())
    X = feature_selector.transform(X)
    type_scorer = SGDClassifier(loss='log', class_weight='auto',
                                n_iter=1000,
                                alpha=1.0,
                                random_state=999,
                                verbose=5)
    type_scorer.fit(X, labels)
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)
コード例 #2
0
ファイル: utils.py プロジェクト: DenXX/aqqu
def get_entity_idf(entity):
    """
    Get the entity IDF based on Google's annotation of ClueWeb corpus.
    :param entity: The entity to lookup.
    :return: IDF of the entity based on ClueWeb collection.
    """
    global _entity_counts
    if _entity_counts is None:
        _entity_counts = dict()
        with gzip.open(globals.config.get('WebSearchFeatures', 'entity-webcounts-file'), 'r') as input_file:
            logger.info("Reading entity ClueWeb counts...")
            for line in input_file:
                entity, count = line.strip().split('\t')
                count = int(count)
                _entity_counts[entity] = count
            logger.info("Reading entity ClueWeb counts done!")

    if _entity_counts:
        mids = ["/" + mid.replace(".", "/") for mid in KBEntity.get_entityid_by_name(entity, keep_most_triples=True)]
        if mids:
            idf = min(log(max(1.0, CLUEWEB_DOCUMENTS_COUNT / (_entity_counts[mid]
                                                              if mid in _entity_counts and _entity_counts[mid] > 0
                                                              else 1.0)))
                      for mid in mids)
            # logger.info("IDF entity %s %.3f" % (entity, idf))
            return idf
        return 1.0
    else:
        return 0.0
コード例 #3
0
ファイル: dump_qa_entity_pairs.py プロジェクト: DenXX/aqqu
            #     else:
            #         for relation in query.eval_candidates[query.oracle_position - 1].query_candidate.relations:
            #             if relation.name not in correct_relations:
            #                 print query.utterance
            #                 print relation.name
            #                 print query.eval_candidates[query.oracle_position - 1].query_candidate
            #                 print "-----"

            # This loop will print questions without good candidate
            # if query.oracle_position == -1:
            #     entities = set()
            #     for candidate in query.eval_candidates:
            #         for entity in candidate.query_candidate.matched_entities:
            #             if isinstance(entity.entity.entity, KBEntity):
            #                 entities.add((entity.entity.name, entity.entity.entity.id))
            #     print ">>>", query.utterance
            #     print entities

            for candidate in query.eval_candidates:
                answer_entities = set(mid for entity_name in candidate.prediction
                                      for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True))
                question_entities = set(mid for entity in candidate.query_candidate.matched_entities
                                        for mid in KBEntity.get_entityid_by_name(entity.entity.name,
                                                                                 keep_most_triples=True))
                for question_entity in question_entities:
                    for answer_entity in answer_entities:
                        print question_entity + "\t" + answer_entity

            if index % 100 == 0:
                print >> stderr, "Processed %d queries" % index
コード例 #4
0
ファイル: learn_notable_types.py プロジェクト: DenXX/aqqu
def extract_npmi_ngram_type_pairs():
    globals.read_configuration('config.cfg')
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]

    parameters = translator.TranslatorParameters()
    parameters.require_relation_match = False
    parameters.restrict_answer_type = False

    n_gram_type_counts = dict()
    type_counts = dict()
    n_gram_counts = dict()
    total = 0
    year_pattern = re.compile("[0-9]+")
    for dataset in datasets:
        queries = get_evaluated_queries(dataset, True, parameters)
        for index, query in enumerate(queries):
            if query.oracle_position != -1 and query.oracle_position <= len(query.eval_candidates):
                correct_candidate = query.eval_candidates[query.oracle_position - 1]
                logger.info(query.utterance)
                logger.info(correct_candidate.query_candidate)

                n_grams = set(get_n_grams_features(correct_candidate.query_candidate))

                answer_entities = [mid for answer in query.target_result
                                   if year_pattern.match(answer) is None
                                   for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
                correct_notable_types = set(filter(lambda x: x,
                                                   [KBEntity.get_notable_type(entity_mid)
                                                    for entity_mid in answer_entities]))

                for notable_type in correct_notable_types:
                    if notable_type not in type_counts:
                        type_counts[notable_type] = 0
                    type_counts[notable_type] += 1

                for n_gram in n_grams:
                    if n_gram not in n_gram_counts:
                        n_gram_counts[n_gram] = 0
                    n_gram_counts[n_gram] += 1

                    for notable_type in correct_notable_types:
                        pair = (n_gram, notable_type)
                        if pair not in n_gram_type_counts:
                            n_gram_type_counts[pair] = 0
                        n_gram_type_counts[pair] += 1

                total += 1

    npmi = dict()
    from math import log
    for n_gram_type_pair, n_gram_type_count in n_gram_type_counts.iteritems():
        if n_gram_type_count > 4:
            n_gram, type = n_gram_type_pair
            npmi[n_gram_type_pair] = (log(n_gram_type_count) - log(n_gram_counts[n_gram]) - log(type_counts[type]) +
                                        log(total)) / (-log(n_gram_type_count) + log(total))

    with open("type_model_npmi.pickle", 'wb') as out:
        pickle.dump(npmi, out)

    import operator
    npmi = sorted(npmi.items(), key=operator.itemgetter(1), reverse=True)
    print "\n".join(map(str, npmi[:50]))
コード例 #5
0
ファイル: learn_notable_types.py プロジェクト: DenXX/aqqu
    with open("type-model.pickle", 'wb') as out:
        pickle.dump((vec, type_scorer), out)


if __name__ == "__main__":
    extract_npmi_ngram_type_pairs()
    exit()

    globals.read_configuration('config.cfg')
    parser = globals.get_parser()
    scorer_globals.init()

    datasets = ["webquestions_split_train", ]
    # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",]
    # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",]

    data = []
    for dataset in datasets:
        queries = load_eval_queries(dataset)
        for index, query in enumerate(queries):
            tokens = [token.token for token in parser.parse(query.utterance).tokens]
            answer_entities = [mid for answer in query.target_result
                               for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)]
            notable_types = [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities]
            data.append((tokens, notable_types))
            logger.info(tokens)
            logger.info([KBEntity.get_entity_name(notable_type) for notable_type in notable_types])

    with open("question_tokens_notable_types.pickle", 'wb') as out:
        pickle.dump(data, out)