def print_sparql_queries(): import argparse parser = argparse.ArgumentParser(description="Dump qa entity pairs.") parser.add_argument("--config", default="config.cfg", help="The configuration file to use.") parser.add_argument("--output", help="The file to dump results to.") args = parser.parse_args() globals.read_configuration(args.config) scorer_globals.init() parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False dataset = "webquestions_test_filter" sparql_backend = globals.get_sparql_backend(globals.config) queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): print "--------------------------------------------" print query.utterance print "\n".join([str((entity.__class__, entity.entity)) for entity in query.eval_candidates[0].query_candidate.query.identified_entities]) for eval_candidate in query.eval_candidates: query_candidate = eval_candidate.query_candidate query_candidate.sparql_backend = sparql_backend notable_types = query_candidate.get_answers_notable_types() if notable_types: print notable_types print query_candidate.graph_as_simple_string().encode("utf-8") print query_candidate.to_sparql_query().encode("utf-8") print "\n\n"
def get_number_of_external_entities(): import scorer_globals globals.read_configuration('config_webentity.cfg') parser = CoreNLPParser.init_from_config() entity_linker = WebSearchResultsExtenderEntityLinker.init_from_config() entity_linker.topn_entities = 100000 scorer_globals.init() parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False datasets = ["webquestions_split_train", "webquestions_split_dev",] # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",] # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",] external_entities_count = [] for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): entities = entity_linker.identify_entities_in_tokens(parser.parse(query.utterance).tokens, text=query.utterance, find_dates=False) print "-------------------------" print query.utterance print "\n".join(map(str, sorted(entities, key=lambda entity: entity.external_entity_count, reverse=True))) external_entities_count.append(0) for entity in entities: if entity.external_entity: external_entities_count[-1] += 1 if index % 100 == 0: print >> sys.stderr, "%s queries processed" % index print "=========================================" print external_entities_count print sum(external_entities_count) print len(external_entities_count)
def train_type_model(): globals.read_configuration('config.cfg') parser = globals.get_parser() scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False feature_extractor = FeatureExtractor(False, False, n_gram_types_features=True) features = [] labels = [] for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): tokens = [token.lemma for token in parser.parse(query.utterance).tokens] n_grams = get_grams_feats(tokens) answer_entities = [mid for answer in query.target_result for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) other_notable_types = set() for candidate in query.eval_candidates: entities = [mid for entity_name in candidate.prediction for mid in KBEntity.get_entityid_by_name(entity_name, keep_most_triples=True)] other_notable_types.update(set([KBEntity.get_notable_type(entity_mid) for entity_mid in entities])) incorrect_notable_types = other_notable_types.difference(correct_notable_types) for type in correct_notable_types.union(incorrect_notable_types): if type in correct_notable_types: labels.append(1) else: labels.append(0) features.append(feature_extractor.extract_ngram_features(n_grams, [type, ], "type")) with open("type_model_data.pickle", 'wb') as out: pickle.dump((features, labels), out) label_encoder = LabelEncoder() labels = label_encoder.fit_transform(labels) vec = DictVectorizer(sparse=True) X = vec.fit_transform(features) feature_selector = SelectPercentile(chi2, percentile=5).fit(X, labels) vec.restrict(feature_selector.get_support()) X = feature_selector.transform(X) type_scorer = SGDClassifier(loss='log', class_weight='auto', n_iter=1000, alpha=1.0, random_state=999, verbose=5) type_scorer.fit(X, labels) with open("type-model.pickle", 'wb') as out: pickle.dump((vec, type_scorer), out)
def get_question_terms(): import scorer_globals globals.read_configuration('config_webentity.cfg') scorer_globals.init() datasets = ["webquestionstrain", "webquestionstest",] question_tokens = set() for dataset in datasets: queries = load_eval_queries(dataset) for index, query in enumerate(queries): question_tokens.update(token for token in tokenize(query.utterance)) print question_tokens
def main(): import argparse parser = argparse.ArgumentParser(description="Console based translation.") parser.add_argument("ranker_name", default="WQ_Ranker", help="The ranker to use.") parser.add_argument("--config", default="config.cfg", help="The configuration file to use.") args = parser.parse_args() globals.read_configuration(args.config) scorer_globals.init() if args.ranker_name not in scorer_globals.scorers_dict: logger.error("%s is not a valid ranker" % args.ranker_name) logger.error("Valid rankers are: %s " % (" ".join(scorer_globals.scorers_dict.keys()))) logger.info("Using ranker %s" % args.ranker_name) ranker = scorer_globals.scorers_dict[args.ranker_name] translator = SparqlQueryTranslator.init_from_config() translator.set_scorer(ranker) while True: try: sys.stdout.write("enter question> ") sys.stdout.flush() query = sys.stdin.readline().strip() logger.info("Translating query: %s" % query) results = translator.translate_and_execute_query(query) logger.info("Done translating query: %s" % query) logger.info("#candidates: %s" % len(results)) logger.info("------------------- Candidate features ------------------") for rank, result in enumerate(results[:10]): logger.info("RANK " + str(rank)) logger.info(result.query_candidate.relations) logger.info(result.query_candidate.get_results_text()) if result.query_candidate.features: logger.info("Features: " + str(result.query_candidate.features)) logger.info("---------------------------------------------------------") if len(results) > 0: best_candidate = results[0].query_candidate sparql_query = best_candidate.to_sparql_query() result_rows = results[0].query_result_rows result = [] # Usually we get a name + mid. for r in result_rows: if len(r) > 1: result.append("%s (%s)" % (r[1], r[0])) else: result.append("%s" % r[0]) logger.info("SPARQL query: %s" % sparql_query) logger.info("Result: %s " % " ".join(result)) except Exception as e: logger.error(e.message)
if __name__ == "__main__": # print_sparql_queries() # exit() import argparse parser = argparse.ArgumentParser(description="Dump qa entity pairs.") parser.add_argument("--config", default="config.cfg", help="The configuration file to use.") parser.add_argument("--output", help="The file to dump results to.") args = parser.parse_args() globals.read_configuration(args.config) scorer_globals.init() parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False # datasets = ["webquestions_split_train", "webquestions_split_dev",] # datasets = ["webquestions_split_train_externalentities", "webquestions_split_dev_externalentities",] # datasets = ["webquestions_split_train_externalentities3", "webquestions_split_dev_externalentities3",] datasets = ["webquestions_train_externalentities_all", "webquestions_test_externalentities_all", ] count = 0 correct_relations = set() positions = [] for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters)
def extract_npmi_ngram_type_pairs(): globals.read_configuration('config.cfg') scorer_globals.init() datasets = ["webquestions_split_train", ] parameters = translator.TranslatorParameters() parameters.require_relation_match = False parameters.restrict_answer_type = False n_gram_type_counts = dict() type_counts = dict() n_gram_counts = dict() total = 0 year_pattern = re.compile("[0-9]+") for dataset in datasets: queries = get_evaluated_queries(dataset, True, parameters) for index, query in enumerate(queries): if query.oracle_position != -1 and query.oracle_position <= len(query.eval_candidates): correct_candidate = query.eval_candidates[query.oracle_position - 1] logger.info(query.utterance) logger.info(correct_candidate.query_candidate) n_grams = set(get_n_grams_features(correct_candidate.query_candidate)) answer_entities = [mid for answer in query.target_result if year_pattern.match(answer) is None for mid in KBEntity.get_entityid_by_name(answer, keep_most_triples=True)] correct_notable_types = set(filter(lambda x: x, [KBEntity.get_notable_type(entity_mid) for entity_mid in answer_entities])) for notable_type in correct_notable_types: if notable_type not in type_counts: type_counts[notable_type] = 0 type_counts[notable_type] += 1 for n_gram in n_grams: if n_gram not in n_gram_counts: n_gram_counts[n_gram] = 0 n_gram_counts[n_gram] += 1 for notable_type in correct_notable_types: pair = (n_gram, notable_type) if pair not in n_gram_type_counts: n_gram_type_counts[pair] = 0 n_gram_type_counts[pair] += 1 total += 1 npmi = dict() from math import log for n_gram_type_pair, n_gram_type_count in n_gram_type_counts.iteritems(): if n_gram_type_count > 4: n_gram, type = n_gram_type_pair npmi[n_gram_type_pair] = (log(n_gram_type_count) - log(n_gram_counts[n_gram]) - log(type_counts[type]) + log(total)) / (-log(n_gram_type_count) + log(total)) with open("type_model_npmi.pickle", 'wb') as out: pickle.dump(npmi, out) import operator npmi = sorted(npmi.items(), key=operator.itemgetter(1), reverse=True) print "\n".join(map(str, npmi[:50]))
def main(): import argparse parser = argparse.ArgumentParser(description='Learn or test a' ' scorer model.') parser.add_argument('--no-cached', default=False, action='store_true', help='Don\'t use cached data if available.') parser.add_argument('--config', default='config.cfg', help='The configuration file to use.') subparsers = parser.add_subparsers(help='command help') train_parser = subparsers.add_parser('train', help='Train a scorer.') train_parser.add_argument('scorer_name', help='The scorer to train.') train_parser.set_defaults(which='train') test_parser = subparsers.add_parser('test', help='Test a scorer.') test_parser.add_argument('scorer_name', help='The scorer to test.') test_parser.add_argument('test_dataset', help='The dataset on which to test the scorer.') test_parser.add_argument('--avg_runs', type=int, default=1, help='Over how many runs to average.') test_parser.set_defaults(which='test') traintest_parser = subparsers.add_parser('traintest', help='Train and test a scorer.') traintest_parser.add_argument('scorer_name', help='The scorer to test.') traintest_parser.add_argument('test_dataset', help='The dataset on which to test the scorer.') traintest_parser.add_argument('--avg_runs', type=int, default=1, help='Over how many runs to average.') traintest_parser.set_defaults(which='traintest') cv_parser = subparsers.add_parser('cv', help='Cross-validate a scorer.') cv_parser.add_argument('scorer_name', help='The scorer to test.') cv_parser.add_argument('dataset', help='The dataset on which to compute cv scores.') cv_parser.add_argument('--n_folds', type=int, default=6, help='The number of folds.') cv_parser.add_argument('--avg_runs', type=int, default=1, help='Over how many runs to average.') cv_parser.set_defaults(which='cv') print_parser = subparsers.add_parser('print', help='Print ranked results.') print_parser.add_argument('scorer_name', help='The scorer to test.') print_parser.add_argument('dataset', help='The dataset on which to compute cv scores.') print_parser.set_defaults(which='print') args = parser.parse_args() # Read global config. globals.read_configuration(args.config) scorer_globals.init() # Fix randomness. random.seed(999) use_cache = not args.no_cached if args.which == 'train': train(args.scorer_name, use_cache) elif args.which == 'test': test(args.scorer_name, args.test_dataset, use_cache, avg_runs=args.avg_runs) elif args.which == 'traintest': train(args.scorer_name, use_cache) test(args.scorer_name, args.test_dataset, use_cache, avg_runs=args.avg_runs) elif args.which == 'cv': cv(args.scorer_name, args.dataset, use_cache, n_folds=args.n_folds, avg_runs=args.avg_runs) elif args.which == 'print': eval_print(args.scorer_name, args.dataset, use_cache)