def choose_jm(lm, params, qb_location, num_globals): qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() scores = defaultdict(float) for ll in params: for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): compare = (hash(pp) + 1) % num_globals for qq in [x for x in pages[pp] if x.fold == "dev"]: for ss in qq.text_lines(): lm[compare].set_jm_interp(ll) text = list(lm[compare].tokenize_and_censor(ss["text"])) try: val = lm[compare].ll(text) except OverflowError: val = float("nan") if isnan(val): continue else: scores[ll] += val print(scores, max(scores.values())) print(scores) return [x for x in scores if scores[x] == max(scores.values())][0]
def verbose(self, qb_location): qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() import time for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): need_title = True compare = (hash(pp) + 1) % self._globals for corpus in self._lm: if not pp in self._lm[corpus]: continue for qq in [x for x in pages[pp] if x.fold == "dev"]: if need_title: print("--------------\t%s\t--------------" % pp) need_title = False for ss in qq.text_lines(): self.set_metadata(qq.page, qq.category, qq.qnum, ss["sent"], 0, None, qq.fold) start = time.time() print("===============\t%s\t===============" % corpus) print(self.vw_from_title(pp, ss["text"])) text = list(self._lm[corpus][0].tokenize_and_censor(ss["text"])) sent = self._lm[corpus][pp].mean_ll(text) background = self._lm[corpus][compare].mean_ll(text) score = self.text_score(corpus, pp, text) print( "sent: ([%f - %f] - %f) / %f = %f" % (sent, background, self._sent_mean[corpus], self._sent_var[corpus], score) ) for cc in self._lm[corpus][pp].ngram_chains(text): ngram_score = self.ngram_score(corpus, pp, cc) vv = self._lm[corpus][pp].mean_ll(cc) background = self._lm[corpus][compare].mean_ll(cc) print( "ngram, %s: ([%f - %f] - %f) / %f = %f" % ( display_ngram(cc), vv, background, self._ngram_mean[corpus][len(cc)], self._ngram_var[corpus][len(cc)], ngram_score, ) ) print(list(x if x in self._lm[corpus][compare]._vocab else None for x in cc)) print("TIME: %f" % (time.time() - start))
def _set_stats(self, corpus, lm, qb_location, max_pages): sents = [] ngrams = defaultdict(list) qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() print("Computing stats for %s from %i pages ..." % (corpus, max_pages)) page_count = 0 for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): compare = (hash(pp) + 1) % self._globals page_count += 1 for qq in [x for x in pages[pp] if x.fold == "dev"]: if max_pages > 0 and page_count > max_pages: break if page_count % 34 == 0: print("%i\t%s" % (page_count, pp)) for ss in qq.text_lines(): if pp in lm: text = list(lm[pp].tokenize_and_censor(ss["text"])) sents.append(lm[pp].mean_ll(text) - lm[compare].mean_ll(text)) for cc in lm[pp].ngram_chains(text): ngrams[len(cc)].append(lm[pp].mean_ll(cc) - lm[compare].mean_ll(cc)) print("done") print("Sents", sents[:10]) self._sent_mean[corpus] = mean(sents) self._sent_var[corpus] = var(sents) print("Ngrams", ngrams[2][:10]) for ii in ngrams: self._ngram_mean[corpus][ii] = mean(list(x for x in ngrams[ii] if x > self._threshold)) self._ngram_var[corpus][ii] = var(list(x for x in ngrams[ii] if x > self._threshold)) print( "Stats for %s: SM: %f, SV: %f, NM: %f, NV: %f" % ( corpus, self._sent_mean[corpus], self._sent_var[corpus], self._ngram_mean[corpus][2], self._ngram_var[corpus][2], ) )
c.execute(query) answer_count = defaultdict(int) for pp, in c: answer_count[pp] += 1 query = 'select page, id, naqt, fold from questions where page != ""' c = question_database.cursor() c.execute(query) print(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF)) print( len(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF))) # Load the DAN to generate guesses if they're missing from the database deep = instantiate_feature("deep", QuestionDatabase(flags.question_db)) questions = {} question_num = 0 for pp, ii, nn, ff in c: if nn >= 0 or answer_count[pp] < kCOUNT_CUTOFF: continue question_num += 1 question = McScience(pp, ii, ff) question.add_text(question_first_sentence(question_database, ii)) choices = question_top_guesses(question.text, deep, guess_database, ii, pp, flags.num_choices) question.add_choices(choices) questions[ii] = question if question_num % 100 == 0: print(pp, ii, question_num)
parser.add_argument('--neg_weight', type=float, default=0.0, help="Negative example weight") parser.add_argument('--question_out', type=str, default='', help="Where we write out questions for buzzer") parser.add_argument('--finals', type=str, default='', help="Where we write out answer after entire question") parser.add_argument('--expo', type=str, default='', help="The expo file") flags = parser.parse_args() qdb = QuestionDatabase(flags.qbdb) buzz = DictWriter(open(flags.buzzes, 'w'), fieldnames=kBUZZ_OUT) buzz.writeheader() final_out = DictWriter(open(flags.finals, 'w'), fieldnames=["question", "answer"]) final_out.writeheader() # Check file length with open(flags.meta) as infile: meta_lines = sum(1 for line in infile) with open(flags.pred) as infile: pred_lines = sum(1 for line in infile) assert meta_lines == pred_lines, "Prediction and meta files mismatch" + \ "(%s: %i vs %s: %i)" % (flags.meta, meta_lines, flags.pred, pred_lines)
import operator kBAD_ANSWERS = ["", "red river", "the", "figaro", "normal", "s", "p"] if __name__ == "__main__": args = argparse.ArgumentParser('Interactive assign pages to questions') args.add_argument('--database', type=str, default='data/questions.db', help='sqlite3 database of questions') args.add_argument('--titles', type=str, default='data/wiki_index.pkl', help='page title candiates') args.add_argument('--labels', type=str, default='data/map/ans_to_wiki', help='write page assignment answers') args = args.parse_args() # Open up the database d = QuestionDatabase(args.database) page_diversity = d.answer_map(normalize) # Set up the active learner for writing assignments al = ActiveLearner(None, args.labels) existing_labels = set(x[0] for x in al.human_labeled()) # get the candidates we want to assign to pages answers = d.unmatched_answers(existing_labels) print(answers.keys()[:10]) # Open up the title finder tf = TitleFinder(open(args.titles)) for ans, count in sorted(answers.items(), key=lambda x: sum(x[1].values()), reverse=True):
first, rest = ii.split('ID="', 1) id, rest = rest.split('" TITLE="', 1) title, rest = rest.split('"', 1) self.topics[int(id)] = title if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description='Import questions') parser.add_argument('--naqt_path', type=str) parser.add_argument('--db', type=str, default='data/questions.db') flags = parser.parse_args() qdb = QuestionDatabase(flags.db) conn = qdb._conn answer_map = qdb.answer_map() # Find existing naqt questions c = conn.cursor() command = 'SELECT naqt FROM questions WHERE naqt >= 0;' c.execute(command) existing = set(int(x[0]) for x in c) num_skipped = 0 last_id = kNAQT_START if flags.naqt_path: for qq in naqt_reader(flags.naqt_path): if qq.answer in answer_map and len(answer_map[qq.answer]) == 1: page = answer_map[qq.answer].keys()[0]
return seen if __name__ == "__main__": from util import flags flags.define_string("title_index", None, "Pickle of all titles") flags.define_string("label_path", None, "Where we write page associations") flags.define_string("database", None, "Question database") flags.define_string("performance_output", None, "Where we write user performance") flags.define_string("user", None, "User identifier") flags.InitFlags() seen = already_answered(flags.performance_output, flags.user) al = ActiveLearner(None, flags.label_path) print("Loading question db %s" % flags.database) db = QuestionDatabase(flags.database) pw = PerformanceWriter(flags.performance_output, flags.user) tf = TitleFinder(open(flags.title_index)) questions = db.questions_by_tournament("High School Championship") for qid in questions: question = questions[qid] if question.fold == "train" or qid in seen: continue choices = list(tf.query(question.answer)) # Get what and when the human answered wp, idx, ans = get_answer([question.text[x] for x in sorted(question.text)], question.answer, question.page) print("\n".join(question.text.values()))
import pickle import time from page_assignment.active_learning_for_matching import ActiveLearner from util.qdb import QuestionDatabase if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="apply wikipedia pages") parser.add_argument("--db", default='data/questions.db', type=str, help="The question database") parser.add_argument("--match_location", type=str, default='data/map/ans_to_wiki_', help="Where we read matches learned") flags = parser.parse_args() start = time.time() print("Loading db..") db = QuestionDatabase(flags.db) print("Loading classifier...") classifier = ActiveLearner(None, flags.match_location, []) for question, page in classifier.human_labeled(): ans_type = "" db.set_answer_page(question, page, ans_type) print(question, page, "GIVEN", ans_type)
print 'top@', top, 'accuracy: ', corr / len(probs) if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--question_db', type=str, default='data/questions.db') parser.add_argument('--attribute', type=str, default='category') parser.add_argument('--bigram_thresh', type=int, default=1000) parser.add_argument("--output", type=str, default="data/classifier/", help="Where we write output file") flags = parser.parse_args() questions = QuestionDatabase(flags.question_db) bigram_filename = "%s/bigrams.pkl" % flags.output if os.path.exists(bigram_filename): bgset = pickle.load(open(bigram_filename, 'rb')) print("Using previous bigrams") else: print("computing bigrams...") bgset = compute_frequent_bigrams(flags.bigram_thresh, questions) write_bigrams(bgset, bigram_filename) train_classifier("%s/%s.pkl" % (flags.output, flags.attribute), bgset, questions, flags.attribute) evaluate("%s/%s.pkl" % (flags.output, flags.attribute), bgset, questions, flags.attribute)
from util.qdb import QuestionDatabase from extract_expo_features import add_expo_questions if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--database', type=str, default='data/questions.db') parser.add_argument('--expo', type=str, default='') parser.add_argument('--min_pages', type=int, default=4) parser.add_argument("--output_directory", type=str, default="data/wikifier/data/input/", help="Where we write output file") flags = parser.parse_args() database = QuestionDatabase(flags.database) if flags.database: pages = database.questions_with_pages() else: pages = defaultdict(set) if flags.expo: add_expo_questions(flags.expo, pages) total = 0 for pp in pages: if len(pages[pp]) >= flags.min_pages: print(pp, len(pages[pp])) for qq in pages[pp]: total += 1 for sentence, word, text in qq.partials():
parser.add_argument('--guess_db', type=str, default='data/guesses.db', help='Where we write/read the guesses') parser.add_argument('--question_db', type=str, default='data/questions.db') parser.add_argument('--feature', type=str, default='', help="Which feature we write out") parser.add_argument("--granularity", type=str, default="sentence") parser.add_argument("--limit", type=int, default=-1, help="How many answer to write to feature files") parser.add_argument("--ans_limit", type=int, default=5, help="minimum answer limit") flags = parser.parse_args() print("Loading database from %s" % flags.question_db) questions = QuestionDatabase(flags.question_db) guess_list = GuessList(flags.guess_db) if flags.guesses: # kFEATURES["ir"] = IrExtractor() # for cc in kIR_CUTOFFS: # kFEATURES["ir"].add_index("wiki_%i" % cc, "%s_%i" % # (flags.whoosh_wiki, cc)) # kFEATURES["ir"].add_index("qb_%i" % cc, "%s_%i" % # (flags.whoosh_qb, cc)) # if kIR_CATEGORIES: # categories = questions.column_options("category") # print("Adding categories %s" % str(categories)) # for cc in categories: # kFEATURES["ir"].add_index("wiki_%s" % cc, "%s_%s" % # (flags.whoosh_wiki, cc))
from util.qdb import QuestionDatabase from extract_expo_features import add_expo_questions if __name__ == "__main__": parser = argparse.ArgumentParser(description="") parser.add_argument("--database", type=str, default="data/questions.db") parser.add_argument("--expo", type=str, default="") parser.add_argument("--min_pages", type=int, default=4) parser.add_argument( "--output_directory", type=str, default="data/wikifier/data/input/", help="Where we write output file" ) flags = parser.parse_args() database = QuestionDatabase(flags.database) if flags.database: pages = database.questions_with_pages() else: pages = defaultdict(set) if flags.expo: add_expo_questions(flags.expo, pages) total = 0 for pp in pages: if len(pages[pp]) >= flags.min_pages: print(pp, len(pages[pp])) for qq in pages[pp]: total += 1 for sentence, word, text in qq.partials():
if __name__ == "__main__": parser = argparse.ArgumentParser(description="") parser.add_argument("--buzzes", type=str, default="", help="Where we write resulting buzzes") parser.add_argument("--perf", type=str, default="", help="Where we write performance statistics") parser.add_argument("--pred", type=str, default="", help="Where we read predictions") parser.add_argument("--meta", type=str, default="", help="Where we read metadata values") parser.add_argument("--qbdb", type=str, default="data/questions.db", help="Source of questions") parser.add_argument("--vw_config", type=str, default="", help="Configuration of classifier") parser.add_argument("--neg_weight", type=float, default=0.0, help="Negative example weight") parser.add_argument("--question_out", type=str, default="", help="Where we write out questions for buzzer") parser.add_argument("--finals", type=str, default="", help="Where we write out answer after entire question") parser.add_argument("--expo", type=str, default="", help="The expo file") flags = parser.parse_args() qdb = QuestionDatabase(flags.qbdb) buzz = DictWriter(open(flags.buzzes, "w"), fieldnames=kBUZZ_OUT) buzz.writeheader() final_out = DictWriter(open(flags.finals, "w"), fieldnames=["question", "answer"]) final_out.writeheader() # Check file length with open(flags.meta) as infile: meta_lines = sum(1 for line in infile) with open(flags.pred) as infile: pred_lines = sum(1 for line in infile) assert meta_lines == pred_lines, "Prediction and meta files mismatch" + "(%s: %i vs %s: %i)" % ( flags.meta, meta_lines,
if __name__ == "__main__": from util import flags flags.define_string("title_index", None, "Pickle of all titles") flags.define_string("label_path", None, "Where we write page associations") flags.define_string("database", None, "Question database") flags.define_string("performance_output", None, "Where we write user performance") flags.define_string("user", None, "User identifier") flags.InitFlags() seen = already_answered(flags.performance_output, flags.user) al = ActiveLearner(None, flags.label_path) print("Loading question db %s" % flags.database) db = QuestionDatabase(flags.database) pw = PerformanceWriter(flags.performance_output, flags.user) tf = TitleFinder(open(flags.title_index)) questions = db.questions_by_tournament("High School Championship") for qid in questions: question = questions[qid] if question.fold == "train" or qid in seen: continue choices = list(tf.query(question.answer)) # Get what and when the human answered wp, idx, ans = get_answer( [question.text[x] for x in sorted(question.text)], question.answer, question.page)
parser.add_argument("--params", default="data/deep/params.pkl", help="Location of parameter pickle") parser.add_argument("--vocab", default="data/deep/deep_vocab.pkl", help="Location of vocab pickle") parser.add_argument("--ners", default="data/common/ners.pkl", help="Location of NER pickle") flags = parser.parse_args() import time start = time.time() questions = questions = QuestionDatabase("data/questions.db") page_dict = {} for page in questions.get_all_pages(): page_dict[page.lower().replace(' ', '_')] = page ws = DeepExtractor(flags.classifier, flags.params, flags.vocab, flags.ners, page_dict) print("Startup: %f sec" % (time.time() - start)) tests = {} tests[u"Tannhäuser (opera)"] = u"""He sought out the pope to seek forgiveness of his sins, only to be told that just as the pope's staff would never (*) blossom, his sins are never be forgiven. Three days later, the pope's staff miraculously bore flowers. For 10 points--identify this German folk hero, the subject of an opera by Wagner [VAHG-ner]."""
args.add_argument('--database', type=str, default='data/questions.db', help='sqlite3 database of questions') args.add_argument('--titles', type=str, default='data/wiki_index.pkl', help='page title candiates') args.add_argument('--labels', type=str, default='data/map/ans_to_wiki', help='write page assignment answers') args = args.parse_args() # Open up the database d = QuestionDatabase(args.database) page_diversity = d.answer_map(normalize) # Set up the active learner for writing assignments al = ActiveLearner(None, args.labels) existing_labels = set(x[0] for x in al.human_labeled()) # get the candidates we want to assign to pages answers = d.unmatched_answers(existing_labels) print(answers.keys()[:10]) # Open up the title finder tf = TitleFinder(open(args.titles)) for ans, count in sorted(answers.items(), key=lambda x: sum(x[1].values()),