def choose_jm(lm, params, qb_location, num_globals): qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() scores = defaultdict(float) for ll in params: for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): compare = (hash(pp) + 1) % num_globals for qq in [x for x in pages[pp] if x.fold == "dev"]: for ss in qq.text_lines(): lm[compare].set_jm_interp(ll) text = list(lm[compare].tokenize_and_censor(ss["text"])) try: val = lm[compare].ll(text) except OverflowError: val = float("nan") if isnan(val): continue else: scores[ll] += val print(scores, max(scores.values())) print(scores) return [x for x in scores if scores[x] == max(scores.values())][0]
def verbose(self, qb_location): qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() import time for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): need_title = True compare = (hash(pp) + 1) % self._globals for corpus in self._lm: if not pp in self._lm[corpus]: continue for qq in [x for x in pages[pp] if x.fold == "dev"]: if need_title: print("--------------\t%s\t--------------" % pp) need_title = False for ss in qq.text_lines(): self.set_metadata(qq.page, qq.category, qq.qnum, ss["sent"], 0, None, qq.fold) start = time.time() print("===============\t%s\t===============" % corpus) print(self.vw_from_title(pp, ss["text"])) text = list(self._lm[corpus][0].tokenize_and_censor( ss["text"])) sent = self._lm[corpus][pp].mean_ll(text) background = \ self._lm[corpus][compare].mean_ll(text) score = self.text_score(corpus, pp, text) print("sent: ([%f - %f] - %f) / %f = %f" % (sent, background, self._sent_mean[corpus], self._sent_var[corpus], score)) for cc in self._lm[corpus][pp].\ ngram_chains(text): ngram_score = self.ngram_score(corpus, pp, cc) vv = self._lm[corpus][pp].mean_ll(cc) background = \ self._lm[corpus][compare].mean_ll(cc) print("ngram, %s: ([%f - %f] - %f) / %f = %f" % (display_ngram(cc), vv, background, self._ngram_mean[corpus][len(cc)], self._ngram_var[corpus][len(cc)], ngram_score)) print( list(x if x in self._lm[corpus][compare]._vocab else None for x in cc)) print("TIME: %f" % (time.time() - start))
def verbose(self, qb_location): qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() import time for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): need_title = True compare = (hash(pp) + 1) % self._globals for corpus in self._lm: if not pp in self._lm[corpus]: continue for qq in [x for x in pages[pp] if x.fold == "dev"]: if need_title: print("--------------\t%s\t--------------" % pp) need_title = False for ss in qq.text_lines(): self.set_metadata(qq.page, qq.category, qq.qnum, ss["sent"], 0, None, qq.fold) start = time.time() print("===============\t%s\t===============" % corpus) print(self.vw_from_title(pp, ss["text"])) text = list(self._lm[corpus][0].tokenize_and_censor(ss["text"])) sent = self._lm[corpus][pp].mean_ll(text) background = self._lm[corpus][compare].mean_ll(text) score = self.text_score(corpus, pp, text) print( "sent: ([%f - %f] - %f) / %f = %f" % (sent, background, self._sent_mean[corpus], self._sent_var[corpus], score) ) for cc in self._lm[corpus][pp].ngram_chains(text): ngram_score = self.ngram_score(corpus, pp, cc) vv = self._lm[corpus][pp].mean_ll(cc) background = self._lm[corpus][compare].mean_ll(cc) print( "ngram, %s: ([%f - %f] - %f) / %f = %f" % ( display_ngram(cc), vv, background, self._ngram_mean[corpus][len(cc)], self._ngram_var[corpus][len(cc)], ngram_score, ) ) print(list(x if x in self._lm[corpus][compare]._vocab else None for x in cc)) print("TIME: %f" % (time.time() - start))
def _set_stats(self, corpus, lm, qb_location, max_pages): sents = [] ngrams = defaultdict(list) qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() print("Computing stats for %s from %i pages ..." % (corpus, max_pages)) page_count = 0 for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): compare = (hash(pp) + 1) % self._globals page_count += 1 for qq in [x for x in pages[pp] if x.fold == "dev"]: if max_pages > 0 and page_count > max_pages: break if page_count % 34 == 0: print("%i\t%s" % (page_count, pp)) for ss in qq.text_lines(): if pp in lm: text = list(lm[pp].tokenize_and_censor(ss["text"])) sents.append(lm[pp].mean_ll(text) - lm[compare].mean_ll(text)) for cc in lm[pp].ngram_chains(text): ngrams[len(cc)].append(lm[pp].mean_ll(cc) - lm[compare].mean_ll(cc)) print("done") print("Sents", sents[:10]) self._sent_mean[corpus] = mean(sents) self._sent_var[corpus] = var(sents) print("Ngrams", ngrams[2][:10]) for ii in ngrams: self._ngram_mean[corpus][ii] = mean(list(x for x in ngrams[ii] if x > self._threshold)) self._ngram_var[corpus][ii] = var(list(x for x in ngrams[ii] if x > self._threshold)) print( "Stats for %s: SM: %f, SV: %f, NM: %f, NV: %f" % ( corpus, self._sent_mean[corpus], self._sent_var[corpus], self._ngram_mean[corpus][2], self._ngram_var[corpus][2], ) )
def _set_stats(self, corpus, lm, qb_location, max_pages): sents = [] ngrams = defaultdict(list) qdb = QuestionDatabase(qb_location) pages = qdb.questions_with_pages() print("Computing stats for %s from %i pages ..." % (corpus, max_pages)) page_count = 0 for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True): compare = (hash(pp) + 1) % self._globals page_count += 1 for qq in [x for x in pages[pp] if x.fold == "dev"]: if max_pages > 0 and page_count > max_pages: break if page_count % 34 == 0: print("%i\t%s" % (page_count, pp)) for ss in qq.text_lines(): if pp in lm: text = list(lm[pp].tokenize_and_censor(ss["text"])) sents.append(lm[pp].mean_ll(text) - lm[compare].mean_ll(text)) for cc in lm[pp].ngram_chains(text): ngrams[len(cc)].\ append(lm[pp].mean_ll(cc) - lm[compare].mean_ll(cc)) print("done") print("Sents", sents[:10]) self._sent_mean[corpus] = mean(sents) self._sent_var[corpus] = var(sents) print("Ngrams", ngrams[2][:10]) for ii in ngrams: self._ngram_mean[corpus][ii] = mean( list(x for x in ngrams[ii] if x > self._threshold)) self._ngram_var[corpus][ii] = var( list(x for x in ngrams[ii] if x > self._threshold)) print("Stats for %s: SM: %f, SV: %f, NM: %f, NV: %f" % (corpus, self._sent_mean[corpus], self._sent_var[corpus], self._ngram_mean[corpus][2], self._ngram_var[corpus][2]))
# if kIR_CATEGORIES: # categories = questions.column_options("category") # print("Adding categories %s" % str(categories)) # for cc in categories: # kFEATURES["ir"].add_index("wiki_%s" % cc, "%s_%s" % # (flags.whoosh_wiki, cc)) # kFEATURES["ir"].add_index("qb_%s" % cc, "%s_%s" % # (flags.whoosh_qb, cc)) kFEATURES["deep"] = instantiate_feature("deep", questions) # features_that_guess = set(kFEATURES[x] for x in kHAS_GUESSES) features_that_guess = {"deep": kFEATURES["deep"]} print("Guesses %s" % "\t".join(x for x in features_that_guess)) all_questions = questions.questions_with_pages() page_num = 0 for page in all_questions: if len(all_questions[page]) < flags.ans_limit: continue else: print("%s\t%i" % (page, len(all_questions[page]))) question_num = 0 page_num += 1 for qq in all_questions[page]: # We don't need guesses for train questions if qq.fold == "train": continue question_num += 1 guesses = guesses_for_question(qq, features_that_guess,
"page": guess.strip(), "evidence": "", "final": final, "weight": weight } buzz.writerow(d) if any_buzz: break final_answer = final_guess(positions) d = {"question": question_id, "answer": final_answer} final_out.writerow(d) # Write out the questions questions = qdb.questions_with_pages() if flags.question_out: question_out = DictWriter(open(flags.question_out, 'w'), kQUES_OUT) question_out.writeheader() perf_out = DictWriter(open(flags.perf, 'w'), fieldnames=kPERF_OUT) perf_out.writeheader() for pp in questions: for qq in questions[pp]: if qq.qnum in questions_with_buzzes: # Write text for buzzer if flags.question_out: for ll in qq.text_lines(): question_out.writerow(ll) # Write performance d = questions_with_buzzes[qq.qnum]
if __name__ == "__main__": parser = argparse.ArgumentParser(description='') parser.add_argument('--database', type=str, default='data/questions.db') parser.add_argument('--expo', type=str, default='') parser.add_argument('--min_pages', type=int, default=4) parser.add_argument("--output_directory", type=str, default="data/wikifier/data/input/", help="Where we write output file") flags = parser.parse_args() database = QuestionDatabase(flags.database) if flags.database: pages = database.questions_with_pages() else: pages = defaultdict(set) if flags.expo: add_expo_questions(flags.expo, pages) total = 0 for pp in pages: if len(pages[pp]) >= flags.min_pages: print(pp, len(pages[pp])) for qq in pages[pp]: total += 1 for sentence, word, text in qq.partials(): sentence = sentence - 1 with open( "%s/%i-%i.txt" %
# (flags.whoosh_qb, cc)) # if kIR_CATEGORIES: # categories = questions.column_options("category") # print("Adding categories %s" % str(categories)) # for cc in categories: # kFEATURES["ir"].add_index("wiki_%s" % cc, "%s_%s" % # (flags.whoosh_wiki, cc)) # kFEATURES["ir"].add_index("qb_%s" % cc, "%s_%s" % # (flags.whoosh_qb, cc)) kFEATURES["deep"] = instantiate_feature("deep", questions) # features_that_guess = set(kFEATURES[x] for x in kHAS_GUESSES) features_that_guess = {"deep": kFEATURES["deep"]} print("Guesses %s" % "\t".join(x for x in features_that_guess)) all_questions = questions.questions_with_pages() page_num = 0 for page in all_questions: if len(all_questions[page]) < flags.ans_limit: continue else: print("%s\t%i" % (page, len(all_questions[page]))) question_num = 0 page_num += 1 for qq in all_questions[page]: # We don't need guesses for train questions if qq.fold == "train": continue question_num += 1 guesses = guesses_for_question(qq, features_that_guess,
if __name__ == "__main__": parser = argparse.ArgumentParser(description="") parser.add_argument("--database", type=str, default="data/questions.db") parser.add_argument("--expo", type=str, default="") parser.add_argument("--min_pages", type=int, default=4) parser.add_argument( "--output_directory", type=str, default="data/wikifier/data/input/", help="Where we write output file" ) flags = parser.parse_args() database = QuestionDatabase(flags.database) if flags.database: pages = database.questions_with_pages() else: pages = defaultdict(set) if flags.expo: add_expo_questions(flags.expo, pages) total = 0 for pp in pages: if len(pages[pp]) >= flags.min_pages: print(pp, len(pages[pp])) for qq in pages[pp]: total += 1 for sentence, word, text in qq.partials(): sentence = sentence - 1 with open("%s/%i-%i.txt" % (flags.output_directory, qq.qnum, sentence), "w") as output: output.write("%s\n" % unidecode(text[sentence]))
"page": guess.strip(), "evidence": "", "final": final, "weight": weight, } buzz.writerow(d) if any_buzz: break final_answer = final_guess(positions) d = {"question": question_id, "answer": final_answer} final_out.writerow(d) # Write out the questions questions = qdb.questions_with_pages() if flags.question_out: question_out = DictWriter(open(flags.question_out, "w"), kQUES_OUT) question_out.writeheader() perf_out = DictWriter(open(flags.perf, "w"), fieldnames=kPERF_OUT) perf_out.writeheader() for pp in questions: for qq in questions[pp]: if qq.qnum in questions_with_buzzes: # Write text for buzzer if flags.question_out: for ll in qq.text_lines(): question_out.writerow(ll) # Write performance d = questions_with_buzzes[qq.qnum]