c.execute(query) answer_count = defaultdict(int) for pp, in c: answer_count[pp] += 1 query = 'select page, id, naqt, fold from questions where page != ""' c = question_database.cursor() c.execute(query) print(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF)) print( len(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF))) # Load the DAN to generate guesses if they're missing from the database deep = instantiate_feature("deep", QuestionDatabase(flags.question_db)) questions = {} question_num = 0 for pp, ii, nn, ff in c: if nn >= 0 or answer_count[pp] < kCOUNT_CUTOFF: continue question_num += 1 question = McScience(pp, ii, ff) question.add_text(question_first_sentence(question_database, ii)) choices = question_top_guesses(question.text, deep, guess_database, ii, pp, flags.num_choices) question.add_choices(choices) questions[ii] = question if question_num % 100 == 0: print(pp, ii, question_num)
print(query) c.execute(query) answer_count = defaultdict(int) for pp, in c: answer_count[pp] += 1 query = 'select page, id, naqt, fold from questions where page != ""' c = question_database.cursor() c.execute(query) print(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF)) print(len(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF))) # Load the DAN to generate guesses if they're missing from the database deep = instantiate_feature("deep", QuestionDatabase(flags.question_db)) questions = {} question_num = 0 for pp, ii, nn, ff in c: if nn >= 0 or answer_count[pp] < kCOUNT_CUTOFF: continue question_num += 1 question = McScience(pp, ii, ff) question.add_text(question_first_sentence(question_database, ii)) choices = question_top_guesses(question.text, deep, guess_database, ii, pp, flags.num_choices) question.add_choices(choices) questions[ii] = question if question_num % 100 == 0: print(pp, ii, question_num)
def main(): import argparse parser = argparse.ArgumentParser(description='') default_path = 'data/' parser.add_argument('--question_db', type=str, default=default_path + 'questions.db') parser.add_argument('--guess_db', type=str, default=default_path + 'guesses.db', help="Guess database") parser.add_argument("--num_choices", type=int, default=4, help="How many choices do we write") parser.add_argument("--train_out", type=str, default="sci_train.csv") parser.add_argument("--test_out", type=str, default="sci_test.csv") parser.add_argument("--key_out", type=str, default="sci_key.csv") flags = parser.parse_args() # Create database connections print("Opening %s" % flags.question_db) question_database = sqlite3.connect(flags.question_db) guess_database = sqlite3.connect(flags.guess_db) # First get answers of interest and put them in a dictionary where the value is their count query = 'select page from questions where page != "" and (' query += " or ".join("category='%s'" % x for x in CATEGORIES) query += ")" c = question_database.cursor() print(query) c.execute(query) answer_count = defaultdict(int) for pp, in c: answer_count[pp] += 1 query = 'select page, id, naqt, fold from questions where page != ""' c = question_database.cursor() c.execute(query) print(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF)) print(len(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF))) # Load the DAN to generate guesses if they're missing from the database deep = instantiate_feature("deep", QuestionDatabase(flags.question_db)) questions = {} question_num = 0 for pp, ii, nn, ff in c: if nn >= 0 or answer_count[pp] < COUNT_CUTOFF: continue question_num += 1 question = McScience(pp, ii, ff) question.add_text(question_first_sentence(question_database, ii)) choices = question_top_guesses(question.text, deep, guess_database, ii, pp, flags.num_choices) question.add_choices(choices) questions[ii] = question if question_num % 100 == 0: print(pp, ii, question_num) print(choices) answer_choices = [ "answer%s" % CHOICEIDS[x] for x in range(flags.num_choices) ] train_out = DictWriter(open(flags.train_out, 'w'), ["id", "question", "correctAnswer"] + answer_choices) train_out.writeheader() test_out = DictWriter(open(flags.test_out, 'w'), ["id", "question"] + answer_choices) test_out.writeheader() key_out = DictWriter(open(flags.key_out, 'w'), ["id", "correctAnswer"]) key_out.writeheader() # Now write the questions out for qq in questions.values(): print(qq.fold) if qq.fold == "devtest": test_out.writerow(qq.csv_line(CHOICEIDS, "test")) key_out.writerow(qq.csv_line(CHOICEIDS, "key")) else: train_out.writerow(qq.csv_line(CHOICEIDS, "train"))
default='results/expo/questions.csv', help="Where we write out questions") flags = parser.parse_args() # Load in the exposition questions questions = add_expo_questions(flags.expo) write_question_text(questions, flags.question_out) # Create question database qdb = QuestionDatabase(flags.question_db) # Create database for guess list guess_list = GuessList(flags.guess_db) # Generate all of the guess and store them in a guess_list features_that_guess = {"deep": instantiate_feature("deep", qdb)} for page in questions: for qq in questions[page]: guesses = guesses_for_question(qq, features_that_guess, guess_list, flags.gap) print(guesses) for guesser in guesses: guess_list.add_guesses(guesser, qq.qnum, "expo", guesses[guesser]) del features_that_guess # Generate the features serially # for ff in ["label", "wikilinks"]: for ff in sorted(["label"] + kFEATURES.keys()):
default='results/expo/questions.csv', help="Where we write out questions") flags = parser.parse_args() # Load in the exposition questions questions = add_expo_questions(flags.expo) write_question_text(questions, flags.question_out) # Create question database qdb = QuestionDatabase(flags.question_db) # Create database for guess list guess_list = GuessList(flags.guess_db) # Generate all of the guess and store them in a guess_list features_that_guess = {"deep": instantiate_feature("deep", qdb)} for page in questions: for qq in questions[page]: guesses = guesses_for_question(qq, features_that_guess, guess_list, flags.gap) print(guesses) for guesser in guesses: guess_list.add_guesses(guesser, qq.qnum, "expo", guesses[guesser]) # Generate the features serially # for ff in ["label", "wikilinks"]: for ff in ["label"] + kFEATURES.keys(): print("Loading %s" % ff)
def main(): import argparse parser = argparse.ArgumentParser(description='') default_path = 'data/' parser.add_argument('--question_db', type=str, default=default_path + 'questions.db') parser.add_argument('--guess_db', type=str, default=default_path + 'guesses.db', help="Guess database") parser.add_argument("--num_choices", type=int, default=4, help="How many choices do we write") parser.add_argument("--train_out", type=str, default="sci_train.csv") parser.add_argument("--test_out", type=str, default="sci_test.csv") parser.add_argument("--key_out", type=str, default="sci_key.csv") flags = parser.parse_args() # Create database connections print("Opening %s" % flags.question_db) question_database = sqlite3.connect(flags.question_db) guess_database = sqlite3.connect(flags.guess_db) # First get answers of interest and put them in a dictionary where the value is their count query = 'select page from questions where page != "" and (' query += " or ".join("category='%s'" % x for x in CATEGORIES) query += ")" c = question_database.cursor() print(query) c.execute(query) answer_count = defaultdict(int) for pp, in c: answer_count[pp] += 1 query = 'select page, id, naqt, fold from questions where page != ""' c = question_database.cursor() c.execute(query) print(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF)) print(len(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF))) # Load the DAN to generate guesses if they're missing from the database deep = instantiate_feature("deep", QuestionDatabase(flags.question_db)) questions = {} question_num = 0 for pp, ii, nn, ff in c: if nn >= 0 or answer_count[pp] < COUNT_CUTOFF: continue question_num += 1 question = McScience(pp, ii, ff) question.add_text(question_first_sentence(question_database, ii)) choices = question_top_guesses(question.text, deep, guess_database, ii, pp, flags.num_choices) question.add_choices(choices) questions[ii] = question if question_num % 100 == 0: print(pp, ii, question_num) print(choices) answer_choices = ["answer%s" % CHOICEIDS[x] for x in range(flags.num_choices)] train_out = DictWriter(open(flags.train_out, 'w'), ["id", "question", "correctAnswer"] + answer_choices) train_out.writeheader() test_out = DictWriter(open(flags.test_out, 'w'), ["id", "question"] + answer_choices) test_out.writeheader() key_out = DictWriter(open(flags.key_out, 'w'), ["id", "correctAnswer"]) key_out.writeheader() # Now write the questions out for qq in questions.values(): print(qq.fold) if qq.fold == "devtest": test_out.writerow(qq.csv_line(CHOICEIDS, "test")) key_out.writerow(qq.csv_line(CHOICEIDS, "key")) else: train_out.writerow(qq.csv_line(CHOICEIDS, "train"))