Example #1
0
    c.execute(query)

    answer_count = defaultdict(int)
    for pp, in c:
        answer_count[pp] += 1

    query = 'select page, id, naqt, fold from questions where page != ""'
    c = question_database.cursor()
    c.execute(query)

    print(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF))
    print(
        len(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF)))

    # Load the DAN to generate guesses if they're missing from the database
    deep = instantiate_feature("deep", QuestionDatabase(flags.question_db))

    questions = {}
    question_num = 0
    for pp, ii, nn, ff in c:
        if nn >= 0 or answer_count[pp] < kCOUNT_CUTOFF:
            continue
        question_num += 1
        question = McScience(pp, ii, ff)
        question.add_text(question_first_sentence(question_database, ii))
        choices = question_top_guesses(question.text, deep, guess_database, ii,
                                       pp, flags.num_choices)
        question.add_choices(choices)
        questions[ii] = question
        if question_num % 100 == 0:
            print(pp, ii, question_num)
Example #2
0
    print(query)
    c.execute(query)

    answer_count = defaultdict(int)
    for pp, in c:
        answer_count[pp] += 1

    query = 'select page, id, naqt, fold from questions where page != ""'
    c = question_database.cursor()
    c.execute(query)

    print(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF))
    print(len(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF)))

    # Load the DAN to generate guesses if they're missing from the database
    deep = instantiate_feature("deep", QuestionDatabase(flags.question_db))

    questions = {}
    question_num = 0
    for pp, ii, nn, ff in c:
        if nn >= 0 or answer_count[pp] < kCOUNT_CUTOFF:
            continue
        question_num += 1
        question = McScience(pp, ii, ff)
        question.add_text(question_first_sentence(question_database, ii))
        choices = question_top_guesses(question.text, deep, guess_database, ii, pp,
                                       flags.num_choices)
        question.add_choices(choices)
        questions[ii] = question
        if question_num % 100 == 0:
            print(pp, ii, question_num)
Example #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='')
    default_path = 'data/'
    parser.add_argument('--question_db',
                        type=str,
                        default=default_path + 'questions.db')
    parser.add_argument('--guess_db',
                        type=str,
                        default=default_path + 'guesses.db',
                        help="Guess database")
    parser.add_argument("--num_choices",
                        type=int,
                        default=4,
                        help="How many choices do we write")
    parser.add_argument("--train_out", type=str, default="sci_train.csv")
    parser.add_argument("--test_out", type=str, default="sci_test.csv")
    parser.add_argument("--key_out", type=str, default="sci_key.csv")
    flags = parser.parse_args()

    # Create database connections
    print("Opening %s" % flags.question_db)
    question_database = sqlite3.connect(flags.question_db)
    guess_database = sqlite3.connect(flags.guess_db)

    # First get answers of interest and put them in a dictionary where the value is their count
    query = 'select page from questions where page != "" and ('
    query += " or ".join("category='%s'" % x for x in CATEGORIES)
    query += ")"
    c = question_database.cursor()
    print(query)
    c.execute(query)

    answer_count = defaultdict(int)
    for pp, in c:
        answer_count[pp] += 1

    query = 'select page, id, naqt, fold from questions where page != ""'
    c = question_database.cursor()
    c.execute(query)

    print(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF))
    print(len(list(x for x in answer_count
                   if answer_count[x] >= COUNT_CUTOFF)))

    # Load the DAN to generate guesses if they're missing from the database
    deep = instantiate_feature("deep", QuestionDatabase(flags.question_db))

    questions = {}
    question_num = 0
    for pp, ii, nn, ff in c:
        if nn >= 0 or answer_count[pp] < COUNT_CUTOFF:
            continue
        question_num += 1
        question = McScience(pp, ii, ff)
        question.add_text(question_first_sentence(question_database, ii))
        choices = question_top_guesses(question.text, deep, guess_database, ii,
                                       pp, flags.num_choices)
        question.add_choices(choices)
        questions[ii] = question
        if question_num % 100 == 0:
            print(pp, ii, question_num)
            print(choices)

    answer_choices = [
        "answer%s" % CHOICEIDS[x] for x in range(flags.num_choices)
    ]

    train_out = DictWriter(open(flags.train_out,
                                'w'), ["id", "question", "correctAnswer"] +
                           answer_choices)
    train_out.writeheader()

    test_out = DictWriter(open(flags.test_out, 'w'),
                          ["id", "question"] + answer_choices)
    test_out.writeheader()

    key_out = DictWriter(open(flags.key_out, 'w'), ["id", "correctAnswer"])
    key_out.writeheader()

    # Now write the questions out
    for qq in questions.values():
        print(qq.fold)
        if qq.fold == "devtest":
            test_out.writerow(qq.csv_line(CHOICEIDS, "test"))
            key_out.writerow(qq.csv_line(CHOICEIDS, "key"))
        else:
            train_out.writerow(qq.csv_line(CHOICEIDS, "train"))
Example #4
0
                        default='results/expo/questions.csv',
                        help="Where we write out questions")

    flags = parser.parse_args()
    # Load in the exposition questions
    questions = add_expo_questions(flags.expo)
    write_question_text(questions, flags.question_out)

    # Create question database
    qdb = QuestionDatabase(flags.question_db)

    # Create database for guess list
    guess_list = GuessList(flags.guess_db)

    # Generate all of the guess and store them in a guess_list
    features_that_guess = {"deep": instantiate_feature("deep", qdb)}

    for page in questions:
        for qq in questions[page]:
            guesses = guesses_for_question(qq, features_that_guess,
                                           guess_list, flags.gap)
            print(guesses)

            for guesser in guesses:
                guess_list.add_guesses(guesser, qq.qnum, "expo",
                                       guesses[guesser])
    del features_that_guess

    # Generate the features serially
    # for ff in ["label", "wikilinks"]:
    for ff in sorted(["label"] + kFEATURES.keys()):
Example #5
0
                        default='results/expo/questions.csv',
                        help="Where we write out questions")

    flags = parser.parse_args()
    # Load in the exposition questions
    questions = add_expo_questions(flags.expo)
    write_question_text(questions, flags.question_out)

    # Create question database
    qdb = QuestionDatabase(flags.question_db)

    # Create database for guess list
    guess_list = GuessList(flags.guess_db)

    # Generate all of the guess and store them in a guess_list
    features_that_guess = {"deep": instantiate_feature("deep", qdb)}

    for page in questions:
        for qq in questions[page]:
            guesses = guesses_for_question(qq, features_that_guess, guess_list,
                                           flags.gap)
            print(guesses)

            for guesser in guesses:
                guess_list.add_guesses(guesser, qq.qnum, "expo",
                                       guesses[guesser])

    # Generate the features serially
    # for ff in ["label", "wikilinks"]:
    for ff in ["label"] + kFEATURES.keys():
        print("Loading %s" % ff)
Example #6
0
def main():
    import argparse
    parser = argparse.ArgumentParser(description='')
    default_path = 'data/'
    parser.add_argument('--question_db', type=str, default=default_path + 'questions.db')
    parser.add_argument('--guess_db', type=str, default=default_path + 'guesses.db',
                        help="Guess database")
    parser.add_argument("--num_choices", type=int, default=4,
                        help="How many choices do we write")
    parser.add_argument("--train_out", type=str, default="sci_train.csv")
    parser.add_argument("--test_out", type=str, default="sci_test.csv")
    parser.add_argument("--key_out", type=str, default="sci_key.csv")
    flags = parser.parse_args()

    # Create database connections
    print("Opening %s" % flags.question_db)
    question_database = sqlite3.connect(flags.question_db)
    guess_database = sqlite3.connect(flags.guess_db)

    # First get answers of interest and put them in a dictionary where the value is their count
    query = 'select page from questions where page != "" and ('
    query += " or ".join("category='%s'" % x for x in CATEGORIES)
    query += ")"
    c = question_database.cursor()
    print(query)
    c.execute(query)

    answer_count = defaultdict(int)
    for pp, in c:
        answer_count[pp] += 1

    query = 'select page, id, naqt, fold from questions where page != ""'
    c = question_database.cursor()
    c.execute(query)

    print(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF))
    print(len(list(x for x in answer_count if answer_count[x] >= COUNT_CUTOFF)))

    # Load the DAN to generate guesses if they're missing from the database
    deep = instantiate_feature("deep", QuestionDatabase(flags.question_db))

    questions = {}
    question_num = 0
    for pp, ii, nn, ff in c:
        if nn >= 0 or answer_count[pp] < COUNT_CUTOFF:
            continue
        question_num += 1
        question = McScience(pp, ii, ff)
        question.add_text(question_first_sentence(question_database, ii))
        choices = question_top_guesses(question.text, deep, guess_database, ii, pp,
                                       flags.num_choices)
        question.add_choices(choices)
        questions[ii] = question
        if question_num % 100 == 0:
            print(pp, ii, question_num)
            print(choices)

    answer_choices = ["answer%s" % CHOICEIDS[x] for x in range(flags.num_choices)]

    train_out = DictWriter(open(flags.train_out, 'w'), ["id", "question", "correctAnswer"] +
                           answer_choices)
    train_out.writeheader()

    test_out = DictWriter(open(flags.test_out, 'w'), ["id", "question"] + answer_choices)
    test_out.writeheader()

    key_out = DictWriter(open(flags.key_out, 'w'), ["id", "correctAnswer"])
    key_out.writeheader()

    # Now write the questions out
    for qq in questions.values():
        print(qq.fold)
        if qq.fold == "devtest":
            test_out.writerow(qq.csv_line(CHOICEIDS, "test"))
            key_out.writerow(qq.csv_line(CHOICEIDS, "key"))
        else:
            train_out.writerow(qq.csv_line(CHOICEIDS, "train"))