コード例 #1
0
ファイル: lm.py プロジェクト: jankim/qb
def choose_jm(lm, params, qb_location, num_globals):
    qdb = QuestionDatabase(qb_location)

    pages = qdb.questions_with_pages()
    scores = defaultdict(float)
    for ll in params:
        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            compare = (hash(pp) + 1) % num_globals
            for qq in [x for x in pages[pp] if x.fold == "dev"]:
                for ss in qq.text_lines():
                    lm[compare].set_jm_interp(ll)
                    text = list(lm[compare].tokenize_and_censor(ss["text"]))
                    try:
                        val = lm[compare].ll(text)
                    except OverflowError:
                        val = float("nan")
                    if isnan(val):
                        continue
                    else:
                        scores[ll] += val

    print(scores, max(scores.values()))
    print(scores)

    return [x for x in scores if scores[x] == max(scores.values())][0]
コード例 #2
0
ファイル: lm.py プロジェクト: jankim/qb
    def verbose(self, qb_location):
        qdb = QuestionDatabase(qb_location)
        pages = qdb.questions_with_pages()
        import time

        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            need_title = True
            compare = (hash(pp) + 1) % self._globals
            for corpus in self._lm:
                if not pp in self._lm[corpus]:
                    continue

                for qq in [x for x in pages[pp] if x.fold == "dev"]:
                    if need_title:
                        print("--------------\t%s\t--------------" % pp)
                        need_title = False
                    for ss in qq.text_lines():
                        self.set_metadata(qq.page, qq.category, qq.qnum, ss["sent"], 0, None, qq.fold)
                        start = time.time()
                        print("===============\t%s\t===============" % corpus)
                        print(self.vw_from_title(pp, ss["text"]))
                        text = list(self._lm[corpus][0].tokenize_and_censor(ss["text"]))
                        sent = self._lm[corpus][pp].mean_ll(text)
                        background = self._lm[corpus][compare].mean_ll(text)
                        score = self.text_score(corpus, pp, text)
                        print(
                            "sent: ([%f - %f] - %f) / %f = %f"
                            % (sent, background, self._sent_mean[corpus], self._sent_var[corpus], score)
                        )

                        for cc in self._lm[corpus][pp].ngram_chains(text):
                            ngram_score = self.ngram_score(corpus, pp, cc)
                            vv = self._lm[corpus][pp].mean_ll(cc)
                            background = self._lm[corpus][compare].mean_ll(cc)
                            print(
                                "ngram, %s: ([%f - %f] - %f) / %f = %f"
                                % (
                                    display_ngram(cc),
                                    vv,
                                    background,
                                    self._ngram_mean[corpus][len(cc)],
                                    self._ngram_var[corpus][len(cc)],
                                    ngram_score,
                                )
                            )
                            print(list(x if x in self._lm[corpus][compare]._vocab else None for x in cc))
                        print("TIME: %f" % (time.time() - start))
コード例 #3
0
ファイル: lm.py プロジェクト: jankim/qb
    def _set_stats(self, corpus, lm, qb_location, max_pages):
        sents = []
        ngrams = defaultdict(list)

        qdb = QuestionDatabase(qb_location)
        pages = qdb.questions_with_pages()

        print("Computing stats for %s from %i pages ..." % (corpus, max_pages))
        page_count = 0
        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            compare = (hash(pp) + 1) % self._globals
            page_count += 1
            for qq in [x for x in pages[pp] if x.fold == "dev"]:
                if max_pages > 0 and page_count > max_pages:
                    break
                if page_count % 34 == 0:
                    print("%i\t%s" % (page_count, pp))
                for ss in qq.text_lines():
                    if pp in lm:
                        text = list(lm[pp].tokenize_and_censor(ss["text"]))
                        sents.append(lm[pp].mean_ll(text) - lm[compare].mean_ll(text))

                        for cc in lm[pp].ngram_chains(text):
                            ngrams[len(cc)].append(lm[pp].mean_ll(cc) - lm[compare].mean_ll(cc))
        print("done")

        print("Sents", sents[:10])
        self._sent_mean[corpus] = mean(sents)
        self._sent_var[corpus] = var(sents)

        print("Ngrams", ngrams[2][:10])
        for ii in ngrams:
            self._ngram_mean[corpus][ii] = mean(list(x for x in ngrams[ii] if x > self._threshold))
            self._ngram_var[corpus][ii] = var(list(x for x in ngrams[ii] if x > self._threshold))

        print(
            "Stats for %s: SM: %f, SV: %f, NM: %f, NV: %f"
            % (
                corpus,
                self._sent_mean[corpus],
                self._sent_var[corpus],
                self._ngram_mean[corpus][2],
                self._ngram_var[corpus][2],
            )
        )
コード例 #4
0
ファイル: build_science_mc.py プロジェクト: BinbinBian/qb
    c.execute(query)

    answer_count = defaultdict(int)
    for pp, in c:
        answer_count[pp] += 1

    query = 'select page, id, naqt, fold from questions where page != ""'
    c = question_database.cursor()
    c.execute(query)

    print(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF))
    print(
        len(list(x for x in answer_count if answer_count[x] >= kCOUNT_CUTOFF)))

    # Load the DAN to generate guesses if they're missing from the database
    deep = instantiate_feature("deep", QuestionDatabase(flags.question_db))

    questions = {}
    question_num = 0
    for pp, ii, nn, ff in c:
        if nn >= 0 or answer_count[pp] < kCOUNT_CUTOFF:
            continue
        question_num += 1
        question = McScience(pp, ii, ff)
        question.add_text(question_first_sentence(question_database, ii))
        choices = question_top_guesses(question.text, deep, guess_database, ii,
                                       pp, flags.num_choices)
        question.add_choices(choices)
        questions[ii] = question
        if question_num % 100 == 0:
            print(pp, ii, question_num)
コード例 #5
0
ファイル: evaluate_predictions.py プロジェクト: zhimingz/qb
    parser.add_argument('--neg_weight',
                        type=float,
                        default=0.0,
                        help="Negative example weight")
    parser.add_argument('--question_out',
                        type=str,
                        default='',
                        help="Where we write out questions for buzzer")
    parser.add_argument('--finals',
                        type=str,
                        default='',
                        help="Where we write out answer after entire question")
    parser.add_argument('--expo', type=str, default='', help="The expo file")

    flags = parser.parse_args()
    qdb = QuestionDatabase(flags.qbdb)

    buzz = DictWriter(open(flags.buzzes, 'w'), fieldnames=kBUZZ_OUT)
    buzz.writeheader()

    final_out = DictWriter(open(flags.finals, 'w'),
                           fieldnames=["question", "answer"])
    final_out.writeheader()

    # Check file length
    with open(flags.meta) as infile:
        meta_lines = sum(1 for line in infile)
    with open(flags.pred) as infile:
        pred_lines = sum(1 for line in infile)
    assert meta_lines == pred_lines, "Prediction and meta files mismatch" + \
        "(%s: %i vs %s: %i)" % (flags.meta, meta_lines, flags.pred, pred_lines)
コード例 #6
0
ファイル: human_in_loop_assignment.py プロジェクト: jankim/qb
import operator

kBAD_ANSWERS = ["", "red river", "the", "figaro", "normal", "s", "p"]

if __name__ == "__main__":
    args = argparse.ArgumentParser('Interactive assign pages to questions')
    args.add_argument('--database', type=str, default='data/questions.db',
                      help='sqlite3 database of questions')
    args.add_argument('--titles', type=str, default='data/wiki_index.pkl',
                      help='page title candiates')
    args.add_argument('--labels', type=str, default='data/map/ans_to_wiki',
                      help='write page assignment answers')
    args = args.parse_args()

    # Open up the database
    d = QuestionDatabase(args.database)
    page_diversity = d.answer_map(normalize)
    
    # Set up the active learner for writing assignments
    al = ActiveLearner(None, args.labels)
    existing_labels = set(x[0] for x in al.human_labeled())

    # get the candidates we want to assign to pages
    answers = d.unmatched_answers(existing_labels)
    print(answers.keys()[:10])

    # Open up the title finder
    tf = TitleFinder(open(args.titles))

    for ans, count in sorted(answers.items(), key=lambda x: sum(x[1].values()),
                             reverse=True):
コード例 #7
0
ファイル: naqt.py プロジェクト: jankim/qb
            first, rest = ii.split('ID="', 1)
            id, rest = rest.split('" TITLE="', 1)
            title, rest = rest.split('"', 1)
            self.topics[int(id)] = title


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Import questions')
    parser.add_argument('--naqt_path', type=str)
    parser.add_argument('--db', type=str, default='data/questions.db')

    flags = parser.parse_args()

    qdb = QuestionDatabase(flags.db)
    conn = qdb._conn
    answer_map = qdb.answer_map()

    # Find existing naqt questions
    c = conn.cursor()
    command = 'SELECT naqt FROM questions WHERE naqt >= 0;'
    c.execute(command)
    existing = set(int(x[0]) for x in c)

    num_skipped = 0
    last_id = kNAQT_START
    if flags.naqt_path:
        for qq in naqt_reader(flags.naqt_path):
            if qq.answer in answer_map and len(answer_map[qq.answer]) == 1:
                page = answer_map[qq.answer].keys()[0]
コード例 #8
0
ファイル: client.py プロジェクト: jankim/qb
    return seen

if __name__ == "__main__":
    from util import flags

    flags.define_string("title_index", None, "Pickle of all titles")
    flags.define_string("label_path", None, "Where we write page associations")
    flags.define_string("database", None, "Question database")
    flags.define_string("performance_output", None, "Where we write user performance")
    flags.define_string("user", None, "User identifier")
    flags.InitFlags()

    seen = already_answered(flags.performance_output, flags.user)
    al = ActiveLearner(None, flags.label_path)
    print("Loading question db %s" % flags.database)
    db = QuestionDatabase(flags.database)
    pw = PerformanceWriter(flags.performance_output, flags.user)
    tf = TitleFinder(open(flags.title_index))


    questions = db.questions_by_tournament("High School Championship")
    for qid in questions:
        question = questions[qid]
        if question.fold == "train" or qid in seen:
            continue
        choices = list(tf.query(question.answer))

        # Get what and when the human answered
        wp, idx, ans = get_answer([question.text[x] for x in sorted(question.text)], question.answer, question.page)

        print("\n".join(question.text.values()))
コード例 #9
0
ファイル: apply_wiki.py プロジェクト: zhimingz/qb
import pickle
import time

from page_assignment.active_learning_for_matching import ActiveLearner
from util.qdb import QuestionDatabase


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description="apply wikipedia pages")
    parser.add_argument("--db", default='data/questions.db', type=str,
                        help="The question database")
    parser.add_argument("--match_location", type=str,
                        default='data/map/ans_to_wiki_',
                        help="Where we read matches learned")

    flags = parser.parse_args()

    start = time.time()
    print("Loading db..")
    db = QuestionDatabase(flags.db)
    print("Loading classifier...")
    classifier = ActiveLearner(None, flags.match_location, [])

    for question, page in classifier.human_labeled():
        ans_type = ""
        db.set_answer_page(question, page, ans_type)
        print(question, page, "GIVEN", ans_type)
コード例 #10
0
    print 'top@', top, 'accuracy: ', corr / len(probs)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--question_db', type=str, default='data/questions.db')
    parser.add_argument('--attribute', type=str, default='category')
    parser.add_argument('--bigram_thresh', type=int, default=1000)
    parser.add_argument("--output",
                        type=str,
                        default="data/classifier/",
                        help="Where we write output file")

    flags = parser.parse_args()

    questions = QuestionDatabase(flags.question_db)
    bigram_filename = "%s/bigrams.pkl" % flags.output
    if os.path.exists(bigram_filename):
        bgset = pickle.load(open(bigram_filename, 'rb'))
        print("Using previous bigrams")
    else:
        print("computing bigrams...")
        bgset = compute_frequent_bigrams(flags.bigram_thresh, questions)
        write_bigrams(bgset, bigram_filename)

    train_classifier("%s/%s.pkl" % (flags.output, flags.attribute), bgset,
                     questions, flags.attribute)
    evaluate("%s/%s.pkl" % (flags.output, flags.attribute), bgset, questions,
             flags.attribute)
コード例 #11
0
ファイル: wikification.py プロジェクト: zhimingz/qb
from util.qdb import QuestionDatabase
from extract_expo_features import add_expo_questions

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--database', type=str, default='data/questions.db')
    parser.add_argument('--expo', type=str, default='')
    parser.add_argument('--min_pages', type=int, default=4)
    parser.add_argument("--output_directory",
                        type=str,
                        default="data/wikifier/data/input/",
                        help="Where we write output file")

    flags = parser.parse_args()

    database = QuestionDatabase(flags.database)

    if flags.database:
        pages = database.questions_with_pages()
    else:
        pages = defaultdict(set)
    if flags.expo:
        add_expo_questions(flags.expo, pages)

    total = 0
    for pp in pages:
        if len(pages[pp]) >= flags.min_pages:
            print(pp, len(pages[pp]))
            for qq in pages[pp]:
                total += 1
                for sentence, word, text in qq.partials():
コード例 #12
0
    parser.add_argument('--guess_db', type=str, default='data/guesses.db',
                        help='Where we write/read the guesses')
    parser.add_argument('--question_db', type=str, default='data/questions.db')
    parser.add_argument('--feature', type=str, default='',
                        help="Which feature we write out")
    parser.add_argument("--granularity", type=str,
                        default="sentence")
    parser.add_argument("--limit", type=int, default=-1,
                        help="How many answer to write to feature files")
    parser.add_argument("--ans_limit", type=int, default=5,
                        help="minimum answer limit")

    flags = parser.parse_args()

    print("Loading database from %s" % flags.question_db)
    questions = QuestionDatabase(flags.question_db)
    guess_list = GuessList(flags.guess_db)

    if flags.guesses:
        # kFEATURES["ir"] = IrExtractor()
        # for cc in kIR_CUTOFFS:
        #     kFEATURES["ir"].add_index("wiki_%i" % cc, "%s_%i" %
        #                               (flags.whoosh_wiki, cc))
        #     kFEATURES["ir"].add_index("qb_%i" % cc, "%s_%i" %
        #                               (flags.whoosh_qb, cc))
        # if kIR_CATEGORIES:
        #     categories = questions.column_options("category")
        #     print("Adding categories %s" % str(categories))
        #     for cc in categories:
        #         kFEATURES["ir"].add_index("wiki_%s" % cc, "%s_%s" %
        #                                   (flags.whoosh_wiki, cc))
コード例 #13
0
ファイル: wikification.py プロジェクト: jankim/qb
from util.qdb import QuestionDatabase
from extract_expo_features import add_expo_questions

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--database", type=str, default="data/questions.db")
    parser.add_argument("--expo", type=str, default="")
    parser.add_argument("--min_pages", type=int, default=4)
    parser.add_argument(
        "--output_directory", type=str, default="data/wikifier/data/input/", help="Where we write output file"
    )

    flags = parser.parse_args()

    database = QuestionDatabase(flags.database)

    if flags.database:
        pages = database.questions_with_pages()
    else:
        pages = defaultdict(set)
    if flags.expo:
        add_expo_questions(flags.expo, pages)

    total = 0
    for pp in pages:
        if len(pages[pp]) >= flags.min_pages:
            print(pp, len(pages[pp]))
            for qq in pages[pp]:
                total += 1
                for sentence, word, text in qq.partials():
コード例 #14
0
ファイル: evaluate_predictions.py プロジェクト: jankim/qb
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--buzzes", type=str, default="", help="Where we write resulting buzzes")
    parser.add_argument("--perf", type=str, default="", help="Where we write performance statistics")
    parser.add_argument("--pred", type=str, default="", help="Where we read predictions")
    parser.add_argument("--meta", type=str, default="", help="Where we read metadata values")
    parser.add_argument("--qbdb", type=str, default="data/questions.db", help="Source of questions")
    parser.add_argument("--vw_config", type=str, default="", help="Configuration of classifier")
    parser.add_argument("--neg_weight", type=float, default=0.0, help="Negative example weight")
    parser.add_argument("--question_out", type=str, default="", help="Where we write out questions for buzzer")
    parser.add_argument("--finals", type=str, default="", help="Where we write out answer after entire question")
    parser.add_argument("--expo", type=str, default="", help="The expo file")

    flags = parser.parse_args()
    qdb = QuestionDatabase(flags.qbdb)

    buzz = DictWriter(open(flags.buzzes, "w"), fieldnames=kBUZZ_OUT)
    buzz.writeheader()

    final_out = DictWriter(open(flags.finals, "w"), fieldnames=["question", "answer"])
    final_out.writeheader()

    # Check file length
    with open(flags.meta) as infile:
        meta_lines = sum(1 for line in infile)
    with open(flags.pred) as infile:
        pred_lines = sum(1 for line in infile)
    assert meta_lines == pred_lines, "Prediction and meta files mismatch" + "(%s: %i vs %s: %i)" % (
        flags.meta,
        meta_lines,
コード例 #15
0
ファイル: client.py プロジェクト: zhimingz/qb
if __name__ == "__main__":
    from util import flags

    flags.define_string("title_index", None, "Pickle of all titles")
    flags.define_string("label_path", None, "Where we write page associations")
    flags.define_string("database", None, "Question database")
    flags.define_string("performance_output", None,
                        "Where we write user performance")
    flags.define_string("user", None, "User identifier")
    flags.InitFlags()

    seen = already_answered(flags.performance_output, flags.user)
    al = ActiveLearner(None, flags.label_path)
    print("Loading question db %s" % flags.database)
    db = QuestionDatabase(flags.database)
    pw = PerformanceWriter(flags.performance_output, flags.user)
    tf = TitleFinder(open(flags.title_index))

    questions = db.questions_by_tournament("High School Championship")
    for qid in questions:
        question = questions[qid]
        if question.fold == "train" or qid in seen:
            continue
        choices = list(tf.query(question.answer))

        # Get what and when the human answered
        wp, idx, ans = get_answer(
            [question.text[x] for x in sorted(question.text)], question.answer,
            question.page)
コード例 #16
0
ファイル: deep.py プロジェクト: zhimingz/qb
    parser.add_argument("--params",
                        default="data/deep/params.pkl",
                        help="Location of parameter pickle")
    parser.add_argument("--vocab",
                        default="data/deep/deep_vocab.pkl",
                        help="Location of vocab pickle")
    parser.add_argument("--ners",
                        default="data/common/ners.pkl",
                        help="Location of NER pickle")
    flags = parser.parse_args()

    import time

    start = time.time()

    questions = questions = QuestionDatabase("data/questions.db")
    page_dict = {}
    for page in questions.get_all_pages():
        page_dict[page.lower().replace(' ', '_')] = page
    ws = DeepExtractor(flags.classifier, flags.params, flags.vocab, flags.ners,
                       page_dict)

    print("Startup: %f sec" % (time.time() - start))

    tests = {}
    tests[u"Tannhäuser (opera)"] = u"""He sought out the pope to
    seek forgiveness of his sins, only to be told that just as the pope's staff
    would never (*) blossom, his sins are never be forgiven. Three days later,
    the pope's staff miraculously bore flowers. For 10 points--identify this
    German folk hero, the subject of an opera by Wagner [VAHG-ner]."""
コード例 #17
0
    args.add_argument('--database',
                      type=str,
                      default='data/questions.db',
                      help='sqlite3 database of questions')
    args.add_argument('--titles',
                      type=str,
                      default='data/wiki_index.pkl',
                      help='page title candiates')
    args.add_argument('--labels',
                      type=str,
                      default='data/map/ans_to_wiki',
                      help='write page assignment answers')
    args = args.parse_args()

    # Open up the database
    d = QuestionDatabase(args.database)
    page_diversity = d.answer_map(normalize)

    # Set up the active learner for writing assignments
    al = ActiveLearner(None, args.labels)
    existing_labels = set(x[0] for x in al.human_labeled())

    # get the candidates we want to assign to pages
    answers = d.unmatched_answers(existing_labels)
    print(answers.keys()[:10])

    # Open up the title finder
    tf = TitleFinder(open(args.titles))

    for ans, count in sorted(answers.items(),
                             key=lambda x: sum(x[1].values()),
コード例 #18
0
ファイル: extract_features.py プロジェクト: EntilZha/qb
    parser.add_argument('--guess_db', type=str, default='data/guesses.db',
                        help='Where we write/read the guesses')
    parser.add_argument('--question_db', type=str, default='data/questions.db')
    parser.add_argument('--feature', type=str, default='',
                        help="Which feature we write out")
    parser.add_argument("--granularity", type=str,
                        default="sentence")
    parser.add_argument("--limit", type=int, default=-1,
                        help="How many answer to write to feature files")
    parser.add_argument("--ans_limit", type=int, default=5,
                        help="minimum answer limit")

    flags = parser.parse_args()

    print("Loading database from %s" % flags.question_db)
    questions = QuestionDatabase(flags.question_db)
    guess_list = GuessList(flags.guess_db)

    if flags.guesses:
        # kFEATURES["ir"] = IrExtractor()
        # for cc in kIR_CUTOFFS:
        #     kFEATURES["ir"].add_index("wiki_%i" % cc, "%s_%i" %
        #                               (flags.whoosh_wiki, cc))
        #     kFEATURES["ir"].add_index("qb_%i" % cc, "%s_%i" %
        #                               (flags.whoosh_qb, cc))
        # if kIR_CATEGORIES:
        #     categories = questions.column_options("category")
        #     print("Adding categories %s" % str(categories))
        #     for cc in categories:
        #         kFEATURES["ir"].add_index("wiki_%s" % cc, "%s_%s" %
        #                                   (flags.whoosh_wiki, cc))
コード例 #19
0
            first, rest = ii.split('ID="', 1)
            id, rest = rest.split('" TITLE="', 1)
            title, rest = rest.split('"', 1)
            self.topics[int(id)] = title


if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(description='Import questions')
    parser.add_argument('--naqt_path', type=str)
    parser.add_argument('--db', type=str, default='data/questions.db')

    flags = parser.parse_args()

    qdb = QuestionDatabase(flags.db)
    conn = qdb._conn
    answer_map = qdb.answer_map()

    # Find existing naqt questions
    c = conn.cursor()
    command = 'SELECT naqt FROM questions WHERE naqt >= 0;'
    c.execute(command)
    existing = set(int(x[0]) for x in c)

    num_skipped = 0
    last_id = kNAQT_START
    if flags.naqt_path:
        for qq in naqt_reader(flags.naqt_path):
            if qq.answer in answer_map and len(answer_map[qq.answer]) == 1:
                page = answer_map[qq.answer].keys()[0]