コード例 #1
0
ファイル: lm.py プロジェクト: jankim/qb
def choose_jm(lm, params, qb_location, num_globals):
    qdb = QuestionDatabase(qb_location)

    pages = qdb.questions_with_pages()
    scores = defaultdict(float)
    for ll in params:
        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            compare = (hash(pp) + 1) % num_globals
            for qq in [x for x in pages[pp] if x.fold == "dev"]:
                for ss in qq.text_lines():
                    lm[compare].set_jm_interp(ll)
                    text = list(lm[compare].tokenize_and_censor(ss["text"]))
                    try:
                        val = lm[compare].ll(text)
                    except OverflowError:
                        val = float("nan")
                    if isnan(val):
                        continue
                    else:
                        scores[ll] += val

    print(scores, max(scores.values()))
    print(scores)

    return [x for x in scores if scores[x] == max(scores.values())][0]
コード例 #2
0
ファイル: lm.py プロジェクト: zhimingz/qb
def choose_jm(lm, params, qb_location, num_globals):
    qdb = QuestionDatabase(qb_location)

    pages = qdb.questions_with_pages()
    scores = defaultdict(float)
    for ll in params:
        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            compare = (hash(pp) + 1) % num_globals
            for qq in [x for x in pages[pp] if x.fold == "dev"]:
                for ss in qq.text_lines():
                    lm[compare].set_jm_interp(ll)
                    text = list(lm[compare].tokenize_and_censor(ss["text"]))
                    try:
                        val = lm[compare].ll(text)
                    except OverflowError:
                        val = float("nan")
                    if isnan(val):
                        continue
                    else:
                        scores[ll] += val

    print(scores, max(scores.values()))
    print(scores)

    return [x for x in scores if scores[x] == max(scores.values())][0]
コード例 #3
0
ファイル: lm.py プロジェクト: zhimingz/qb
    def verbose(self, qb_location):
        qdb = QuestionDatabase(qb_location)
        pages = qdb.questions_with_pages()
        import time

        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            need_title = True
            compare = (hash(pp) + 1) % self._globals
            for corpus in self._lm:
                if not pp in self._lm[corpus]:
                    continue

                for qq in [x for x in pages[pp] if x.fold == "dev"]:
                    if need_title:
                        print("--------------\t%s\t--------------" % pp)
                        need_title = False
                    for ss in qq.text_lines():
                        self.set_metadata(qq.page, qq.category, qq.qnum,
                                          ss["sent"], 0, None, qq.fold)
                        start = time.time()
                        print("===============\t%s\t===============" % corpus)
                        print(self.vw_from_title(pp, ss["text"]))
                        text = list(self._lm[corpus][0].tokenize_and_censor(
                            ss["text"]))
                        sent = self._lm[corpus][pp].mean_ll(text)
                        background = \
                            self._lm[corpus][compare].mean_ll(text)
                        score = self.text_score(corpus, pp, text)
                        print("sent: ([%f - %f] - %f) / %f = %f" %
                              (sent, background, self._sent_mean[corpus],
                               self._sent_var[corpus], score))

                        for cc in self._lm[corpus][pp].\
                                ngram_chains(text):
                            ngram_score = self.ngram_score(corpus, pp, cc)
                            vv = self._lm[corpus][pp].mean_ll(cc)
                            background = \
                                self._lm[corpus][compare].mean_ll(cc)
                            print("ngram, %s: ([%f - %f] - %f) / %f = %f" %
                                  (display_ngram(cc), vv, background,
                                   self._ngram_mean[corpus][len(cc)],
                                   self._ngram_var[corpus][len(cc)],
                                   ngram_score))
                            print(
                                list(x if x in
                                     self._lm[corpus][compare]._vocab else None
                                     for x in cc))
                        print("TIME: %f" % (time.time() - start))
コード例 #4
0
ファイル: lm.py プロジェクト: jankim/qb
    def verbose(self, qb_location):
        qdb = QuestionDatabase(qb_location)
        pages = qdb.questions_with_pages()
        import time

        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            need_title = True
            compare = (hash(pp) + 1) % self._globals
            for corpus in self._lm:
                if not pp in self._lm[corpus]:
                    continue

                for qq in [x for x in pages[pp] if x.fold == "dev"]:
                    if need_title:
                        print("--------------\t%s\t--------------" % pp)
                        need_title = False
                    for ss in qq.text_lines():
                        self.set_metadata(qq.page, qq.category, qq.qnum, ss["sent"], 0, None, qq.fold)
                        start = time.time()
                        print("===============\t%s\t===============" % corpus)
                        print(self.vw_from_title(pp, ss["text"]))
                        text = list(self._lm[corpus][0].tokenize_and_censor(ss["text"]))
                        sent = self._lm[corpus][pp].mean_ll(text)
                        background = self._lm[corpus][compare].mean_ll(text)
                        score = self.text_score(corpus, pp, text)
                        print(
                            "sent: ([%f - %f] - %f) / %f = %f"
                            % (sent, background, self._sent_mean[corpus], self._sent_var[corpus], score)
                        )

                        for cc in self._lm[corpus][pp].ngram_chains(text):
                            ngram_score = self.ngram_score(corpus, pp, cc)
                            vv = self._lm[corpus][pp].mean_ll(cc)
                            background = self._lm[corpus][compare].mean_ll(cc)
                            print(
                                "ngram, %s: ([%f - %f] - %f) / %f = %f"
                                % (
                                    display_ngram(cc),
                                    vv,
                                    background,
                                    self._ngram_mean[corpus][len(cc)],
                                    self._ngram_var[corpus][len(cc)],
                                    ngram_score,
                                )
                            )
                            print(list(x if x in self._lm[corpus][compare]._vocab else None for x in cc))
                        print("TIME: %f" % (time.time() - start))
コード例 #5
0
ファイル: lm.py プロジェクト: jankim/qb
    def _set_stats(self, corpus, lm, qb_location, max_pages):
        sents = []
        ngrams = defaultdict(list)

        qdb = QuestionDatabase(qb_location)
        pages = qdb.questions_with_pages()

        print("Computing stats for %s from %i pages ..." % (corpus, max_pages))
        page_count = 0
        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            compare = (hash(pp) + 1) % self._globals
            page_count += 1
            for qq in [x for x in pages[pp] if x.fold == "dev"]:
                if max_pages > 0 and page_count > max_pages:
                    break
                if page_count % 34 == 0:
                    print("%i\t%s" % (page_count, pp))
                for ss in qq.text_lines():
                    if pp in lm:
                        text = list(lm[pp].tokenize_and_censor(ss["text"]))
                        sents.append(lm[pp].mean_ll(text) - lm[compare].mean_ll(text))

                        for cc in lm[pp].ngram_chains(text):
                            ngrams[len(cc)].append(lm[pp].mean_ll(cc) - lm[compare].mean_ll(cc))
        print("done")

        print("Sents", sents[:10])
        self._sent_mean[corpus] = mean(sents)
        self._sent_var[corpus] = var(sents)

        print("Ngrams", ngrams[2][:10])
        for ii in ngrams:
            self._ngram_mean[corpus][ii] = mean(list(x for x in ngrams[ii] if x > self._threshold))
            self._ngram_var[corpus][ii] = var(list(x for x in ngrams[ii] if x > self._threshold))

        print(
            "Stats for %s: SM: %f, SV: %f, NM: %f, NV: %f"
            % (
                corpus,
                self._sent_mean[corpus],
                self._sent_var[corpus],
                self._ngram_mean[corpus][2],
                self._ngram_var[corpus][2],
            )
        )
コード例 #6
0
ファイル: lm.py プロジェクト: zhimingz/qb
    def _set_stats(self, corpus, lm, qb_location, max_pages):
        sents = []
        ngrams = defaultdict(list)

        qdb = QuestionDatabase(qb_location)
        pages = qdb.questions_with_pages()

        print("Computing stats for %s from %i pages ..." % (corpus, max_pages))
        page_count = 0
        for pp in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
            compare = (hash(pp) + 1) % self._globals
            page_count += 1
            for qq in [x for x in pages[pp] if x.fold == "dev"]:
                if max_pages > 0 and page_count > max_pages:
                    break
                if page_count % 34 == 0:
                    print("%i\t%s" % (page_count, pp))
                for ss in qq.text_lines():
                    if pp in lm:
                        text = list(lm[pp].tokenize_and_censor(ss["text"]))
                        sents.append(lm[pp].mean_ll(text) -
                                     lm[compare].mean_ll(text))

                        for cc in lm[pp].ngram_chains(text):
                            ngrams[len(cc)].\
                                append(lm[pp].mean_ll(cc) -
                                       lm[compare].mean_ll(cc))
        print("done")

        print("Sents", sents[:10])
        self._sent_mean[corpus] = mean(sents)
        self._sent_var[corpus] = var(sents)

        print("Ngrams", ngrams[2][:10])
        for ii in ngrams:
            self._ngram_mean[corpus][ii] = mean(
                list(x for x in ngrams[ii] if x > self._threshold))
            self._ngram_var[corpus][ii] = var(
                list(x for x in ngrams[ii] if x > self._threshold))

        print("Stats for %s: SM: %f, SV: %f, NM: %f, NV: %f" %
              (corpus, self._sent_mean[corpus], self._sent_var[corpus],
               self._ngram_mean[corpus][2], self._ngram_var[corpus][2]))
コード例 #7
0
ファイル: extract_features.py プロジェクト: EntilZha/qb
        # if kIR_CATEGORIES:
        #     categories = questions.column_options("category")
        #     print("Adding categories %s" % str(categories))
        #     for cc in categories:
        #         kFEATURES["ir"].add_index("wiki_%s" % cc, "%s_%s" %
        #                                   (flags.whoosh_wiki, cc))
        #         kFEATURES["ir"].add_index("qb_%s" % cc, "%s_%s" %
        #                                   (flags.whoosh_qb, cc))


        kFEATURES["deep"] = instantiate_feature("deep", questions)
        # features_that_guess = set(kFEATURES[x] for x in kHAS_GUESSES)
        features_that_guess = {"deep": kFEATURES["deep"]}
        print("Guesses %s" % "\t".join(x for x in features_that_guess))

        all_questions = questions.questions_with_pages()

        page_num = 0
        for page in all_questions:
            if len(all_questions[page]) < flags.ans_limit:
                continue
            else:
                print("%s\t%i" % (page, len(all_questions[page])))
                question_num = 0
                page_num += 1
                for qq in all_questions[page]:
                    # We don't need guesses for train questions
                    if qq.fold == "train":
                        continue
                    question_num += 1
                    guesses = guesses_for_question(qq, features_that_guess,
コード例 #8
0
ファイル: evaluate_predictions.py プロジェクト: zhimingz/qb
                    "page": guess.strip(),
                    "evidence": "",
                    "final": final,
                    "weight": weight
                }
                buzz.writerow(d)
            if any_buzz:
                break

        final_answer = final_guess(positions)
        d = {"question": question_id, "answer": final_answer}
        final_out.writerow(d)

    # Write out the questions

    questions = qdb.questions_with_pages()
    if flags.question_out:
        question_out = DictWriter(open(flags.question_out, 'w'), kQUES_OUT)
        question_out.writeheader()
    perf_out = DictWriter(open(flags.perf, 'w'), fieldnames=kPERF_OUT)
    perf_out.writeheader()
    for pp in questions:
        for qq in questions[pp]:
            if qq.qnum in questions_with_buzzes:
                # Write text for buzzer
                if flags.question_out:
                    for ll in qq.text_lines():
                        question_out.writerow(ll)

                # Write performance
                d = questions_with_buzzes[qq.qnum]
コード例 #9
0
ファイル: wikification.py プロジェクト: zhimingz/qb
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='')
    parser.add_argument('--database', type=str, default='data/questions.db')
    parser.add_argument('--expo', type=str, default='')
    parser.add_argument('--min_pages', type=int, default=4)
    parser.add_argument("--output_directory",
                        type=str,
                        default="data/wikifier/data/input/",
                        help="Where we write output file")

    flags = parser.parse_args()

    database = QuestionDatabase(flags.database)

    if flags.database:
        pages = database.questions_with_pages()
    else:
        pages = defaultdict(set)
    if flags.expo:
        add_expo_questions(flags.expo, pages)

    total = 0
    for pp in pages:
        if len(pages[pp]) >= flags.min_pages:
            print(pp, len(pages[pp]))
            for qq in pages[pp]:
                total += 1
                for sentence, word, text in qq.partials():
                    sentence = sentence - 1
                    with open(
                            "%s/%i-%i.txt" %
コード例 #10
0
        #                               (flags.whoosh_qb, cc))
        # if kIR_CATEGORIES:
        #     categories = questions.column_options("category")
        #     print("Adding categories %s" % str(categories))
        #     for cc in categories:
        #         kFEATURES["ir"].add_index("wiki_%s" % cc, "%s_%s" %
        #                                   (flags.whoosh_wiki, cc))
        #         kFEATURES["ir"].add_index("qb_%s" % cc, "%s_%s" %
        #                                   (flags.whoosh_qb, cc))

        kFEATURES["deep"] = instantiate_feature("deep", questions)
        # features_that_guess = set(kFEATURES[x] for x in kHAS_GUESSES)
        features_that_guess = {"deep": kFEATURES["deep"]}
        print("Guesses %s" % "\t".join(x for x in features_that_guess))

        all_questions = questions.questions_with_pages()

        page_num = 0
        for page in all_questions:
            if len(all_questions[page]) < flags.ans_limit:
                continue
            else:
                print("%s\t%i" % (page, len(all_questions[page])))
                question_num = 0
                page_num += 1
                for qq in all_questions[page]:
                    # We don't need guesses for train questions
                    if qq.fold == "train":
                        continue
                    question_num += 1
                    guesses = guesses_for_question(qq, features_that_guess,
コード例 #11
0
ファイル: wikification.py プロジェクト: jankim/qb
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="")
    parser.add_argument("--database", type=str, default="data/questions.db")
    parser.add_argument("--expo", type=str, default="")
    parser.add_argument("--min_pages", type=int, default=4)
    parser.add_argument(
        "--output_directory", type=str, default="data/wikifier/data/input/", help="Where we write output file"
    )

    flags = parser.parse_args()

    database = QuestionDatabase(flags.database)

    if flags.database:
        pages = database.questions_with_pages()
    else:
        pages = defaultdict(set)
    if flags.expo:
        add_expo_questions(flags.expo, pages)

    total = 0
    for pp in pages:
        if len(pages[pp]) >= flags.min_pages:
            print(pp, len(pages[pp]))
            for qq in pages[pp]:
                total += 1
                for sentence, word, text in qq.partials():
                    sentence = sentence - 1
                    with open("%s/%i-%i.txt" % (flags.output_directory, qq.qnum, sentence), "w") as output:
                        output.write("%s\n" % unidecode(text[sentence]))
コード例 #12
0
ファイル: evaluate_predictions.py プロジェクト: jankim/qb
                    "page": guess.strip(),
                    "evidence": "",
                    "final": final,
                    "weight": weight,
                }
                buzz.writerow(d)
            if any_buzz:
                break

        final_answer = final_guess(positions)
        d = {"question": question_id, "answer": final_answer}
        final_out.writerow(d)

    # Write out the questions

    questions = qdb.questions_with_pages()
    if flags.question_out:
        question_out = DictWriter(open(flags.question_out, "w"), kQUES_OUT)
        question_out.writeheader()
    perf_out = DictWriter(open(flags.perf, "w"), fieldnames=kPERF_OUT)
    perf_out.writeheader()
    for pp in questions:
        for qq in questions[pp]:
            if qq.qnum in questions_with_buzzes:
                # Write text for buzzer
                if flags.question_out:
                    for ll in qq.text_lines():
                        question_out.writerow(ll)

                # Write performance
                d = questions_with_buzzes[qq.qnum]