Ejemplo n.º 1
0
    def __init__(self, db, min_pages, lm="data/kenlm.apra"):
        self._name = "mentions"
        self._refex_count = defaultdict(int)
        self._refex_lookup = defaultdict(set)

        # Get all of the answers
        answers = set(x for x, y in text_iterator(False, "", False, db,
                                                  False, "", limit=-1,
                                                  min_pages=min_pages))
        self.generate_refexs(answers)

        self._text = ""
        self._lm = kenlm.LanguageModel('data/kenlm.arpa')
Ejemplo n.º 2
0
Archivo: lm.py Proyecto: zhimingz/qb
    # Remove QB as part of the training to prevent overfitting in VW
    #
    # TODO: make it so that question counts are removed in generating features
    # on train data
    for corpus, qb, wiki in [
        ("wiki", False, True),
        ("qb", True, False),
    ]:
        num_docs = 0
        lm = {}
        background = defaultdict(int)
        # Build the vocabulary
        for title, text in text_iterator(wiki,
                                         flags.wiki_location,
                                         qb,
                                         flags.question_db,
                                         flags.max_pages,
                                         min_pages=min_answers):
            num_docs += 1
            if not title in lm:
                lm[title] = \
                    JelinekMercerLanguageModel(flags.vocab_size,
                                               normalize_function=
                                               lambda x: unidecode(x.lower()))

            for tt in lm[title].tokenize_without_censor(text):
                background[tt] += 1

        for ii in xrange(flags.global_lms):
            lm[ii] =  \
                JelinekMercerLanguageModel(flags.vocab_size,
Ejemplo n.º 3
0
Archivo: lm.py Proyecto: jankim/qb
    combined = LanguageModel(flags.global_lms)

    min_answers = flags.min_answers
    print("Training language model with pages that appear more than %i times" % min_answers)

    # Remove QB as part of the training to prevent overfitting in VW
    #
    # TODO: make it so that question counts are removed in generating features
    # on train data
    for corpus, qb, wiki in [("wiki", False, True), ("qb", True, False)]:
        num_docs = 0
        lm = {}
        background = defaultdict(int)
        # Build the vocabulary
        for title, text in text_iterator(
            wiki, flags.wiki_location, qb, flags.question_db, flags.max_pages, min_pages=min_answers
        ):
            num_docs += 1
            if not title in lm:
                lm[title] = JelinekMercerLanguageModel(
                    flags.vocab_size, normalize_function=lambda x: unidecode(x.lower())
                )

            for tt in lm[title].tokenize_without_censor(text):
                background[tt] += 1

        for ii in xrange(flags.global_lms):
            lm[ii] = JelinekMercerLanguageModel(flags.vocab_size, normalize_function=lambda x: unidecode(x.lower()))

        # Create the vocabulary
        vocab = None
Ejemplo n.º 4
0
                        default="/Volumes/Documents/research_data/wikisource/en/*/*",
                        help="Location of wiki cache")
    parser.add_argument("--plot_location", type=str,
                        default="/Volumes/Documents/research_data/plots/*",
                        help="Location of plot summaries")
    parser.add_argument("--min_answers", type=int, default=0,
                        help="How many times does an answer need to appear to be included")
    parser.add_argument("--output_path", type=str, default="data/source",
                        help="How many pages to add to the index")
    flags = parser.parse_args()

    # Get the pages that we want to use

    answers = set(title for title, text
                  in text_iterator(False, "",
                                   True, flags.question_db,
                                   False, "",
                                   -1, min_pages=flags.min_answers))

    pages = defaultdict(str)
    for ii in glob(flags.plot_location):
        text = unidecode(gzip.open(ii, 'r').read())
        pages[ii.split("/")[-1].replace(".txt.gz", "")] = text

    print(pages.keys()[:5], pages[pages.keys()[0]][:60])

    for ii in glob(flags.wiki_location):
        for jj, tt, cc in read_wiki(ii):
            match = match_page(tt, answers)
            if match:
                pages[unidecode(match)] += "\n\n\n"
                pages[unidecode(match)] += unidecode(cc)