def __init__(self, db, min_pages, lm="data/kenlm.apra"): self._name = "mentions" self._refex_count = defaultdict(int) self._refex_lookup = defaultdict(set) # Get all of the answers answers = set(x for x, y in text_iterator(False, "", False, db, False, "", limit=-1, min_pages=min_pages)) self.generate_refexs(answers) self._text = "" self._lm = kenlm.LanguageModel('data/kenlm.arpa')
# Remove QB as part of the training to prevent overfitting in VW # # TODO: make it so that question counts are removed in generating features # on train data for corpus, qb, wiki in [ ("wiki", False, True), ("qb", True, False), ]: num_docs = 0 lm = {} background = defaultdict(int) # Build the vocabulary for title, text in text_iterator(wiki, flags.wiki_location, qb, flags.question_db, flags.max_pages, min_pages=min_answers): num_docs += 1 if not title in lm: lm[title] = \ JelinekMercerLanguageModel(flags.vocab_size, normalize_function= lambda x: unidecode(x.lower())) for tt in lm[title].tokenize_without_censor(text): background[tt] += 1 for ii in xrange(flags.global_lms): lm[ii] = \ JelinekMercerLanguageModel(flags.vocab_size,
combined = LanguageModel(flags.global_lms) min_answers = flags.min_answers print("Training language model with pages that appear more than %i times" % min_answers) # Remove QB as part of the training to prevent overfitting in VW # # TODO: make it so that question counts are removed in generating features # on train data for corpus, qb, wiki in [("wiki", False, True), ("qb", True, False)]: num_docs = 0 lm = {} background = defaultdict(int) # Build the vocabulary for title, text in text_iterator( wiki, flags.wiki_location, qb, flags.question_db, flags.max_pages, min_pages=min_answers ): num_docs += 1 if not title in lm: lm[title] = JelinekMercerLanguageModel( flags.vocab_size, normalize_function=lambda x: unidecode(x.lower()) ) for tt in lm[title].tokenize_without_censor(text): background[tt] += 1 for ii in xrange(flags.global_lms): lm[ii] = JelinekMercerLanguageModel(flags.vocab_size, normalize_function=lambda x: unidecode(x.lower())) # Create the vocabulary vocab = None
default="/Volumes/Documents/research_data/wikisource/en/*/*", help="Location of wiki cache") parser.add_argument("--plot_location", type=str, default="/Volumes/Documents/research_data/plots/*", help="Location of plot summaries") parser.add_argument("--min_answers", type=int, default=0, help="How many times does an answer need to appear to be included") parser.add_argument("--output_path", type=str, default="data/source", help="How many pages to add to the index") flags = parser.parse_args() # Get the pages that we want to use answers = set(title for title, text in text_iterator(False, "", True, flags.question_db, False, "", -1, min_pages=flags.min_answers)) pages = defaultdict(str) for ii in glob(flags.plot_location): text = unidecode(gzip.open(ii, 'r').read()) pages[ii.split("/")[-1].replace(".txt.gz", "")] = text print(pages.keys()[:5], pages[pages.keys()[0]][:60]) for ii in glob(flags.wiki_location): for jj, tt, cc in read_wiki(ii): match = match_page(tt, answers) if match: pages[unidecode(match)] += "\n\n\n" pages[unidecode(match)] += unidecode(cc)