def generate_corpus(dump_file, entity_db, out_file, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() with bz2.BZ2File(out_file, mode='w') as f: logger.info('Starting to process Wikipedia dump...') with closing(Pool(pool_size)) as pool: for paragraphs in pool.imap_unordered(_process_page, dump_reader, chunksize=chunk_size): for paragraph in paragraphs: para_text = u'' cur = 0 for link in sorted(paragraph.wiki_links, key=lambda l: l.span[0]): if link.title.startswith('File:'): continue title = entity_db.resolve_redirect(link.title).replace( u' ', u'_') para_text += paragraph.text[cur:link.span[0]].lower() para_text += u' ' + MARKER + title + u' ' cur = link.span[1] para_text += paragraph.text[cur:].lower() f.write(para_text.encode('utf-8') + '\n')
def generate_corpus(dump_file, entity_db, out_file, abstract_db, learn_entity, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() with bz2.BZ2File(out_file, mode='w') as f: click.echo('Processing Wikipedia dump...') with closing(Pool(pool_size)) as pool: for paragraphs in pool.imap_unordered( _process_page, dump_reader, chunksize=chunk_size ): for paragraph in paragraphs: para_words = paragraph.words if learn_entity: words = [] cur = 0 for link in sorted(paragraph.wiki_links, key=lambda l: l.span[0]): title = entity_db.resolve_redirect(link.title).replace(u' ', u'_') words += para_words[cur:link.span[0]] words.append(MARKER + title) cur = link.span[1] words += para_words[cur:] else: words = para_words f.write(u' '.join(words).encode('utf-8') + '\n') if abstract_db is not None: click.echo('Processing paragraphs in Abstract DB...') tokenizer = RegexpTokenizer() for value in abstract_db.itervalues(): para_text = value['text'] links = value['links'] if learn_entity: cur = len(para_text) words = [] for (text, title, span) in sorted(links, key=lambda l: l[2][0], reverse=True): words = ([MARKER + entity_db.resolve_redirect(title).replace(u' ', u'_')] + [t.text.lower() for t in tokenizer.tokenize(para_text[span[1]:cur])] + words) cur = span[0] words += [t.text.lower() for t in tokenizer.tokenize(para_text[:cur])] + words else: words = [t.text.lower() for t in tokenizer.tokenize(para_text)] f.write(u' '.join(words).encode('utf-8') + '\n')
def train_embedding(dump_file, dic_file, out_file, **kwargs): from wiki_dump_reader import WikiDumpReader dump_reader = WikiDumpReader(dump_file) dictionary = Dictionary.load(dic_file) train_kwargs = dict(parallel=kwargs.pop('parallel'), pool_size=kwargs.pop('pool_size'), chunk_size=kwargs.pop('chunk_size')) ent_vec = EntityVector(dictionary, **kwargs) ent_vec.train(dump_reader, **train_kwargs) ent_vec.save(out_file)
def build(dump_file, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() titles = [] redirects = {} title_counter = Counter() with closing(Pool(pool_size)) as pool: for (page, links) in pool.imap_unordered(_process_page, dump_reader, chunksize=chunk_size): titles.append(page.title) if page.is_redirect: redirects[page.title] = page.redirect for link_obj in links: title_counter[link_obj.title] += 1 title_dict = Trie(titles) redirect_items = [] for (title, dest_title) in redirects.iteritems(): if dest_title in title_dict: redirect_items.append((title, (title_dict[dest_title], ))) redirect_dict = RecordTrie('<I', redirect_items) for (title, count) in title_counter.items(): dest_obj = redirect_dict.get(title) if dest_obj is not None: title_counter[title_dict.restore_key(dest_obj[0][0])] += count del title_counter[title] inlink_arr = np.zeros(len(title_dict), dtype=np.int) for (title, count) in title_counter.items(): title_index = title_dict.get(title) if title_index is not None: inlink_arr[title_index] = count return EntityDB(title_dict, redirect_dict, inlink_arr)
def build(dataset, entity_db, dump_file, out_file, category, pool_size, chunk_size): global _extractor dump_reader = WikiDumpReader(dump_file) _extractor = WikiExtractor(entity_db) if category == 'pro': kb_data = dataset.profession_kb click.echo('Category: Profession') elif category == 'nat': kb_data = dataset.nationality_kb click.echo('Category: Nationality') else: raise RuntimeError('Invalid category') target_titles = frozenset( [entity_db.resolve_redirect(title) for (title, _) in kb_data]) paragraph_buf = {} link_buf = defaultdict(list) with closing(Pool(pool_size)) as pool: for (title, paragraphs) in pool.imap_unordered(_process_page, dump_reader, chunksize=chunk_size): if title in target_titles: paragraph_buf[title] = paragraphs for paragraph in paragraphs: for link in paragraph.wiki_links: if link.title in target_titles: link_buf[link.title].append( WikiLink(title, link.text, 'in')) with closing(PageDB(out_file, protocol=-1)) as db: with click.progressbar(paragraph_buf.iteritems(), length=len(paragraph_buf)) as bar: for (title, paragraphs) in bar: db[title.encode('utf-8')] = dict(paragraphs=paragraphs, in_links=link_buf[title])
def build_dictionary(dump_file, out_file, **kwargs): from wiki_dump_reader import WikiDumpReader dump_reader = WikiDumpReader(dump_file) dictionary = Dictionary.build(dump_reader, **kwargs) dictionary.save(out_file)