def build(dump_file, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() titles = [] redirects = {} title_counter = Counter() with closing(Pool(pool_size)) as pool: for (page, links) in pool.imap_unordered(_process_page, dump_reader, chunksize=chunk_size): titles.append(normalize(page.title)) if page.is_redirect: redirects[normalize(page.title)] = page.redirect for link_obj in links: title_counter[normalize(link_obj.title)] += 1 title_dict = Trie(titles) redirect_items = [] for (title, dest_title) in redirects.items(): if dest_title in title_dict: redirect_items.append((title, (title_dict[dest_title], ))) redirect_dict = RecordTrie('<I', redirect_items) delete_keys = [] keys = list(title_counter.keys()) for key in keys: title = key count = title_counter[key] dest_obj = redirect_dict.get(title) if dest_obj is not None: title_counter[title_dict.restore_key(dest_obj[0][0])] += count del title_counter[title] inlink_arr = np.zeros(len(title_dict), dtype=np.int) for (title, count) in title_counter.items(): title_index = title_dict.get(title) if title_index is not None: inlink_arr[title_index] = count return EntityDB(title_dict, redirect_dict, inlink_arr)
def build(dump_file, pool_size, chunk_size): dump_reader = WikiDumpReader(dump_file) global _extractor _extractor = WikiExtractor() titles = [] redirects = {} title_counter = Counter() with closing(Pool(pool_size)) as pool: for (page, links) in pool.imap_unordered( _process_page, dump_reader, chunksize=chunk_size ): titles.append(page.title) if page.is_redirect: redirects[page.title] = page.redirect for link_obj in links: title_counter[link_obj.title] += 1 title_dict = Trie(titles) redirect_items = [] for (title, dest_title) in redirects.iteritems(): if dest_title in title_dict: redirect_items.append((title, (title_dict[dest_title],))) redirect_dict = RecordTrie('<I', redirect_items) for (title, count) in title_counter.items(): dest_obj = redirect_dict.get(title) if dest_obj is not None: title_counter[title_dict.restore_key(dest_obj[0][0])] += count del title_counter[title] inlink_arr = np.zeros(len(title_dict), dtype=np.int) for (title, count) in title_counter.items(): title_index = title_dict.get(title) if title_index is not None: inlink_arr[title_index] = count return EntityDB(title_dict, redirect_dict, inlink_arr)