コード例 #1
0
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(_process_page,
                                                     dump_reader,
                                                     chunksize=chunk_size):
                titles.append(normalize(page.title))
                if page.is_redirect:
                    redirects[normalize(page.title)] = page.redirect

                for link_obj in links:
                    title_counter[normalize(link_obj.title)] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.items():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title], )))

        redirect_dict = RecordTrie('<I', redirect_items)

        delete_keys = []
        keys = list(title_counter.keys())
        for key in keys:
            title = key
            count = title_counter[key]
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)
コード例 #2
0
ファイル: entity_db.py プロジェクト: studio-ousia/ntee
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(
                _process_page, dump_reader, chunksize=chunk_size
            ):
                titles.append(page.title)
                if page.is_redirect:
                    redirects[page.title] = page.redirect

                for link_obj in links:
                    title_counter[link_obj.title] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.iteritems():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title],)))

        redirect_dict = RecordTrie('<I', redirect_items)

        for (title, count) in title_counter.items():
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)