Example #1
0
def generate_corpus(dump_file, entity_db, out_file, pool_size, chunk_size):
    dump_reader = WikiDumpReader(dump_file)

    global _extractor
    _extractor = WikiExtractor()

    with bz2.BZ2File(out_file, mode='w') as f:
        logger.info('Starting to process Wikipedia dump...')
        with closing(Pool(pool_size)) as pool:
            for paragraphs in pool.imap_unordered(_process_page,
                                                  dump_reader,
                                                  chunksize=chunk_size):
                for paragraph in paragraphs:
                    para_text = u''
                    cur = 0
                    for link in sorted(paragraph.wiki_links,
                                       key=lambda l: l.span[0]):
                        if link.title.startswith('File:'):
                            continue

                        title = entity_db.resolve_redirect(link.title).replace(
                            u' ', u'_')
                        para_text += paragraph.text[cur:link.span[0]].lower()
                        para_text += u' ' + MARKER + title + u' '
                        cur = link.span[1]

                    para_text += paragraph.text[cur:].lower()

                    f.write(para_text.encode('utf-8') + '\n')
Example #2
0
def generate_corpus(dump_file, entity_db, out_file, abstract_db, learn_entity,
                    pool_size, chunk_size):
    dump_reader = WikiDumpReader(dump_file)

    global _extractor
    _extractor = WikiExtractor()

    with bz2.BZ2File(out_file, mode='w') as f:
        click.echo('Processing Wikipedia dump...')
        with closing(Pool(pool_size)) as pool:
            for paragraphs in pool.imap_unordered(
                _process_page, dump_reader, chunksize=chunk_size
            ):
                for paragraph in paragraphs:
                    para_words = paragraph.words

                    if learn_entity:
                        words = []
                        cur = 0
                        for link in sorted(paragraph.wiki_links, key=lambda l: l.span[0]):
                            title = entity_db.resolve_redirect(link.title).replace(u' ', u'_')
                            words += para_words[cur:link.span[0]]
                            words.append(MARKER + title)
                            cur = link.span[1]

                        words += para_words[cur:]

                    else:
                        words = para_words

                    f.write(u' '.join(words).encode('utf-8') + '\n')

        if abstract_db is not None:
            click.echo('Processing paragraphs in Abstract DB...')
            tokenizer = RegexpTokenizer()

            for value in abstract_db.itervalues():
                para_text = value['text']
                links = value['links']

                if learn_entity:
                    cur = len(para_text)
                    words = []
                    for (text, title, span) in sorted(links, key=lambda l: l[2][0], reverse=True):
                        words = ([MARKER + entity_db.resolve_redirect(title).replace(u' ', u'_')] +
                                 [t.text.lower() for t in tokenizer.tokenize(para_text[span[1]:cur])] +
                                 words)
                        cur = span[0]

                    words += [t.text.lower() for t in tokenizer.tokenize(para_text[:cur])] + words

                else:
                    words = [t.text.lower() for t in tokenizer.tokenize(para_text)]

                f.write(u' '.join(words).encode('utf-8') + '\n')
Example #3
0
def train_embedding(dump_file, dic_file, out_file, **kwargs):
    from wiki_dump_reader import WikiDumpReader

    dump_reader = WikiDumpReader(dump_file)
    dictionary = Dictionary.load(dic_file)

    train_kwargs = dict(parallel=kwargs.pop('parallel'),
                        pool_size=kwargs.pop('pool_size'),
                        chunk_size=kwargs.pop('chunk_size'))

    ent_vec = EntityVector(dictionary, **kwargs)
    ent_vec.train(dump_reader, **train_kwargs)

    ent_vec.save(out_file)
Example #4
0
    def build(dump_file, pool_size, chunk_size):
        dump_reader = WikiDumpReader(dump_file)

        global _extractor
        _extractor = WikiExtractor()

        titles = []
        redirects = {}
        title_counter = Counter()

        with closing(Pool(pool_size)) as pool:
            for (page, links) in pool.imap_unordered(_process_page,
                                                     dump_reader,
                                                     chunksize=chunk_size):
                titles.append(page.title)
                if page.is_redirect:
                    redirects[page.title] = page.redirect

                for link_obj in links:
                    title_counter[link_obj.title] += 1

        title_dict = Trie(titles)

        redirect_items = []
        for (title, dest_title) in redirects.iteritems():
            if dest_title in title_dict:
                redirect_items.append((title, (title_dict[dest_title], )))

        redirect_dict = RecordTrie('<I', redirect_items)

        for (title, count) in title_counter.items():
            dest_obj = redirect_dict.get(title)
            if dest_obj is not None:
                title_counter[title_dict.restore_key(dest_obj[0][0])] += count
                del title_counter[title]

        inlink_arr = np.zeros(len(title_dict), dtype=np.int)
        for (title, count) in title_counter.items():
            title_index = title_dict.get(title)
            if title_index is not None:
                inlink_arr[title_index] = count

        return EntityDB(title_dict, redirect_dict, inlink_arr)
Example #5
0
    def build(dataset, entity_db, dump_file, out_file, category, pool_size,
              chunk_size):
        global _extractor

        dump_reader = WikiDumpReader(dump_file)
        _extractor = WikiExtractor(entity_db)

        if category == 'pro':
            kb_data = dataset.profession_kb
            click.echo('Category: Profession')

        elif category == 'nat':
            kb_data = dataset.nationality_kb
            click.echo('Category: Nationality')

        else:
            raise RuntimeError('Invalid category')

        target_titles = frozenset(
            [entity_db.resolve_redirect(title) for (title, _) in kb_data])
        paragraph_buf = {}
        link_buf = defaultdict(list)

        with closing(Pool(pool_size)) as pool:
            for (title,
                 paragraphs) in pool.imap_unordered(_process_page,
                                                    dump_reader,
                                                    chunksize=chunk_size):
                if title in target_titles:
                    paragraph_buf[title] = paragraphs

                for paragraph in paragraphs:
                    for link in paragraph.wiki_links:
                        if link.title in target_titles:
                            link_buf[link.title].append(
                                WikiLink(title, link.text, 'in'))

        with closing(PageDB(out_file, protocol=-1)) as db:
            with click.progressbar(paragraph_buf.iteritems(),
                                   length=len(paragraph_buf)) as bar:
                for (title, paragraphs) in bar:
                    db[title.encode('utf-8')] = dict(paragraphs=paragraphs,
                                                     in_links=link_buf[title])
Example #6
0
def build_dictionary(dump_file, out_file, **kwargs):
    from wiki_dump_reader import WikiDumpReader

    dump_reader = WikiDumpReader(dump_file)
    dictionary = Dictionary.build(dump_reader, **kwargs)
    dictionary.save(out_file)