def setUp(self):
     self.tempdir = tempfile.mkdtemp(prefix='test_corpora',
                                     dir=os.path.dirname(
                                         os.path.abspath(__file__)))
     wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2')
     write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True)
     self.wikireader = WikiReader(wiki_fname)
Ejemplo n.º 2
0
def parse_and_save():
    en = spacy.load('en')
    reader = WikiReader(wikidump)
    records = reader.records()

    def section_texts_flat(records):
        while 1:
            try:
                record = next(records)
            except OSError as e:
                print('error: %s' % e)
            else:
                for section in record['sections']:
                    yield section['text']

    pipe = en.pipe(section_texts_flat(records),
                   n_threads=cpu_count(),
                   batch_size=1000)
    # pipe = (en(txt) for txt in section_texts_flat(records))
    preproc = Preprocessor(en.vocab)
    with FilePoolWriter(wikidoc_dir, wikidoc_fn_template) as f:
        for i, doc in enumerate(tqdm.tqdm(pipe)):
            if len(doc._py_tokens) <= 7:
                # short sentences -- nah
                continue
            for sent in doc.sents:
                packed = preproc.pack(sent)
                f.write(packed)
            if i % 10000 == 0:
                print('i=%s, saving vocab' % i)
                save_vocab(en.vocab)
    save_vocab(en.vocab)
    import IPython
    IPython.embed()