def setUp(self): self.tempdir = tempfile.mkdtemp(prefix='test_corpora', dir=os.path.dirname( os.path.abspath(__file__))) wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2') write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True) self.wikireader = WikiReader(wiki_fname)
def parse_and_save(): en = spacy.load('en') reader = WikiReader(wikidump) records = reader.records() def section_texts_flat(records): while 1: try: record = next(records) except OSError as e: print('error: %s' % e) else: for section in record['sections']: yield section['text'] pipe = en.pipe(section_texts_flat(records), n_threads=cpu_count(), batch_size=1000) # pipe = (en(txt) for txt in section_texts_flat(records)) preproc = Preprocessor(en.vocab) with FilePoolWriter(wikidoc_dir, wikidoc_fn_template) as f: for i, doc in enumerate(tqdm.tqdm(pipe)): if len(doc._py_tokens) <= 7: # short sentences -- nah continue for sent in doc.sents: packed = preproc.pack(sent) f.write(packed) if i % 10000 == 0: print('i=%s, saving vocab' % i) save_vocab(en.vocab) save_vocab(en.vocab) import IPython IPython.embed()
def setUp(self): self.tempdir = tempfile.mkdtemp( prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__))) wiki_fname =os.path.join(self.tempdir, 'wikitext.xml.bz2') with bzip_open(wiki_fname, mode='w') as f: f.write(WIKITEXT) self.wikireader = WikiReader(wiki_fname)
class WikiReaderTestCase(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp( prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__))) wiki_fname =os.path.join(self.tempdir, 'wikitext.xml.bz2') with bzip_open(wiki_fname, mode='w') as f: f.write(WIKITEXT) self.wikireader = WikiReader(wiki_fname) def test_texts(self): texts = list(self.wikireader.texts()) for text in texts: self.assertIsInstance(text, str) def test_texts_min_len(self): texts = list(self.wikireader.texts(min_len=300)) self.assertEqual(len(texts), 1) def test_texts_limit(self): texts = list(self.wikireader.texts(limit=1)) self.assertEqual(len(texts), 1) def test_pages(self): pages = list(self.wikireader.pages()) for page in pages: self.assertIsInstance(page, dict) def test_pages_min_len(self): pages = list(self.wikireader.pages(min_len=300)) self.assertEqual(len(pages), 1) def test_pages_limit(self): pages = list(self.wikireader.pages(limit=1)) self.assertEqual(len(pages), 1) def tearDown(self): for fname in os.listdir(self.tempdir): os.remove(os.path.join(self.tempdir, fname)) os.rmdir(self.tempdir)
class WikiReaderTestCase(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp( prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__))) wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2') write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True) self.wikireader = WikiReader(wiki_fname) def test_texts(self): texts = list(self.wikireader.texts()) for text in texts: self.assertIsInstance(text, unicode_type) def test_texts_min_len(self): texts = list(self.wikireader.texts(min_len=300)) self.assertEqual(len(texts), 1) def test_texts_limit(self): texts = list(self.wikireader.texts(limit=1)) self.assertEqual(len(texts), 1) def test_pages(self): pages = list(self.wikireader.pages()) for page in pages: self.assertIsInstance(page, dict) def test_pages_min_len(self): pages = list(self.wikireader.pages(min_len=300)) self.assertEqual(len(pages), 1) def test_pages_limit(self): pages = list(self.wikireader.pages(limit=1)) self.assertEqual(len(pages), 1) def tearDown(self): shutil.rmtree(self.tempdir)
class WikiReaderTestCase(unittest.TestCase): def setUp(self): self.tempdir = tempfile.mkdtemp(prefix='test_corpora', dir=os.path.dirname( os.path.abspath(__file__))) wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2') write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True) self.wikireader = WikiReader(wiki_fname) def test_texts(self): texts = list(self.wikireader.texts()) for text in texts: self.assertIsInstance(text, unicode_type) def test_texts_min_len(self): texts = list(self.wikireader.texts(min_len=300)) self.assertEqual(len(texts), 1) def test_texts_limit(self): texts = list(self.wikireader.texts(limit=1)) self.assertEqual(len(texts), 1) def test_records(self): records = list(self.wikireader.records()) for record in records: self.assertIsInstance(record, dict) def test_records_min_len(self): records = list(self.wikireader.records(min_len=300)) self.assertEqual(len(records), 1) def test_records_limit(self): records = list(self.wikireader.records(limit=1)) self.assertEqual(len(records), 1) def tearDown(self): shutil.rmtree(self.tempdir)
def setUp(self): self.tempdir = tempfile.mkdtemp( prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__))) wiki_fname = os.path.join(self.tempdir, 'wikitext.xml.bz2') write_file(WIKITEXT, wiki_fname, mode='wb', auto_make_dirs=True) self.wikireader = WikiReader(wiki_fname)