コード例 #1
0
 def test_download(self):
     dataset = Wikipedia(data_dir=self.tempdir)
     dataset.download()
     self.assertTrue(os.path.exists(dataset.filename))
コード例 #2
0
 def test_ioerror(self):
     dataset = Wikipedia(data_dir=self.tempdir)
     with self.assertRaises(IOError):
         _ = list(dataset.texts())
コード例 #3
0
from __future__ import absolute_import, unicode_literals

import os
import shutil
import tempfile
import unittest

from textacy import data_dir
from textacy.compat import unicode_
from textacy.datasets.wikipedia import Wikipedia

DATASET = Wikipedia(lang='en', version='latest')


@unittest.skipUnless(
    DATASET.filename,
    'Wikipedia dataset must be downloaded before running tests')
class WikipediaTestCase(unittest.TestCase):
    def setUp(self):
        self.tempdir = tempfile.mkdtemp(prefix='test_datasets_',
                                        dir=os.path.dirname(
                                            os.path.abspath(__file__)))

    @unittest.skip("No need to download a new dataset every time")
    def test_download(self):
        dataset = Wikipedia(data_dir=self.tempdir)
        dataset.download()
        self.assertTrue(os.path.exists(dataset.filename))

    def test_ioerror(self):
        dataset = Wikipedia(data_dir=self.tempdir)
コード例 #4
0
ファイル: clean.py プロジェクト: richiefrost/lda-frost
            continue
        if token.like_num:
            continue
        if not token.is_ascii:
            continue
        if token.pos_ in {u'NOUN', u'PROPN'}:
            words.append(token.lemma_)

    return words


pool_size = 32

p = Pool(pool_size)

wp = Wikipedia(lang='en', version='latest')

with open("lemmatized_nouns/output.txt", "w+") as f:
    batch, batch_max = [], 2**14

    for text in wp.texts(min_len=300):
        batch.append(text)
        if len(batch) >= batch_max:
            # Returns pool_size number of arrays of roughly (batch_max / pool_size) processed documents (each document represented in array form)
            results = p.map(process_mini_batch,
                            (batch[i::pool_size] for i in range(pool_size)))
            for result in results:
                for entry in result:
                    # Write each document on its own line
                    f.write(' '.join([word.encode('utf-8')
                                      for word in entry]) + "\n")
コード例 #5
0
def test_ioerror(tmpdir):
    dataset = Wikipedia(data_dir=str(tmpdir))
    with pytest.raises(IOError):
        _ = list(dataset.texts())
コード例 #6
0
def test_download(tmpdir):
    dataset = Wikipedia(data_dir=str(tmpdir))
    dataset.download()
    assert os.path.exists(dataset.filename)
コード例 #7
0
from __future__ import absolute_import, unicode_literals

import os

import pytest

from textacy import compat
from textacy.datasets.wikipedia import Wikipedia

DATASET = Wikipedia(lang="en", version="latest")

pytestmark = pytest.mark.skipif(
    DATASET.filename is None,
    reason="Wikipedia dataset must be downloaded before running tests",
)


@pytest.mark.skip("No need to download a new dataset every time")
def test_download(tmpdir):
    dataset = Wikipedia(data_dir=str(tmpdir))
    dataset.download()
    assert os.path.exists(dataset.filename)


def test_ioerror(tmpdir):
    dataset = Wikipedia(data_dir=str(tmpdir))
    with pytest.raises(IOError):
        _ = list(dataset.texts())


def test_texts():