Beispiel #1
0
def test_splitting_sentences():
    # Use temporary directory since mocking is hard to apply to
    # `_tokenize_sentences_worker`.
    input_file = random_filename(tempfile.gettempdir())
    output_file = random_filename(tempfile.gettempdir())

    # Write dummy file to `input_file`.
    dummy = '안녕하세요. 반갑습니다! 어떠신가요? 괜찮습니다...ㅎㅎ\n\n'
    with open(input_file, 'w') as fp:
        fp.write(dummy)

    # Split the sentences.
    namuwiki._tokenize_sentences_worker(
        input_file, output_file, min_len=0, max_len=100)

    # Check if sentences are splitted well.
    with open(output_file, 'r') as fp:
        lines = fp.readlines()
        assert len(lines) == 4
        assert ' '.join([line.strip() for line in lines]) == dummy.strip()

    # Check if splitting into chuncks works well.
    namuwiki._tokenize_sentences_worker(
        input_file, output_file, 0, 100, split_sent=False)

    with open(output_file, 'r') as fp:
        lines = fp.readlines()
        assert lines[0].strip() == dummy.strip()
Beispiel #2
0
def test_tokenizing_corpus_well():
    # Use temporary directory since `tokenizers` does not support mocking.
    input_file = random_filename(tempfile.gettempdir())
    vocab_file = random_filename(tempfile.gettempdir())
    output_file = random_filename(tempfile.gettempdir())

    # Copy dummy corpus file to `input_file`.
    shutil.copyfile('tests/res/wikipedia.plain.txt', input_file)

    # First of all, train the tokenizer to tokenize corpus.
    train_tokenizer(input_file,
                    vocab_file,
                    tempfile.gettempdir(),
                    vocab_size=100)

    # Next, tokenize the given corpus by using trained tokenizer.
    tokenize_corpus(input_file, output_file, vocab_file)

    # Check the corpus is tokenized well.
    with open(output_file, 'r') as output, \
            open('tests/res/wikipedia.plain.txt', 'r') as dummy:
        assert (output.read().strip().count('\n') ==
                dummy.read().strip().count('\n'))

    # Remove created temporary files.
    os.remove(vocab_file)
    os.remove(input_file)
Beispiel #3
0
def test_splitting_sentences():
    # Use temporary directory since mocking is hard to apply to
    # `_tokenize_sentences_worker`.
    input_file = random_filename(tempfile.gettempdir())
    output_file = random_filename(tempfile.gettempdir())

    # Write dummy file to `input_file`.
    dummy = 'Nice to meet you Dr. John. Welcome! How are you?\n\n'
    with open(input_file, 'w') as fp:
        fp.write(dummy)

    # Split the sentences.
    wikipedia._prepare_tokenizing_sentences('en')
    wikipedia._tokenize_sentences_worker(input_file, output_file, 'en', 0, 100)

    # Check if sentences are splitted well.
    with open(output_file, 'r') as fp:
        lines = fp.readlines()
        assert len(lines) == 3
        assert ' '.join([line.strip() for line in lines]) == dummy.strip()

    # Check if splitting into chuncks works well.
    wikipedia._tokenize_sentences_worker(input_file,
                                         output_file,
                                         'en',
                                         0,
                                         100,
                                         split_sent=False)

    with open(output_file, 'r') as fp:
        lines = fp.readlines()
        assert lines[0].strip() == dummy.strip()
Beispiel #4
0
def test_generate_correct_filenames():
    # Check for single generation.
    assert len(random_filename('')) == 16
    assert random_filename('parent').startswith('parent')

    # Check for multiple generations.
    filenames = random_filenames('', n=4)
    assert len(filenames) == 4
    for name in filenames:
        assert len(name) == 16

    filenames = random_filenames('parent', n=4)
    for name in filenames:
        assert name.startswith('parent')
Beispiel #5
0
def test_training_tokenizer_well():
    # Use temporary directory since `tokenizers` does not support mocking.
    input_file = random_filename(tempfile.gettempdir())
    vocab_file = random_filename(tempfile.gettempdir())

    # Copy dummy corpus file to `input_file`.
    shutil.copyfile('tests/res/wikipedia.plain.txt', input_file)

    # Train tokenizer with dummy corpus file.
    train_tokenizer(input_file,
                    vocab_file,
                    tempfile.gettempdir(),
                    vocab_size=100)

    # Check that the tokenizer is trained well.
    with open(vocab_file, 'r') as fp:
        assert len(fp.readlines()) == 100

    # Remove created temporary files.
    os.remove(input_file)
    os.remove(vocab_file)