def test_iterator(): """Test the iterator of TextDirectory.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files() files = [file for file in td] assert len(files) == 10 print(files[0]['path'].resolve()) assert 'Text_' in str(files[0]['path'].resolve())
def test_transformation_expand_contrations(): """Test the expand English contractions transformation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.stage_transformation(['transformation_expand_english_contractions']) assert 'She is the one who flew to Mars.' in td.aggregate_to_memory() assert 'I will finish the spaceship in time.' in td.aggregate_to_memory()
def test_print_pipeline(capsys): """"Test the print_pipeline function.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.filter_by_chars_outliers() td.print_pipeline() out, err = capsys.readouterr() assert 'filter_by_chars_outliers' in out
def test_transformation_test_arguments(): """Test whether we can pass arguments to transformations.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.stage_transformation([ 'transformation_remove_stopwords', 'internal', 'en', 'en_core_web_sm', 'dolor,dolore,dolores' ]) text = td.aggregate_to_memory() assert 'dolor' not in text
def test_tabulation(capsys): """Test the tabulation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.print_aggregation() out, err = capsys.readouterr() assert 'path' in out
def test_filter_by_similar_documents(): """Test the similarity filter.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.filter_by_similar_documents( reference_file='textdirectory/data/testdata/level_2/Text_2_B.txt', threshold=0.7) assert len(td.aggregation) == 2
def test_transformation_uppercase(): """Test the uppercase transformation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.stage_transformation(['transformation_uppercase']) assert td.aggregate_to_memory().isupper()
def test_transformation_remove_nl(): """Test the remove_nl transformation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.stage_transformation(['transformation_remove_nl']) assert '\n' not in td.aggregate_to_memory()
def test_filter_by_min_filesize(): """Test the filesize (min) filter.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.filter_by_min_filesize(min_kb=2) assert len(td.aggregation) == 1
def test_filter_by_chars_outliers(): """Test the outlier filter.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.filter_by_chars_outliers(1) assert len(td.aggregation) == 9
def test_simpple_aggregations(): """Test the simplest form of aggregation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') assert len(td.aggregate_to_memory()) == 4179
def test_transform_to_memory(): """Test the in memory transformation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.transform_to_memory() assert len(list(td.get_aggregation())[0]['transformed_text']) > 0
def test_transformation_lemmatize(): """Test the lemmatize transformation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.stage_transformation(['transformation_lemmatize']) assert 'language be complicate' in td.aggregate_to_memory()
def test_transformation_postag(): """Test the postag transformation.""" td = TextDirectory(directory='textdirectory/data/testdata/') td.load_files(True, 'txt') td.stage_transformation(['transformation_postag']) assert 'NN' in td.aggregate_to_memory()