def test_filterpunc(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\n.\nnot' expected_tokens = ['Hello', 'World', 'I', 'not'] create_single_output(filename, sentence) result = runner.invoke(filterpunc, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_words2ngrams(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.' expected_tokens = ['Hello World !', 'World ! I', '! I love', 'I love go'] create_single_output(filename, sentence) result = runner.invoke(words2ngrams, ['-n', 3, filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_sentences(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello World! I love go.' expected_tokens = ['Hello World!', 'I love go.'] create_single_output(filename, sentence) result = runner.invoke(text2sentences, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_text2words(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello World!\nI.\nnot sure where to go' expected_tokens = ['Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go'] create_single_output(filename, sentence) result = runner.invoke(text2words, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_punc(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\nlove,\ngo\n.' expected_tokens = ['!', ',', '.'] create_single_output(filename, sentence) result = runner.invoke(text2punc, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_uppercase(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n' expected_tokens = ['HELLO', 'WORLD', '!', 'I', '.', 'NOOOO'] create_single_output(filename, sentence) result = runner.invoke(tokens2upper, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_punc_multifile(): runner = CliRunner() with runner.isolated_filesystem(): filenames = ['in.txt', 'in2.txt'] sentences = ['Hello\nWorld\n!\nI\nlove,\ngo\n.', 'Goodbye World!\n I...\n know everything\'s about you?'] expected_tokens = ['!', ',', '.', '!', '...', "'", '?'] create_multifile_output(filenames, sentences) result = runner.invoke(text2punc, filenames) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_filterwords(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.' expected_tokens = ['Hello', 'World', '!', 'crook', '.'] create_single_output(filename, sentence) result = runner.invoke(filterwords, ['--language', 'english', filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_lowercase(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\n.\nnoooo\n' expected_tokens = ['hello', 'world', '!', 'i', '.', 'noooo'] create_single_output(filename, sentence) result = runner.invoke(lowercase, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_text2words(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello World!\nI.\nnot sure where to go' expected_tokens = [ 'Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go' ] create_single_output(filename, sentence) result = runner.invoke(text2words, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_nonewlines(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\nam\nin.\n' expected_tokens = ['Hello World ! I am in.'] create_single_output(filename, sentence) result = runner.invoke(nonewlines, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 assert len(result.output.split('\n')) == 2 compare_results(tokens, expected_tokens)
def test_nonewlines_multifile(): runner = CliRunner() with runner.isolated_filesystem(): filenames = ['in.txt', 'in2.txt'] sentences = ['Hello\nWorld\n!\nI\nam\nin.', 'What are you\na creature\nof mystery'] expected_tokens = ['Hello World ! I am in. What are you a creature of mystery'] create_multifile_output(filenames, sentences) result = runner.invoke(nonewlines, filenames) tokens = result.output.split('\n') assert result.exit_code == 0 assert len(result.output.split('\n')) == 2 compare_results(tokens, expected_tokens)
def test_words2ngrams(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\nlove\ngo\n.' expected_tokens = [ 'Hello World !', 'World ! I', '! I love', 'I love go' ] create_single_output(filename, sentence) result = runner.invoke(words2ngrams, ['-n', 3, filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_pos_tokens(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou' expected_tokens = ['Hello,NNP', 'world,NN', '!,.', 'I,PRP', 'love,VBP', 'this,DT', 'world,NN', 'and,CC', 'love,VB', 'you,PRP'] create_single_output(filename, sentence) result = runner.invoke(tokens2pos, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_punc_multifile(): runner = CliRunner() with runner.isolated_filesystem(): filenames = ['in.txt', 'in2.txt'] sentences = [ 'Hello\nWorld\n!\nI\nlove,\ngo\n.', 'Goodbye World!\n I...\n know everything\'s about you?' ] expected_tokens = ['!', ',', '.', '!', '...', "'", '?'] create_multifile_output(filenames, sentences) result = runner.invoke(text2punc, filenames) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_text2words_multifile(): runner = CliRunner() with runner.isolated_filesystem(): filenames = ['in.txt', 'in2.txt'] sentences = ('Hello World!\nI.\nnot sure where to go', 'Goodbye World!\n I.\n know everything about you') expected_tokens = ['Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go', 'Goodbye', 'World', '!', 'I.', 'know', 'everything', 'about', 'you'] create_multifile_output(filenames, sentences) result = runner.invoke(text2words, filenames) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_count_tokens(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello,\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou' expected_tokens = ['love,2', 'world,2', 'and,1', 'I,1', 'you,1', 'this,1', '\"Hello,\",1', '!,1', ''] expected_tokens.sort() create_single_output(filename, sentence) result = runner.invoke(tokens2counts, [filename]) tokens = result.output.split('\n') tokens.sort() assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_pos_tokens(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou' expected_tokens = [ 'Hello,NNP', 'world,NN', '!,.', 'I,PRP', 'love,VBP', 'this,DT', 'world,NN', 'and,CC', 'love,VB', 'you,PRP' ] create_single_output(filename, sentence) result = runner.invoke(tokens2pos, [filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_count_tokens(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello,\nworld\n!\nI\nlove\nthis\nworld\nand\nlove\nyou' expected_tokens = [ 'love,2', 'world,2', 'and,1', 'I,1', 'you,1', 'this,1', '\"Hello,\",1', '!,1', '' ] expected_tokens.sort() create_single_output(filename, sentence) result = runner.invoke(tokens2counts, [filename]) tokens = result.output.split('\n') tokens.sort() assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_text2words_multifile(): runner = CliRunner() with runner.isolated_filesystem(): filenames = ['in.txt', 'in2.txt'] sentences = ('Hello World!\nI.\nnot sure where to go', 'Goodbye World!\n I.\n know everything about you') expected_tokens = [ 'Hello', 'World', '!', 'I.', 'not', 'sure', 'where', 'to', 'go', 'Goodbye', 'World', '!', 'I.', 'know', 'everything', 'about', 'you' ] create_multifile_output(filenames, sentences) result = runner.invoke(text2words, filenames) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_filterwords_custom(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.' expected_tokens = ['World','!','crook','.'] custom_stopword_filename = 'custom.txt' custom_stopwords = 'hello\n' create_single_output(filename, sentence) create_single_output(custom_stopword_filename, custom_stopwords) result = runner.invoke(filterwords, ['--custom', 'custom.txt', filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_filterwords_custom(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\nam\nnot\na\ncrook\n.' expected_tokens = ['World', '!', 'crook', '.'] custom_stopword_filename = 'custom.txt' custom_stopwords = 'hello\n' create_single_output(filename, sentence) create_single_output(custom_stopword_filename, custom_stopwords) result = runner.invoke(filterwords, ['--custom', 'custom.txt', filename]) tokens = result.output.split('\n') assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_filterlengths(): runner = CliRunner() with runner.isolated_filesystem(): filename = 'in.txt' sentence = 'Hello\nWorld\n!\nI\n.\nnot\nwin\n' create_single_output(filename, sentence) # default length 3 result = runner.invoke(filterlengths, [filename]) tokens = result.output.split('\n') expected_tokens = ['Hello', 'World', 'not', 'win'] assert result.exit_code == 0 compare_results(tokens, expected_tokens) # minumum length 4 result = runner.invoke(filterlengths, ['-m', '4', filename]) tokens = result.output.split('\n') expected_tokens = ['Hello', 'World'] assert result.exit_code == 0 compare_results(tokens, expected_tokens)
def test_annotate_settings(): TEXT = """<div><p>Paris Hilton wasn't going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in Paris.</p></div>""" RESULT = """<div><p>Paris Hilton wasn\'t going to let a bit of snowfall ruin her trip to New York City.</p><p>The Stars Are Blind singer, who was born in the Big Apple, was snapped Friday boarding a vehicle amid snowy conditions to check out the on-goings as fashion fever overtakes the city that never sleeps for New York Fashion Week.</p><p>The 35-year-old socialite matched well with a black and red leather jacket in <a class="anchorman" lemma="Paris" type="location">Paris</a>.</p></div>""" links = [{ u'Paris': { 'lemma': u'Paris', 'type': 'location' } }, { u'Paris Hilton': { 'lemma': u'Paris Hilton', 'type': 'person' } }] cfg = get_config() cfg['markup'] = { 'anchor_pattern': '<a class="anchorman" lemma="{lemma}" type="{type}">{token}</a>', 'decorate_anchor_key': 'the_anchor' } rules = { 'return_applied_links': True, # apply high score candidates first 'sort_by_item_value': { 'key': 'score', 'default': 0 }, # 'replaces_per_element': { # 'number': 1, # 'key': 'lemma' # }, # 'replaces_at_all': 5, #self.max_links, # not available 'longest_match_first': False, # 'replaces': { # 'by_attribute': { # 'key': 'type', # # 'value_per_unit': 1 # 'value_overall': 2 #self.max_per_etype # } # }, 'items_per_unit': 4, #self.links_per_paragraph, 'filter_by_attribute': { 'attributes': [{ 'key': 'type', 'value': 'person' }] } } settings = { # "log_level": "DEBUG", "return_applied_links": True, # "forbidden_areas": { # "tags": ["img", "a"], # "classes": ["first", "p--heading-3"] # } } cfg['settings'].update(settings) cfg['rules'].update(rules) annotated, applied, rest = annotate(TEXT, links, config=cfg) from tests.utils import compare_results RESULT = re.sub(" +", " ", RESULT) compare_results(annotated, RESULT) assert annotated == RESULT