def test_key_phrase_tokenizer_returns_nested_key_phrase_markup_as_tokens( self): text = 'Redistribution {{is {{not}} really}} permitted.' assert list(key_phrase_tokenizer(text)) == [ 'redistribution', '{{', 'is', '{{', 'not', '}}', 'really', '}}', 'permitted' ]
def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_after_newline( self): text = '{{IS_RIGHT\nThis program is distributed under GPL\n}}IS_RIGHT' assert list(key_phrase_tokenizer(text)) == [ '{{', 'is', 'right', 'this', 'program', 'is', 'distributed', 'under', 'gpl', '}}', 'is', 'right' ]
def test_key_phrase_tokenizer_lines_on_html_like_texts_2( self, regen=REGEN_TEST_FIXTURES): test_file = self.get_test_loc('tokenize/htmlish.html') expected_file = test_file + '.expected.key_phrase_tokenizer.json' lines = query_lines(test_file) result = [list(key_phrase_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen)
def test_key_phrase_tokenizer_returns_same_word_tokens_as_index_tokenizer(self): """ It is important that the `key_phrase_tokenizer` returns the same amount of tokens (excluding key_phrase markup) as the `index_tokenizer` so that they Span positions derived from the tokens line up. """ text = 'Redistribution \n\n comma and use in \n\t binary \xe4r till\xe5tet.' key_phrase_tokens = key_phrase_tokenizer(text) index_tokens = index_tokenizer(text) assert list(key_phrase_tokens) == list(index_tokens)
def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_for_multiple_token_key_phrases( self): text = 'Redistribution and {{use in binary}} is permitted.' assert list(key_phrase_tokenizer(text)) == [ 'redistribution', 'and', '{{', 'use', 'in', 'binary', '}}', 'is', 'permitted', ]
def test_key_phrase_tokenizer_ignores_invalid_key_phrase_markup(self): text = 'Redistribution {{{is not really}}} { {permitted} }, I am {afraid}.' assert list(key_phrase_tokenizer(text)) == [ 'redistribution', '{{', 'is', 'not', 'really', '}}', 'permitted', 'i', 'am', 'afraid' ]
def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_for_single_token_key_phrase(self): text = 'Redistribution {{is}} permitted.' assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', '}}', 'permitted']
def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_when_separated_by_space(self): text = 'Redistribution {{ is }} permitted.' assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', '}}', 'permitted']
def test_key_phrase_does_not_crash_on_unicode_rules_text_5(self): test_file = self.get_test_loc('tokenize/unicode/12420.txt') with io.open(test_file, encoding='utf-8') as test: list(key_phrase_tokenizer(test.read()))
def test_key_phrase_tokenizer_handles_empty_lines(self): text = u'\n\n' expected = [] assert list(key_phrase_tokenizer(text)) == expected
def test_key_phrase_tokenizer_handles_blank_lines2(self): text = ' \n\t ' result = list(key_phrase_tokenizer(text)) assert result == []
def test_key_phrase_tokenizer_handles_empty_string(self): text = '' result = list(key_phrase_tokenizer(text)) assert result == []
def test_key_phrase_tokenizer_on_html_like_texts(self, regen=False): test_file = self.get_test_loc('tokenize/htmlish.txt') expected_file = test_file + '.expected.key_phrase_tokenizer.json' lines = query_lines(test_file) result = [list(key_phrase_tokenizer(line)) for _ln, line in lines] check_results(result, expected_file, regen=regen)