Esempio n. 1
0
 def test_key_phrase_tokenizer_returns_nested_key_phrase_markup_as_tokens(
         self):
     text = 'Redistribution {{is {{not}} really}} permitted.'
     assert list(key_phrase_tokenizer(text)) == [
         'redistribution', '{{', 'is', '{{', 'not', '}}', 'really', '}}',
         'permitted'
     ]
Esempio n. 2
0
 def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_after_newline(
         self):
     text = '{{IS_RIGHT\nThis program is distributed under GPL\n}}IS_RIGHT'
     assert list(key_phrase_tokenizer(text)) == [
         '{{', 'is', 'right', 'this', 'program', 'is', 'distributed',
         'under', 'gpl', '}}', 'is', 'right'
     ]
Esempio n. 3
0
 def test_key_phrase_tokenizer_lines_on_html_like_texts_2(
         self, regen=REGEN_TEST_FIXTURES):
     test_file = self.get_test_loc('tokenize/htmlish.html')
     expected_file = test_file + '.expected.key_phrase_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(key_phrase_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)
    def test_key_phrase_tokenizer_returns_same_word_tokens_as_index_tokenizer(self):
        """
        It is important that the `key_phrase_tokenizer` returns the same amount
        of tokens (excluding key_phrase markup) as the `index_tokenizer` so that
        they Span positions derived from the tokens line up.
        """
        text = 'Redistribution \n\n comma and   use in \n\t binary \xe4r till\xe5tet.'

        key_phrase_tokens = key_phrase_tokenizer(text)
        index_tokens = index_tokenizer(text)

        assert list(key_phrase_tokens) == list(index_tokens)
Esempio n. 5
0
 def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_for_multiple_token_key_phrases(
         self):
     text = 'Redistribution and {{use in binary}} is permitted.'
     assert list(key_phrase_tokenizer(text)) == [
         'redistribution',
         'and',
         '{{',
         'use',
         'in',
         'binary',
         '}}',
         'is',
         'permitted',
     ]
 def test_key_phrase_tokenizer_ignores_invalid_key_phrase_markup(self):
     text = 'Redistribution {{{is not really}}} { {permitted} }, I am {afraid}.'
     assert list(key_phrase_tokenizer(text)) == [
         'redistribution', '{{', 'is', 'not', 'really', '}}', 'permitted',
         'i', 'am', 'afraid'
     ]
 def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_for_single_token_key_phrase(self):
     text = 'Redistribution {{is}} permitted.'
     assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', '}}', 'permitted']
 def test_key_phrase_tokenizer_returns_key_phrase_markup_as_tokens_when_separated_by_space(self):
     text = 'Redistribution {{ is }} permitted.'
     assert list(key_phrase_tokenizer(text)) == ['redistribution', '{{', 'is', '}}', 'permitted']
 def test_key_phrase_does_not_crash_on_unicode_rules_text_5(self):
     test_file = self.get_test_loc('tokenize/unicode/12420.txt')
     with io.open(test_file, encoding='utf-8') as test:
         list(key_phrase_tokenizer(test.read()))
 def test_key_phrase_tokenizer_handles_empty_lines(self):
     text = u'\n\n'
     expected = []
     assert list(key_phrase_tokenizer(text)) == expected
 def test_key_phrase_tokenizer_handles_blank_lines2(self):
     text = ' \n\t  '
     result = list(key_phrase_tokenizer(text))
     assert result == []
 def test_key_phrase_tokenizer_handles_empty_string(self):
     text = ''
     result = list(key_phrase_tokenizer(text))
     assert result == []
 def test_key_phrase_tokenizer_on_html_like_texts(self, regen=False):
     test_file = self.get_test_loc('tokenize/htmlish.txt')
     expected_file = test_file + '.expected.key_phrase_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(key_phrase_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)