def test_index_tokenizer_lines_on_html_like_texts_2(
         self, regen=REGEN_TEST_FIXTURES):
     test_file = self.get_test_loc('tokenize/htmlish.html')
     expected_file = test_file + '.expected.index_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(index_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)
    def test_key_phrase_tokenizer_returns_same_word_tokens_as_index_tokenizer(self):
        """
        It is important that the `key_phrase_tokenizer` returns the same amount
        of tokens (excluding key_phrase markup) as the `index_tokenizer` so that
        they Span positions derived from the tokens line up.
        """
        text = 'Redistribution \n\n comma and   use in \n\t binary \xe4r till\xe5tet.'

        key_phrase_tokens = key_phrase_tokenizer(text)
        index_tokens = index_tokenizer(text)

        assert list(key_phrase_tokens) == list(index_tokens)
 def test_index_tokenizer_on_html_like_texts(self, regen=False):
     test_file = self.get_test_loc('tokenize/htmlish.txt')
     expected_file = test_file + '.expected.index_tokenizer.json'
     lines = query_lines(test_file)
     result = [list(index_tokenizer(line)) for _ln, line in lines]
     check_results(result, expected_file, regen=regen)