def iter_layout_tokens_for_text(text: str,
                                tail_whitespace: str = ' ',
                                **kwargs) -> Iterable[LayoutToken]:
    pending_text = ''
    pending_whitespace = ' '
    for token_text in iter_tokenized_tokens(text, keep_whitespace=True):
        if not token_text.strip():
            pending_whitespace += token_text
            continue
        if pending_text:
            yield LayoutToken(pending_text,
                              whitespace=pending_whitespace,
                              **kwargs)
        pending_text = token_text
        pending_whitespace = ''
    if pending_text:
        pending_whitespace += tail_whitespace
        yield LayoutToken(pending_text,
                          whitespace=pending_whitespace,
                          **kwargs)
Beispiel #2
0
 def test_should_split_on_thin_space(self):
     assert (list(iter_tokenized_tokens('token1\u2009token2')) == [
         'token1', 'token2'
     ])
Beispiel #3
0
 def test_should_split_on_regular_space(self):
     assert (list(
         iter_tokenized_tokens('token1 token2')) == ['token1', 'token2'])
Beispiel #4
0
 def test_should_preserve_line_feed(self):
     assert (list(
         iter_tokenized_tokens('token1\ntoken2', keep_whitespace=True)) == [
             'token1', '\n', 'token2'
         ])
Beispiel #5
0
 def test_should_preserve_space(self):
     assert (list(
         iter_tokenized_tokens('token1 token2', keep_whitespace=True)) == [
             'token1', ' ', 'token2'
         ])
Beispiel #6
0
 def test_should_split_on_line_feed(self):
     assert (list(
         iter_tokenized_tokens('token1\ntoken2')) == ['token1', 'token2'])
Beispiel #7
0
def get_normalized_key_tokens(text: str):
    return [
        get_normalized_key_text(token) for token in iter_tokenized_tokens(text)
        if token.strip()
    ]