Python tokenize_content Examples

Programming Language: Python

Namespace/Package Name: djangae.contrib.search.tokens

Method/Function: tokenize_content

Examples at hotexamples.com: 9

Python tokenize_content - 9 examples found. These are the top rated real world Python examples of djangae.contrib.search.tokens.tokenize_content extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

0

Show file

File: test_indexing.py Project: potatolondon/djangae

 def test_tokenization_of_symbols_as_word_on_its_own(self):
     """
         Symbols on their own are not considered tokens
         (e.g. "H & M" would generate ["H", "M"])
     """
     text = "H & M"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["H", "M"])
     text = "Hello!"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["Hello", "Hello!"])

Example #2

0

Show file

File: test_indexing.py Project: potatolondon/djangae

    def test_tokenization_of_special_symbols_within_a_word(self):
        """
            Symbols configured in `PUNCTUATION` and `SPECIAL_SYMBOLS` constants
            are considered stop words characters, but they do not count as token
            themselves.
            (e.g. "l'oreal" would generate ["l", "oreal", "l'oreal", "loreal"])
        """

        "`'` is configured in `SPECIAL_SYMBOLS`"
        text = "l'oreal"
        tokens = tokenize_content(text)
        self.assertCountEqual(tokens, ["l", "oreal", "l'oreal", "loreal"])

        text = "H*M"
        tokens = tokenize_content(text)
        self.assertCountEqual(tokens, ["H", "M", "HM", "H*M"])

Example #3

0

Show file

File: test_indexing.py Project: potatolondon/djangae

 def test_tokenization_of_not_acronyms_pipes(self):
     """
         When a word contains WORD_DOCUMENT_JOIN_STRING special char,
         it's replaced by an EMPTY char.
     """
     text = "Tokenize    multiple chars ||"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["Tokenize", "multiple", "chars"])

Example #4

0

Show file

File: test_indexing.py Project: potatolondon/djangae

 def test_tokenization_of_word_multiple_symbols(self):
     """
         When tokenizing a word, we also save its version with no symbols
     """
     text = "[[[[Tokenize]]]]"
     tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens,
         ["[[[[Tokenize]]]]", "[[[[Tokenize", "Tokenize]]]]", "Tokenize"])

Example #5

0

Show file

File: test_indexing.py Project: potatolondon/djangae

 def test_tokenization_of_repeated_words(self):
     """
         Hyphens are stop characters except when they are part
         of an ancronym (e.g I-B-M), this handling also covers dates
         (e.g. 2020-01-01)
     """
     text = "du du du da da da"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["du", "da"])

Example #6

0

Show file

File: test_indexing.py Project: potatolondon/djangae

 def test_tokenization_of_generic_symbols_within_a_word(self):
     """
         Symbols that are not listed in `PUNCTUATION` or `SPECIAL_SYMBOLS` constants
         are considered normal characters and they won't be stopwords
         (e.g. "H&M" would generate ["H&M"])
     """
     text = "H&M"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["H&M", "HM"])

Example #7

0

Show file

File: test_indexing.py Project: jasoncartwright/djangae

 def test_tokenization_of_not_acronyms(self):
     """
         Words than have more than 1 letter and separated by an
         hyphen are not an acronym.
     """
     text = "This-is-not-an-acronym"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "-", "is", "-", "not", "-", "an", "-", "acronym", ]
     )

Example #8

0

Show file

File: test_indexing.py Project: jasoncartwright/djangae

 def test_tokenization_of_dates(self):
     """
         Hyphens are stop characters except when they are part
         of a date
         (e.g. 2020-01-01)
     """
     text = "This is a date 2020-01-01"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "is", "a", "date", "2020-01-01", "20200101", "2020.01.01", ]
     )

Example #9

0

Show file

File: test_indexing.py Project: jasoncartwright/djangae

 def test_tokenization_of_acronyms(self):
     """
         Hyphens are stop characters except when they are part
         of an ancronym (e.g I-B-M), this handling also covers dates
         (e.g. 2020-01-01)
     """
     text = "This-is some text with - hyphens. I-B-M"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "-", "is", "some", "text", "with", "-", "hyphens", ".", "I-B-M", "IBM", "I.B.M"]
     )