Beispiel #1
0
 def test_tokenization_of_symbols_as_word_on_its_own(self):
     """
         Symbols on their own are not considered tokens
         (e.g. "H & M" would generate ["H", "M"])
     """
     text = "H & M"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["H", "M"])
     text = "Hello!"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["Hello", "Hello!"])
Beispiel #2
0
    def test_tokenization_of_special_symbols_within_a_word(self):
        """
            Symbols configured in `PUNCTUATION` and `SPECIAL_SYMBOLS` constants
            are considered stop words characters, but they do not count as token
            themselves.
            (e.g. "l'oreal" would generate ["l", "oreal", "l'oreal", "loreal"])
        """

        "`'` is configured in `SPECIAL_SYMBOLS`"
        text = "l'oreal"
        tokens = tokenize_content(text)
        self.assertCountEqual(tokens, ["l", "oreal", "l'oreal", "loreal"])

        text = "H*M"
        tokens = tokenize_content(text)
        self.assertCountEqual(tokens, ["H", "M", "HM", "H*M"])
Beispiel #3
0
 def test_tokenization_of_not_acronyms_pipes(self):
     """
         When a word contains WORD_DOCUMENT_JOIN_STRING special char,
         it's replaced by an EMPTY char.
     """
     text = "Tokenize    multiple chars ||"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["Tokenize", "multiple", "chars"])
Beispiel #4
0
 def test_tokenization_of_word_multiple_symbols(self):
     """
         When tokenizing a word, we also save its version with no symbols
     """
     text = "[[[[Tokenize]]]]"
     tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens,
         ["[[[[Tokenize]]]]", "[[[[Tokenize", "Tokenize]]]]", "Tokenize"])
Beispiel #5
0
 def test_tokenization_of_repeated_words(self):
     """
         Hyphens are stop characters except when they are part
         of an ancronym (e.g I-B-M), this handling also covers dates
         (e.g. 2020-01-01)
     """
     text = "du du du da da da"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["du", "da"])
Beispiel #6
0
 def test_tokenization_of_generic_symbols_within_a_word(self):
     """
         Symbols that are not listed in `PUNCTUATION` or `SPECIAL_SYMBOLS` constants
         are considered normal characters and they won't be stopwords
         (e.g. "H&M" would generate ["H&M"])
     """
     text = "H&M"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["H&M", "HM"])
 def test_tokenization_of_not_acronyms(self):
     """
         Words than have more than 1 letter and separated by an
         hyphen are not an acronym.
     """
     text = "This-is-not-an-acronym"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "-", "is", "-", "not", "-", "an", "-", "acronym", ]
     )
 def test_tokenization_of_dates(self):
     """
         Hyphens are stop characters except when they are part
         of a date
         (e.g. 2020-01-01)
     """
     text = "This is a date 2020-01-01"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "is", "a", "date", "2020-01-01", "20200101", "2020.01.01", ]
     )
 def test_tokenization_of_acronyms(self):
     """
         Hyphens are stop characters except when they are part
         of an ancronym (e.g I-B-M), this handling also covers dates
         (e.g. 2020-01-01)
     """
     text = "This-is some text with - hyphens. I-B-M"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "-", "is", "some", "text", "with", "-", "hyphens", ".", "I-B-M", "IBM", "I.B.M"]
     )