def test_tokenization_of_symbols_as_word_on_its_own(self): """ Symbols on their own are not considered tokens (e.g. "H & M" would generate ["H", "M"]) """ text = "H & M" tokens = tokenize_content(text) self.assertCountEqual(tokens, ["H", "M"]) text = "Hello!" tokens = tokenize_content(text) self.assertCountEqual(tokens, ["Hello", "Hello!"])
def test_tokenization_of_special_symbols_within_a_word(self): """ Symbols configured in `PUNCTUATION` and `SPECIAL_SYMBOLS` constants are considered stop words characters, but they do not count as token themselves. (e.g. "l'oreal" would generate ["l", "oreal", "l'oreal", "loreal"]) """ "`'` is configured in `SPECIAL_SYMBOLS`" text = "l'oreal" tokens = tokenize_content(text) self.assertCountEqual(tokens, ["l", "oreal", "l'oreal", "loreal"]) text = "H*M" tokens = tokenize_content(text) self.assertCountEqual(tokens, ["H", "M", "HM", "H*M"])
def test_tokenization_of_not_acronyms_pipes(self): """ When a word contains WORD_DOCUMENT_JOIN_STRING special char, it's replaced by an EMPTY char. """ text = "Tokenize multiple chars ||" tokens = tokenize_content(text) self.assertCountEqual(tokens, ["Tokenize", "multiple", "chars"])
def test_tokenization_of_word_multiple_symbols(self): """ When tokenizing a word, we also save its version with no symbols """ text = "[[[[Tokenize]]]]" tokens = tokenize_content(text) self.assertCountEqual( tokens, ["[[[[Tokenize]]]]", "[[[[Tokenize", "Tokenize]]]]", "Tokenize"])
def test_tokenization_of_repeated_words(self): """ Hyphens are stop characters except when they are part of an ancronym (e.g I-B-M), this handling also covers dates (e.g. 2020-01-01) """ text = "du du du da da da" tokens = tokenize_content(text) self.assertCountEqual(tokens, ["du", "da"])
def test_tokenization_of_generic_symbols_within_a_word(self): """ Symbols that are not listed in `PUNCTUATION` or `SPECIAL_SYMBOLS` constants are considered normal characters and they won't be stopwords (e.g. "H&M" would generate ["H&M"]) """ text = "H&M" tokens = tokenize_content(text) self.assertCountEqual(tokens, ["H&M", "HM"])
def test_tokenization_of_not_acronyms(self): """ Words than have more than 1 letter and separated by an hyphen are not an acronym. """ text = "This-is-not-an-acronym" tokens, new_tokens = tokenize_content(text) self.assertCountEqual( tokens + new_tokens, ["This", "-", "is", "-", "not", "-", "an", "-", "acronym", ] )
def test_tokenization_of_dates(self): """ Hyphens are stop characters except when they are part of a date (e.g. 2020-01-01) """ text = "This is a date 2020-01-01" tokens, new_tokens = tokenize_content(text) self.assertCountEqual( tokens + new_tokens, ["This", "is", "a", "date", "2020-01-01", "20200101", "2020.01.01", ] )
def test_tokenization_of_acronyms(self): """ Hyphens are stop characters except when they are part of an ancronym (e.g I-B-M), this handling also covers dates (e.g. 2020-01-01) """ text = "This-is some text with - hyphens. I-B-M" tokens, new_tokens = tokenize_content(text) self.assertCountEqual( tokens + new_tokens, ["This", "-", "is", "some", "text", "with", "-", "hyphens", ".", "I-B-M", "IBM", "I.B.M"] )