Python tokenize_content Beispiele

Programmiersprache: Python

Namespace / Paketname: djangae.contrib.search.tokens

Methode / Funktion: tokenize_content

Beispiele auf hotexamples.com: 9

Python tokenize_content - 9 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die djangae.contrib.search.tokens.tokenize_content, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

0

Datei anzeigen

Datei: test_indexing.py Projekt: potatolondon/djangae

 def test_tokenization_of_symbols_as_word_on_its_own(self):
     """
         Symbols on their own are not considered tokens
         (e.g. "H & M" would generate ["H", "M"])
     """
     text = "H & M"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["H", "M"])
     text = "Hello!"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["Hello", "Hello!"])

Beispiel #2

0

Datei anzeigen

Datei: test_indexing.py Projekt: potatolondon/djangae

    def test_tokenization_of_special_symbols_within_a_word(self):
        """
            Symbols configured in `PUNCTUATION` and `SPECIAL_SYMBOLS` constants
            are considered stop words characters, but they do not count as token
            themselves.
            (e.g. "l'oreal" would generate ["l", "oreal", "l'oreal", "loreal"])
        """

        "`'` is configured in `SPECIAL_SYMBOLS`"
        text = "l'oreal"
        tokens = tokenize_content(text)
        self.assertCountEqual(tokens, ["l", "oreal", "l'oreal", "loreal"])

        text = "H*M"
        tokens = tokenize_content(text)
        self.assertCountEqual(tokens, ["H", "M", "HM", "H*M"])

Beispiel #3

0

Datei anzeigen

Datei: test_indexing.py Projekt: potatolondon/djangae

 def test_tokenization_of_not_acronyms_pipes(self):
     """
         When a word contains WORD_DOCUMENT_JOIN_STRING special char,
         it's replaced by an EMPTY char.
     """
     text = "Tokenize    multiple chars ||"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["Tokenize", "multiple", "chars"])

Beispiel #4

0

Datei anzeigen

Datei: test_indexing.py Projekt: potatolondon/djangae

 def test_tokenization_of_word_multiple_symbols(self):
     """
         When tokenizing a word, we also save its version with no symbols
     """
     text = "[[[[Tokenize]]]]"
     tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens,
         ["[[[[Tokenize]]]]", "[[[[Tokenize", "Tokenize]]]]", "Tokenize"])

Beispiel #5

0

Datei anzeigen

Datei: test_indexing.py Projekt: potatolondon/djangae

 def test_tokenization_of_repeated_words(self):
     """
         Hyphens are stop characters except when they are part
         of an ancronym (e.g I-B-M), this handling also covers dates
         (e.g. 2020-01-01)
     """
     text = "du du du da da da"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["du", "da"])

Beispiel #6

0

Datei anzeigen

Datei: test_indexing.py Projekt: potatolondon/djangae

 def test_tokenization_of_generic_symbols_within_a_word(self):
     """
         Symbols that are not listed in `PUNCTUATION` or `SPECIAL_SYMBOLS` constants
         are considered normal characters and they won't be stopwords
         (e.g. "H&M" would generate ["H&M"])
     """
     text = "H&M"
     tokens = tokenize_content(text)
     self.assertCountEqual(tokens, ["H&M", "HM"])

Beispiel #7

0

Datei anzeigen

Datei: test_indexing.py Projekt: jasoncartwright/djangae

 def test_tokenization_of_not_acronyms(self):
     """
         Words than have more than 1 letter and separated by an
         hyphen are not an acronym.
     """
     text = "This-is-not-an-acronym"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "-", "is", "-", "not", "-", "an", "-", "acronym", ]
     )

Beispiel #8

0

Datei anzeigen

Datei: test_indexing.py Projekt: jasoncartwright/djangae

 def test_tokenization_of_dates(self):
     """
         Hyphens are stop characters except when they are part
         of a date
         (e.g. 2020-01-01)
     """
     text = "This is a date 2020-01-01"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "is", "a", "date", "2020-01-01", "20200101", "2020.01.01", ]
     )

Beispiel #9

0

Datei anzeigen

Datei: test_indexing.py Projekt: jasoncartwright/djangae

 def test_tokenization_of_acronyms(self):
     """
         Hyphens are stop characters except when they are part
         of an ancronym (e.g I-B-M), this handling also covers dates
         (e.g. 2020-01-01)
     """
     text = "This-is some text with - hyphens. I-B-M"
     tokens, new_tokens = tokenize_content(text)
     self.assertCountEqual(
         tokens + new_tokens,
         ["This", "-", "is", "some", "text", "with", "-", "hyphens", ".", "I-B-M", "IBM", "I.B.M"]
     )