Python SentenceTokenizer Examples

Programming Language: Python

Namespace/Package Name: text.tokenizers

Examples at hotexamples.com: 7

Python SentenceTokenizer - 7 examples found. These are the top rated real world Python examples of text.tokenizers.SentenceTokenizer extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

SentenceTokenizer(2)

itokenize(1)

tokenize(1)

Example #1

Show file

class TestSentenceTokenizer(unittest.TestCase):
    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            "Beautiful is better than ugly.", "Simple is better than complex."
        ])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
                     ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"])

Example #2

Show file

File: test_tokenizers.py Project: AydinSakar/TextBlob

class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

Example #3

Show file

File: test_tokenizers.py Project: nicolargo/TextBlob

class TestSentenceTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = SentenceTokenizer()
        self.text = "Beautiful is better than ugly. Simple is better than complex."

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ["Beautiful is better than ugly.", "Simple is better than complex."])

    def test_tokenize_with_multiple_punctuation(self):
        text = "Hello world. How do you do?! My name's Steve..."
        assert_equal(self.tokenizer.tokenize(text),
            ["Hello world.", "How do you do?!", "My name's Steve..."])
        text2 = 'OMG! I am soooo LOL!!!'
        tokens = self.tokenizer.tokenize(text2)
        assert_equal(len(tokens), 2)
        assert_equal(tokens,
            ["OMG!", "I am soooo LOL!!!"])

Example #4

Show file

File: test_blob.py Project: syllog1sm/TextBlob

 def test_overrides(self):
     b = tb.Blobber(tokenizer=SentenceTokenizer(),
                    np_extractor=ConllExtractor())
     blob = b("How now? Brown cow?")
     assert_true(isinstance(blob.tokenizer, SentenceTokenizer))
     assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"]))
     blob2 = b("Another blob")
     # blobs have the same tokenizer
     assert_true(blob.tokenizer is blob2.tokenizer)
     # but aren't the same object
     assert_not_equal(blob, blob2)

Example #5

Show file

File: blob.py Project: scraping-xx/TextBlob

 def _create_sentence_objects(self):
     '''Returns a list of Sentence objects given
     a list of sentence strings. Attempts to handle sentences that
     have more than one punctuation mark at the end of the sentence.
     Examples: "An ellipses is no problem..." or "This is awesome!!!"
     '''
     sent_tokenizer = SentenceTokenizer()
     sentence_objects = []
     sentences = sent_tokenizer.itokenize(self.raw)
     char_index = 0  # Keeps track of character index within the blob
     for sent in sentences:
         # Compute the start and end indices of the sentence
         # within the blob
         start_index = self.raw.index(sent, char_index)
         char_index += len(sent)
         end_index = start_index + len(sent)
         # Sentences share the same models as their parent blob
         s = Sentence(sent, start_index=start_index, end_index=end_index,
             tokenizer=self.tokenizer, np_extractor=self.np_extractor,
             pos_tagger=self.pos_tagger, analyzer=self.analyzer,
             parser=self.parser, classifier=self.classifier)
         sentence_objects.append(s)
     return sentence_objects

Example #6

Show file

 def setUp(self):
     self.tokenizer = SentenceTokenizer()
     self.text = "Beautiful is better than ugly. Simple is better than complex."

Example #7

Show file

File: test_tokenizers.py Project: nicolargo/TextBlob

 def setUp(self):
     self.tokenizer = SentenceTokenizer()
     self.text = "Beautiful is better than ugly. Simple is better than complex."