class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ "Beautiful is better than ugly.", "Simple is better than complex." ]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"])
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."])
class TestSentenceTokenizer(unittest.TestCase): def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex." def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ["Beautiful is better than ugly.", "Simple is better than complex."]) def test_tokenize_with_multiple_punctuation(self): text = "Hello world. How do you do?! My name's Steve..." assert_equal(self.tokenizer.tokenize(text), ["Hello world.", "How do you do?!", "My name's Steve..."]) text2 = 'OMG! I am soooo LOL!!!' tokens = self.tokenizer.tokenize(text2) assert_equal(len(tokens), 2) assert_equal(tokens, ["OMG!", "I am soooo LOL!!!"])
def test_overrides(self): b = tb.Blobber(tokenizer=SentenceTokenizer(), np_extractor=ConllExtractor()) blob = b("How now? Brown cow?") assert_true(isinstance(blob.tokenizer, SentenceTokenizer)) assert_equal(blob.tokens, tb.WordList(["How now?", "Brown cow?"])) blob2 = b("Another blob") # blobs have the same tokenizer assert_true(blob.tokenizer is blob2.tokenizer) # but aren't the same object assert_not_equal(blob, blob2)
def _create_sentence_objects(self): '''Returns a list of Sentence objects given a list of sentence strings. Attempts to handle sentences that have more than one punctuation mark at the end of the sentence. Examples: "An ellipses is no problem..." or "This is awesome!!!" ''' sent_tokenizer = SentenceTokenizer() sentence_objects = [] sentences = sent_tokenizer.itokenize(self.raw) char_index = 0 # Keeps track of character index within the blob for sent in sentences: # Compute the start and end indices of the sentence # within the blob start_index = self.raw.index(sent, char_index) char_index += len(sent) end_index = start_index + len(sent) # Sentences share the same models as their parent blob s = Sentence(sent, start_index=start_index, end_index=end_index, tokenizer=self.tokenizer, np_extractor=self.np_extractor, pos_tagger=self.pos_tagger, analyzer=self.analyzer, parser=self.parser, classifier=self.classifier) sentence_objects.append(s) return sentence_objects
def setUp(self): self.tokenizer = SentenceTokenizer() self.text = "Beautiful is better than ugly. Simple is better than complex."