Python en_tokenize_and_segment Examples, iepy.preprocess.tokenizer.en_tokenize_and_segment Python Examples

Example #1

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def test_nltk_punk_sentence_tokenizer_is_used(self):
     text = "The wolf killed a duck. What a pitty"
     with mock.patch.object(PunktSentenceTokenizer,
                            'span_tokenize') as nltk_sent:
         nltk_sent.return_value = [(0, 5)]
         en_tokenize_and_segment(text)
         nltk_sent.assert_called_once_with(text)

Example #2

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def test_each_offset_its_the_exac_location_pf_the_token_in_the_text(self):
     text = (u"John's bar is cool, right :) XD? "
             u"The wolf (starved to death), killed a duck.")
     tokens = en_tokenize_and_segment(text)['tokens']
     offsets = en_tokenize_and_segment(text)['spans']
     for tkn, off in zip(tokens, offsets):
         self.assertEqual(text[off:len(tkn) + off], tkn)

Example #3

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def test_each_offset_its_the_exac_location_pf_the_token_in_the_text(self):
     text = (u"John's bar is cool, right :) XD? "
             u"The wolf (starved to death), killed a duck."
             )
     tokens = en_tokenize_and_segment(text)['tokens']
     offsets = en_tokenize_and_segment(text)['spans']
     for tkn, off in zip(tokens, offsets):
         self.assertEqual(text[off:len(tkn)+off], tkn)

Example #4

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def test_sentences_with_big_text(self):
     text = (u"The Bastard Operator From Hell (BOFH), a fictional character "
             u"created by Simon Travaglia, is a rogue system administrator who "
             u"takes out his anger on users (often referred to as lusers), "
             u"colleagues, bosses, and anyone else who pesters him with their "
             u"pitiful user created \"problems\".\n"
             u"The BOFH stories were originally posted in 1992 to Usenet by "
             u"Travaglia, with some being reprinted in Datamation. They were "
             u"published weekly from 1995 to 1999 in Network Week and since 2000"
             u" they have been published most weeks in The Register. They were "
             u"also published in PC Plus magazine for a short time, and several"
             u" books of the stories have also been released.")
     tokenizer = _get_tokenizer()
     expected_sentences = [0]
     sentence_splitter = nltk.data.load("tokenizers/punkt/english.pickle")
     for i, j in sentence_splitter.span_tokenize(text):
         expected_sentences.append(len(list(tokenizer.span_tokenize(text[:j]))))
     sents = en_tokenize_and_segment(text)['sentences']
     self.assertEqual(expected_sentences, sents)

Example #5

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def test_sentences_with_big_text(self):
     text = (
         u"The Bastard Operator From Hell (BOFH), a fictional character "
         u"created by Simon Travaglia, is a rogue system administrator who "
         u"takes out his anger on users (often referred to as lusers), "
         u"colleagues, bosses, and anyone else who pesters him with their "
         u"pitiful user created \"problems\".\n"
         u"The BOFH stories were originally posted in 1992 to Usenet by "
         u"Travaglia, with some being reprinted in Datamation. They were "
         u"published weekly from 1995 to 1999 in Network Week and since 2000"
         u" they have been published most weeks in The Register. They were "
         u"also published in PC Plus magazine for a short time, and several"
         u" books of the stories have also been released.")
     tokenizer = _get_tokenizer()
     expected_sentences = [0]
     sentence_splitter = nltk.data.load("tokenizers/punkt/english.pickle")
     for i, j in sentence_splitter.span_tokenize(text):
         expected_sentences.append(
             len(list(tokenizer.span_tokenize(text[:j]))))
     sents = en_tokenize_and_segment(text)['sentences']
     self.assertEqual(expected_sentences, sents)

Example #6

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def test_there_is_an_offset_per_token(self):
     text = u"The wolf (starved to death), killed a duck."
     tokens = en_tokenize_and_segment(text)['tokens']
     offsets = en_tokenize_and_segment(text)['spans']
     self.assertEqual(len(tokens), len(offsets))

Example #7

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def check_expected_words_are_in_tokenization(self, text, expected_words):
     words = en_tokenize_and_segment(text)['tokens']
     for expected_word in expected_words:
         self.assertIn(expected_word, words)

Example #8

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def test_nltk_punk_sentence_tokenizer_is_used(self):
     text = "The wolf killed a duck. What a pitty"
     with mock.patch.object(PunktSentenceTokenizer, 'span_tokenize') as nltk_sent:
         nltk_sent.return_value = [(0, 5)]
         en_tokenize_and_segment(text)
         nltk_sent.assert_called_once_with(text)

Example #9

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def test_number_of_tokens_is_always_last(self):
     text = "The wolf killed a duck. What a pitty"
     pieces = en_tokenize_and_segment(text)
     sents = pieces['sentences']
     tkns = pieces['tokens']
     self.assertEqual(sents[-1], len(tkns))

Example #10

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def test_cero_is_all_even_if_no_tokens(self):
     text = ""
     sents = en_tokenize_and_segment(text)['sentences']
     self.assertEqual(sents, [0])

Example #11

0

Show file

File: test_tokenizer.py Project: 52nlp/iepy

 def test_cero_is_always_included(self):
     text = "The wolf killed a duck. What a pitty"
     sents = en_tokenize_and_segment(text)['sentences']
     self.assertEqual(sents[0], 0)

Example #12

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def test_there_is_an_offset_per_token(self):
     text = u"The wolf (starved to death), killed a duck."
     tokens = en_tokenize_and_segment(text)['tokens']
     offsets = en_tokenize_and_segment(text)['spans']
     self.assertEqual(len(tokens), len(offsets))

Example #13

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def check_expected_words_are_in_tokenization(self, text, expected_words):
     words = en_tokenize_and_segment(text)['tokens']
     for expected_word in expected_words:
         self.assertIn(expected_word, words)

Example #14

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def test_number_of_tokens_is_always_last(self):
     text = "The wolf killed a duck. What a pitty"
     pieces = en_tokenize_and_segment(text)
     sents = pieces['sentences']
     tkns = pieces['tokens']
     self.assertEqual(sents[-1], len(tkns))

Example #15

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def test_cero_is_all_even_if_no_tokens(self):
     text = ""
     sents = en_tokenize_and_segment(text)['sentences']
     self.assertEqual(sents, [0])

Example #16

0

Show file

File: test_tokenizer.py Project: theblueskies/iepy

 def test_cero_is_always_included(self):
     text = "The wolf killed a duck. What a pitty"
     sents = en_tokenize_and_segment(text)['sentences']
     self.assertEqual(sents[0], 0)