class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), ['Python', 'is', 'a', 'high-level', 'programming', 'language', '.']) def test_exclude_punc(self): assert_equal(self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language']) def test_itokenize(self): gen = self.tokenizer.itokenize(self.text) assert_equal(next(gen), "Python") assert_equal(next(gen), "is")
class TestWordTokenizer(unittest.TestCase): '''An example unit test case.''' def setUp(self): self.tokenizer = WordTokenizer() self.text = "Python is a high-level programming language." def tearDown(self): pass def test_tokenize(self): assert_equal(self.tokenizer.tokenize(self.text), [ 'Python', 'is', 'a', 'high-level', 'programming', 'language', '.' ]) def test_exclude_punc(self): assert_equal( self.tokenizer.tokenize(self.text, include_punc=False), ['Python', 'is', 'a', 'high-level', 'programming', 'language'])
def analyze(self, text): """Return the sentiment as a tuple of the form: ``(classification, pos_probability, neg_probability)`` """ # Lazily train the classifier super(NaiveBayesAnalyzer, self).analyze(text) tokenizer = WordTokenizer() tokens = tokenizer.tokenize(text, include_punc=False) filtered = [t.lower() for t in tokens if len(t) >= 3] feats = self._extract_feats(filtered) prob_dist = self._classifier.prob_classify(feats) # classification, p_pos, p_neg return prob_dist.max(), prob_dist.prob('pos'), prob_dist.prob("neg")
def correct(self): '''Attempt to correct the spelling of a blob. .. versionadded:: 0.6.0 :rtype: BaseBlob ''' tok = WordTokenizer() corrected = (Word(w).correct() for w in tok.tokenize(self.raw, include_punc=True)) # Separate each token with a space unless the token is a punctuation ret = '' for i, word in enumerate(corrected): # Avoid an extra space at the beginning if word in pystring.punctuation or i == 0: ret = ''.join([ret, word]) else: ret = ' '.join([ret, word]) return self.__class__(ret)