Ejemplo n.º 1
0
class TestWordTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")

    def test_word_tokenize(self):
        tokens = word_tokenize(self.text)
        assert_true(is_generator(tokens))
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
Ejemplo n.º 2
0
class TestWordTokenizer(unittest.TestCase):

    '''An example unit test case.'''

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")
Ejemplo n.º 3
0
class TestWordTokenizer(unittest.TestCase):

    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language', '.'])

    def test_exclude_punc(self):
        assert_equal(self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming',
            'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")

    def test_word_tokenize(self):
        tokens = word_tokenize(self.text)
        assert_true(is_generator(tokens))
        assert_equal(list(tokens), self.tokenizer.tokenize(self.text))
Ejemplo n.º 4
0
 def words(self):
     '''Return a list of word tokens. This excludes punctuation characters.
     If you want to include punctuation characters, access the ``tokens``
     property.
     '''
     # NLTK's word tokenizer expects sentences as input, so tokenize the
     # blob into sentences before tokenizing to words
     tok = WordTokenizer()
     words = chain.from_iterable(tok.itokenize(sent.raw, include_punc=False)
                                 for sent in self.sentences)
     return WordList(words)
Ejemplo n.º 5
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
Ejemplo n.º 6
0
def contains_extractor(document):
    '''A basic document feature extractor that returns a dict of words that
    the document contains.
    '''
    tokenizer = WordTokenizer()
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set((strip_punc(w, all=False) for w in document))
    features = dict((u'contains({0})'.format(w), True) for w in tokens)
    return features
Ejemplo n.º 7
0
 def analyze(self, text):
     """Return the sentiment as a named tuple of the form:
     ``Sentiment(classification, p_pos, p_neg)``
     """
     # Lazily train the classifier
     super(NaiveBayesAnalyzer, self).analyze(text)
     tokenizer = WordTokenizer()
     tokens = tokenizer.itokenize(text, include_punc=False)
     filtered = (t.lower() for t in tokens if len(t) >= 3)
     feats = self._extract_feats(filtered)
     prob_dist = self._classifier.prob_classify(feats)
     return self.RETURN_TYPE(
         classification=prob_dist.max(), p_pos=prob_dist.prob("pos"), p_neg=prob_dist.prob("neg")
     )
Ejemplo n.º 8
0
def _get_words_from_dataset(dataset):
    '''Return a set of all words in a dataset.

    :param dataset: A list of tuples of the form ``(words, label)`` where
        ``words`` is either a string of a list of tokens.
    '''
    tokenizer = WordTokenizer()
    all_words = []
    for words, classification in dataset:
        # Words may either be a string or an iterable
        if isinstance(words, basestring):
            all_words.extend(tokenizer.itokenize(words, include_punc=False))
        else:
            all_words.extend(words)
    return set(all_words)
Ejemplo n.º 9
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set((strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)))
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict(((u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features))
    return features
Ejemplo n.º 10
0
def basic_extractor(document, train_set):
    '''A basic document feature extractor that returns a dict indicating
    what words in ``train_set`` are contained in ``document``.

    :param document: The text to extract features from. Can be a string or an iterable.
    :param train_set: Training data set, a list of tuples of the form
        ``(words, label)``.
    '''
    tokenizer = WordTokenizer()
    word_features = _get_words_from_dataset(train_set)
    if isinstance(document, basestring):
        tokens = set([strip_punc(w, all=False)
                    for w in tokenizer.itokenize(document, include_punc=False)])
    else:
        tokens = set(strip_punc(w, all=False) for w in document)
    features = dict([(u'contains({0})'.format(word), (word in tokens))
                                            for word in word_features])
    return features
Ejemplo n.º 11
0
class TestWordTokenizer(unittest.TestCase):
    '''An example unit test case.'''
    def setUp(self):
        self.tokenizer = WordTokenizer()
        self.text = "Python is a high-level programming language."

    def tearDown(self):
        pass

    def test_tokenize(self):
        assert_equal(self.tokenizer.tokenize(self.text), [
            'Python', 'is', 'a', 'high-level', 'programming', 'language', '.'
        ])

    def test_exclude_punc(self):
        assert_equal(
            self.tokenizer.tokenize(self.text, include_punc=False),
            ['Python', 'is', 'a', 'high-level', 'programming', 'language'])

    def test_itokenize(self):
        gen = self.tokenizer.itokenize(self.text)
        assert_equal(next(gen), "Python")
        assert_equal(next(gen), "is")
Ejemplo n.º 12
0
def test_get_words_from_dataset():
    tok = WordTokenizer()
    all_words = []
    for words, _ in train_set:
        all_words.extend(tok.itokenize(words, include_punc=False))
    assert_equal(_get_words_from_dataset(train_set), set(all_words))