def word_counts(self): '''Dictionary of word frequencies in this text. ''' counts = defaultdict(int) stripped_words = [lowerstrip(word) for word in self.words] for word in stripped_words: counts[word] += 1 return counts
def contains_extractor(document): '''A basic document feature extractor that returns a dict of words that the document contains. ''' tokenizer = WordTokenizer() if isinstance(document, basestring): tokens = set([w.lower() for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((lowerstrip(w, all=False) for w in document)) features = dict((u'contains({0})'.format(w), True) for w in tokens) return features
def __init__(self, text, tokenizer=None, pos_tagger=None, np_extractor=None, analyzer=None, parser=None, classifier=None, clean_html=False): if not isinstance(text, basestring): raise TypeError('The `text` argument passed to `__init__(text)` ' 'must be a string, not {0}'.format(type(text))) if clean_html: raise NotImplementedError("clean_html has been deprecated. " "To remove HTML markup, use BeautifulSoup's " "get_text() function") self.raw = self.string = text self.stripped = lowerstrip(self.raw, all=True) _initialize_models(self, tokenizer, pos_tagger, np_extractor, analyzer, parser, classifier)
def basic_extractor(document, train_set): '''A basic document feature extractor that returns a dict indicating what words in ``train_set`` are contained in ``document``. :param document: The text to extract features from. Can be a string or an iterable. :param train_set: Training data set, a list of tuples of the form ``(words, label)``. ''' tokenizer = WordTokenizer() word_features = _get_words_from_dataset(train_set) if isinstance(document, basestring): tokens = set([w.lower() for w in tokenizer.itokenize(document, include_punc=False)]) else: tokens = set((lowerstrip(w, all=False) for w in document)) features = dict([(u'contains({0})'.format(word), (word in tokens)) for word in word_features]) return features
def test_lowerstrip(self): assert_equal(lowerstrip(self.text), 'this has punctuation')
def test_lowerstrip(self): assert_equal(lowerstrip(self.text), 'this. has. punctuation')