def __init__(self, lemmatizer=None, stemmer=None, url_parser=None, unicode_form='NFKC', nltk_stop_words="english", sentence_tokenizer=('nltk_data', 'tokenizers/punkt/english.pickle'), max_char_repeats=3, lru_cache_size=50000, translate_map_inv=None, replace_map=None, html_renderer='default', add_abbrev_types=None, del_sent_starters=None): self._unicode_normalize = partial(unicodedata.normalize, unicode_form) self._replace_inplace = InPlaceReplacer(replace_map).replace \ if replace_map else lambda x: x self._tokenize = RegexpFeatureTokenizer().tokenize self._stopwords = frozenset(stopwords.words(nltk_stop_words)) self._url_parser = url_parser self._sentence_tokenizer, self._sentence_tokenize = \ self.load_sent_tokenizer(sentence_tokenizer, add_abbrev_types, del_sent_starters) self.sentence_tokenizer = None self._lemmatize = lru_wrap(lemmatizer.lemmatize, lru_cache_size) if lemmatizer else None self._stem = stemmer.stem if stemmer else None self._pos_tag = pos_tag self._replace_char_repeats = \ RepeatReplacer(max_repeats=max_char_repeats).replace \ if max_char_repeats > 0 else self._identity # translation of Unicode characters translator = Translator(EXTRA_TRANSLATE_MAP, translated=True) translator.add_inverse_map(translate_map_inv, translated=False) self._replace_chars = translator.replace if html_renderer is None: self.strip_html = lambda x: x elif html_renderer == u'default': self.strip_html = HTMLCleaner().clean elif html_renderer == u'beautifulsoup': self.strip_html = strip_html_bs else: raise ValueError('Invalid parameter value given for `html_renderer`') # tokenize a dummy string b/c lemmatizer and/or other tools can take # a while to initialize screwing up our attempts to measure performance self.tokenize(u"dummy string")
def setUp(self): self.tokenizer = TOKENIZER self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False) self.sentence_tokenize = TOKENIZER.sentence_tokenize self.base_tokenizer = RegexpFeatureTokenizer(debug=True)