Example #1
0
    def __init__(self, lemmatizer=None, stemmer=None, url_parser=None,
                 unicode_form='NFKC', nltk_stop_words="english",
                 sentence_tokenizer=('nltk_data', 'tokenizers/punkt/english.pickle'),
                 max_char_repeats=3, lru_cache_size=50000, translate_map_inv=None,
                 replace_map=None, html_renderer='default', add_abbrev_types=None,
                 del_sent_starters=None):
        self._unicode_normalize = partial(unicodedata.normalize, unicode_form)
        self._replace_inplace = InPlaceReplacer(replace_map).replace \
            if replace_map else lambda x: x
        self._tokenize = RegexpFeatureTokenizer().tokenize
        self._stopwords = frozenset(stopwords.words(nltk_stop_words))
        self._url_parser = url_parser

        self._sentence_tokenizer, self._sentence_tokenize = \
            self.load_sent_tokenizer(sentence_tokenizer, add_abbrev_types, del_sent_starters)

        self.sentence_tokenizer = None
        self._lemmatize = lru_wrap(lemmatizer.lemmatize, lru_cache_size) if lemmatizer else None
        self._stem = stemmer.stem if stemmer else None
        self._pos_tag = pos_tag
        self._replace_char_repeats = \
            RepeatReplacer(max_repeats=max_char_repeats).replace \
            if max_char_repeats > 0 else self._identity

        # translation of Unicode characters
        translator = Translator(EXTRA_TRANSLATE_MAP, translated=True)
        translator.add_inverse_map(translate_map_inv, translated=False)
        self._replace_chars = translator.replace

        if html_renderer is None:
            self.strip_html = lambda x: x
        elif html_renderer == u'default':
            self.strip_html = HTMLCleaner().clean
        elif html_renderer == u'beautifulsoup':
            self.strip_html = strip_html_bs
        else:
            raise ValueError('Invalid parameter value given for `html_renderer`')

        # tokenize a dummy string b/c lemmatizer and/or other tools can take
        # a while to initialize screwing up our attempts to measure performance
        self.tokenize(u"dummy string")
Example #2
0
 def setUp(self):
     self.tokenizer = TOKENIZER
     self.tokenize = partial(TOKENIZER.tokenize, remove_stopwords=False)
     self.sentence_tokenize = TOKENIZER.sentence_tokenize
     self.base_tokenizer = RegexpFeatureTokenizer(debug=True)