def __init__(self, doc=None, regex=CRE_TOKEN, strip=True, nonwords=False, nonwords_set=None, nonwords_regex=RE_NONWORD, lower=None, stem=None, ngrams=1): # specific set of characters to strip self.strip_chars = None if isinstance(strip, basestring): self.strip_chars = strip # strip_chars takes care of the stripping config, so no need for strip function anymore self.strip = None elif strip is True: self.strip_chars = '-_*`()"' + '"' strip = strip or None # strip whitespace, overrides strip() method self.strip = strip if callable(strip) else (str_strip if strip else None) self.doc = to_ascii(doc) self.regex = regex if isinstance(self.regex, basestring): self.regex = re.compile(self.regex) self.nonwords = nonwords # whether to use the default REGEX for nonwords self.nonwords_set = nonwords_set or set() self.nonwords_regex = nonwords_regex self.lower = lower if callable(lower) else (str_lower if lower else None) self.stemmer_name, self.stem = make_named_stemmer(stem) # stem can be a callable Stemmer instance or just a function self.ngrams = ngrams or 1 # ngram degree, numger of ngrams per token if isinstance(self.nonwords_regex, basestring): self.nonwords_regex = re.compile(self.nonwords_regex) elif self.nonwords: try: self.nonwords_set = set(self.nonwords) except TypeError: self.nonwords_set = set(['None', 'none', 'and', 'but']) # if a set of nonwords has been provided dont use the internal nonwords REGEX? self.nonwords = not bool(self.nonwords)
def __call__(self, doc): """Lazily tokenize a new document (tokens aren't generated until the class instance is iterated) >>> list(Tokenizer()('new string to parse')) ['new', 'string', 'to', 'parse'] """ # tokenization doesn't happen until you try to iterate through the Tokenizer instance or class self.doc = to_ascii(doc) # need to return self so that this will work: Tokenizer()('doc (str) to parse even though default doc is None') return self