def get_tokenizer(tag,chunkers=None,filters=None): """Locate an appropriate tokenizer by language tag. This requires importing the function 'tokenize' from an appropriate module. Modules tried are named after the language tag, tried in the following order: * the entire tag (e.g. "en_AU.py") * the base country code of the tag (e.g. "en.py") If a suitable function cannot be found, raises TokenizerNotFoundError. If given and not None, 'chunkers' and 'filters' must be lists of chunker classes and filter classes resectively. These will be applied to the tokenizer during creation. """ # "filters" used to be the second argument. Try to catch cases # where it is given positionally and issue a DeprecationWarning. if chunkers is not None and filters is None: chunkers = list(chunkers) if chunkers: try: chunkers_are_filters = issubclass(chunkers[0],Filter) except TypeError: pass else: if chunkers_are_filters: msg = "passing 'filters' as a non-keyword argument "\ "to get_tokenizer() is deprecated" warnings.warn(msg,category=DeprecationWarning) filters = chunkers chunkers = None # Ensure only '_' used as separator tag = tag.replace("-","_") # First try the whole tag tkFunc = _try_tokenizer(tag) if tkFunc is None: # Try just the base base = tag.split("_")[0] tkFunc = _try_tokenizer(base) if tkFunc is None: msg = "No tokenizer found for language '%s'" % (tag,) raise TokenizerNotFoundError(msg) # Given the language-specific tokenizer, we now build up the # end result as follows: # * chunk the text using any given chunkers in turn # * begin with basic whitespace tokenization # * apply each of the given filters in turn # * apply language-specific rules tokenizer = basic_tokenize if chunkers is not None: chunkers = list(chunkers) for i in xrange(len(chunkers)-1,-1,-1): tokenizer = wrap_tokenizer(chunkers[i],tokenizer) if filters is not None: for f in filters: tokenizer = f(tokenizer) tokenizer = wrap_tokenizer(tokenizer,tkFunc) return tokenizer
def get_tokenizer(tag=None, chunkers=None, filters=None): """Locate an appropriate tokenizer by language tag. This requires importing the function 'tokenize' from an appropriate module. Modules tried are named after the language tag, tried in the following order: * the entire tag (e.g. "en_AU.py") * the base country code of the tag (e.g. "en.py") If the language tag is None, a default tokenizer (actually the English one) is returned. It's unicode aware and should work OK for most latin-derived languages. If a suitable function cannot be found, raises TokenizerNotFoundError. If given and not None, 'chunkers' and 'filters' must be lists of chunker classes and filter classes respectively. These will be applied to the tokenizer during creation. """ if tag is None: tag = "en" # "filters" used to be the second argument. Try to catch cases # where it is given positionally and issue a DeprecationWarning. if chunkers is not None and filters is None: chunkers = list(chunkers) if chunkers: try: chunkers_are_filters = issubclass(chunkers[0], Filter) except TypeError: pass else: if chunkers_are_filters: msg = "passing 'filters' as a non-keyword argument "\ "to get_tokenizer() is deprecated" warnings.warn(msg, category=DeprecationWarning, stacklevel=2) filters = chunkers chunkers = None # Ensure only '_' used as separator tag = tag.replace("-", "_") # First try the whole tag tkFunc = _try_tokenizer(tag) if tkFunc is None: # Try just the base base = tag.split("_")[0] tkFunc = _try_tokenizer(base) if tkFunc is None: msg = "No tokenizer found for language '%s'" % (tag, ) raise TokenizerNotFoundError(msg) # Given the language-specific tokenizer, we now build up the # end result as follows: # * chunk the text using any given chunkers in turn # * begin with basic whitespace tokenization # * apply each of the given filters in turn # * apply language-specific rules tokenizer = basic_tokenize if chunkers is not None: chunkers = list(chunkers) for i in xrange(len(chunkers) - 1, -1, -1): tokenizer = wrap_tokenizer(chunkers[i], tokenizer) if filters is not None: for f in filters: tokenizer = f(tokenizer) tokenizer = wrap_tokenizer(tokenizer, tkFunc) return tokenizer