Ejemplo n.º 1
0
def get_tokenizer(tag,chunkers=None,filters=None):
    """Locate an appropriate tokenizer by language tag.

    This requires importing the function 'tokenize' from an
    appropriate module.  Modules tried are named after the
    language tag, tried in the following order:
        * the entire tag (e.g. "en_AU.py")
        * the base country code of the tag (e.g. "en.py")

    If a suitable function cannot be found, raises TokenizerNotFoundError.
    
    If given and not None, 'chunkers' and 'filters' must be lists of chunker
    classes and filter classes resectively.  These will be applied to the
    tokenizer during creation.
    """
    # "filters" used to be the second argument.  Try to catch cases
    # where it is given positionally and issue a DeprecationWarning.
    if chunkers is not None and filters is None:
        chunkers = list(chunkers)
        if chunkers:
            try:
                chunkers_are_filters = issubclass(chunkers[0],Filter)
            except TypeError:
                pass
            else:
                if chunkers_are_filters:
                    msg = "passing 'filters' as a non-keyword argument "\
                          "to get_tokenizer() is deprecated"
                    warnings.warn(msg,category=DeprecationWarning)
                    filters = chunkers
                    chunkers = None
    # Ensure only '_' used as separator
    tag = tag.replace("-","_")
    # First try the whole tag
    tkFunc = _try_tokenizer(tag)
    if tkFunc is None:
        # Try just the base
        base = tag.split("_")[0]
        tkFunc = _try_tokenizer(base)
        if tkFunc is None:
            msg = "No tokenizer found for language '%s'" % (tag,)
            raise TokenizerNotFoundError(msg)
    # Given the language-specific tokenizer, we now build up the
    # end result as follows:
    #    * chunk the text using any given chunkers in turn
    #    * begin with basic whitespace tokenization
    #    * apply each of the given filters in turn
    #    * apply language-specific rules
    tokenizer = basic_tokenize
    if chunkers is not None:
        chunkers = list(chunkers)
        for i in xrange(len(chunkers)-1,-1,-1):
            tokenizer = wrap_tokenizer(chunkers[i],tokenizer)
    if filters is not None:
        for f in filters:
            tokenizer = f(tokenizer)
    tokenizer = wrap_tokenizer(tokenizer,tkFunc)
    return tokenizer
Ejemplo n.º 2
0
def get_tokenizer(tag=None, chunkers=None, filters=None):
    """Locate an appropriate tokenizer by language tag.

    This requires importing the function 'tokenize' from an appropriate
    module.  Modules tried are named after the language tag, tried in the
    following order:
        * the entire tag (e.g. "en_AU.py")
        * the base country code of the tag (e.g. "en.py")

    If the language tag is None, a default tokenizer (actually the English
    one) is returned.  It's unicode aware and should work OK for most
    latin-derived languages.

    If a suitable function cannot be found, raises TokenizerNotFoundError.
    
    If given and not None, 'chunkers' and 'filters' must be lists of chunker
    classes and filter classes respectively.  These will be applied to the
    tokenizer during creation.
    """
    if tag is None:
        tag = "en"
    # "filters" used to be the second argument.  Try to catch cases
    # where it is given positionally and issue a DeprecationWarning.
    if chunkers is not None and filters is None:
        chunkers = list(chunkers)
        if chunkers:
            try:
                chunkers_are_filters = issubclass(chunkers[0], Filter)
            except TypeError:
                pass
            else:
                if chunkers_are_filters:
                    msg = "passing 'filters' as a non-keyword argument "\
                          "to get_tokenizer() is deprecated"
                    warnings.warn(msg,
                                  category=DeprecationWarning,
                                  stacklevel=2)
                    filters = chunkers
                    chunkers = None
    # Ensure only '_' used as separator
    tag = tag.replace("-", "_")
    # First try the whole tag
    tkFunc = _try_tokenizer(tag)
    if tkFunc is None:
        # Try just the base
        base = tag.split("_")[0]
        tkFunc = _try_tokenizer(base)
        if tkFunc is None:
            msg = "No tokenizer found for language '%s'" % (tag, )
            raise TokenizerNotFoundError(msg)
    # Given the language-specific tokenizer, we now build up the
    # end result as follows:
    #    * chunk the text using any given chunkers in turn
    #    * begin with basic whitespace tokenization
    #    * apply each of the given filters in turn
    #    * apply language-specific rules
    tokenizer = basic_tokenize
    if chunkers is not None:
        chunkers = list(chunkers)
        for i in xrange(len(chunkers) - 1, -1, -1):
            tokenizer = wrap_tokenizer(chunkers[i], tokenizer)
    if filters is not None:
        for f in filters:
            tokenizer = f(tokenizer)
    tokenizer = wrap_tokenizer(tokenizer, tkFunc)
    return tokenizer