Ejemplo n.º 1
0
        j += 1
    sentences[-1].extend(tokens[i:j])
    return [" ".join(s) for s in sentences if len(s) > 0]


# MBSP's tokenizer.py is pretty fast and a lot more robust so we could try to load it.
# You could also do parser.tokenize = my_module.tokenize
#try: from MBSP.tokenizer import split as tokenize
#except:
#    pass

#### TAGGER ########################################################################################

#--- BRILL TAGGER ----------------------------------------------------------------------------------

LEXICON = lexicon = Lexicon()  # Lazy dictionary based on Brill_lexicon.txt.


def find_tags(tokens,
              default="NN",
              light=False,
              lexicon=LEXICON,
              language="en",
              map=None):
    """ Returns a list of [token, tag]-items for the given list of tokens.
        For example: 
         ['That', 'is', 'interesting', '.'] => 
         [['That', 'DT'], ['is', 'VBZ'], ['interesting', 'JJ'], ['.', '.']]
        With light=True uses Brill's lexical and contextual rules to improve token tags.
        With light=False uses a faster set of arbitrary rules (Jason Wiener's rules).
        If map is a function, apply it to each tag after lexical and contextual rules.
Ejemplo n.º 2
0
    sentences[-1].extend(tokens[i:j])
    return [" ".join(s) for s in sentences if len(s) > 0]


# MBSP's tokenizer.py is pretty fast and a lot more robust so we could try to load it.
# You could also do parser.tokenize = my_module.tokenize
#try: from MBSP.tokenizer import split as tokenize
#except:
#    pass

#### TAGGER ########################################################################################

#--- BRILL TAGGER ----------------------------------------------------------------------------------

# Lazy dictionary based on Brill_lexicon.txt.
LEXICON = lexicon = Lexicon()

# By default, numbers are recognized as strings of digits and -,.:/%
CD = re.compile(r"^[0-9\-\,\.\:\/\%]+$")


def find_tags(tokens,
              default="NN",
              light=False,
              lexicon=LEXICON,
              language="en",
              map=None):
    """ Returns a list of [token, tag]-items for the given list of tokens.
        For example: 
         ['That', 'is', 'interesting', '.'] => 
         [['That', 'DT'], ['is', 'VBZ'], ['interesting', 'JJ'], ['.', '.']]