j += 1 sentences[-1].extend(tokens[i:j]) return [" ".join(s) for s in sentences if len(s) > 0] # MBSP's tokenizer.py is pretty fast and a lot more robust so we could try to load it. # You could also do parser.tokenize = my_module.tokenize #try: from MBSP.tokenizer import split as tokenize #except: # pass #### TAGGER ######################################################################################## #--- BRILL TAGGER ---------------------------------------------------------------------------------- LEXICON = lexicon = Lexicon() # Lazy dictionary based on Brill_lexicon.txt. def find_tags(tokens, default="NN", light=False, lexicon=LEXICON, language="en", map=None): """ Returns a list of [token, tag]-items for the given list of tokens. For example: ['That', 'is', 'interesting', '.'] => [['That', 'DT'], ['is', 'VBZ'], ['interesting', 'JJ'], ['.', '.']] With light=True uses Brill's lexical and contextual rules to improve token tags. With light=False uses a faster set of arbitrary rules (Jason Wiener's rules). If map is a function, apply it to each tag after lexical and contextual rules.
sentences[-1].extend(tokens[i:j]) return [" ".join(s) for s in sentences if len(s) > 0] # MBSP's tokenizer.py is pretty fast and a lot more robust so we could try to load it. # You could also do parser.tokenize = my_module.tokenize #try: from MBSP.tokenizer import split as tokenize #except: # pass #### TAGGER ######################################################################################## #--- BRILL TAGGER ---------------------------------------------------------------------------------- # Lazy dictionary based on Brill_lexicon.txt. LEXICON = lexicon = Lexicon() # By default, numbers are recognized as strings of digits and -,.:/% CD = re.compile(r"^[0-9\-\,\.\:\/\%]+$") def find_tags(tokens, default="NN", light=False, lexicon=LEXICON, language="en", map=None): """ Returns a list of [token, tag]-items for the given list of tokens. For example: ['That', 'is', 'interesting', '.'] => [['That', 'DT'], ['is', 'VBZ'], ['interesting', 'JJ'], ['.', '.']]