def demo(): """ A demonstration that shows the output of several different tokenizers on the same string. """ from en.parser.nltk_lite import tokenize # Define the test string. s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." print 'Input text:' print ` s ` print print 'Tokenize using whitespace:' _display(tokenize.whitespace(s)) print print 'Tokenize sequences of alphanumeric characters:' _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) print print 'Tokenize sequences of letters and sequences of nonletters:' _display(tokenize.wordpunct(s)) print print 'Tokenize by lines:' _display(tokenize.line(s)) print print 'Tokenize by blank lines:' _display(tokenize.blankline(s)) print print 'A simple sentence tokenizer:' _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) print
def raw(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "state_union", file + ".txt") f = open(path) preamble = True text = f.read() for t in tokenize.wordpunct(text): yield t
def raw(files=items): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "gutenberg", file + ".txt") f = open(path) preamble = True for line in f.readlines(): if not preamble: for t in tokenize.wordpunct(line): yield t if line[:5] == '*END*': preamble = False