def demo(): """ A demonstration that shows the output of several different tokenizers on the same string. """ from en.parser.nltk_lite import tokenize # Define the test string. s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." print 'Input text:' print `s` print print 'Tokenize using whitespace:' _display(tokenize.whitespace(s)) print print 'Tokenize sequences of alphanumeric characters:' _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) print print 'Tokenize sequences of letters and sequences of nonletters:' _display(tokenize.wordpunct(s)) print print 'Tokenize by lines:' _display(tokenize.line(s)) print print 'Tokenize by blank lines:' _display(tokenize.blankline(s)) print print 'A simple sentence tokenizer:' _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) print
def demo(): """ A demonstration that shows the output of several different tokenizers on the same string. """ from en.parser.nltk_lite import tokenize # Define the test string. s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." print 'Input text:' print ` s ` print print 'Tokenize using whitespace:' _display(tokenize.whitespace(s)) print print 'Tokenize sequences of alphanumeric characters:' _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) print print 'Tokenize sequences of letters and sequences of nonletters:' _display(tokenize.wordpunct(s)) print print 'Tokenize by lines:' _display(tokenize.line(s)) print print 'Tokenize by blank lines:' _display(tokenize.blankline(s)) print print 'A simple sentence tokenizer:' _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) print
def conll_chunk(s, chunk_types=("NP",), top_node="S"): """ @return: A chunk structure for a single sentence encoded in the given CONLL 2000 style string. @rtype: L{Tree} """ stack = [Tree(top_node, [])] for lineno, line in enumerate(tokenize.line(s)): # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError, 'Error on line %d' % lineno (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if (chunk_types is not None and chunk_type not in chunk_types): state = 'O' # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == 'I' and chunk_type != stack[-1].node if state in 'BO' or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == 'B' or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0]
def conll_chunk(s, chunk_types=("NP", ), top_node="S"): """ @return: A chunk structure for a single sentence encoded in the given CONLL 2000 style string. @rtype: L{Tree} """ stack = [Tree(top_node, [])] for lineno, line in enumerate(tokenize.line(s)): # Decode the line. match = _LINE_RE.match(line) if match is None: raise ValueError('Error on line %d' % lineno) (word, tag, state, chunk_type) = match.groups() # If it's a chunk type we don't care about, treat it as O. if (chunk_types is not None and chunk_type not in chunk_types): state = 'O' # For "Begin"/"Outside", finish any completed chunks - # also do so for "Inside" which don't match the previous token. mismatch_I = state == 'I' and chunk_type != stack[-1].node if state in 'BO' or mismatch_I: if len(stack) == 2: stack.pop() # For "Begin", start a new chunk. if state == 'B' or mismatch_I: chunk = Tree(chunk_type, []) stack[-1].append(chunk) stack.append(chunk) # Add the new word token. stack[-1].append((word, tag)) return stack[0]
def _list_sent(sent): return [tokenize.whitespace(line) for line in tokenize.line(sent)]