def chunked(files=items, chunk_types=('NP', )): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield tree.conll_chunk(sent, chunk_types)
def demo(): """ A demonstration that shows the output of several different tokenizers on the same string. """ from nodebox_linguistics_extended.parser.nltk_lite import tokenize # Define the test string. s = "Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\n\nThanks." print 'Input text:' print ` s ` print print 'Tokenize using whitespace:' _display(tokenize.whitespace(s)) print print 'Tokenize sequences of alphanumeric characters:' _display(tokenize.regexp(s, pattern=r'\w+', gaps=False)) print print 'Tokenize sequences of letters and sequences of nonletters:' _display(tokenize.wordpunct(s)) print print 'Tokenize by lines:' _display(tokenize.line(s)) print print 'Tokenize by blank lines:' _display(tokenize.blankline(s)) print print 'A simple sentence tokenizer:' _display(tokenize.regexp(s, pattern=r'\.(\s+|$)', gaps=True)) print
def tagged(files=items): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
def _read(files, conversion_function): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "brown", file) f = open(path).read() for sent in tokenize.blankline(f): yield conversion_function(sent)
def chunked(files='chunked'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{tree} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "treebank", file) s = open(path).read() for t in tokenize.blankline(s): yield tree.chunk(t)
def _read(files, conversion_function): if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "ycoe/pos", file) f = open(path).read() rx_pattern = re.compile( r""" <.*>_CODE |\s.*_ID """, re.VERBOSE | re.UNICODE) mySents = tokenize.blankline(f) for sent in mySents: sent = re.sub(rx_pattern, '', sent) if sent != "": yield conversion_function(sent, sep="_")
def raw(files='raw'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files, ) for file in files: path = os.path.join(get_basedir(), "treebank", file) f = open(path).read() for sent in tokenize.blankline(f): l = [] for t in tokenize.whitespace(sent): l.append(t) yield l