def chunked(files = items, chunk_types=('NP',)): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield tree.conll_chunk(sent, chunk_types)
def tagged(files = items): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "conll2000", file + ".txt") s = open(path).read() for sent in tokenize.blankline(s): yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
def _read(files, conversion_function): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "brown", file) f = open(path).read() for sent in tokenize.blankline(f): yield conversion_function(sent)
def raw(files = 'rotokas'): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @rtype: iterator over L{list(string)} """ # Just one file to process? If so convert to a tuple so we can iterate if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "shoebox", file + ".dic") f = open(path).read() for entry in tokenize.blankline(f): yield list(_parse_entry(entry))
def _read(files, conversion_function): if type(files) is str: files = (files,) for file in files: path = os.path.join(get_basedir(), "ycoe/pos", file) f = open(path).read() rx_pattern = re.compile( r""" <.*>_CODE |\s.*_ID """, re.VERBOSE | re.UNICODE, ) mySents = tokenize.blankline(f) for sent in mySents: sent = re.sub(rx_pattern, "", sent) if sent != "": yield conversion_function(sent, sep="_")
def processParagraphs(self, corpus): paragraphs = tokenize.blankline(corpus) return paragraphs