def processParagraphs(self, corpus): from nltk import tokenize # get paragraphs paragraphs = tokenize.blankline(corpus) # return return paragraphs
def tabtagged(files = 'chunked', basedir= None): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @return: iterator over lines in Malt-TAB input format """ if type(files) is str: files = (files,) if not basedir: basedir = get_basedir() for file in files: path = os.path.join(get_basedir(), "treebank", file) f = open(path).read() for sent in tokenize.blankline(f): l = [] for t in tokenize.whitespace(sent): if (t != '[' and t != ']'): l.append(tag2tab(t)) #add a blank line as sentence separator l.append('\n') yield l
def tabtagged(files='chunked', basedir=None): """ @param files: One or more treebank files to be processed @type files: L{string} or L{tuple(string)} @return: iterator over lines in Malt-TAB input format """ if type(files) is str: files = (files, ) if not basedir: basedir = get_basedir() for file in files: path = os.path.join(get_basedir(), "treebank", file) f = open(path).read() for sent in tokenize.blankline(f): l = [] for t in tokenize.whitespace(sent): if (t != '[' and t != ']'): l.append(tag2tab(t)) #add a blank line as sentence separator l.append('\n') yield l