Example #1
0
    def processParagraphs(self, corpus):
        from nltk import tokenize

        # get paragraphs
        paragraphs = tokenize.blankline(corpus)

        # return
        return paragraphs
Example #2
0
    def processParagraphs(self, corpus):
        from nltk import tokenize

        # get paragraphs
        paragraphs = tokenize.blankline(corpus)

        # return
        return paragraphs
Example #3
0
def tabtagged(files = 'chunked', basedir= None):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @return: iterator over lines in Malt-TAB input format
    """       
    if type(files) is str: files = (files,)

    if not basedir: basedir = get_basedir()

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        f = open(path).read()

        for sent in tokenize.blankline(f):
            l = []
            for t in tokenize.whitespace(sent):
                if (t != '[' and t != ']'):
                    l.append(tag2tab(t))
            #add a blank line as sentence separator
            l.append('\n')
            yield l
Example #4
0
def tabtagged(files='chunked', basedir=None):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @return: iterator over lines in Malt-TAB input format
    """
    if type(files) is str: files = (files, )

    if not basedir: basedir = get_basedir()

    for file in files:
        path = os.path.join(get_basedir(), "treebank", file)
        f = open(path).read()

        for sent in tokenize.blankline(f):
            l = []
            for t in tokenize.whitespace(sent):
                if (t != '[' and t != ']'):
                    l.append(tag2tab(t))
            #add a blank line as sentence separator
            l.append('\n')
            yield l