def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "words", file)
        for word in open(path).readlines():
            yield word.strip()
def chunked(files = items, chunk_types=('NP',)):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield tree.conll_chunk(sent, chunk_types)
def tagged(files = items):
    if type(files) is str: files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "conll2000", file + ".txt")
        s = open(path).read()
        for sent in tokenize.blankline(s):
            yield [(word, tag) for (word, tag, chunk) in _list_sent(sent)]
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ppattach", file)
        for line in open(path).readlines():
            yield tuple(line.split())
def _read(files, conversion_function):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "brown", file)
        f = open(path).read()
        for sent in tokenize.blankline(f):
            yield conversion_function(sent)
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ieer", file)
        for doc in open(path).read().split('</DOC>'):
            doc = doc.split('<DOC>')
            if len(doc) == 2:
                yield "<DOC>" + doc[1] + "</DOC>\n"
def bracket_parse(files=items):
    if type(files) is str:
        files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for sent in data:
            yield tree.bracket_parse(sent)
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "state_union", file + ".txt")
        f = open(path)
        preamble = True
        text = f.read()
        for t in tokenize.wordpunct(text):
            yield t
def raw(files = items):
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "gutenberg", file + ".txt")
        f = open(path)
        preamble = True
        for line in f.readlines():
            if not preamble:
                for t in tokenize.wordpunct(line):
                    yield t
            if line[:5] == '*END*':
                preamble = False
def raw(files = 'cmudict'):
    """
    @param files: One or more cmudict files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "cmudict", file)
        for line in open(path).readlines():
            fields = line.strip().split(' ')
            yield (fields[0], int(fields[1]), tuple(fields[2:]))
def raw(files = 'rotokas'):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{list(string)}
    """       

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str: files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "shoebox", file + ".dic")
        f = open(path).read()
        for entry in tokenize.blankline(f):
            yield list(_parse_entry(entry))
def raw(files="english-kjv"):
    """
    @param files: One or more treebank files to be processed
    @type files: L{string} or L{tuple(string)}
    @rtype: iterator over L{tree}
    """

    # Just one file to process?  If so convert to a tuple so we can iterate
    if type(files) is str:
        files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "genesis", file + ".txt")
        s = open(path).read()
        for t in tokenize.whitespace(s):
            yield t
Esempio n. 13
0
def _read(files, conversion_function):
    if type(files) is str:
        files = (files,)

    for file in files:
        path = os.path.join(get_basedir(), "ycoe/pos", file)
        f = open(path).read()
        rx_pattern = re.compile(
            r"""
                <.*>_CODE
                |\s.*_ID
        """,
            re.VERBOSE | re.UNICODE,
        )
        mySents = tokenize.blankline(f)
        for sent in mySents:
            sent = re.sub(rx_pattern, "", sent)
            if sent != "":
                yield conversion_function(sent, sep="_")
    def loadParadigm(self, p_filename ):
        """
        Load the given paradigm (XML file)
        Attributes are stored in self.attributes
        Data are stored in self.data
    
        They can be accessed as follows:
        self.attributes['gender']   # list of genders
        self.data[6]['gender']      # gender for the sixth data object
        self.data[6]['content']     # content for the sixth data object
        """

        from nltk_lite.corpora import get_basedir
        basedir = get_basedir()

        # Look for the file
        try_filename = os.path.join(get_basedir(), "paradigms", p_filename)
        try:
            f = open(try_filename)
            p_filename = try_filename
        except IOError:
            print "Cannot find file"
            return None
        f.close()

        # These variables will be set by this method
        self.attributes = {}  # A new dictionary
        self.data = []        # A new list

        # XML admin: create Reader object, parse document
        reader = Sax2.Reader()
        doc = reader.fromStream(p_filename)

        # Cycle through the given attributes and add them to self.attributes
        # for <name> in <attributes>
        attributes = doc.getElementsByTagName('attributes')[0]
        for name in attributes.getElementsByTagName('name'):

            # Setup a list of attribute values
            tmp_list = []

            # for each value under name, store in list
            for value in name.getElementsByTagName('value'):
                tmp_list.append(value.getAttribute('value'))

            # Store list of values in dictionary
            self.attributes[name.getAttribute('name')] = tmp_list


        # Cycle through data objects and add them to self.data
        # for <form> in <paradigm>
        forms = doc.getElementsByTagName('paradigm')[0]
        for form in forms.getElementsByTagName('form'):
            # Initialise a temporary dictionary
            tmp_dict = {}
            for value in form.getElementsByTagName('attribute'):
                tmp_dict[value.getAttribute('name')] = value.getAttribute('value')
            # Add the new dictionary to the data list
            self.data.append(tmp_dict)

        # Talk to the user
        print "Paradigm information successfully loaded from file:", p_filename
        # State the number and print out a list of attributes
        print " "*4 + str(len(self.attributes)) + " attributes imported:",
        for att in self.attributes:
            print att,
        print
        # State the number of paradigm objects imported
        print " "*4 + str(len(self.data)) + " paradigm objects imported."

        return
Esempio n. 15
0
def _chunk_parse(files, chunk_types, top_node, partial_match, collapse_partials, cascade):
    # allow any kind of bracketing for flexibility

    L_BRACKET = re.compile(r"[\(\[\{<]")
    R_BRACKET = re.compile(r"[\)\]\}>]")

    if type(files) is str:
        files = (files,)
    for file in files:
        path = os.path.join(get_basedir(), "ycoe/psd", file + ".psd")
        s = open(path).read()
        data = _parse(s)
        for s in data:
            bracket = 0
            itmType = None
            stack = [tree.Tree(top_node, [])]
            inTag = []
            for itm in list(tokenize.whitespace(s)):
                if L_BRACKET.match(itm[0]):
                    bracket += 1
                    itm = itm[1:]
                    matched = False
                    if partial_match == True:
                        for eachItm in chunk_types:
                            if len(eachItm) <= len(itm) and eachItm == itm[: len(eachItm)]:
                                matched = True
                                if collapse_partials == True:
                                    itm = eachItm
                    else:
                        if chunk_types is not None and itm in chunk_types:
                            matched = True
                    if matched == True:  # and inTag == 0:
                        chunk = tree.Tree(itm, [])
                        if cascade == True:
                            stack.append(chunk)
                            inTag += [bracket]
                        else:
                            if len(inTag) == 0:
                                stack[-1].append(chunk)
                                inTag += [bracket]
                    itmType = itm
                if R_BRACKET.match(itm[-1]):
                    tmpItm = split(itm, itm[-1])
                    if tmpItm != "":
                        if len(inTag) > 0 and inTag[-1] <= bracket:  # inTag <= bracket:
                            if cascade == True:
                                stack[-1].append((itmType, tmpItm[0]))
                            else:
                                stack[-1][-1].append((itmType, tmpItm[0]))
                        else:
                            if cascade == True:
                                if len(stack) > 1:
                                    stack[-2].append(stack[-1])
                                    stack = stack[:-1]
                            stack[-1].append((itmType, tmpItm[0]))
                            inTag = [] + inTag[:-2]
                    bracket -= len(tmpItm) - 1
                    while len(inTag) > 0 and bracket < inTag[-1]:
                        if cascade == True:
                            if len(stack) > 1:
                                stack[-2].append(stack[-1])
                                stack = stack[:-1]
                        inTag = [] + inTag[:-2]
            yield stack