Python find_corpus_file Examples, nltk.corpus.find_corpus_file Python Examples

Example #1

0

Show file

File: paradigm.py Project: sushengyang/NLP-project

    def loadParadigm(self, p_filename):
        """
        Load the given paradigm (XML file)
        Attributes are stored in self.attributes
        Data are stored in self.data
    
        They can be accessed as follows:
        self.attributes['gender']   # list of genders
        self.data[6]['gender']      # gender for the sixth data object
        self.data[6]['content']     # content for the sixth data object
        """

        from nltk.corpus import find_corpus_file
        basedir = get_basedir()

        # Look for the file
        try_filename = find_corpus_file("paradigms", p_filename)
        try:
            f = open(try_filename)
            p_filename = try_filename
        except IOError:
            print "Cannot find file"
            return None
        f.close()

        # These variables will be set by this method
        self.attributes = {}  # A new dictionary
        self.data = []  # A new list

        # XML admin: create Reader object, parse document
        reader = Sax2.Reader()
        doc = reader.fromStream(p_filename)

        # Cycle through the given attributes and add them to self.attributes
        # for <name> in <attributes>
        attributes = doc.getElementsByTagName('attributes')[0]
        for name in attributes.getElementsByTagName('name'):

            # Setup a list of attribute values
            tmp_list = []

            # for each value under name, store in list
            for value in name.getElementsByTagName('value'):
                tmp_list.append(value.getAttribute('value'))

            # Store list of values in dictionary
            self.attributes[name.getAttribute('name')] = tmp_list

        # Cycle through data objects and add them to self.data
        # for <form> in <paradigm>
        forms = doc.getElementsByTagName('paradigm')[0]
        for form in forms.getElementsByTagName('form'):
            # Initialise a temporary dictionary
            tmp_dict = {}
            for value in form.getElementsByTagName('attribute'):
                tmp_dict[value.getAttribute('name')] = value.getAttribute(
                    'value')
            # Add the new dictionary to the data list
            self.data.append(tmp_dict)

        # Talk to the user
        print "Paradigm information successfully loaded from file:", p_filename
        # State the number and print out a list of attributes
        print " " * 4 + str(len(self.attributes)) + " attributes imported:",
        for att in self.attributes:
            print att,
        print
        # State the number of paradigm objects imported
        print " " * 4 + str(len(self.data)) + " paradigm objects imported."

        return

Example #2

0

Show file

# -*- coding: utf-8 -*-

# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
demonstration of grammar parsing
"""

from nltk.etree.ElementTree import ElementTree
from nltk_contrib import toolbox
from nltk.corpus import find_corpus_file
import os.path, sys

grammar = r"""
      lexfunc: {<lf>(<lv><ln|le>*)*}
      example: {<rf|xv><xn|xe>*}
      sense:   {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*}
      record:   {<lx><hm><sense>+<dt>}
    """

db = toolbox.ToolboxData()
db.open(find_corpus_file('toolbox', 'iu_mien_samp.db'))
lexicon = db.chunk_parse(grammar, encoding='utf8')
toolbox.data.indent(lexicon)
tree = ElementTree(lexicon)
tree.write(sys.stdout, encoding='utf8')

Example #3

0

Show file

File: demo4.py Project: Sandy4321/nltk_contrib

# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
demonstration of grammar parsing
"""

from nltk.etree.ElementTree import ElementTree
from nltk_contrib import toolbox
from nltk.corpus import find_corpus_file
import os.path, sys

grammar = r"""
      lexfunc: {<lf>(<lv><ln|le>*)*}
      example: {<rf|xv><xn|xe>*}
      sense:   {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*}
      record:   {<lx><hm><sense>+<dt>}
    """

db = toolbox.ToolboxData()
db.open(find_corpus_file("toolbox", "iu_mien_samp.db"))
lexicon = db.chunk_parse(grammar, encoding="utf8")
toolbox.data.indent(lexicon)
tree = ElementTree(lexicon)
tree.write(sys.stdout, encoding="utf8")

Example #4

0

Show file

File: paradigm.py Project: DrDub/icsisumm

    def loadParadigm(self, p_filename ):
        """
        Load the given paradigm (XML file)
        Attributes are stored in self.attributes
        Data are stored in self.data
    
        They can be accessed as follows:
        self.attributes['gender']   # list of genders
        self.data[6]['gender']      # gender for the sixth data object
        self.data[6]['content']     # content for the sixth data object
        """

        from nltk.corpus import find_corpus_file
        basedir = get_basedir()

        # Look for the file
        try_filename = find_corpus_file("paradigms", p_filename)
        try:
            f = open(try_filename)
            p_filename = try_filename
        except IOError:
            print "Cannot find file"
            return None
        f.close()

        # These variables will be set by this method
        self.attributes = {}  # A new dictionary
        self.data = []        # A new list

        # XML admin: create Reader object, parse document
        reader = Sax2.Reader()
        doc = reader.fromStream(p_filename)

        # Cycle through the given attributes and add them to self.attributes
        # for <name> in <attributes>
        attributes = doc.getElementsByTagName('attributes')[0]
        for name in attributes.getElementsByTagName('name'):

            # Setup a list of attribute values
            tmp_list = []

            # for each value under name, store in list
            for value in name.getElementsByTagName('value'):
                tmp_list.append(value.getAttribute('value'))

            # Store list of values in dictionary
            self.attributes[name.getAttribute('name')] = tmp_list


        # Cycle through data objects and add them to self.data
        # for <form> in <paradigm>
        forms = doc.getElementsByTagName('paradigm')[0]
        for form in forms.getElementsByTagName('form'):
            # Initialise a temporary dictionary
            tmp_dict = {}
            for value in form.getElementsByTagName('attribute'):
                tmp_dict[value.getAttribute('name')] = value.getAttribute('value')
            # Add the new dictionary to the data list
            self.data.append(tmp_dict)

        # Talk to the user
        print "Paradigm information successfully loaded from file:", p_filename
        # State the number and print out a list of attributes
        print " "*4 + str(len(self.attributes)) + " attributes imported:",
        for att in self.attributes:
            print att,
        print
        # State the number of paradigm objects imported
        print " "*4 + str(len(self.data)) + " paradigm objects imported."

        return

Example #5

0

Show file

File: demo3.py Project: Sandy4321/nltk_contrib

import os.path

grammar = {
        'toolbox':      (('_sh',), ('_DateStampHasFourDigitYear', 'entry')),
        'entry':          (('lx',), ('hm', 'sense', 'dt')),
        'sense':          (('sn', 'ps'), ('pn', 'gv', 'dv',
                                   'gn', 'gp', 'dn', 'rn',
                                   'ge', 'de', 're',
                                   'example', 'lexfunc')),
        'example':      (('rf', 'xv',), ('xn', 'xe')),
        'lexfunc':      (('lf',), ('lexvalue',)),
        'lexvalue':    (('lv',), ('ln', 'le')),
}

db = toolbox.ToolboxData()
db.open(find_corpus_file('toolbox', 'iu_mien_samp.db'))
lexicon = db.grammar_parse('toolbox', grammar, encoding='utf8')
tree = ElementTree(lexicon)
tree.write('iu_mien_samp.xml', encoding='utf8')
num_lexemes = 0
num_senses = 0
num_examples = 0
for lexeme in lexicon.findall('entry'):
    num_lexemes += 1
    for sense in lexeme.findall('sense'):
        num_senses += 1
        for example in sense.findall('example'):
            num_examples += 1
print 'num. lexemes  =', num_lexemes
print 'num. senses   =', num_senses
print 'num. examples =', num_examples