def loadParadigm(self, p_filename): """ Load the given paradigm (XML file) Attributes are stored in self.attributes Data are stored in self.data They can be accessed as follows: self.attributes['gender'] # list of genders self.data[6]['gender'] # gender for the sixth data object self.data[6]['content'] # content for the sixth data object """ from nltk.corpus import find_corpus_file basedir = get_basedir() # Look for the file try_filename = find_corpus_file("paradigms", p_filename) try: f = open(try_filename) p_filename = try_filename except IOError: print "Cannot find file" return None f.close() # These variables will be set by this method self.attributes = {} # A new dictionary self.data = [] # A new list # XML admin: create Reader object, parse document reader = Sax2.Reader() doc = reader.fromStream(p_filename) # Cycle through the given attributes and add them to self.attributes # for <name> in <attributes> attributes = doc.getElementsByTagName('attributes')[0] for name in attributes.getElementsByTagName('name'): # Setup a list of attribute values tmp_list = [] # for each value under name, store in list for value in name.getElementsByTagName('value'): tmp_list.append(value.getAttribute('value')) # Store list of values in dictionary self.attributes[name.getAttribute('name')] = tmp_list # Cycle through data objects and add them to self.data # for <form> in <paradigm> forms = doc.getElementsByTagName('paradigm')[0] for form in forms.getElementsByTagName('form'): # Initialise a temporary dictionary tmp_dict = {} for value in form.getElementsByTagName('attribute'): tmp_dict[value.getAttribute('name')] = value.getAttribute( 'value') # Add the new dictionary to the data list self.data.append(tmp_dict) # Talk to the user print "Paradigm information successfully loaded from file:", p_filename # State the number and print out a list of attributes print " " * 4 + str(len(self.attributes)) + " attributes imported:", for att in self.attributes: print att, print # State the number of paradigm objects imported print " " * 4 + str(len(self.data)) + " paradigm objects imported." return
# -*- coding: utf-8 -*- # Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ demonstration of grammar parsing """ from nltk.etree.ElementTree import ElementTree from nltk_contrib import toolbox from nltk.corpus import find_corpus_file import os.path, sys grammar = r""" lexfunc: {<lf>(<lv><ln|le>*)*} example: {<rf|xv><xn|xe>*} sense: {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*} record: {<lx><hm><sense>+<dt>} """ db = toolbox.ToolboxData() db.open(find_corpus_file('toolbox', 'iu_mien_samp.db')) lexicon = db.chunk_parse(grammar, encoding='utf8') toolbox.data.indent(lexicon) tree = ElementTree(lexicon) tree.write(sys.stdout, encoding='utf8')
# Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ demonstration of grammar parsing """ from nltk.etree.ElementTree import ElementTree from nltk_contrib import toolbox from nltk.corpus import find_corpus_file import os.path, sys grammar = r""" lexfunc: {<lf>(<lv><ln|le>*)*} example: {<rf|xv><xn|xe>*} sense: {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*} record: {<lx><hm><sense>+<dt>} """ db = toolbox.ToolboxData() db.open(find_corpus_file("toolbox", "iu_mien_samp.db")) lexicon = db.chunk_parse(grammar, encoding="utf8") toolbox.data.indent(lexicon) tree = ElementTree(lexicon) tree.write(sys.stdout, encoding="utf8")
def loadParadigm(self, p_filename ): """ Load the given paradigm (XML file) Attributes are stored in self.attributes Data are stored in self.data They can be accessed as follows: self.attributes['gender'] # list of genders self.data[6]['gender'] # gender for the sixth data object self.data[6]['content'] # content for the sixth data object """ from nltk.corpus import find_corpus_file basedir = get_basedir() # Look for the file try_filename = find_corpus_file("paradigms", p_filename) try: f = open(try_filename) p_filename = try_filename except IOError: print "Cannot find file" return None f.close() # These variables will be set by this method self.attributes = {} # A new dictionary self.data = [] # A new list # XML admin: create Reader object, parse document reader = Sax2.Reader() doc = reader.fromStream(p_filename) # Cycle through the given attributes and add them to self.attributes # for <name> in <attributes> attributes = doc.getElementsByTagName('attributes')[0] for name in attributes.getElementsByTagName('name'): # Setup a list of attribute values tmp_list = [] # for each value under name, store in list for value in name.getElementsByTagName('value'): tmp_list.append(value.getAttribute('value')) # Store list of values in dictionary self.attributes[name.getAttribute('name')] = tmp_list # Cycle through data objects and add them to self.data # for <form> in <paradigm> forms = doc.getElementsByTagName('paradigm')[0] for form in forms.getElementsByTagName('form'): # Initialise a temporary dictionary tmp_dict = {} for value in form.getElementsByTagName('attribute'): tmp_dict[value.getAttribute('name')] = value.getAttribute('value') # Add the new dictionary to the data list self.data.append(tmp_dict) # Talk to the user print "Paradigm information successfully loaded from file:", p_filename # State the number and print out a list of attributes print " "*4 + str(len(self.attributes)) + " attributes imported:", for att in self.attributes: print att, print # State the number of paradigm objects imported print " "*4 + str(len(self.data)) + " paradigm objects imported." return
import os.path grammar = { 'toolbox': (('_sh',), ('_DateStampHasFourDigitYear', 'entry')), 'entry': (('lx',), ('hm', 'sense', 'dt')), 'sense': (('sn', 'ps'), ('pn', 'gv', 'dv', 'gn', 'gp', 'dn', 'rn', 'ge', 'de', 're', 'example', 'lexfunc')), 'example': (('rf', 'xv',), ('xn', 'xe')), 'lexfunc': (('lf',), ('lexvalue',)), 'lexvalue': (('lv',), ('ln', 'le')), } db = toolbox.ToolboxData() db.open(find_corpus_file('toolbox', 'iu_mien_samp.db')) lexicon = db.grammar_parse('toolbox', grammar, encoding='utf8') tree = ElementTree(lexicon) tree.write('iu_mien_samp.xml', encoding='utf8') num_lexemes = 0 num_senses = 0 num_examples = 0 for lexeme in lexicon.findall('entry'): num_lexemes += 1 for sense in lexeme.findall('sense'): num_senses += 1 for example in sense.findall('example'): num_examples += 1 print 'num. lexemes =', num_lexemes print 'num. senses =', num_senses print 'num. examples =', num_examples