def demo_flat(): from nltk.etree.ElementTree import ElementTree import sys tree = ElementTree( toolbox.xml('iu_mien_samp.db', key='lx', encoding='utf8')) tree.write(sys.stdout)
def demo(): from nltk.etree.ElementTree import ElementTree settings = ToolboxSettings() settings.open("demos/MDF_AltH.typ") tree = settings.parse(unwrap=False, encoding="gbk") print tree.find("expset/expMDF/rtfPageSetup/paperSize").text settings_tree = ElementTree(tree) settings_tree.write("test.xml") print to_settings_string(settings_tree).encode("gbk")
def demo(): from nltk.etree.ElementTree import ElementTree settings = ToolboxSettings() settings.open('demos/MDF_AltH.typ') tree = settings.parse(unwrap=False, encoding='gbk') print tree.find('expset/expMDF/rtfPageSetup/paperSize').text settings_tree = ElementTree(tree) settings_tree.write('test.xml') print to_settings_string(settings_tree).encode('gbk')
def demo(): from itertools import islice # zip_path = nltk.data.find('corpora/toolbox.zip') # lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse() file_path = nltk.data.find('corpora/toolbox/rotokas.dic') lexicon = ToolboxData(file_path).parse() print 'first field in fourth record:' print lexicon[3][0].tag print lexicon[3][0].text print '\nfields in sequential order:' for field in islice(lexicon.find('record'), 10): print field.tag, field.text print '\nlx fields:' for field in islice(lexicon.findall('record/lx'), 10): print field.text from nltk.etree.ElementTree import ElementTree settings = ToolboxSettings() file_path = nltk.data.find('corpora/toolbox/MDF/MDF_AltH.typ') settings.open(file_path) # settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ')) tree = settings.parse(unwrap=False, encoding='cp1252') print tree.find('expset/expMDF/rtfPageSetup/paperSize').text settings_tree = ElementTree(tree) print to_settings_string(settings_tree).encode('utf8')
def __init__(self,arq): #inicializa a instância com a função # monta uma árvore a partir dos campos do arquivo do XML: arq_xml = ElementTree().parse(arq) # coleta direta da árvore do ElementTree (função do módulo NLTK para processamento de XML) self.sitename=arq_xml[0][0].text self.title=arq_xml[1][0].text self.alticleid=arq_xml[1][1].text self.timestamp=arq_xml[1][2][1].text self.username=arq_xml[1][2][2][0].text self.userid=arq_xml[1][2][2][1].text self.texto=arq_xml[1][2][4].text # Processamento do texto: # Separa o texto em trechos com ' == ' # Extração de tokens de dentro da variável da classe self.tokens=re.split(r'\s*==\s*',self.texto) self.discussoes=[] for n in range(len(self.tokens)/2): self.discussoes.append(Discussao(self.tokens[2*n+1],self.tokens[2*n+2],n+1))
def readXML(self, fileName = None): """Extract POS NER and WORD from XML file. :param fileName: XML formatted file name with full directory path to read, string. Return dictionary with: * **ner_events_in_folder** -- Contains dict with all ner information for given XML file, list. * **all_sentences** -- Contains dict with sentence id and all tokens in that sentence, list. """ if fileName is None: fileName = self.fileName try: from nltk.etree.ElementTree import ElementTree except Exception as e: try: from nltk.ElementTree import ElementTree except: import xml.etree.ElementTree as ET try: try: fileparsing = ElementTree().parse(fileName) except: fileparsing = ET.ElementTree().parse(fileName) except Exception as e: print "{FileHandler} [readXML] fileHandler error.... %s" % e if fileparsing: try: doc = fileparsing[0] sentences = doc[0] except: doc = fileparsing.getroot() sentences = doc.getroot() sentences = sentences.getchildren() logging.info("{FileHandler} [readXML] Parsing XML: %s" % fileName) #parse filename for extra information #Get the root node fileInfo = [] event = {} allSentences = [] for sentence in sentences: tokensInASentence = [] sentenceId = sentence.attrib.get('id') for tokens in sentence.findall('tokens'): for a in range(len(tokens)): pos = tokens[a][4].text ner = tokens[a][5].text tokenNumber = tokens[a].attrib.get('id') word = tokens[a][0].text #create dictionary containing all events in a parsed XML file like 1991 92 serie A try: normalizedNer = tokens[a][6].text event = {'pos':pos,'ner':ner, 'word':word, 'sentence_id': sentenceId, 'normalizedNer':normalizedNer, 'token_number':tokenNumber} except Exception as e: event = {'pos':pos,'ner':ner, 'word':word, 'sentence_id': sentence_id, 'token_number':tokenNumber} fileInfo.append(event) tokensInASentence.append(event) allTokensInSameSentence = {'sentence_id':sentenceId, 'all_tokens':tokensInASentence} allSentences.append(allTokensInSameSentence) allTokensAndAllSentences = {'ner_events_in_folder':fileInfo, 'all_sentences':allSentences} return allTokensAndAllSentences else: print "{FileHandler} [readXML] fileparsing error." logging.info("{FileHandler} [readXML] Parsing XML error.\n" % fileName)
# -*- coding: utf-8 -*- # Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ demonstration of grammar parsing """ from nltk.etree.ElementTree import ElementTree from nltk_contrib import toolbox from nltk.corpus import find_corpus_file import os.path, sys grammar = r""" lexfunc: {<lf>(<lv><ln|le>*)*} example: {<rf|xv><xn|xe>*} sense: {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*} record: {<lx><hm><sense>+<dt>} """ db = toolbox.ToolboxData() db.open(find_corpus_file('toolbox', 'iu_mien_samp.db')) lexicon = db.chunk_parse(grammar, encoding='utf8') toolbox.data.indent(lexicon) tree = ElementTree(lexicon) tree.write(sys.stdout, encoding='utf8')
pass else: ppl_on_stage, incidence_list = exeuntAct() return ppl_on_stage, incidence_list def exeuntAct(): return [], [] ''' Prep stuff for doing the rest of this... ''' # get the file and such merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml') merchant = ElementTree().parse(merchant_file) # people speaker_seq = [s.text.upper() for s in \ merchant.findall('ACT/SCENE/SPEECH/SPEAKER')] speaker_freq = nltk.FreqDist(speaker_seq) top10 = speaker_freq.keys()[:10] mapping = nltk.defaultdict(lambda: 'OTHE') for s in top10: mapping[s] = s[:4].upper() # stage actions stage_seq = [s.text for s in merchant.findall('ACT/SCENE/STAGEDIR')] for i, stage in enumerate(stage_seq): stage_seq[i] = nltk.word_tokenize(stage) keep_list = speaker_freq.keys() keep_list.extend(['EXIT', 'ENTER', 'EXEUNT', 'WITH']) for i, entry in enumerate(stage_seq):
# Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ corresponds to 12.3.1 Accessing Toolbox Data in http://nltk.sourceforge.net/lite/doc/en/data.html """ from nltk.corpus import toolbox lexicon = toolbox.xml('rotokas.dic') sum_size = num_entries = 0 for entry in lexicon.findall('record'): num_entries += 1 sum_size += len(entry) print sum_size/num_entries from nltk.etree.ElementTree import ElementTree import sys fourth_entry = lexicon.findall('record')[3] tree = ElementTree(fourth_entry) tree.write(sys.stdout)
def readXML(self, fileName=None): """Extract POS NER and WORD from XML file. :param fileName: XML formatted file name with full directory path to read, string. Return dictionary with: * **ner_events_in_folder** -- Contains dict with all ner information for given XML file, list. * **all_sentences** -- Contains dict with sentence id and all tokens in that sentence, list. """ if fileName is None: fileName = self.fileName try: from nltk.etree.ElementTree import ElementTree except Exception as e: try: from nltk.ElementTree import ElementTree except: import xml.etree.ElementTree as ET try: try: fileparsing = ElementTree().parse(fileName) except: fileparsing = ET.ElementTree().parse(fileName) except Exception as e: print "{FileHandler} [readXML] fileHandler error.... %s" % e if fileparsing: try: doc = fileparsing[0] sentences = doc[0] except: doc = fileparsing.getroot() sentences = doc.getroot() sentences = sentences.getchildren() logging.info("{FileHandler} [readXML] Parsing XML: %s" % fileName) #parse filename for extra information #Get the root node fileInfo = [] event = {} allSentences = [] for sentence in sentences: tokensInASentence = [] sentenceId = sentence.attrib.get('id') for tokens in sentence.findall('tokens'): for a in range(len(tokens)): pos = tokens[a][4].text ner = tokens[a][5].text tokenNumber = tokens[a].attrib.get('id') word = tokens[a][0].text #create dictionary containing all events in a parsed XML file like 1991 92 serie A try: normalizedNer = tokens[a][6].text event = { 'pos': pos, 'ner': ner, 'word': word, 'sentence_id': sentenceId, 'normalizedNer': normalizedNer, 'token_number': tokenNumber } except Exception as e: event = { 'pos': pos, 'ner': ner, 'word': word, 'sentence_id': sentence_id, 'token_number': tokenNumber } fileInfo.append(event) tokensInASentence.append(event) allTokensInSameSentence = { 'sentence_id': sentenceId, 'all_tokens': tokensInASentence } allSentences.append(allTokensInSameSentence) allTokensAndAllSentences = { 'ner_events_in_folder': fileInfo, 'all_sentences': allSentences } return allTokensAndAllSentences else: print "{FileHandler} [readXML] fileparsing error." logging.info("{FileHandler} [readXML] Parsing XML error.\n" % fileName)
# Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ demonstration of grammar parsing """ from nltk.etree.ElementTree import ElementTree from nltk_contrib import toolbox from nltk.corpus import find_corpus_file import os.path, sys grammar = r""" lexfunc: {<lf>(<lv><ln|le>*)*} example: {<rf|xv><xn|xe>*} sense: {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*} record: {<lx><hm><sense>+<dt>} """ db = toolbox.ToolboxData() db.open(find_corpus_file("toolbox", "iu_mien_samp.db")) lexicon = db.chunk_parse(grammar, encoding="utf8") toolbox.data.indent(lexicon) tree = ElementTree(lexicon) tree.write(sys.stdout, encoding="utf8")
for ppl in details: try: ppl_on_stage.remove(mapping[ppl].upper()) except ValueError: pass else: ppl_on_stage, incidence_list = exeuntAct() return ppl_on_stage, incidence_list def exeuntAct(): return [], [] ''' Prep stuff for doing the rest of this... ''' # get the file and such merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml') merchant = ElementTree().parse(merchant_file) # people speaker_seq = [s.text.upper() for s in \ merchant.findall('ACT/SCENE/SPEECH/SPEAKER')] speaker_freq = nltk.FreqDist(speaker_seq) top10 = speaker_freq.keys()[:10] mapping = nltk.defaultdict(lambda: 'OTHE') for s in top10: mapping[s] = s[:4].upper() # stage actions stage_seq = [s.text for s in merchant.findall('ACT/SCENE/STAGEDIR')] for i, stage in enumerate(stage_seq): stage_seq[i] = nltk.word_tokenize(stage) keep_list = speaker_freq.keys() keep_list.extend(['EXIT', 'ENTER', 'EXEUNT', 'WITH']) for i, entry in enumerate(stage_seq):
# -*- coding: utf-8 -*- # Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ corresponds to 12.3.1 Accessing Toolbox Data in http://nltk.sourceforge.net/lite/doc/en/data.html """ from nltk.corpus import toolbox lexicon = toolbox.xml('rotokas.dic') sum_size = num_entries = 0 for entry in lexicon.findall('record'): num_entries += 1 sum_size += len(entry) print sum_size / num_entries from nltk.etree.ElementTree import ElementTree import sys fourth_entry = lexicon.findall('record')[3] tree = ElementTree(fourth_entry) tree.write(sys.stdout)
'toolbox': (('_sh', ), ('_DateStampHasFourDigitYear', 'entry')), 'entry': (('lx', ), ('hm', 'sense', 'dt')), 'sense': (('sn', 'ps'), ('pn', 'gv', 'dv', 'gn', 'gp', 'dn', 'rn', 'ge', 'de', 're', 'example', 'lexfunc')), 'example': (( 'rf', 'xv', ), ('xn', 'xe')), 'lexfunc': (('lf', ), ('lexvalue', )), 'lexvalue': (('lv', ), ('ln', 'le')), } db = toolbox.ToolboxData() db.open(find_corpus_file('toolbox', 'iu_mien_samp.db')) lexicon = db.grammar_parse('toolbox', grammar, encoding='utf8') tree = ElementTree(lexicon) tree.write('iu_mien_samp.xml', encoding='utf8') num_lexemes = 0 num_senses = 0 num_examples = 0 for lexeme in lexicon.findall('entry'): num_lexemes += 1 for sense in lexeme.findall('sense'): num_senses += 1 for example in sense.findall('example'): num_examples += 1 print 'num. lexemes =', num_lexemes print 'num. senses =', num_senses print 'num. examples =', num_examples #another approach
def demo_flat(): from nltk.etree.ElementTree import ElementTree import sys tree = ElementTree(toolbox.xml('iu_mien_samp.db', key='lx', encoding='utf8')) tree.write(sys.stdout)
grammar = { 'toolbox': (('_sh',), ('_DateStampHasFourDigitYear', 'entry')), 'entry': (('lx',), ('hm', 'sense', 'dt')), 'sense': (('sn', 'ps'), ('pn', 'gv', 'dv', 'gn', 'gp', 'dn', 'rn', 'ge', 'de', 're', 'example', 'lexfunc')), 'example': (('rf', 'xv',), ('xn', 'xe')), 'lexfunc': (('lf',), ('lexvalue',)), 'lexvalue': (('lv',), ('ln', 'le')), } db = toolbox.ToolboxData() db.open(find_corpus_file('toolbox', 'iu_mien_samp.db')) lexicon = db.grammar_parse('toolbox', grammar, encoding='utf8') tree = ElementTree(lexicon) tree.write('iu_mien_samp.xml', encoding='utf8') num_lexemes = 0 num_senses = 0 num_examples = 0 for lexeme in lexicon.findall('entry'): num_lexemes += 1 for sense in lexeme.findall('sense'): num_senses += 1 for example in sense.findall('example'): num_examples += 1 print 'num. lexemes =', num_lexemes print 'num. senses =', num_senses print 'num. examples =', num_examples #another approach