Example #1
0
def demo_flat():
    from nltk.etree.ElementTree import ElementTree
    import sys

    tree = ElementTree(
        toolbox.xml('iu_mien_samp.db', key='lx', encoding='utf8'))
    tree.write(sys.stdout)
Example #2
0
def demo():
    from nltk.etree.ElementTree import ElementTree

    settings = ToolboxSettings()
    settings.open("demos/MDF_AltH.typ")
    tree = settings.parse(unwrap=False, encoding="gbk")
    print tree.find("expset/expMDF/rtfPageSetup/paperSize").text
    settings_tree = ElementTree(tree)
    settings_tree.write("test.xml")
    print to_settings_string(settings_tree).encode("gbk")
Example #3
0
def demo():
    from nltk.etree.ElementTree import ElementTree

    settings = ToolboxSettings()
    settings.open('demos/MDF_AltH.typ')
    tree = settings.parse(unwrap=False, encoding='gbk')
    print tree.find('expset/expMDF/rtfPageSetup/paperSize').text
    settings_tree = ElementTree(tree)
    settings_tree.write('test.xml')
    print to_settings_string(settings_tree).encode('gbk')
Example #4
0
def demo():
    from itertools import islice

#    zip_path = nltk.data.find('corpora/toolbox.zip')
#    lexicon = ToolboxData(ZipFilePathPointer(zip_path, 'toolbox/rotokas.dic')).parse()
    file_path = nltk.data.find('corpora/toolbox/rotokas.dic')
    lexicon = ToolboxData(file_path).parse()
    print 'first field in fourth record:'
    print lexicon[3][0].tag
    print lexicon[3][0].text
    
    print '\nfields in sequential order:'
    for field in islice(lexicon.find('record'), 10):
        print field.tag, field.text

    print '\nlx fields:'
    for field in islice(lexicon.findall('record/lx'), 10):
        print field.text

    from nltk.etree.ElementTree import ElementTree
    
    settings = ToolboxSettings()
    file_path = nltk.data.find('corpora/toolbox/MDF/MDF_AltH.typ')
    settings.open(file_path)
#    settings.open(ZipFilePathPointer(zip_path, entry='toolbox/MDF/MDF_AltH.typ'))
    tree = settings.parse(unwrap=False, encoding='cp1252')
    print tree.find('expset/expMDF/rtfPageSetup/paperSize').text
    settings_tree = ElementTree(tree)
    print to_settings_string(settings_tree).encode('utf8')
Example #5
0
 def __init__(self,arq): #inicializa a instância com a função
   # monta uma árvore a partir dos campos do arquivo do XML:
     arq_xml = ElementTree().parse(arq)
     # coleta direta da árvore do ElementTree (função do módulo NLTK para processamento de XML)
     self.sitename=arq_xml[0][0].text
     self.title=arq_xml[1][0].text
     self.alticleid=arq_xml[1][1].text
     self.timestamp=arq_xml[1][2][1].text
     self.username=arq_xml[1][2][2][0].text
     self.userid=arq_xml[1][2][2][1].text
     self.texto=arq_xml[1][2][4].text
   # Processamento do texto:
     # Separa o texto em trechos com ' == '
     # Extração de tokens de dentro da variável da classe
     self.tokens=re.split(r'\s*==\s*',self.texto)
     self.discussoes=[]
     for n in range(len(self.tokens)/2):
             self.discussoes.append(Discussao(self.tokens[2*n+1],self.tokens[2*n+2],n+1))
    def readXML(self, fileName = None):
        """Extract POS NER and WORD from XML file.

        :param fileName:
            XML formatted file name with full directory path to read, string.

        Return dictionary with:
            * **ner_events_in_folder** -- Contains dict with all ner information for given XML file, list.
            * **all_sentences** -- Contains dict with sentence id and all tokens in that sentence, list.

        """
        if fileName is None:
            fileName = self.fileName
        try:
            from nltk.etree.ElementTree import ElementTree
        except Exception as e:
            try:
                from nltk.ElementTree import ElementTree
            except:
                import xml.etree.ElementTree as ET
        try:
            try: 
                fileparsing = ElementTree().parse(fileName)
            except:
                fileparsing = ET.ElementTree().parse(fileName)
    	except Exception as e:
    	   print "{FileHandler} [readXML] fileHandler error.... %s" % e
    	if fileparsing:
            try:
                doc = fileparsing[0]
                sentences = doc[0]
            except:
                doc = fileparsing.getroot()
                sentences = doc.getroot()
            sentences = sentences.getchildren()
            logging.info("{FileHandler} [readXML] Parsing XML: %s" % fileName)
            #parse filename for extra information
		    #Get the root node
            fileInfo = []
            event = {}
            allSentences = []
            for sentence in sentences:
                tokensInASentence = []
                sentenceId = sentence.attrib.get('id')
                for tokens in sentence.findall('tokens'):
                    for a in range(len(tokens)):						
                        pos = tokens[a][4].text
                        ner = tokens[a][5].text
                        tokenNumber = tokens[a].attrib.get('id')
                        word = tokens[a][0].text
            		#create dictionary containing all events in a parsed XML file like 1991 92 serie A
                        try:
                            normalizedNer = tokens[a][6].text
                            event = {'pos':pos,'ner':ner, 'word':word, 'sentence_id': sentenceId, 'normalizedNer':normalizedNer, 'token_number':tokenNumber}
                        except Exception as e:
                            event = {'pos':pos,'ner':ner, 'word':word, 'sentence_id': sentence_id, 'token_number':tokenNumber}
                        fileInfo.append(event)
                        tokensInASentence.append(event)
                allTokensInSameSentence = {'sentence_id':sentenceId, 'all_tokens':tokensInASentence}
                allSentences.append(allTokensInSameSentence)
            allTokensAndAllSentences = {'ner_events_in_folder':fileInfo, 'all_sentences':allSentences}
            return allTokensAndAllSentences
        else:
            print "{FileHandler} [readXML] fileparsing error."
            logging.info("{FileHandler} [readXML] Parsing XML error.\n" % fileName)
Example #7
0
# -*- coding: utf-8 -*-

# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
demonstration of grammar parsing
"""

from nltk.etree.ElementTree import ElementTree
from nltk_contrib import toolbox
from nltk.corpus import find_corpus_file
import os.path, sys

grammar = r"""
      lexfunc: {<lf>(<lv><ln|le>*)*}
      example: {<rf|xv><xn|xe>*}
      sense:   {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*}
      record:   {<lx><hm><sense>+<dt>}
    """

db = toolbox.ToolboxData()
db.open(find_corpus_file('toolbox', 'iu_mien_samp.db'))
lexicon = db.chunk_parse(grammar, encoding='utf8')
toolbox.data.indent(lexicon)
tree = ElementTree(lexicon)
tree.write(sys.stdout, encoding='utf8')
Example #8
0
                        pass
    else:
        ppl_on_stage, incidence_list = exeuntAct()
    return ppl_on_stage, incidence_list


def exeuntAct():
    return [], []


'''
Prep stuff for doing the rest of this...
'''
# get the file and such
merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
merchant = ElementTree().parse(merchant_file)
# people
speaker_seq = [s.text.upper() for s in \
               merchant.findall('ACT/SCENE/SPEECH/SPEAKER')]
speaker_freq = nltk.FreqDist(speaker_seq)
top10 = speaker_freq.keys()[:10]
mapping = nltk.defaultdict(lambda: 'OTHE')
for s in top10:
    mapping[s] = s[:4].upper()
# stage actions
stage_seq = [s.text for s in merchant.findall('ACT/SCENE/STAGEDIR')]
for i, stage in enumerate(stage_seq):
    stage_seq[i] = nltk.word_tokenize(stage)
keep_list = speaker_freq.keys()
keep_list.extend(['EXIT', 'ENTER', 'EXEUNT', 'WITH'])
for i, entry in enumerate(stage_seq):
Example #9
0
# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
corresponds to 
12.3.1   Accessing Toolbox Data
in http://nltk.sourceforge.net/lite/doc/en/data.html
"""

from nltk.corpus import toolbox

lexicon = toolbox.xml('rotokas.dic')

sum_size = num_entries = 0
for entry in lexicon.findall('record'):
    num_entries += 1
    sum_size += len(entry)
print sum_size/num_entries


from nltk.etree.ElementTree import ElementTree
import sys
fourth_entry = lexicon.findall('record')[3]
tree = ElementTree(fourth_entry)
tree.write(sys.stdout)
    def readXML(self, fileName=None):
        """Extract POS NER and WORD from XML file.

        :param fileName:
            XML formatted file name with full directory path to read, string.

        Return dictionary with:
            * **ner_events_in_folder** -- Contains dict with all ner information for given XML file, list.
            * **all_sentences** -- Contains dict with sentence id and all tokens in that sentence, list.

        """
        if fileName is None:
            fileName = self.fileName
        try:
            from nltk.etree.ElementTree import ElementTree
        except Exception as e:
            try:
                from nltk.ElementTree import ElementTree
            except:
                import xml.etree.ElementTree as ET
        try:
            try:
                fileparsing = ElementTree().parse(fileName)
            except:
                fileparsing = ET.ElementTree().parse(fileName)
        except Exception as e:
            print "{FileHandler} [readXML] fileHandler error.... %s" % e
        if fileparsing:
            try:
                doc = fileparsing[0]
                sentences = doc[0]
            except:
                doc = fileparsing.getroot()
                sentences = doc.getroot()
            sentences = sentences.getchildren()
            logging.info("{FileHandler} [readXML] Parsing XML: %s" % fileName)
            #parse filename for extra information
            #Get the root node
            fileInfo = []
            event = {}
            allSentences = []
            for sentence in sentences:
                tokensInASentence = []
                sentenceId = sentence.attrib.get('id')
                for tokens in sentence.findall('tokens'):
                    for a in range(len(tokens)):
                        pos = tokens[a][4].text
                        ner = tokens[a][5].text
                        tokenNumber = tokens[a].attrib.get('id')
                        word = tokens[a][0].text
                        #create dictionary containing all events in a parsed XML file like 1991 92 serie A
                        try:
                            normalizedNer = tokens[a][6].text
                            event = {
                                'pos': pos,
                                'ner': ner,
                                'word': word,
                                'sentence_id': sentenceId,
                                'normalizedNer': normalizedNer,
                                'token_number': tokenNumber
                            }
                        except Exception as e:
                            event = {
                                'pos': pos,
                                'ner': ner,
                                'word': word,
                                'sentence_id': sentence_id,
                                'token_number': tokenNumber
                            }
                        fileInfo.append(event)
                        tokensInASentence.append(event)
                allTokensInSameSentence = {
                    'sentence_id': sentenceId,
                    'all_tokens': tokensInASentence
                }
                allSentences.append(allTokensInSameSentence)
            allTokensAndAllSentences = {
                'ner_events_in_folder': fileInfo,
                'all_sentences': allSentences
            }
            return allTokensAndAllSentences
        else:
            print "{FileHandler} [readXML] fileparsing error."
            logging.info("{FileHandler} [readXML] Parsing XML error.\n" %
                         fileName)
Example #11
0
# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT

"""
demonstration of grammar parsing
"""

from nltk.etree.ElementTree import ElementTree
from nltk_contrib import toolbox
from nltk.corpus import find_corpus_file
import os.path, sys

grammar = r"""
      lexfunc: {<lf>(<lv><ln|le>*)*}
      example: {<rf|xv><xn|xe>*}
      sense:   {<sn><ps><pn|gv|dv|gn|gp|dn|rn|ge|de|re>*<example>*<lexfunc>*}
      record:   {<lx><hm><sense>+<dt>}
    """

db = toolbox.ToolboxData()
db.open(find_corpus_file("toolbox", "iu_mien_samp.db"))
lexicon = db.chunk_parse(grammar, encoding="utf8")
toolbox.data.indent(lexicon)
tree = ElementTree(lexicon)
tree.write(sys.stdout, encoding="utf8")
Example #12
0
                for ppl in details:
                    try: ppl_on_stage.remove(mapping[ppl].upper())
                    except ValueError: pass
    else:
        ppl_on_stage, incidence_list = exeuntAct()
    return ppl_on_stage, incidence_list

def exeuntAct():
    return [], []
    
'''
Prep stuff for doing the rest of this...
'''
# get the file and such
merchant_file = nltk.data.find('corpora/shakespeare/merchant.xml')
merchant = ElementTree().parse(merchant_file)
# people
speaker_seq = [s.text.upper() for s in \
               merchant.findall('ACT/SCENE/SPEECH/SPEAKER')]
speaker_freq = nltk.FreqDist(speaker_seq)
top10 = speaker_freq.keys()[:10]
mapping = nltk.defaultdict(lambda: 'OTHE')
for s in top10:
    mapping[s] = s[:4].upper()
# stage actions
stage_seq = [s.text for s in merchant.findall('ACT/SCENE/STAGEDIR')]
for i, stage in enumerate(stage_seq):
    stage_seq[i] = nltk.word_tokenize(stage)
keep_list = speaker_freq.keys()
keep_list.extend(['EXIT', 'ENTER', 'EXEUNT', 'WITH'])
for i, entry in enumerate(stage_seq):
Example #13
0
# -*- coding: utf-8 -*-

# Natural Language Toolkit: Toolbox Data demonstration
#
# Copyright (C) 2001-2006 NLTK Project
# Author: Greg Aumann <*****@*****.**>
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
corresponds to 
12.3.1   Accessing Toolbox Data
in http://nltk.sourceforge.net/lite/doc/en/data.html
"""

from nltk.corpus import toolbox

lexicon = toolbox.xml('rotokas.dic')

sum_size = num_entries = 0
for entry in lexicon.findall('record'):
    num_entries += 1
    sum_size += len(entry)
print sum_size / num_entries

from nltk.etree.ElementTree import ElementTree
import sys
fourth_entry = lexicon.findall('record')[3]
tree = ElementTree(fourth_entry)
tree.write(sys.stdout)
Example #14
0
    'toolbox': (('_sh', ), ('_DateStampHasFourDigitYear', 'entry')),
    'entry': (('lx', ), ('hm', 'sense', 'dt')),
    'sense': (('sn', 'ps'), ('pn', 'gv', 'dv', 'gn', 'gp', 'dn', 'rn', 'ge',
                             'de', 're', 'example', 'lexfunc')),
    'example': ((
        'rf',
        'xv',
    ), ('xn', 'xe')),
    'lexfunc': (('lf', ), ('lexvalue', )),
    'lexvalue': (('lv', ), ('ln', 'le')),
}

db = toolbox.ToolboxData()
db.open(find_corpus_file('toolbox', 'iu_mien_samp.db'))
lexicon = db.grammar_parse('toolbox', grammar, encoding='utf8')
tree = ElementTree(lexicon)
tree.write('iu_mien_samp.xml', encoding='utf8')
num_lexemes = 0
num_senses = 0
num_examples = 0
for lexeme in lexicon.findall('entry'):
    num_lexemes += 1
    for sense in lexeme.findall('sense'):
        num_senses += 1
        for example in sense.findall('example'):
            num_examples += 1
print 'num. lexemes  =', num_lexemes
print 'num. senses   =', num_senses
print 'num. examples =', num_examples

#another approach
Example #15
0
def demo_flat():
    from nltk.etree.ElementTree import ElementTree    
    import sys

    tree = ElementTree(toolbox.xml('iu_mien_samp.db', key='lx', encoding='utf8'))
    tree.write(sys.stdout)
Example #16
0
grammar = {
        'toolbox':      (('_sh',), ('_DateStampHasFourDigitYear', 'entry')),
        'entry':          (('lx',), ('hm', 'sense', 'dt')),
        'sense':          (('sn', 'ps'), ('pn', 'gv', 'dv',
                                   'gn', 'gp', 'dn', 'rn',
                                   'ge', 'de', 're',
                                   'example', 'lexfunc')),
        'example':      (('rf', 'xv',), ('xn', 'xe')),
        'lexfunc':      (('lf',), ('lexvalue',)),
        'lexvalue':    (('lv',), ('ln', 'le')),
}

db = toolbox.ToolboxData()
db.open(find_corpus_file('toolbox', 'iu_mien_samp.db'))
lexicon = db.grammar_parse('toolbox', grammar, encoding='utf8')
tree = ElementTree(lexicon)
tree.write('iu_mien_samp.xml', encoding='utf8')
num_lexemes = 0
num_senses = 0
num_examples = 0
for lexeme in lexicon.findall('entry'):
    num_lexemes += 1
    for sense in lexeme.findall('sense'):
        num_senses += 1
        for example in sense.findall('example'):
            num_examples += 1
print 'num. lexemes  =', num_lexemes
print 'num. senses   =', num_senses
print 'num. examples =', num_examples

#another approach