def demo(): from nltk.corpus import toolbox from itertools import islice lexicon = toolbox.xml('rotokas.dic') print 'first field in fourth record:' print lexicon[3][0].tag print lexicon[3][0].text print '\nfields in sequential order:' for field in islice(lexicon.find('record'), 10): print field.tag, field.text print '\nlx fields:' for field in islice(lexicon.findall('record/lx'), 10): print field.text
from collections import defaultdict abbreviate = defaultdict(lambda: 'OTH') for speaker, _ in top5: abbreviate[speaker] = speaker[:4] speaker_seq2 = [abbreviate[speaker] for speaker in speaker_seq] cfd = nltk.ConditionalFreqDist(nltk.bigrams(speaker_seq2)) cfd.tabulate() # 11.4.4 使用ElementTree访问Toolbox的数据 from nltk.corpus import toolbox # 访问lexicon对象的内容的两种方法 # 1) 通过索引 # 索引访问:lexicon[3]返回3号条目(从0开始算起的第4个条目),lexicon[3][0]返回它的第一个字段 lexicon = toolbox.xml('rotokas.dic') lexicon[3][0] lexicon[3][0].tag lexicon[3][0].text # 2) 通过路径 # 路径访问:'record/lx'的所有匹配,并且访问该元素的文本内容,将其规范化为小写 [lexeme.text.lower() for lexeme in lexicon.findall('record/lx')] # Toolbox数据是XML格式。 import sys from nltk.util import elementtree_indent from xml.etree.ElementTree import ElementTree elementtree_indent(lexicon) tree = ElementTree(lexicon[3])
# Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ corresponds to 12.3.1 Accessing Toolbox Data in http://nltk.sourceforge.net/lite/doc/en/data.html """ from nltk.corpus import toolbox lexicon = toolbox.xml('rotokas.dic') sum_size = num_entries = 0 for entry in lexicon.findall('record'): num_entries += 1 sum_size += len(entry) print sum_size/num_entries from nltk.etree.ElementTree import ElementTree import sys fourth_entry = lexicon.findall('record')[3] tree = ElementTree(fourth_entry) tree.write(sys.stdout)
# Natural Language Toolkit: Toolbox Data demonstration # # Copyright (C) 2001-2006 NLTK Project # Author: Greg Aumann <*****@*****.**> # URL: <http://www.nltk.org/> # For license information, see LICENSE.TXT """ corresponds to 12.3.1 Accessing Toolbox Data in http://nltk.sourceforge.net/lite/doc/en/data.html """ from nltk.corpus import toolbox lexicon = toolbox.xml('rotokas.dic') lexemes = [] for lexeme in lexicon.findall('record/lx'): normalised_lexeme = lexeme.text.lower() lexemes.append(normalised_lexeme) # list comprehension approach lexemes2 = [lexeme.text.lower() for lexeme in lexicon.findall('record/lx')] ##if lexemes != lexemes2: ## print 'error two lists not equal' ##else: ## print repr(lexemes) import re