def __call__(self, tag, *children, **attrib): elem = ET.Element(tag, attrib) for item in children: if isinstance(item, dict): elem.attrib.update(item) elif isinstance(item, basestring): if len(elem): elem[-1].tail = (elem[-1].tail or "") + item else: elem.text = (elem.text or "") + item elif ET.iselement(item): elem.append(item) else: raise TypeError("bad argument: %r" % item) return elem
def process(dict_names, gram_fname, xml, encoding): """""" gram_file = open(gram_fname, "r") gram = gram_file.read() gram_file.close() lexicon = parse_corpus(dict_names, grammar=gram, encoding=encoding, errors="replace") mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon) analysis = analyse_dict(lexicon) if xml: indent(lexicon) out_file = open(xml, "w") out_file.write(ET.tostring(lexicon, encoding="UTF-8")) out_file.close() print "analysing files\n%s\n" % "\n".join(dict_names) if xml: print 'XML lexicon output in file "%s"\n' % xml print "====chunk grammar====" print gram print "\n" max_positions = 30 for structure, patt_dict in analysis.items(): print "\n\n===%s===: total= %d" % (structure, pattern_count(patt_dict)) for pattern, positions in sorted(patt_dict.items(), key=lambda t: (-len(t[1]), t[0])): if len(positions) <= max_positions: pos_str = "Entries: %s" % ", ".join(positions) else: pos_str = "Too many entries to list." print "\t%5d: %s %s" % (len(positions), ":".join(pattern), pos_str) print "\n\n" print "mkr\tcount\tnonblank" for mkr in mkr_counts: print "%s\t%5d\t%5d" % (mkr, mkr_counts.get(mkr, 0), nonblank_mkr_counts.get(mkr, 0))
def load_ace_file(textfile, fmt): print ' - %s' % os.path.split(textfile)[1] annfile = textfile+'.tmx.rdc.xml' # Read the xml file, and get a list of entities entities = [] xml = ET.parse(open(annfile)).getroot() for entity in xml.findall('document/entity'): typ = entity.find('entity_type').text for mention in entity.findall('entity_mention'): if mention.get('TYPE') != 'NAME': continue # only NEs s = int(mention.find('head/charseq/start').text) e = int(mention.find('head/charseq/end').text)+1 entities.append( (s, e, typ) ) # Read the text file, and mark the entities. text = open(textfile).read() # Strip XML tags, since they don't count towards the indices text = re.sub('<(?!/?TEXT)[^>]+>', '', text) # Blank out anything before/after <TEXT> def subfunc(m): return ' '*(m.end()-m.start()-6) text = re.sub('[\s\S]*<TEXT>', subfunc, text) text = re.sub('</TEXT>[\s\S]*', '', text) # Simplify quotes text = re.sub("``", ' "', text) text = re.sub("''", '" ', text) entity_types = set(typ for (s,e,typ) in entities) # Binary distinction (NE or not NE) if fmt == 'binary': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree('NE', text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks # Multiclass distinction (NE type) elif fmt == 'multiclass': i = 0 toks = nltk.Tree('S', []) for (s,e,typ) in sorted(entities): if s < i: s = i # Overlapping! Deal with this better? if e <= s: continue toks.extend(nltk.word_tokenize(text[i:s])) toks.append(nltk.Tree(typ, text[s:e].split())) i = e toks.extend(nltk.word_tokenize(text[i:])) yield toks else: raise ValueError('bad fmt value')
def concat(docs): """ Concatenate together the contents of multiple documents from a single corpus, using an appropriate concatenation function. This utility function is used by corpus readers when the user requests more than one document at a time. """ if len(docs) == 1: return docs[0] if len(docs) == 0: raise ValueError('concat() expects at least one object!') types = set([d.__class__ for d in docs]) # If they're all strings, use string concatenation. if types.issubset([str, unicode, basestring]): return reduce((lambda a, b: a + b), docs, '') # If they're all corpus views, then use ConcatenatedCorpusView. for typ in types: if not issubclass(typ, AbstractCorpusView): break else: return ConcatenatedCorpusView(docs) # Otherwise, see what we can do: if len(types) == 1: typ = list(types)[0] if issubclass(typ, list): return reduce((lambda a, b: a + b), docs, []) if issubclass(typ, tuple): return reduce((lambda a, b: a + b), docs, ()) if ElementTree.iselement(typ): xmltree = ElementTree.Element('documents') for doc in docs: xmltree.append(doc) return xmltree # No method found! raise ValueError("Don't know how to concatenate types: %r" % types)
def roleset(self, roleset_id): """ @return: the xml description for the given roleset. """ lemma = roleset_id.split('.')[0] framefile = 'frames/%s.xml' % lemma if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id) etree = ElementTree.parse(self.abspath(framefile)).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def roleset(self, roleset_id): """ @return: the xml description for the given roleset. """ lemma = roleset_id.split('.')[0] framefile = 'frames/%s.xml' % lemma if framefile not in self._framefiles: raise ValueError('Frameset file for %s not found' % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. etree = ElementTree.parse(self.abspath(framefile).open()).getroot() for roleset in etree.findall('predicate/roleset'): if roleset.attrib['id'] == roleset_id: return roleset else: raise ValueError('Roleset %s not found in %s' % (roleset_id, framefile))
def add_default_fields(elem, default_fields): """Add blank elements and subelements specified in default_fields. @param elem: toolbox data in an elementtree structure @type elem: ElementTree._ElementInterface @param default_fields: fields to add to each type of element and subelement @type default_fields: dictionary of tuples """ try: default = default_fields[elem.tag] except KeyError: pass else: for field in default: if elem.find(field) is None: ET.SubElement(elem, field) for child in elem: add_default_fields(child, default_fields)
def concat(docs): """ Concatenate together the contents of multiple documents from a single corpus, using an appropriate concatenation function. This utility function is used by corpus readers when the user requests more than one document at a time. """ if len(docs) == 1: return docs[0] if len(docs) == 0: raise ValueError('concat() expects at least one object!') types = set([d.__class__ for d in docs]) # If they're all strings, use string concatenation. if types.issubset([str, unicode, basestring]): return reduce((lambda a,b:a+b), docs, '') # If they're all corpus views, then use ConcatenatedCorpusView. for typ in types: if not issubclass(typ, AbstractCorpusView): break else: return ConcatenatedCorpusView(docs) # Otherwise, see what we can do: if len(types) == 1: typ = list(types)[0] if issubclass(typ, list): return reduce((lambda a,b:a+b), docs, []) if issubclass(typ, tuple): return reduce((lambda a,b:a+b), docs, ()) if ElementTree.iselement(typ): xmltree = ElementTree.Element('documents') for doc in docs: xmltree.append(doc) return xmltree # No method found! raise ValueError("Don't know how to concatenate types: %r" % types)
def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell()) - 1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == '': assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith('<lexelt'): lexelt_num += 1 m = re.search('item=("[^"]+"|\'[^\']+\')', line) assert m is not None # <lexelt> has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith('<instance'): assert instance_lines == [] in_instance = True # Body of an instance? if in_instance: instance_lines.append(line) # End of an instance? if line.lstrip().startswith('</instance'): xml_block = '\n'.join(instance_lines) xml_block = _fixXML(xml_block) inst = ElementTree.fromstring(xml_block) return [self._parse_instance(inst, lexelt)]
def read_block(self, stream): # Decide which lexical element we're in. lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1 lexelt = self._lexelts[lexelt_num] instance_lines = [] in_instance = False while True: line = stream.readline() if line == '': assert instance_lines == [] return [] # Start of a lexical element? if line.lstrip().startswith('<lexelt'): lexelt_num += 1 m = re.search('item=("[^"]+"|\'[^\']+\')', line) assert m is not None # <lexelt> has no 'item=...' lexelt = m.group(1)[1:-1] if lexelt_num < len(self._lexelts): assert lexelt == self._lexelts[lexelt_num] else: self._lexelts.append(lexelt) self._lexelt_starts.append(stream.tell()) # Start of an instance? if line.lstrip().startswith('<instance'): assert instance_lines == [] in_instance = True # Body of an instance? if in_instance: instance_lines.append(line) # End of an instance? if line.lstrip().startswith('</instance'): xml_block = '\n'.join(instance_lines) xml_block = _fixXML(xml_block) inst = ElementTree.fromstring(xml_block) return [self._parse_instance(inst, lexelt)]
def process(dict_names, gram_fname, xml, encoding): """""" gram_file = open(gram_fname, 'r') gram = gram_file.read() gram_file.close() lexicon = parse_corpus(dict_names, grammar=gram, encoding=encoding, errors='replace') mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon) analysis = analyse_dict(lexicon) if xml: indent(lexicon) out_file = open(xml, "w") out_file.write(ET.tostring(lexicon, encoding='UTF-8')) out_file.close() print 'analysing files\n%s\n' % '\n'.join(dict_names) if xml: print 'XML lexicon output in file "%s"\n' % xml print '====chunk grammar====' print gram print '\n' max_positions = 30 for structure, patt_dict in analysis.items(): print '\n\n===%s===: total= %d' % (structure, pattern_count(patt_dict)) for pattern, positions in sorted(patt_dict.items(), key=lambda t: (-len(t[1]), t[0])): if len(positions) <= max_positions: pos_str = 'Entries: %s' % ', '.join(positions) else: pos_str = 'Too many entries to list.' print "\t%5d: %s %s" % (len(positions), ':'.join(pattern), pos_str) print "\n\n" print 'mkr\tcount\tnonblank' for mkr in mkr_counts: print '%s\t%5d\t%5d' % (mkr, mkr_counts.get( mkr, 0), nonblank_mkr_counts.get(mkr, 0))
def fromstring(s): return CRFInfo._read(ElementTree.fromstring(s))
if ord(c) > 127: print '%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c)) print line.find(u'zosta\u0142y') line = line.lower() import re print line.encode('unicode_escape') m = re.search(u'\u015b\w*', line) print m.group() from nltk.tokenize import WordTokenizer tokenizer = WordTokenizer() print tokenizer.tokenize(line) path = nltk.data.find('samples/sinorama-gb.xml') f = codecs.open(path, encoding='gb2312') lines = f.readlines() for l in lines: l = l[:-1] utf_enc = l.encode('utf8') print repr(utf_enc) path = nltk.data.find('samples/sinorama-utf8.xml') from nltk.etree import ElementTree as ET tree = ET.parse(path) text = tree.findtext('sent') uni_text = text.encode('utf8') print repr(uni_text.splitlines()[1]) print "text=", text
#!/usr/bin/env python """ Build the corpus package index. Usage: build_pkg_index.py <path-to-packages> <base-url> <output-file> """ xml_header = """<?xml version="1.0"?> <?xml-stylesheet href="index.xsl" type="text/xsl"?> """ import sys from nltk.downloader import build_index from nltk.etree import ElementTree if len(sys.argv) != 4: print "Usage: " print "build_pkg_index.py <path-to-packages> <base-url> <output-file>" sys.exit(-1) ROOT, BASE_URL, OUT = sys.argv[1:] index = build_index(ROOT, BASE_URL) s = ElementTree.tostring(index) out = open(OUT, 'w') out.write(xml_header) out.write(s) out.close()
def parse_from_file(self, filename): self._raw = ElementTree.parse(filename) self.filename_ = filename self.parse_tree(self._raw)
def parse_from_string(self, string): raw = ElementTree.fromstring(string) self.parse_tree(raw)