Esempio n. 1
0
 def __call__(self, tag, *children, **attrib):
     elem = ET.Element(tag, attrib)
     for item in children:
         if isinstance(item, dict):
             elem.attrib.update(item)
         elif isinstance(item, basestring):
             if len(elem):
                 elem[-1].tail = (elem[-1].tail or "") + item
             else:
                 elem.text = (elem.text or "") + item
         elif ET.iselement(item):
             elem.append(item)
         else:
             raise TypeError("bad argument: %r" % item)
     return elem
Esempio n. 2
0
def process(dict_names, gram_fname, xml, encoding):
    """"""
    gram_file = open(gram_fname, "r")
    gram = gram_file.read()
    gram_file.close()
    lexicon = parse_corpus(dict_names, grammar=gram, encoding=encoding, errors="replace")
    mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon)
    analysis = analyse_dict(lexicon)
    if xml:
        indent(lexicon)
        out_file = open(xml, "w")
        out_file.write(ET.tostring(lexicon, encoding="UTF-8"))
        out_file.close()

    print "analysing files\n%s\n" % "\n".join(dict_names)
    if xml:
        print 'XML lexicon output in file "%s"\n' % xml
    print "====chunk grammar===="
    print gram
    print "\n"
    max_positions = 30
    for structure, patt_dict in analysis.items():
        print "\n\n===%s===: total= %d" % (structure, pattern_count(patt_dict))
        for pattern, positions in sorted(patt_dict.items(), key=lambda t: (-len(t[1]), t[0])):
            if len(positions) <= max_positions:
                pos_str = "Entries: %s" % ", ".join(positions)
            else:
                pos_str = "Too many entries to list."
            print "\t%5d:  %s %s" % (len(positions), ":".join(pattern), pos_str)
    print "\n\n"
    print "mkr\tcount\tnonblank"
    for mkr in mkr_counts:
        print "%s\t%5d\t%5d" % (mkr, mkr_counts.get(mkr, 0), nonblank_mkr_counts.get(mkr, 0))
Esempio n. 3
0
def load_ace_file(textfile, fmt):
    print '  - %s' % os.path.split(textfile)[1]
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree('NE', text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree(typ, text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Esempio n. 4
0
def load_ace_file(textfile, fmt):
    print '  - %s' % os.path.split(textfile)[1]
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree('NE', text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree(typ, text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Esempio n. 5
0
def concat(docs):
    """
    Concatenate together the contents of multiple documents from a
    single corpus, using an appropriate concatenation function.  This
    utility function is used by corpus readers when the user requests
    more than one document at a time.
    """
    if len(docs) == 1:
        return docs[0]
    if len(docs) == 0:
        raise ValueError('concat() expects at least one object!')

    types = set([d.__class__ for d in docs])

    # If they're all strings, use string concatenation.
    if types.issubset([str, unicode, basestring]):
        return reduce((lambda a, b: a + b), docs, '')

    # If they're all corpus views, then use ConcatenatedCorpusView.
    for typ in types:
        if not issubclass(typ, AbstractCorpusView):
            break
    else:
        return ConcatenatedCorpusView(docs)

    # Otherwise, see what we can do:
    if len(types) == 1:
        typ = list(types)[0]

        if issubclass(typ, list):
            return reduce((lambda a, b: a + b), docs, [])

        if issubclass(typ, tuple):
            return reduce((lambda a, b: a + b), docs, ())

        if ElementTree.iselement(typ):
            xmltree = ElementTree.Element('documents')
            for doc in docs:
                xmltree.append(doc)
            return xmltree

    # No method found!
    raise ValueError("Don't know how to concatenate types: %r" % types)
Esempio n. 6
0
 def __call__(self, tag, *children, **attrib):
     elem = ET.Element(tag, attrib)
     for item in children:
         if isinstance(item, dict):
             elem.attrib.update(item)
         elif isinstance(item, basestring):
             if len(elem):
                 elem[-1].tail = (elem[-1].tail or "") + item
             else:
                 elem.text = (elem.text or "") + item
         elif ET.iselement(item):
             elem.append(item)
         else:
             raise TypeError("bad argument: %r" % item)
     return elem
Esempio n. 7
0
    def roleset(self, roleset_id):
        """
        @return: the xml description for the given roleset.
        """
        lemma = roleset_id.split('.')[0]
        framefile = 'frames/%s.xml' % lemma
        if framefile not in self._framefiles:
            raise ValueError('Frameset file for %s not found' %
                             roleset_id)

        etree = ElementTree.parse(self.abspath(framefile)).getroot()
        for roleset in etree.findall('predicate/roleset'):
            if roleset.attrib['id'] == roleset_id:
                return roleset
        else:
            raise ValueError('Roleset %s not found in %s' %
                             (roleset_id, framefile))
Esempio n. 8
0
    def roleset(self, roleset_id):
        """
        @return: the xml description for the given roleset.
        """
        lemma = roleset_id.split('.')[0]
        framefile = 'frames/%s.xml' % lemma
        if framefile not in self._framefiles:
            raise ValueError('Frameset file for %s not found' % roleset_id)

        # n.b.: The encoding for XML fileids is specified by the file
        # itself; so we ignore self._encoding here.
        etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
        for roleset in etree.findall('predicate/roleset'):
            if roleset.attrib['id'] == roleset_id:
                return roleset
        else:
            raise ValueError('Roleset %s not found in %s' %
                             (roleset_id, framefile))
Esempio n. 9
0
def add_default_fields(elem, default_fields):
    """Add blank elements and subelements specified in default_fields.
    
    @param elem: toolbox data in an elementtree structure
    @type elem: ElementTree._ElementInterface
    @param default_fields: fields to add to each type of element and subelement
    @type default_fields: dictionary of tuples
    """
    try:
        default = default_fields[elem.tag]
    except KeyError:
        pass
    else:
        for field in default:
            if elem.find(field) is None:
                ET.SubElement(elem, field)
    for child in elem:
        add_default_fields(child, default_fields)
Esempio n. 10
0
    def roleset(self, roleset_id):
        """
        @return: the xml description for the given roleset.
        """
        lemma = roleset_id.split('.')[0]
        framefile = 'frames/%s.xml' % lemma
        if framefile not in self._framefiles:
            raise ValueError('Frameset file for %s not found' %
                             roleset_id)

        # n.b.: The encoding for XML fileids is specified by the file
        # itself; so we ignore self._encoding here.
        etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
        for roleset in etree.findall('predicate/roleset'):
            if roleset.attrib['id'] == roleset_id:
                return roleset
        else:
            raise ValueError('Roleset %s not found in %s' %
                             (roleset_id, framefile))
Esempio n. 11
0
def concat(docs):
    """
    Concatenate together the contents of multiple documents from a
    single corpus, using an appropriate concatenation function.  This
    utility function is used by corpus readers when the user requests
    more than one document at a time.
    """
    if len(docs) == 1:
        return docs[0]
    if len(docs) == 0:
        raise ValueError('concat() expects at least one object!')
    
    types = set([d.__class__ for d in docs])

    # If they're all strings, use string concatenation.
    if types.issubset([str, unicode, basestring]):
        return reduce((lambda a,b:a+b), docs, '')

    # If they're all corpus views, then use ConcatenatedCorpusView.
    for typ in types:
        if not issubclass(typ, AbstractCorpusView):
            break
    else:
        return ConcatenatedCorpusView(docs)

    # Otherwise, see what we can do:
    if len(types) == 1:
        typ = list(types)[0]

        if issubclass(typ, list):
            return reduce((lambda a,b:a+b), docs, [])
    
        if issubclass(typ, tuple):
            return reduce((lambda a,b:a+b), docs, ())

        if ElementTree.iselement(typ):
            xmltree = ElementTree.Element('documents')
            for doc in docs: xmltree.append(doc)
            return xmltree

    # No method found!
    raise ValueError("Don't know how to concatenate types: %r" % types)
    def read_block(self, stream):
        # Decide which lexical element we're in.
        lexelt_num = bisect.bisect_right(self._lexelt_starts,
                                         stream.tell()) - 1
        lexelt = self._lexelts[lexelt_num]

        instance_lines = []
        in_instance = False
        while True:
            line = stream.readline()
            if line == '':
                assert instance_lines == []
                return []

            # Start of a lexical element?
            if line.lstrip().startswith('<lexelt'):
                lexelt_num += 1
                m = re.search('item=("[^"]+"|\'[^\']+\')', line)
                assert m is not None  # <lexelt> has no 'item=...'
                lexelt = m.group(1)[1:-1]
                if lexelt_num < len(self._lexelts):
                    assert lexelt == self._lexelts[lexelt_num]
                else:
                    self._lexelts.append(lexelt)
                    self._lexelt_starts.append(stream.tell())

            # Start of an instance?
            if line.lstrip().startswith('<instance'):
                assert instance_lines == []
                in_instance = True

            # Body of an instance?
            if in_instance:
                instance_lines.append(line)

            # End of an instance?
            if line.lstrip().startswith('</instance'):
                xml_block = '\n'.join(instance_lines)
                xml_block = _fixXML(xml_block)
                inst = ElementTree.fromstring(xml_block)
                return [self._parse_instance(inst, lexelt)]
Esempio n. 13
0
    def read_block(self, stream):
        # Decide which lexical element we're in.
        lexelt_num = bisect.bisect_right(self._lexelt_starts, stream.tell())-1
        lexelt = self._lexelts[lexelt_num]
        
        instance_lines = []
        in_instance = False
        while True:
            line = stream.readline()
            if line == '':
                assert instance_lines == []
                return []
            
            # Start of a lexical element?
            if line.lstrip().startswith('<lexelt'):
                lexelt_num += 1
                m = re.search('item=("[^"]+"|\'[^\']+\')', line)
                assert m is not None # <lexelt> has no 'item=...'
                lexelt = m.group(1)[1:-1]
                if lexelt_num < len(self._lexelts):
                    assert lexelt == self._lexelts[lexelt_num]
                else:
                    self._lexelts.append(lexelt)
                    self._lexelt_starts.append(stream.tell())
                
            # Start of an instance?
            if line.lstrip().startswith('<instance'):
                assert instance_lines == []
                in_instance = True

            # Body of an instance?
            if in_instance:
                instance_lines.append(line)

            # End of an instance?
            if line.lstrip().startswith('</instance'):
                xml_block = '\n'.join(instance_lines)
                xml_block = _fixXML(xml_block)
                inst = ElementTree.fromstring(xml_block)
                return [self._parse_instance(inst, lexelt)]
Esempio n. 14
0
def process(dict_names, gram_fname, xml, encoding):
    """"""
    gram_file = open(gram_fname, 'r')
    gram = gram_file.read()
    gram_file.close()
    lexicon = parse_corpus(dict_names,
                           grammar=gram,
                           encoding=encoding,
                           errors='replace')
    mkr_counts, nonblank_mkr_counts = count_mkrs(lexicon)
    analysis = analyse_dict(lexicon)
    if xml:
        indent(lexicon)
        out_file = open(xml, "w")
        out_file.write(ET.tostring(lexicon, encoding='UTF-8'))
        out_file.close()

    print 'analysing files\n%s\n' % '\n'.join(dict_names)
    if xml:
        print 'XML lexicon output in file "%s"\n' % xml
    print '====chunk grammar===='
    print gram
    print '\n'
    max_positions = 30
    for structure, patt_dict in analysis.items():
        print '\n\n===%s===: total= %d' % (structure, pattern_count(patt_dict))
        for pattern, positions in sorted(patt_dict.items(),
                                         key=lambda t: (-len(t[1]), t[0])):
            if len(positions) <= max_positions:
                pos_str = 'Entries: %s' % ', '.join(positions)
            else:
                pos_str = 'Too many entries to list.'
            print "\t%5d:  %s %s" % (len(positions), ':'.join(pattern),
                                     pos_str)
    print "\n\n"
    print 'mkr\tcount\tnonblank'
    for mkr in mkr_counts:
        print '%s\t%5d\t%5d' % (mkr, mkr_counts.get(
            mkr, 0), nonblank_mkr_counts.get(mkr, 0))
Esempio n. 15
0
 def fromstring(s):
     return CRFInfo._read(ElementTree.fromstring(s))
Esempio n. 16
0
    if ord(c) > 127:
        print '%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c))

print line.find(u'zosta\u0142y')
line = line.lower()

import re
print line.encode('unicode_escape')
m = re.search(u'\u015b\w*', line)
print m.group()

from nltk.tokenize import WordTokenizer
tokenizer = WordTokenizer()
print tokenizer.tokenize(line)

path = nltk.data.find('samples/sinorama-gb.xml')
f = codecs.open(path, encoding='gb2312')
lines = f.readlines()
for l in lines:
    l = l[:-1]
    utf_enc = l.encode('utf8')
    print repr(utf_enc)

path = nltk.data.find('samples/sinorama-utf8.xml')
from nltk.etree import ElementTree as ET
tree = ET.parse(path)
text = tree.findtext('sent')
uni_text = text.encode('utf8')
print repr(uni_text.splitlines()[1])
print "text=", text
Esempio n. 17
0
 def fromstring(s):
     return CRFInfo._read(ElementTree.fromstring(s))
Esempio n. 18
0
#!/usr/bin/env python

"""
Build the corpus package index.  Usage:

  build_pkg_index.py <path-to-packages> <base-url> <output-file>
"""

xml_header = """<?xml version="1.0"?>
<?xml-stylesheet href="index.xsl" type="text/xsl"?>
"""

import sys
from nltk.downloader import build_index
from nltk.etree import ElementTree

if len(sys.argv) != 4:
    print "Usage: "
    print "build_pkg_index.py <path-to-packages> <base-url> <output-file>"
    sys.exit(-1)

ROOT, BASE_URL, OUT = sys.argv[1:]

index = build_index(ROOT, BASE_URL)
s = ElementTree.tostring(index)
out = open(OUT, 'w')
out.write(xml_header)
out.write(s)
out.close()

Esempio n. 19
0
#!/usr/bin/env python
"""
Build the corpus package index.  Usage:

  build_pkg_index.py <path-to-packages> <base-url> <output-file>
"""

xml_header = """<?xml version="1.0"?>
<?xml-stylesheet href="index.xsl" type="text/xsl"?>
"""

import sys
from nltk.downloader import build_index
from nltk.etree import ElementTree

if len(sys.argv) != 4:
    print "Usage: "
    print "build_pkg_index.py <path-to-packages> <base-url> <output-file>"
    sys.exit(-1)

ROOT, BASE_URL, OUT = sys.argv[1:]

index = build_index(ROOT, BASE_URL)
s = ElementTree.tostring(index)
out = open(OUT, 'w')
out.write(xml_header)
out.write(s)
out.close()
Esempio n. 20
0
 def parse_from_file(self, filename):
     self._raw = ElementTree.parse(filename)
     self.filename_ = filename
     self.parse_tree(self._raw)
Esempio n. 21
0
 def parse_from_string(self, string):
     raw = ElementTree.fromstring(string)
     self.parse_tree(raw)