Ejemplo n.º 1
0
def load_ace_file(textfile, fmt):
    print '  - %s' % os.path.split(textfile)[1]
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree('NE', text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree(typ, text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Ejemplo n.º 2
0
def load_ace_file(textfile, fmt):
    print '  - %s' % os.path.split(textfile)[1]
    annfile = textfile+'.tmx.rdc.xml'

    # Read the xml file, and get a list of entities
    entities = []
    xml = ET.parse(open(annfile)).getroot()
    for entity in xml.findall('document/entity'):
        typ = entity.find('entity_type').text
        for mention in entity.findall('entity_mention'):
            if mention.get('TYPE') != 'NAME': continue # only NEs
            s = int(mention.find('head/charseq/start').text)
            e = int(mention.find('head/charseq/end').text)+1
            entities.append( (s, e, typ) )

    # Read the text file, and mark the entities.
    text = open(textfile).read()
    
    # Strip XML tags, since they don't count towards the indices
    text = re.sub('<(?!/?TEXT)[^>]+>', '', text)

    # Blank out anything before/after <TEXT>
    def subfunc(m): return ' '*(m.end()-m.start()-6)
    text = re.sub('[\s\S]*<TEXT>', subfunc, text)
    text = re.sub('</TEXT>[\s\S]*', '', text)

    # Simplify quotes
    text = re.sub("``", ' "', text)
    text = re.sub("''", '" ', text)

    entity_types = set(typ for (s,e,typ) in entities)

    # Binary distinction (NE or not NE)
    if fmt == 'binary':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree('NE', text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    # Multiclass distinction (NE type)
    elif fmt == 'multiclass':
        i = 0
        toks = nltk.Tree('S', [])
        for (s,e,typ) in sorted(entities):
            if s < i: s = i # Overlapping!  Deal with this better?
            if e <= s: continue
            toks.extend(nltk.word_tokenize(text[i:s]))
            toks.append(nltk.Tree(typ, text[s:e].split()))
            i = e
        toks.extend(nltk.word_tokenize(text[i:]))
        yield toks

    else:
        raise ValueError('bad fmt value')
Ejemplo n.º 3
0
    def roleset(self, roleset_id):
        """
        @return: the xml description for the given roleset.
        """
        lemma = roleset_id.split('.')[0]
        framefile = 'frames/%s.xml' % lemma
        if framefile not in self._framefiles:
            raise ValueError('Frameset file for %s not found' %
                             roleset_id)

        etree = ElementTree.parse(self.abspath(framefile)).getroot()
        for roleset in etree.findall('predicate/roleset'):
            if roleset.attrib['id'] == roleset_id:
                return roleset
        else:
            raise ValueError('Roleset %s not found in %s' %
                             (roleset_id, framefile))
Ejemplo n.º 4
0
    def roleset(self, roleset_id):
        """
        @return: the xml description for the given roleset.
        """
        lemma = roleset_id.split('.')[0]
        framefile = 'frames/%s.xml' % lemma
        if framefile not in self._framefiles:
            raise ValueError('Frameset file for %s not found' % roleset_id)

        # n.b.: The encoding for XML fileids is specified by the file
        # itself; so we ignore self._encoding here.
        etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
        for roleset in etree.findall('predicate/roleset'):
            if roleset.attrib['id'] == roleset_id:
                return roleset
        else:
            raise ValueError('Roleset %s not found in %s' %
                             (roleset_id, framefile))
Ejemplo n.º 5
0
    def roleset(self, roleset_id):
        """
        @return: the xml description for the given roleset.
        """
        lemma = roleset_id.split('.')[0]
        framefile = 'frames/%s.xml' % lemma
        if framefile not in self._framefiles:
            raise ValueError('Frameset file for %s not found' %
                             roleset_id)

        # n.b.: The encoding for XML fileids is specified by the file
        # itself; so we ignore self._encoding here.
        etree = ElementTree.parse(self.abspath(framefile).open()).getroot()
        for roleset in etree.findall('predicate/roleset'):
            if roleset.attrib['id'] == roleset_id:
                return roleset
        else:
            raise ValueError('Roleset %s not found in %s' %
                             (roleset_id, framefile))
Ejemplo n.º 6
0
 def parse_from_file(self, filename):
     self._raw = ElementTree.parse(filename)
     self.filename_ = filename
     self.parse_tree(self._raw)
Ejemplo n.º 7
0
    if ord(c) > 127:
        print '%r U+%04x %s' % (c.encode('utf8'), ord(c), unicodedata.name(c))

print line.find(u'zosta\u0142y')
line = line.lower()

import re
print line.encode('unicode_escape')
m = re.search(u'\u015b\w*', line)
print m.group()

from nltk.tokenize import WordTokenizer
tokenizer = WordTokenizer()
print tokenizer.tokenize(line)

path = nltk.data.find('samples/sinorama-gb.xml')
f = codecs.open(path, encoding='gb2312')
lines = f.readlines()
for l in lines:
    l = l[:-1]
    utf_enc = l.encode('utf8')
    print repr(utf_enc)

path = nltk.data.find('samples/sinorama-utf8.xml')
from nltk.etree import ElementTree as ET
tree = ET.parse(path)
text = tree.findtext('sent')
uni_text = text.encode('utf8')
print repr(uni_text.splitlines()[1])
print "text=", text