Exemple #1
0
    def load_anno(self, annofilename, annotype='unannotated', basename=''):
        """ Load Glozz annotations """
        
        def nid(id):
            """ Actual, unique id for unit
                Some ids are re-used from one file to another
            """
            #~ return "{0}_{1}".format(id, self.delta)
            return id
        
        def common(src, tgt):
            tgt._base = self
            tgt.id = nid(src.id)
            tgt.oid = src.id
            cr = src.characterisation
            tgt.type = cr.type._val()
            tgt.features = dict((f.name, f._val()) for f in cr.featureSet.all('feature'))
            tgt.inRelation = list()
            tgt.ordRelation = dict()
            tgt.inSchema = list()
            return (tgt.id in self.ids)

        self.annoname, self.annotype, self.basename = (
            annofilename, annotype, basename)
        annoelt = nxml.load(annofilename)

        for ru in annoelt.all('unit'):
            u = Record('unit')
            if common(ru, u):
                continue
            u.startPos = int(ru.positioning.start.singlePosition.index)
            u.endPos = int(ru.positioning.end.singlePosition.index)
            self.units.append(u)

        # Make sure they're sorted
        self.units = sorted(self.units, key=lambda x:x.endPos)

        for rr in annoelt.all('relation'):
            r = Record('relation')
            if common(rr, r):
                continue
            r.nodes = list(nid(t.id) for t in rr.positioning.all('term'))
            self.relations.append(r)
            
        for rs in annoelt.all('schema'):
            s = Record('schema')
            if common(rs, s):
                continue
            s.nodes = list(nid(t.id) for t in chain(
                rs.positioning.all('embedded-unit'),
                rs.positioning.all('embedded-relation'),
                rs.positioning.all('embedded-schema')
                ))
            self.schemas.append(s)

        # Annotation parsing completed
        self.delta += 1
Exemple #2
0
    def load_anno(self, annofilename, annotype='unannotated', basename=''):
        """ Load Glozz annotations """
        def nid(id):
            """ Actual, unique id for unit
                Some ids are re-used from one file to another
            """
            #~ return "{0}_{1}".format(id, self.delta)
            return id

        def common(src, tgt):
            tgt._base = self
            tgt.id = nid(src.id)
            tgt.oid = src.id
            cr = src.characterisation
            tgt.type = cr.type._val()
            tgt.features = dict(
                (f.name, f._val()) for f in cr.featureSet.all('feature'))
            tgt.inRelation = list()
            tgt.ordRelation = dict()
            tgt.inSchema = list()
            return (tgt.id in self.ids)

        self.annoname, self.annotype, self.basename = (annofilename, annotype,
                                                       basename)
        annoelt = nxml.load(annofilename)

        for ru in annoelt.all('unit'):
            u = Record('unit')
            if common(ru, u):
                continue
            u.startPos = int(ru.positioning.start.singlePosition.index)
            u.endPos = int(ru.positioning.end.singlePosition.index)
            self.units.append(u)

        # Make sure they're sorted
        self.units = sorted(self.units, key=lambda x: x.endPos)

        for rr in annoelt.all('relation'):
            r = Record('relation')
            if common(rr, r):
                continue
            r.nodes = list(nid(t.id) for t in rr.positioning.all('term'))
            self.relations.append(r)

        for rs in annoelt.all('schema'):
            s = Record('schema')
            if common(rs, s):
                continue
            s.nodes = list(
                nid(t.id)
                for t in chain(rs.positioning.all('embedded-unit'),
                               rs.positioning.all('embedded-relation'),
                               rs.positioning.all('embedded-schema')))
            self.schemas.append(s)

        # Annotation parsing completed
        self.delta += 1
Exemple #3
0
 def __init__(self, annofilename):
     """ Class initialiser
     
     annofilename : filename with XML data
     """
     self.tokens = list()
     self.sen_ids = [0]
     annoelt = nxml.load(annofilename)
     for s in annoelt.document.sentences.all('sentence'):
         for t in s.tokens.all('token'):
             d = tuple(self.unescape(t.one(n)._val()) for n in
                 ('word','lemma'))
             self.tokens.append(d)
         self.sen_ids.append(len(self.tokens))
Exemple #4
0
 def __init__(self, annofilename):
     """ Class initialiser
     
     annofilename : filename with XML data
     """
     self.tokens = list()
     self.sen_ids = [0]
     annoelt = nxml.load(annofilename)
     for s in annoelt.document.sentences.all('sentence'):
         for t in s.tokens.all('token'):
             d = tuple(
                 self.unescape(t.one(n)._val()) for n in ('word', 'lemma'))
             self.tokens.append(d)
         self.sen_ids.append(len(self.tokens))
Exemple #5
0
    def load_anno(self, annofilename):
        def nid(id):
            return "{0}_{1}".format(id, self.delta)
        
        def common(src, tgt):
            tgt._base = self
            tgt.id = nid(src.id)
            cr = src.characterisation
            tgt.type = cr.type._val()
            tgt.features = dict((f.name, f._val()) for f in cr.featureSet.all('feature'))
            tgt.inRelation = list()
            tgt.ordRelation = dict()
            tgt.inSchema = list()
            return (tgt.id in self.ids)

        annoelt = nxml.load(annofilename)

        for ru in annoelt.all('unit'):
            u = Record('unit')
            if common(ru, u):
                continue
            u.startPos = int(ru.positioning.start.singlePosition.index)
            u.endPos = int(ru.positioning.end.singlePosition.index)
            self.units.append(u)

        for rr in annoelt.all('relation'):
            r = Record('relation')
            if common(rr, r):
                continue
            r.nodes = list(nid(t.id) for t in rr.positioning.all('term'))
            self.relations.append(r)
            
        for rs in annoelt.all('schema'):
            s = Record('schema')
            if common(rs, s):
                continue
            s.nodes = list(nid(t.id) for t in rs.positioning.all('embedded-unit'))
            self.schemas.append(s)

        # Annotation parsing completed
        self.delta += 1