def mk_unit(partial, counter): "from PartialUnit and counter to Unit" # Note that Glozz seems to identify items by the pair of author and # creation date, ignoring the unit ID altogether (assumed to be # author_date) creation_date = mk_creation_date(counter) metadata = { 'author': author, 'creation-date': creation_date, 'lastModifier': 'n/a', 'lastModificationDate': '0' } unit_id = '_'.join([author, str(counter)]) return Unit(unit_id, partial.span, partial.type, partial.features, metadata)
def read_node(node, context=None): def get_one(name, default, ctx=None): f = lambda n: read_node(n, ctx) return on_single_element(node, default, f, name) def get_all(name): return list(map(read_node, node.findall(name))) if node.tag == 'annotations': hashcode = get_one('metadata', '', 'annotations') if hashcode is '': hashcode = None units = get_all('unit') rels = get_all('relation') schemas = get_all('schema') return (hashcode, units, rels, schemas) elif node.tag == 'characterisation': fs = get_one('featureSet', {}) unit_type = get_one('type', None) return (unit_type, fs) elif node.tag == 'feature': attr = node.attrib['name'] val = node.text.strip() if node.text else None return (attr, val) # TODO throw exception if we see more than one instance of a key elif node.tag == 'featureSet': return dict(get_all('feature')) elif node.tag == 'metadata' and context == 'annotations': return node.attrib['corpusHashcode'] elif node.tag == 'metadata': return dict([(t.tag, t.text.strip()) for t in node]) elif node.tag == 'positioning' and context == 'unit': start = get_one('start', None) end = get_one('end', None) return Span(start, end) elif node.tag == 'positioning' and context == 'relation': terms = get_all('term') if len(terms) != 2: raise GlozzException("Was expecting exactly 2 terms, but got %d" % len(terms)) else: return RelSpan(terms[0], terms[1]) elif node.tag == 'positioning' and context == 'schema': units = frozenset(get_all('embedded-unit')) relations = frozenset(get_all('embedded-relation')) schemas = frozenset(get_all('embedded-schema')) return units, relations, schemas elif node.tag == 'relation': rel_id = node.attrib['id'] (unit_type, fs) = get_one('characterisation', None) span = get_one('positioning', None, 'relation') metadata = get_one('metadata', {}) return Relation(rel_id, span, unit_type, fs, metadata=metadata) if node.tag == 'schema': anno_id = node.attrib['id'] (anno_type, fs) = get_one('characterisation', None) units, rels, schemas = get_one('positioning', None, 'schema') metadata = get_one('metadata', {}) return Schema(anno_id, units, rels, schemas, anno_type, fs, metadata=metadata) elif node.tag == 'singlePosition': return int(node.attrib['index']) elif node.tag == 'start' or node.tag == 'end': return get_one('singlePosition', None) elif node.tag in [ 'term', 'embedded-unit', 'embedded-relation', 'embedded-schema' ]: return node.attrib['id'] elif node.tag == 'type': return node.text.strip() elif node.tag == 'unit': unit_id = node.attrib['id'] (unit_type, fs) = get_one('characterisation', None) span = get_one('positioning', None, 'unit') metadata = get_one('metadata', {}) return Unit(unit_id, span, unit_type, fs, metadata=metadata)
def __init__(self, id, start, end): Unit.__init__(self, id, Span(start, end), '', {})
def _mk_doc(self): """ Create an educe.annotation.Document from this graph """ def start(name): return ord(name) - ord('a') def glozz_id(name): return 'du_' + str(start(name)) def is_edu(name): return name not in self.cdus anno_units = list() anno_cdus = list() anno_rels = list() for du_name, speaker_set in self.speakers.items(): # EDU loop if not is_edu(du_name): continue du_start, du_glozz_id = start(du_name), glozz_id(du_name) x_edu = Unit(du_glozz_id, Span(du_start, du_start + 1), 'Segment', dict()) speaker = list(speaker_set)[0] turn = Unit('t' + du_glozz_id, Span(du_start, du_start + 1), 'Turn', { 'Identifier': du_start, 'Emitter': speaker }) self.anno_map[du_name] = x_edu anno_units.append(x_edu) anno_units.append(turn) for du_name, sub_names in self.cdus.items(): x_cdu = Schema( glozz_id(du_name), set(glozz_id(x) for x in sub_names if is_edu(x)), set(), set(glozz_id(x) for x in sub_names if not is_edu(x)), 'Complex_discourse_unit', dict()) self.anno_map[du_name] = x_cdu anno_cdus.append(x_cdu) rel_count = 0 for src_name in self.down: for tgt_name, rel_tag in self.down[src_name]: rel_glozz_id = 'rel_' + str(rel_count) rel_count += 1 if rel_tag == 'S': rel_name = 'Q-Elab' elif rel_tag == 'C': rel_name = 'Contrast' else: raise ValueError('Unknown tag {0}'.format(rel_tag)) rel = Relation(rel_glozz_id, RelSpan(glozz_id(src_name), glozz_id(tgt_name)), rel_name, dict()) self.anno_map[(src_name, tgt_name)] = rel anno_rels.append(rel) dialogue = Unit( 'dialogue_0', Span(0, max(u.text_span().char_end for u in anno_units)), 'Dialogue', {}) anno_units.append(dialogue) doc = Document(anno_units, anno_rels, anno_cdus, string.ascii_lowercase) return doc