def mk_relation(tstamp, local_id_parent, local_id_child, label): """ Given a document and edu ids, create a relation instance betweenthem """ span = RelSpan(local_id_parent, local_id_child) label = label annotator = 'stacparser' date = tstamp.next() rel_id = stac_glozz.anno_id_from_tuple((annotator, date)) features = {} metadata = {} metadata['author'] = annotator metadata['creation-date'] = str(date) return Relation(rel_id=rel_id, span=span, rtype=label, features=features, metadata=metadata)
def strip_cdus(self, sloppy=False, mode='head'): """ Delete all CDUs in this graph. Links involving a CDU will point to/from the elements of this CDU. Non-head modes may add new edges to the graph. Parameters ---------- sloppy: boolean, default=False See `cdu_head`. mode: string, default='head' Strategy for replacing edges involving CDUs. `head` will relocate the edge on the recursive head of the CDU (see `recursive_cdu_heads`). `broadcast` will distribute the edge over all EDUs belonging to the CDU. A copy of the edge will be created for each of them. If the edge's source and target are both distributed, a new copy will be created for each combination of EDUs. `custom` (or any other string) will distribute or relocate on the head depending on the relation label. """ # Set of labels for which the source node should be distributed LEFT_DIST = frozenset( ('Acknowledgement', 'Explanation', 'Comment', 'Continuation', 'Narration', 'Contrast', 'Parallel', 'Background')) # Set of labels for which the target node should be distributed RIGHT_DIST = frozenset( ('Result', 'Continuation', 'Narration', 'Comment', 'Contrast', 'Parallel', 'Background', 'Elaboration')) # Warning: heads.keys() are hyperedges heads = self.recursive_cdu_heads(sloppy) def distrib_candidates(links, label): """ Return a pair of list of nodes to be attached, depending on the edge label. """ src_node, tgt_node = links def candidates(node, distributive): if not self.is_cdu(node): return [node] if (mode != 'head' and (mode == 'broadcast' or label in distributive)): # Either distribute over all components... # (always do in broadcast mode) nodes = edu_components(node) else: # ... or link to the CDU recursive head only # (always do in head mode) nodes = [heads[self.mirror(node)]] return nodes return (candidates(src_node, LEFT_DIST), candidates(tgt_node, RIGHT_DIST)) def edu_components(node): """ Returns a list of all EDUs contained by a node. """ if not self.is_cdu(node): return [node] return [ snode for snode in self.cdu_members(node, deep=True) if self.is_edu(snode) ] # Convert all edges in order for old_edge in self.relations(): links = self.links(old_edge) # Verify the edge is well-formed assert (len(links) == 2) if not any(self.is_cdu(l) for l in links): # No CDU to strip: skip continue old_attrs = self.edge_attributes(old_edge) old_anno = self.annotation(old_edge) src_nodes, tgt_nodes = distrib_candidates(links, old_anno.type) # Remove the old edge self.del_edge(old_edge) self.doc.relations.remove(old_anno) # Build a new edge for all new combinations for i, (n_src, n_tgt) in enumerate( itertools.product(src_nodes, tgt_nodes)): if n_src == n_tgt: print("WARNING: something is pointing to its own CDU : " + str(n_src)) continue # First, build a new Relation for the annotation layer n_src_anno = self.annotation(n_src) n_tgt_anno = self.annotation(n_tgt) new_anno = Relation( '{0}_{1}'.format(old_anno._anno_id, i), RelSpan(n_src_anno._anno_id, n_tgt_anno._anno_id), old_anno.type, dict()) new_anno.source = n_src_anno new_anno.target = n_tgt_anno self.doc.relations.append(new_anno) # Second, build a new graph edge new_edge = '{0}_{1}'.format(old_edge, i) new_attrs = dict(old_attrs) new_attrs['annotation'] = new_anno self.add_edge(new_edge) self.add_edge_attributes(new_edge, new_attrs.items()) self.link(n_src, new_edge) self.link(n_tgt, new_edge) # Now all the CDUs are edge-orphaned, remove them from the graph for e_cdu in self.cdus(): self.del_node(self.mirror(e_cdu)) self.del_edge(e_cdu) # Same for annotation-level CDUs self.doc.schemas = [s for s in self.doc.schemas if not stac.is_cdu(s)]
def read_node(node, context=None): def get_one(name, default, ctx=None): f = lambda n: read_node(n, ctx) return on_single_element(node, default, f, name) def get_all(name): return list(map(read_node, node.findall(name))) if node.tag == 'annotations': hashcode = get_one('metadata', '', 'annotations') if hashcode is '': hashcode = None units = get_all('unit') rels = get_all('relation') schemas = get_all('schema') return (hashcode, units, rels, schemas) elif node.tag == 'characterisation': fs = get_one('featureSet', {}) unit_type = get_one('type', None) return (unit_type, fs) elif node.tag == 'feature': attr = node.attrib['name'] val = node.text.strip() if node.text else None return (attr, val) # TODO throw exception if we see more than one instance of a key elif node.tag == 'featureSet': return dict(get_all('feature')) elif node.tag == 'metadata' and context == 'annotations': return node.attrib['corpusHashcode'] elif node.tag == 'metadata': return dict([(t.tag, t.text.strip()) for t in node]) elif node.tag == 'positioning' and context == 'unit': start = get_one('start', None) end = get_one('end', None) return Span(start, end) elif node.tag == 'positioning' and context == 'relation': terms = get_all('term') if len(terms) != 2: raise GlozzException("Was expecting exactly 2 terms, but got %d" % len(terms)) else: return RelSpan(terms[0], terms[1]) elif node.tag == 'positioning' and context == 'schema': units = frozenset(get_all('embedded-unit')) relations = frozenset(get_all('embedded-relation')) schemas = frozenset(get_all('embedded-schema')) return units, relations, schemas elif node.tag == 'relation': rel_id = node.attrib['id'] (unit_type, fs) = get_one('characterisation', None) span = get_one('positioning', None, 'relation') metadata = get_one('metadata', {}) return Relation(rel_id, span, unit_type, fs, metadata=metadata) if node.tag == 'schema': anno_id = node.attrib['id'] (anno_type, fs) = get_one('characterisation', None) units, rels, schemas = get_one('positioning', None, 'schema') metadata = get_one('metadata', {}) return Schema(anno_id, units, rels, schemas, anno_type, fs, metadata=metadata) elif node.tag == 'singlePosition': return int(node.attrib['index']) elif node.tag == 'start' or node.tag == 'end': return get_one('singlePosition', None) elif node.tag in [ 'term', 'embedded-unit', 'embedded-relation', 'embedded-schema' ]: return node.attrib['id'] elif node.tag == 'type': return node.text.strip() elif node.tag == 'unit': unit_id = node.attrib['id'] (unit_type, fs) = get_one('characterisation', None) span = get_one('positioning', None, 'unit') metadata = get_one('metadata', {}) return Unit(unit_id, span, unit_type, fs, metadata=metadata)
def __init__(self, id, start, end): Relation.__init__(self, id, RelSpan(start, end), '', {})
def strip_cdus(self, sloppy=False, mode='head'): """Delete all CDUs in this graph. Links involving a CDU will point to/from the elements of this CDU. Non-head modes may add new edges to the graph. Parameters ---------- sloppy : boolean, default=False See `cdu_head`. mode : string, default='head' Strategy for replacing edges involving CDUs. `head` will relocate the edge on the recursive head of the CDU (see `recursive_cdu_heads`). `broadcast` will distribute the edge over all EDUs belonging to the CDU. A copy of the edge will be created for each of them. If the edge's source and target are both distributed, a new copy will be created for each combination of EDUs. `custom` (or any other string) will distribute or relocate on the head depending on the relation label. """ # Set of labels for which the source node should be distributed LEFT_DIST = frozenset(( 'Acknowledgement', 'Explanation', 'Comment', 'Continuation', 'Narration', 'Contrast', 'Parallel', 'Background')) # Set of labels for which the target node should be distributed RIGHT_DIST = frozenset(( 'Result', 'Continuation', 'Narration', 'Comment', 'Contrast', 'Parallel', 'Background', 'Elaboration')) # Warning: heads.keys() are hyperedges heads = self.recursive_cdu_heads(sloppy=sloppy) def distrib_candidates(links, label): """Return a pair of list of nodes to be attached, depending on the edge label. """ src_node, tgt_node = links def candidates(node, distributive): """Nodes to which edges from or to `node` should be distributed. """ if not self.is_cdu(node): return [node] if ((mode == 'broadcast' or (mode != 'head' and label in distributive))): # Either distribute over all components... # (always do in broadcast mode) nodes = edu_components(node) else: # ... or link to the CDU recursive head only # (always do in head mode) nodes = [heads[self.mirror(node)]] return nodes return (candidates(src_node, LEFT_DIST), candidates(tgt_node, RIGHT_DIST)) def edu_components(node): """ Returns a list of all EDUs contained by a node. """ if not self.is_cdu(node): return [node] return [snode for snode in self.cdu_members(node, deep=True) if self.is_edu(snode)] # Convert all edges in order for old_edge in self.relations(): links = self.links(old_edge) # Verify the edge is well-formed assert len(links) == 2 if not any(self.is_cdu(l) for l in links): # No CDU to strip: skip continue old_attrs = self.edge_attributes(old_edge) old_anno = self.annotation(old_edge) src_nodes, tgt_nodes = distrib_candidates(links, old_anno.type) # Remove the old edge self.del_edge(old_edge) self.doc.relations.remove(old_anno) # Build a new edge for all new combinations for i, (n_src, n_tgt) in enumerate( itertools.product(src_nodes, tgt_nodes)): if n_src == n_tgt: # FIXME find a way to add this to the errors voiced in # educe.stac.sanity.checks.graph # we should likely squawk() this print("WARNING: something is pointing to its own CDU : " + str(n_src)) continue # First, build a new Relation for the annotation layer n_src_anno = self.annotation(n_src) n_tgt_anno = self.annotation(n_tgt) new_anno = Relation( '{0}_{1}'.format(old_anno._anno_id, i), RelSpan(n_src_anno._anno_id, n_tgt_anno._anno_id), old_anno.type, dict()) new_anno.source = n_src_anno new_anno.target = n_tgt_anno self.doc.relations.append(new_anno) # Second, build a new graph edge new_edge = '{0}_{1}'.format(old_edge, i) new_attrs = dict(old_attrs) new_attrs['annotation'] = new_anno self.add_edge(new_edge) self.add_edge_attributes(new_edge, new_attrs.items()) self.link(n_src, new_edge) self.link(n_tgt, new_edge) # Now all the CDUs are edge-orphaned, remove them from the graph for e_cdu in self.cdus(): self.del_node(self.mirror(e_cdu)) self.del_edge(e_cdu) # Same for annotation-level CDUs self.doc.schemas = [s for s in self.doc.schemas if not stac.is_cdu(s)]
def _mk_doc(self): """ Create an educe.annotation.Document from this graph """ def start(name): return ord(name) - ord('a') def glozz_id(name): return 'du_' + str(start(name)) def is_edu(name): return name not in self.cdus anno_units = list() anno_cdus = list() anno_rels = list() for du_name, speaker_set in self.speakers.items(): # EDU loop if not is_edu(du_name): continue du_start, du_glozz_id = start(du_name), glozz_id(du_name) x_edu = Unit(du_glozz_id, Span(du_start, du_start + 1), 'Segment', dict()) speaker = list(speaker_set)[0] turn = Unit('t' + du_glozz_id, Span(du_start, du_start + 1), 'Turn', { 'Identifier': du_start, 'Emitter': speaker }) self.anno_map[du_name] = x_edu anno_units.append(x_edu) anno_units.append(turn) for du_name, sub_names in self.cdus.items(): x_cdu = Schema( glozz_id(du_name), set(glozz_id(x) for x in sub_names if is_edu(x)), set(), set(glozz_id(x) for x in sub_names if not is_edu(x)), 'Complex_discourse_unit', dict()) self.anno_map[du_name] = x_cdu anno_cdus.append(x_cdu) rel_count = 0 for src_name in self.down: for tgt_name, rel_tag in self.down[src_name]: rel_glozz_id = 'rel_' + str(rel_count) rel_count += 1 if rel_tag == 'S': rel_name = 'Q-Elab' elif rel_tag == 'C': rel_name = 'Contrast' else: raise ValueError('Unknown tag {0}'.format(rel_tag)) rel = Relation(rel_glozz_id, RelSpan(glozz_id(src_name), glozz_id(tgt_name)), rel_name, dict()) self.anno_map[(src_name, tgt_name)] = rel anno_rels.append(rel) dialogue = Unit( 'dialogue_0', Span(0, max(u.text_span().char_end for u in anno_units)), 'Dialogue', {}) anno_units.append(dialogue) doc = Document(anno_units, anno_rels, anno_cdus, string.ascii_lowercase) return doc