def tgt_html(grandparent, anno, naughty=False): """ Describe the given annotation in HTML and append that description to the given HTML grandparent node. """ parent = h.span(grandparent) h.span(parent, anno_code(anno)) type_span = h.span(parent, '[%s] ' % anno.type) if naughty: type_span.attrib['class'] = 'naughty' if anno in contexts: turn = contexts[anno].turn turn_info = stac.split_turn_text(doc.text(turn.span))[0] turn_splits = turn_info.split(":") if len(turn_splits) > 1: tid = ET.SubElement(parent, 'b') tid.text = turn_splits[0] + ":" h.span(parent, ":".join(turn_splits[1:])) else: h.span(parent, turn_info) if not stac.is_relation_instance(anno): t_text = text(anno) if stac.is_cdu(anno): trange = turn_range(anno) if trange: h.elem(parent, 'b', trange) h.span(parent, text=snippet(t_text, 100), attrib={'class': 'snippet'}) h.span(parent, ' %s' % anno.text_span()) return parent
def in_dialogue(x): if stac.is_edu(x): return x in units elif stac.is_relation_instance(x): return x.source in units and x.target in units elif stac.is_cdu(x): return all(t in units for t in x.terminals()) else: return False
def in_dialogue(d_annos, anno): "if the given annotation is in the given dialogue" if stac.is_edu(anno): return anno in d_annos elif stac.is_relation_instance(anno): return anno.source in d_annos and anno.target in d_annos elif stac.is_cdu(anno): return all(t in d_annos for t in anno.terminals()) else: return False
def search_cdu_overlap(inputs, k, g): doc = inputs.corpus[k] contexts = inputs.contexts[k] containers = defaultdict(list) for cdu in g.cdus(): cdu_anno = g.annotation(cdu) if not stac.is_cdu(cdu_anno): continue for m in g.cdu_members(cdu): edu_anno = g.annotation(m) containers[edu_anno].append(cdu_anno) return [CduOverlapItem(doc, contexts, k,v) for k,v in containers.items() if len(v) > 1]
def node_speaker(anno): "return the designated speaker for an EDU or CDU" if stac.is_edu(anno): return edu_speaker(anno) elif stac.is_cdu(anno): speakers = frozenset(edu_speaker(x) for x in anno.terminals()) if len(speakers) == 1: return list(speakers)[0] else: return None else: return None
def node_speaker(n): if stac.is_edu(n): return edu_speaker(n) elif stac.is_cdu(n): terms = n.terminals() speakers = list(frozenset(map(edu_speaker, n.terminals()))) if len(speakers) == 1: return speakers[0] else: return None else: return None
def has_non_du_member(anno): """ True if `anno` is a relation that points to another relation, or if it's a CDU that has relation members """ if stac.is_relation_instance(anno): members = [anno.source, anno.target] elif stac.is_cdu(anno): members = anno.members else: return False return any(is_non_du(x) for x in members)
def dialogue(anno): "return the enclosing dialogue for an EDU/CDU" if stac.is_edu(anno): if anno not in contexts: return None else: return contexts[anno].dialogue elif stac.is_cdu(anno): dialogues = [dialogue(x) for x in anno.terminals()] if dialogues and all(d == dialogues[0] for d in dialogues[1:]): return dialogues[0] else: return None else: return None
def dialogue(anno): if stac.is_edu(anno): if anno not in contexts: return None else: return contexts[anno].dialogue elif stac.is_cdu(anno): units = anno.terminals() dialogues = list(map(dialogue, units)) if dialogues and all(d == dialogues[0] for d in dialogues[1:]): return dialogues[0] else: return None else: return None
def is_bad(anno): if stac.is_relation_instance(anno): members = [ anno.source, anno.target ] elif stac.is_cdu(anno): members = list(anno.members) else: members = [] # don't worry about members which are relations members = list(filter(expect_dialogue, members)) dialogues = frozenset(map(dialogue, members)) if members: return len(dialogues) > 1 else: return False
def is_bad(anno): "true if the annotation is crosses a dialogue boundary" if stac.is_relation_instance(anno): members = [anno.source, anno.target] elif stac.is_cdu(anno): members = list(anno.members) else: members = [] # don't worry about members which are relations members = [x for x in members if expect_dialogue(x)] dialogues = frozenset(dialogue(x) for x in members) if members: return len(dialogues) > 1 else: return False
def search_graph_cdu_overlap(inputs, k, gra): """ Return a ReportItem for every EDU that appears in more than one CDU """ doc = inputs.corpus[k] contexts = inputs.contexts[k] containers = defaultdict(list) for cdu in gra.cdus(): cdu_anno = gra.annotation(cdu) if not stac.is_cdu(cdu_anno): continue for mem in gra.cdu_members(cdu): edu_anno = gra.annotation(mem) containers[edu_anno].append(cdu_anno) return [CduOverlapItem(doc, contexts, k, v) for k, v in containers.items() if len(v) > 1]
def turn_range(anno): """ given a CDU return a string representing the turns spanned by that CDU (or None if empty) """ if not stac.is_cdu(anno): raise ValueError("not a CDU: " + anno) tids = [turn_id(y) for y in anno.terminals()] tids = [x for x in tids if x] if tids: min_tid = min(tids) max_tid = max(tids) if min_tid == max_tid: return "%d: " % min_tid else: return "%d-%d: " % (min_tid, max_tid) else: return None
def search_graph_cdu_overlap(inputs, k, gra): """ Return a ReportItem for every EDU that appears in more than one CDU """ doc = inputs.corpus[k] contexts = inputs.contexts[k] containers = defaultdict(list) for cdu in gra.cdus(): cdu_anno = gra.annotation(cdu) if not stac.is_cdu(cdu_anno): continue for mem in gra.cdu_members(cdu): edu_anno = gra.annotation(mem) containers[edu_anno].append(cdu_anno) return [ CduOverlapItem(doc, contexts, ek, ev) for ek, ev in containers.items() if len(ev) > 1 ]
def tgt_html(grandparent, t, naughty=False): def tid(x): if x in contexts: tid_str = contexts[x].turn.features['Identifier'] return int(tid_str) if tid_str else None else: return None parent = html_span(grandparent) html_span(parent, anno_code(t)) type_span = html_span(parent, '[%s] ' % t.type) if naughty: type_span.attrib['class'] = 'naughty' if t in contexts: turn = contexts[t].turn turn_info = stac.split_turn_text(doc.text(turn.span))[0] turn_splits = turn_info.split(":") if len(turn_splits) > 1: tid = ET.SubElement(parent, 'b') tid.text = turn_splits[0] + ":" trest = html_span(parent, ":".join(turn_splits[1:])) else: html_span(parent, turn_info) if not stac.is_relation_instance(t): t_span = t.text_span() t_text = doc.text(t_span) if stac.is_cdu(t): tids = [x for x in map(tid, t.terminals()) if x] if tids: tspan = ET.SubElement(parent, 'b') min_tid = min(tids) max_tid = max(tids) if min_tid == max_tid: tspan.text = "%d: " % min_tid else: tspan.text = "%d-%d: " % (min_tid, max_tid) text_sp = html_span(parent, snippet(t_text, 100)) text_sp.attrib['class'] = 'snippet' html_span(parent, ' %s' % t_span) return parent
def without_cdus(self, sloppy=False): """ Return a deep copy of this graph with all CDUs removed. Links involving these CDUs will point instead from/to their deep heads """ g2 = copy.deepcopy(self) heads = g2.recursive_cdu_heads(sloppy) anno_heads = dict((g2.annotation(k),g2.annotation(v))\ for k,v in heads.items()) # replace all links to/from cdus with to/from their heads for e_edge in g2.relations(): links = g2.links(e_edge) attrs = g2.edge_attributes(e_edge) if any(g2.is_cdu(l) for l in links): # recreate the edge g2.del_edge(e_edge) g2.add_edge(e_edge) g2.add_edge_attributes(e_edge, attrs) for l in links: l2 = heads[g2.mirror(l)] if g2.is_cdu(l) else l g2.link(l2, e_edge) # now that we've pointed everything away, nuke the CDUs for e_cdu in g2.cdus(): g2.del_node(g2.mirror(e_cdu)) g2.del_edge(e_cdu) # to be on the safe side, we should also do similar link-rewriting # but on the underlying educe.annotation objects layer # (symptom of a yucky design) :-( for r in g2.doc.relations: if stac.is_relation_instance(r): src = r.source tgt = r.target src2 = anno_heads.get(src, src) tgt2 = anno_heads.get(tgt, tgt) r.source = src2 r.target = tgt2 r.span = annotation.RelSpan(src2.local_id(), tgt2.local_id()) # remove the actual CDU objects too g2.doc.schemas = [ s for s in g2.doc.schemas if not stac.is_cdu(s) ] return g2
def test_fake_objs(): assert stac.is_edu(edu1) assert stac.is_relation_instance(rel1) assert stac.is_cdu(cdu1)
def is_cdu(self, x): return super(Graph, self).is_cdu(x) and\ stac.is_cdu(self.annotation(x))
def expect_dialogue(anno): return stac.is_edu(anno) or stac.is_cdu(anno)
def expect_dialogue(anno): "true if the annotation should live in a dialogue" return stac.is_edu(anno) or stac.is_cdu(anno)
def are_single_headed_cdus(inputs, k, gra): """Check that each CDU has exactly one head DU. Parameters ---------- gra : Graph Graph for the discourse structure. Returns ------- report_items : list of ReportItem List of report items, one per faulty CDU. """ report_items = [] doc = inputs.corpus[k] contexts = inputs.contexts[k] # compute the transitive closure of DUs embedded under each CDU # * map each CDU to its member EDUs and CDUs, as two lists # keys are edge ids eg. 'e_pilot01_07_jhunter_1487683021582', # values are node ids eg. 'n_pilot01_07_stac_1464335440' cdu2mems = defaultdict(lambda: ([], [])) for cdu_id in gra.cdus(): cdu = gra.annotation(cdu_id) cdu_members = set(gra.cdu_members(cdu_id)) cdu2mems[cdu_id] = ([ x for x in cdu_members if stac.is_edu(gra.annotation(x)) ], [x for x in cdu_members if stac.is_cdu(gra.annotation(x))]) # * replace each nested CDU in the second list with its member DUs # (to first list), and mark CDUs for exploration (to second list) ; # repeat until fixpoint, ie. transitive closure complete for each CDU while any(v[1] for k, v in cdu2mems.items()): for cdu_id, (mem_edus, mem_cdus) in cdu2mems.items(): for mem_cdu in mem_cdus: # switch between the edge and node representations of CDUs: # gra.mirror() nested_edus, nested_cdus = cdu2mems[gra.mirror(mem_cdu)] # add the nested CDU and its EDU members cdu2mems[cdu_id][0].append(mem_cdu) cdu2mems[cdu_id][0].extend(nested_edus) # store CDU members of the nested CDU for exploration cdu2mems[cdu_id][1].extend(nested_cdus) # delete current nested CDU from list of CDUs to be explored cdu2mems[cdu_id][1].remove(mem_cdu) # switch to simple dict, forget list of CDUs for exploration cdu2mems = {k: v[0] for k, v in cdu2mems.items()} # end transitive closure for cdu_id in gra.cdus(): cdu = gra.annotation(cdu_id) cdu_mems = set(gra.cdu_members(cdu_id)) cdu_rec_mems = set(cdu2mems[cdu_id]) internal_head = dict() for cdu_mem in cdu_mems: for rel in gra.links(cdu_mem): if gra.is_relation(rel): src, tgt = gra.rel_links(rel) # src can be any DU under the current CDU, eg. even # a member of a nested CDU ; this is probably too # loose but we'll see later if we need to refine if src in cdu_rec_mems and tgt in cdu_mems: internal_head[tgt] = src unheaded_mems = cdu_mems - set(internal_head.keys()) if len(unheaded_mems) > 1: report_items.append(SchemaItem(doc, contexts, cdu, [])) return report_items
def are_single_headed_cdus(inputs, k, gra): """Check that each CDU has exactly one head DU. Parameters ---------- gra : Graph Graph for the discourse structure. Returns ------- report_items : list of ReportItem List of report items, one per faulty CDU. """ report_items = [] doc = inputs.corpus[k] contexts = inputs.contexts[k] # compute the transitive closure of DUs embedded under each CDU # * map each CDU to its member EDUs and CDUs, as two lists # keys are edge ids eg. 'e_pilot01_07_jhunter_1487683021582', # values are node ids eg. 'n_pilot01_07_stac_1464335440' cdu2mems = defaultdict(lambda: ([], [])) for cdu_id in gra.cdus(): cdu = gra.annotation(cdu_id) cdu_members = set(gra.cdu_members(cdu_id)) cdu2mems[cdu_id] = ( [x for x in cdu_members if stac.is_edu(gra.annotation(x))], [x for x in cdu_members if stac.is_cdu(gra.annotation(x))] ) # * replace each nested CDU in the second list with its member DUs # (to first list), and mark CDUs for exploration (to second list) ; # repeat until fixpoint, ie. transitive closure complete for each CDU while any(v[1] for k, v in cdu2mems.items()): for cdu_id, (mem_edus, mem_cdus) in cdu2mems.items(): for mem_cdu in mem_cdus: # switch between the edge and node representations of CDUs: # gra.mirror() nested_edus, nested_cdus = cdu2mems[gra.mirror(mem_cdu)] # add the nested CDU and its EDU members cdu2mems[cdu_id][0].append(mem_cdu) cdu2mems[cdu_id][0].extend(nested_edus) # store CDU members of the nested CDU for exploration cdu2mems[cdu_id][1].extend(nested_cdus) # delete current nested CDU from list of CDUs to be explored cdu2mems[cdu_id][1].remove(mem_cdu) # switch to simple dict, forget list of CDUs for exploration cdu2mems = {k: v[0] for k, v in cdu2mems.items()} # end transitive closure for cdu_id in gra.cdus(): cdu = gra.annotation(cdu_id) cdu_mems = set(gra.cdu_members(cdu_id)) cdu_rec_mems = set(cdu2mems[cdu_id]) internal_head = dict() for cdu_mem in cdu_mems: for rel in gra.links(cdu_mem): if gra.is_relation(rel): src, tgt = gra.rel_links(rel) # src can be any DU under the current CDU, eg. even # a member of a nested CDU ; this is probably too # loose but we'll see later if we need to refine if src in cdu_rec_mems and tgt in cdu_mems: internal_head[tgt] = src unheaded_mems = cdu_mems - set(internal_head.keys()) if len(unheaded_mems) > 1: report_items.append( SchemaItem(doc, contexts, cdu, [])) return report_items