def test_cdu_itself_cross(self): "naughty: CDU spanning dialogues" cdu = FakeCDU('c1', [self.edu1_1, self.edu2_1]) doc = FakeDocument(self.edus1, [], [cdu]) contexts = Context.for_edus(doc) cp = doc.copies self.assertTrue(is_cross_dialogue(contexts)(cp[cdu]))
def assertPunctured(self, doc, rel): "given rel is indeed a puncture in this graph" graph = self.mk_graph(doc) contexts = Context.for_edus(doc) ids = graph_ids(graph) self.assertTrue(is_puncture(graph, contexts, ids[rel.local_id()]), 'failed to detect puncture')
def assertIntact(self, doc, rel): "given rel does not constitute a puncture in this graph" graph = self.mk_graph(doc) contexts = Context.for_edus(doc) ids = graph_ids(graph) self.assertFalse(is_puncture(graph, contexts, ids[rel.local_id()]), 'unexpected puncture')
def test_simple_segment_cross(self): "simple cross-dialogue" src = self.edu1_1 tgt = self.edu2_1 rel = FakeRelInst('r', src, tgt) doc = FakeDocument(self.edus1, [rel], []) contexts = Context.for_edus(doc) cp = doc.copies self.assertTrue(is_cross_dialogue(contexts)(cp[rel]))
def test_cdu_cross(self): "naughty: rel to CDU in another dialogue" cdu = FakeCDU('c1', [self.edu1_1, self.edu1_2]) src = cdu tgt = self.edu2_1 rel = FakeRelInst('r', src, tgt) doc = FakeDocument(self.edus1, [rel], [cdu]) contexts = Context.for_edus(doc) cp = doc.copies self.assertTrue(is_cross_dialogue(contexts)(cp[rel]))
def test_innocent_cdu(self): "innocent: CDU entirely in dialogue" cdu = FakeCDU('c1', [self.edu1_1, self.edu1_2]) src = cdu tgt = self.edu1_3 rel = FakeRelInst('r', src, tgt) doc = FakeDocument(self.edus1, [rel], [cdu]) contexts = Context.for_edus(doc) cp = doc.copies self.assertFalse(is_cross_dialogue(contexts)(cp[rel])) self.assertFalse(is_cross_dialogue(contexts)(cp[cdu]))
def test_innocent(self): "no squawking on in-dialogue relation" src = self.edu1_1 tgt = self.edu1_2 rel = FakeRelInst('r', src, tgt) doc = FakeDocument(self.edus1, [rel], []) contexts = Context.for_edus(doc) cp = doc.copies self.assertTrue(stac.is_edu(cp[src])) self.assertTrue(stac.is_edu(cp[rel].source)) self.assertFalse(is_cross_dialogue(contexts)(cp[rel]))
def get_edus_plus(inputs): """Generate edus and extra environmental information for each Currently: * environment * contexts * edu """ for env in stac_features.mk_envs(inputs, 'unannotated'): doc = env.current.doc contexts = Context.for_edus(doc) for unit in doc.units: if educe.stac.is_edu(unit): yield env, contexts, unit
def __init_read_corpus(self, is_interesting, corpus_dir): """ Read the corpus specified in our args """ reader = stac.Reader(corpus_dir) all_files = reader.files() self.anno_files = reader.filter(all_files, is_interesting) interesting = self.anno_files.keys() for key in interesting: ukey = twin_key(key, 'unannotated') if ukey in all_files: self.anno_files[ukey] = all_files[ukey] self.corpus = reader.slurp(self.anno_files, verbose=True) self.contexts = {k: Context.for_edus(self.corpus[k]) for k in self.corpus}
def __init_read_corpus(self, is_interesting, corpus_dir): """ Read the corpus specified in our args """ reader = stac.Reader(corpus_dir) all_files = reader.files() self.anno_files = reader.filter(all_files, is_interesting) interesting = list(self.anno_files) # or list(self.anno_files.keys()) for key in interesting: ukey = twin_key(key, 'unannotated') if ukey in all_files: self.anno_files[ukey] = all_files[ukey] self.corpus = reader.slurp(self.anno_files, verbose=True) self.contexts = {k: Context.for_edus(self.corpus[k]) for k in self.corpus}
def _last_nodes(self): """ Return the dict of node names to the set of last elements up to that node (included) """ nodes = self._nodes contexts = Context.for_edus(self._graph.doc) doc_speakers = frozenset(ctx.speaker() for ctx in contexts.values()) current_last = dict() last_nodes = dict() for node in nodes: anno_node = self._graph.annotation(node) for speaker in speakers(contexts, anno_node): current_last[speaker] = node last_nodes[node] = frozenset(current_last[speaker] for speaker in doc_speakers if speaker in current_last) return last_nodes
def fuse_edus(discourse_doc, unit_doc, postags): """Return a copy of the discourse level doc, merging info from both the discourse and units stage. All EDUs will be converted to higher level EDUs. Notes ----- * The discourse stage is primary in that we work by going over what EDUs we find in the discourse stage and trying to enhance them with information we find on their units-level equivalents. Sometimes (rarely but it happens) annotations can go out of synch. EDUs missing on the units stage will be silently ignored (we try to make do without them). EDUs that were introduced on the units stage but not percolated to discourse will also be ignored. * We rely on annotation ids to match EDUs from both stages; it's up to you to ensure that the annotations are really in synch. * This does not constitute a full merge of the documents. For a full merge, you would have to bring over other annotations such as Resources, `Preference`, `Anaphor`, `Several_resources`, taking care all the while to ensure there are no timestamp clashes with pre-existing annotations (it's unlikely but best be on the safe side if you ever find yourself with automatically generated annotations, where all bets are off time-stamp wise). Parameters ---------- discourse_doc : GlozzDocument Document from the "discourse" stage. unit_doc : GlozzDocument Document from the "units" stage. postags : list of Token Sequence of educe tokens predicted by the POS tagger for this document. Returns ------- doc : GlozzDocument Deep copy of the discourse_doc with info from the units stage merged in. """ doc = copy.deepcopy(discourse_doc) # first pass: create the EDU objects annos = sorted([x for x in doc.units if is_edu(x)], key=lambda x: x.span) replacements = {} for anno in annos: unit_anno = None if unit_doc is None else twin_from(unit_doc, anno) edu = EDU(doc, anno, unit_anno) replacements[anno] = edu # second pass: rewrite doc so that annotations that correspond # to EDUs are replacement by their higher-level equivalents edus = [] for anno in annos: edu = replacements[anno] edus.append(edu) doc.units.remove(anno) doc.units.append(edu) for rel in doc.relations: if rel.source == anno: rel.source = edu if rel.target == anno: rel.target = edu for schema in doc.schemas: if anno in schema.units: schema.units.remove(anno) schema.units.append(edu) # fourth pass: flesh out the EDUs with contextual info # now the EDUs should work as contexts too contexts = Context.for_edus(doc, postags=postags) for edu in edus: edu.fleshout(contexts[edu]) return doc
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = { anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel'] } for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in [ 'paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference' ]: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = { anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list } return res
def fuse_edus(discourse_doc, unit_doc, postags): """Return a copy of the discourse level doc, merging info from both the discourse and units stage. All EDUs will be converted to higher level EDUs. Notes ----- * The discourse stage is primary in that we work by going over what EDUs we find in the discourse stage and trying to enhance them with information we find on their units-level equivalents. Sometimes (rarely but it happens) annotations can go out of synch. EDUs missing on the units stage will be silently ignored (we try to make do without them). EDUs that were introduced on the units stage but not percolated to discourse will also be ignored. * We rely on annotation ids to match EDUs from both stages; it's up to you to ensure that the annotations are really in synch. * This does not constitute a full merge of the documents. For a full merge, you would have to bring over other annotations such as Resources, `Preference`, `Anaphor`, `Several_resources`, taking care all the while to ensure there are no timestamp clashes with pre-existing annotations (it's unlikely but best be on the safe side if you ever find yourself with automatically generated annotations, where all bets are off time-stamp wise). """ doc = copy.deepcopy(discourse_doc) # first pass: create the EDU objects annos = sorted([x for x in doc.units if is_edu(x)], key=lambda x: x.span) replacements = {} for anno in annos: unit_anno = None if unit_doc is None else twin_from(unit_doc, anno) edu = EDU(doc, anno, unit_anno) replacements[anno] = edu # second pass: rewrite doc so that annotations that corresponds # to EDUs are replacement by their higher-level equivalents edus = [] for anno in annos: edu = replacements[anno] edus.append(edu) doc.units.remove(anno) doc.units.append(edu) for rel in doc.relations: if rel.source == anno: rel.source = edu if rel.target == anno: rel.target = edu for schema in doc.schemas: if anno in schema.units: schema.units.remove(anno) schema.units.append(edu) # fourth pass: flesh out the EDUs with contextual info # now the EDUs should be work as contexts too contexts = Context.for_edus(doc, postags=postags) for edu in edus: edu.fleshout(contexts[edu]) return doc
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = {anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']} for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in ['paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference']: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = {anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list} return res