def __init__(self, doc, discourse_anno, unit_anno): """ Parameters ---------- doc : ? ? discourse_anno : ? Annotation from the discourse layer. unit_anno : ? Annotation from the units layer. """ self._doc = doc self._anno = discourse_anno self._unit_anno = unit_anno unit_anno = unit_anno or discourse_anno unit_type = (unit_anno.type if is_edu(unit_anno) else discourse_anno.type) super(EDU, self).__init__(discourse_anno.local_id(), discourse_anno.text_span(), unit_type, discourse_anno.features, discourse_anno.metadata, discourse_anno.origin) # to be fleshed out self.turn = None self.tstar = None self.turn_edus = None self.dialogue = None self.dialogue_turns = None self.doc_turns = None self.tokens = None
def is_various(annotation): """None of {edu, turn, paragraph, dialogue}. It seems to capture only Resources (to be confirmed). """ return not(is_edu(annotation) or is_turn(annotation) or is_paragraph(annotation) or is_dialogue(annotation))
def cdu_feats(anno): """Get CDU features that are not immediate. Parameters ---------- anno: Schema The schema that codes this CDU in the glozz format Returns ------- res: dict(string, int) Features on this CDU, currently 'nb_edus_tot' (total number of EDUs spanned by this CDU), 'nb_cdus_imm' (number of CDUs immediately embedded in this CDU), 'nb_cdus_tot' (total number of CDUs recursively embedded in this CDU), 'max_lvl' (maximal degree of CDU nesting in this CDU). """ nb_members = len(anno.members) nb_cdus_imm = len([m for m in anno.members if is_cdu(m)]) nb_edus_tot = 0 nb_cdus_tot = 0 max_lvl = 0 cdus_to_expand = [(0, anno)] while cdus_to_expand: lvl, cur_cdu = cdus_to_expand.pop() mem_lvl = lvl + 1 for member in cur_cdu.members: if is_edu(member): nb_edus_tot += 1 elif is_cdu(member): nb_cdus_tot += 1 if mem_lvl > max_lvl: max_lvl = mem_lvl cdus_to_expand.append((mem_lvl, member)) else: raise ValueError('Unexpected type for a CDU member') # TODO new features: # * nb_gaps: CDUs spans can be discontinuous # * gap_max_len: max len of a gap (in #EDUs) # * over_nb_turns: nb of turns this CDU (partly) spans over # * over_nb_tstars: nb of tstars this CDU (partly) spans over res = { 'members': nb_members, 'members_cdu': nb_cdus_imm, 'spanned_cdus': nb_cdus_tot, 'spanned_edus': nb_edus_tot, 'depth': max_lvl, } return res
def is_empty_dialogue_act(anno): """Return True if anno is an empty dialogue act. This is defined as: - having span length 0 or - no addressee and no surface act. Parameters ---------- anno : Annotation Annotation to be tested Returns ------- res : boolean True if `anno` is an empty dialogue act. """ return (is_edu(anno) and anno.type in DIALOGUE_ACTS and addressees(anno) is None and (anno.features.get('Surface_act') == 'Please choose...'))
def __init__(self, doc, discourse_anno, unit_anno): self._doc = doc self._anno = discourse_anno self._unit_anno = unit_anno unit_anno = unit_anno or discourse_anno unit_type = unit_anno.type if is_edu(unit_anno)\ else discourse_anno.type super(EDU, self).__init__(discourse_anno.local_id(), discourse_anno.text_span(), unit_type, discourse_anno.features, discourse_anno.metadata, discourse_anno.origin) # to be fleshed out self.turn = None self.tstar = None self.turn_edus = None self.dialogue = None self.dialogue_turns = None self.doc_turns = None self.tokens = None
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True, strip_cdus=False, attach_len=False): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. strip_cdus : boolean, defaults to False If True, strip CDUs with the "head" strategy and sloppy=True. attach_len : boolean, defaults to False If True, compute attachment length. This requires strip_cdus=True. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_disc_rels = [] # discourse relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences df_unit_rels = [] # relations from the "units" stage (anaphora) print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) # give integer indices to segments, and EDUs in particular seg_idx = 0 eeu_idx = 0 edu_idx = 0 for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in sorted(doc_val.units, key=lambda x: x.span): # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError('Wow, a discourse segment has *features*') # assign index among segments, across the whole doc unit_dict['seg_idx'] = seg_idx seg_idx += 1 if anno.type == 'NonplayerSegment': # EEU unit_dict['eeu_idx'] = eeu_idx eeu_idx += 1 else: # EDU unit_dict['edu_idx'] = edu_idx edu_idx += 1 # df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted(anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set( ['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == ['Correctness', 'Kind', 'Quantity', 'Status']) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) # RELATIONS # * rewrite endpoints of relations if strip_cdus if strip_cdus: endpts = dict() # map relation ids to (src_id, tgt_id) dgr = Graph.from_doc(game_corpus, doc_key) dgraph = copy.deepcopy(dgr) dgraph.strip_cdus(sloppy=True, mode='head') for edge in dgraph.relations(): if "asoubeille_1414085458642" in edge: print('Wop', edge) raise ValueError('gni') links = dgraph.links(edge) # get the identifiers of the relation and its endpoints # to replace CDU ids with segment indices anno_rel = dgraph.annotation(edge) # as of 2017-06-24, anno_rel has no origin (why?) at # this point anno_rel.origin = doc_key # temporary(?) fix # anno_src = dgraph.annotation(links[0]) anno_tgt = dgraph.annotation(links[1]) gid_rel = anno_rel.identifier() if gid_rel.endswith('_0'): # strip_cdus appends an integer to each copy of # the relation ; with mode="head", we only expect # one such copy per relation so "_0" should be a # sufficient match, which we can cut off for the # mapping gid_rel = gid_rel[:-2] gid_src = anno_src.identifier() gid_tgt = anno_tgt.identifier() endpts[gid_rel] = (gid_src, gid_tgt) # * process relations for anno in doc_val.relations: # attributes common to all(?) types of annotations # * global ids of the relation and its endpoints gid_rel = anno.identifier() gid_src = anno.source.identifier() gid_tgt = anno.target.identifier() # * build dict rel_dict = { # identification 'global_id': gid_rel, 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier() ) warnings.warn(w_msg) # if strip_cdus, replace endpoints of *discourse* relations # with segment ids if strip_cdus and is_relation_instance(anno): gid_src, gid_tgt = endpts[gid_rel] rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': gid_src, 'target': gid_tgt, }) if stage == 'discourse': df_disc_rels.append(rel_dict) elif stage == 'units': df_unit_rels.append(rel_dict) else: raise ValueError( "relation from stage not in {'units', 'discourse'}") # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS) df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) # * length of attachments # 2017-06-29 restricted to *discourse* relations, for the time being if strip_cdus and attach_len: df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_disc_rels, df_res, df_pref, df_unit_rels)
def rel_feats(doc, ctx, anno, debug=False): """Get features for relations. Parameters ---------- doc : GlozzDocument Surrounding document ctx : anno : Returns ------- res : dict(string, string?) Features for this relation """ # get all EDUs from document, sorted by their span doc_edus = sorted([u for u in doc.units if is_edu(u)], key=lambda u: u.span) # TODO doc_tstars = ... src = anno.source if is_cdu(src): src_type = 'CDU' src_edus = sorted(src.terminals(), key=lambda e: e.span) elif is_edu(src): src_type = 'EDU' src_edus = [src] else: # covered by stac-check ("non-DU endpoints") return {} tgt = anno.target if is_cdu(tgt): tgt_type = 'CDU' tgt_edus = sorted(tgt.terminals(), key=lambda e: e.span) elif is_edu(tgt): tgt_type = 'EDU' tgt_edus = [tgt] else: # covered by stac-check ("non-DU endpoints") return {} # get the index of the EDUs in the interval between src and tgt src_idc = [doc_edus.index(e) for e in src_edus] tgt_idc = [doc_edus.index(e) for e in tgt_edus] # error case covered at least partially by stac-check, either # as "bizarre relation instance" or as "CDU punctures" if set(src_idc).intersection(set(tgt_idc)): if debug: direction = 'messed up' print('* {}: {} {}'.format(doc.origin, direction, anno.type)) print('\t' + ', '.join(['[{}] {}'.format(str(e.span), doc.text(e.span)) for e in src_edus])) print('\t' + ', '.join(['[{}] {}'.format(str(e.span), doc.text(e.span)) for e in tgt_edus])) return {} # src ... tgt if src_idc[-1] < tgt_idc[0]: direction = 'right' fst_idc = src_idc snd_idc = tgt_idc interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]] # tgt ... src elif tgt_idc[-1] < src_idc[0]: direction = 'left' fst_idc = tgt_idc snd_idc = src_idc interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]] # tgt and src are interwoven else: direction = 'interwoven' # FIXME src_tgt_idc = set(src_idc).union(tgt_idc) interv_edus = [] gap_edus = [e for i, e in enumerate(doc_edus) if (i not in src_tgt_idc and i > min(src_tgt_idc) and i < max(src_tgt_idc))] if debug: print('* {}: {} {}'.format(doc.origin, direction, anno.type)) print('\t' + ', '.join(['[{}] {}'.format(str(e.span), doc.text(e.span)) for e in src_edus])) print('\t' + ', '.join(['[{}] {}'.format(str(e.span), doc.text(e.span)) for e in tgt_edus])) print('\t' + ', '.join(['[{}] {}'.format(str(e.span), doc.text(e.span)) for e in gap_edus])) edu_dist = len(interv_edus) + 1 # turn-stars distance src_tstars = [ctx[e].tstar for e in src_edus] tgt_tstars = [ctx[e].tstar for e in tgt_edus] interv_tstars = [ctx[e].tstar for e in interv_edus] # turn-stars from the interval that don't overlap with src nor tgt skipped_tstars = set(interv_tstars) - set(src_tstars) - set(tgt_tstars) # we define: # * tstar_dist = 0 if (part of) src and tgt belong to the same tstar # * tstar_dist = len(skipped_tstars) + 1 otherwise tstar_dist = (len(skipped_tstars) + 1 if not set(src_tstars).intersection(set(tgt_tstars)) else 0) res = { 'src_type': src_type, 'tgt_type': tgt_type, 'direction': direction, 'edu_dist': edu_dist, 'tstar_dist': tstar_dist, } return res
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = {anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']} for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in ['paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference']: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = {anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list} return res
def fix_likely_annotation_errors(anno_doc, verbose=1): """Fix a document for likely annotation errors due to glozz UX. Likely errors are currently defined as: - units of span length 0 (delete), - empty dialogue acts (delete), - schemas with no member (delete), - overflowing units (fix span). Parameters ---------- anno_doc : GlozzDocument Document to filter verbose : int Verbosity level Returns ------- anno_doc : GlozzDocument Same document but filtered. """ # units anno_units_err = [ x for x in anno_doc.units if (x.span.char_start == x.span.char_end or (is_empty_dialogue_act(x) and any(y.encloses(x) for y in anno_doc.units if y.text_span() != x.text_span() and is_edu(y)))) ] # schemas anno_schms_err = [ x for x in anno_doc.schemas if not x.members ] # relations # TODO anno_relas_err = [] # warn about the ignored annotations if verbose: if anno_units_err or anno_schms_err or anno_relas_err: print('Likely errors due to glozz UX') print('-----------------------------') if anno_units_err: print('|-> Units') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_units_err)) if anno_schms_err: print('|-> Schemas') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_schms_err)) if anno_relas_err: print('|-> Relations') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_relas_err)) # remove detected errors anno_units_err = set(anno_units_err) anno_doc.units = [x for x in anno_doc.units if x not in anno_units_err] anno_schms_err = set(anno_schms_err) anno_doc.schemas = [x for x in anno_doc.schemas if x not in anno_schms_err] anno_relas_err = set(anno_relas_err) anno_doc.relations = [x for x in anno_doc.relations if x not in anno_relas_err] # fix span of units that overflow from their turn turns = [x for x in anno_doc.units if is_turn(x)] edus = [x for x in anno_doc.units if is_edu(x)] for edu in edus: enclosing_turns = [x for x in turns if x.encloses(edu)] if len(enclosing_turns) == 1: continue overlapping_turns = [x for x in turns if x.overlaps(edu)] if len(overlapping_turns) != 1: raise ValueError('No unique overlapping turn for {}'.format(edu)) turn = overlapping_turns[0] if turn.overlaps(edu) != edu.text_span(): edu.span = turn.overlaps(edu) if verbose: print('Fix span of overflowing unit: {}'.format(edu)) return anno_doc
def split_annotated(dir_orig, doc, verbose=0): """Do the split Parameters ---------- dir_orig : string Folder of the annotated corpus doc : string Name of the document """ # locate game folder dir_orig = os.path.abspath(dir_orig) game_dir_orig = os.path.join(dir_orig, doc) if not os.path.isdir(game_dir_orig): err_msg = 'Unable to find original files {}'.format(game_dir_orig) raise ValueError(err_msg) # check for unannotated subfolder unannotated_dir = os.path.join(game_dir_orig, 'unannotated') if not os.path.isdir(unannotated_dir): err_msg = 'Unable to find unannotated folder {}'.format( unannotated_dir) raise ValueError(err_msg) # check for annotated subfolder annotated_dir = os.path.join(game_dir_orig, 'annotated') if not os.path.isdir(annotated_dir): err_msg = 'Unable to find annotated folder {}'.format( annotated_dir) raise ValueError(err_msg) # create discourse/BRONZE and units/BRONZE (should it be skar?) disc_dir = os.path.join(game_dir_orig, 'discourse', 'BRONZE') if not os.path.isdir(disc_dir): os.makedirs(disc_dir) print('Creating folder {}'.format(disc_dir)) units_dir = os.path.join(game_dir_orig, 'units', 'BRONZE') if not os.path.isdir(units_dir): os.makedirs(units_dir) print('Creating folder {}'.format(units_dir)) # process each annotated file for anno_file in sorted(glob(os.path.join(annotated_dir, '*.aa'))): print('Processing {}'.format(os.path.basename(anno_file))) print('=================================') # matching text file text_file = os.path.splitext(anno_file)[0] + '.ac' # read and filter the `annotated` file anno_doc = educe.glozz.read_annotation_file(anno_file, text_file) anno_doc = fix_likely_annotation_errors(anno_doc, verbose=verbose) # read the `unannotated` file unanno_file = os.path.join(unannotated_dir, os.path.basename(anno_file)) unanno_doc = educe.glozz.read_annotation_file(unanno_file, text_file) # infer resegmentation in `annotated` anno_doc = infer_resegmentation(unanno_doc, anno_doc, verbose=verbose) # create `units` doc from the cleaned `annotated` # port annotations: dialogue acts, resources, preferences units_doc = copy.deepcopy(anno_doc) # * keep all clean units # * relations: anaphors only units_doc.relations = [x for x in units_doc.relations if x.type == 'Anaphor'] # * schemas: 'Several_resources' only units_doc.schemas = [x for x in units_doc.schemas if x.type == 'Several_resources'] # create `discourse` from the cleaned `annotated` disc_doc = copy.deepcopy(anno_doc) # remove dialogue act annotation from segments, so that they revert # to being basic EDUs for disc_unit in disc_doc.units: if is_edu(disc_unit): disc_unit.type = 'Segment' disc_unit.features = {} # filter anaphoric relations disc_doc.relations = [x for x in disc_doc.relations if x.type != 'Anaphor'] # filter resources schemas disc_doc.schemas = [x for x in disc_doc.schemas if x.type != 'Several_resources'] # dump both files bname = os.path.basename(os.path.splitext(anno_file)[0]) # discourse file disc_anno_file = os.path.join(disc_dir, bname + '.aa') write_annotation_file(disc_anno_file, disc_doc) # units file units_anno_file = os.path.join(units_dir, bname + '.aa') write_annotation_file(units_anno_file, units_doc) # create two symlinks to the same .ac file, for discourse and units ac_path = os.path.join(game_dir_orig, 'unannotated', bname + '.ac') for subdir in [disc_dir, units_dir]: link_src = os.path.relpath(ac_path, subdir) link_name = os.path.join(subdir, os.path.basename(ac_path)) if os.path.exists(link_name): os.unlink(link_name) try: os.symlink(link_src, link_name) except OSError: print('Unable to create symlink {} to {}'.format( link_src, link_name )) raise # check that all annotations from the filtered annotated doc # have been ported to either units or discourse anno_all_annos = set(x.local_id() for x in itertools.chain( anno_doc.units, anno_doc.relations, anno_doc.schemas )) # gather all annotations from units_doc and disc_doc units_all_annos = set(x.local_id() for x in itertools.chain( units_doc.units, units_doc.relations, units_doc.schemas )) disc_all_annos = set(x.local_id() for x in itertools.chain( disc_doc.units, disc_doc.relations, disc_doc.schemas )) # do the check missing_annos = (anno_all_annos - units_all_annos - disc_all_annos) if missing_annos: print('Missing annotations from {}:\n {}'.format( anno_file, '\n '.join(str(x) for x in set(anno_doc.units + anno_doc.relations + anno_doc.schemas) if x.local_id() in missing_annos) )) print('unanno EDUs:\n {}'.format( '\n '.join(str(x) for x in unanno_doc.units if is_edu(x)) )) raise ValueError('Ho?') # pretty print()
def infer_resegmentation(unanno_doc, anno_doc, verbose=0): """Infer resegmentation of EDUs. Parameters ---------- anno_doc : GlozzDocument Document to filter verbose : int Verbosity level Returns ------- anno_doc : GlozzDocument Filtered document, where the support of relations and schemas has been rewritten. """ anno_map = dict() cautious_map = dict() new_cdus = [] turns = [x for x in unanno_doc.units if is_turn(x)] for turn in turns: # `unannotated` was the starting point for the annotation process u_edus = [x for x in unanno_doc.units if is_edu(x) and turn.span.encloses(x.span)] u_ids = set(x.local_id() for x in u_edus) # `annotated` is the result of the annotation process # find conflicts, as pair-wise overlaps between annotations # from `annotated` a_edus = [x for x in anno_doc.units if is_edu(x) and turn.span.encloses(x.span)] # 1. map new segments to their original equivalent, backporting # dialogue act annotation dup_items = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: ( x.local_id() in u_ids, x.local_id())), 2) if (span_eq(elt_a.text_span(), elt_b.text_span(), eps=1) and elt_b.local_id() in u_ids)] anno_map.update(dup_items) # backport dialogue act annotation to original segment for elt_a, elt_b in dup_items: if elt_a.type in DIALOGUE_ACTS: # backport annotation to original segment elt_b elt_b.type = elt_a.type elt_b.features = elt_a.features for k in ['lastModifier', 'lastModificationDate']: elt_b.metadata[k] = elt_a.metadata[k] # (locally) update the list of EDUs in anno_doc, so conflicts # are not computed on trivially mapped segments a_edus = [x for x in a_edus if x not in anno_map] # 2. list conflicts, then whitelist them progressively # NB: we sort EDUs in reverse using their local_ids, so that # conflict pairs are of the form (stac*, skar*) ; this is # admittedly a cheap, ad-hoc, trick to simulate an ordering # such that annotations already present in unannotated < annotations # introduced in annotated pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: ( x.type in DIALOGUE_ACTS, x.local_id())), 2) if elt_a.overlaps(elt_b)] # * Two cases are very close: EDU merges, and CDUs rels_support = set(anno_map.get(x, x) for rel in anno_doc.relations for x in [rel.source, rel.target]) edu_merges = [] # list of (list of elt_a, elt_b) cdu_guess = [] # list of (list of elt_a, elt_b) for elt_b, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[1]): sorted_a = sorted((y[0] for y in pairs), key=lambda z: z.text_span()) span_seq_a = Span(sorted_a[0].text_span().char_start, sorted_a[-1].text_span().char_end) # we approximately check that the sequence of EDUs elts_a # fully covers the span of elt_b, from start to end, with # no overlap or that the whole sequence is enclosed in # the annotation from `annotated` (this happens when some but # not all of the merged EDUs have been deleted) if ((approximate_cover(sorted_a, elt_b) or elt_b.text_span().encloses(span_seq_a))): # then, it is either an EDU merge or a CDU ; # if any element of the sequence supports a relation, # we take this as indicating a CDU if any(y in rels_support for y in sorted_a): # broadcast type, features, metadata to the segments for elt_a in sorted_a: elt_a.type = _SPLIT_PREFIX + elt_b.type elt_a.features = elt_b.features for k in ['lastModifier', 'lastModificationDate']: elt_a.metadata[k] = elt_b.metadata[k] # transform elt_b into a CDU sch_relid = elt_b.local_id() sch_units = set(y.local_id() for y in sorted_a) sch_relas = set() sch_schms = set() sch_stype = 'Complex_discourse_unit' sch_feats = {} sch_metad = elt_b.metadata new_cdu = Schema(sch_relid, sch_units, sch_relas, sch_schms, sch_stype, sch_feats, metadata=sch_metad) new_cdus.append(new_cdu) # map former (bad) segment to its proper CDU version anno_map[elt_b] = new_cdu cdu_guess.append((sorted_a, elt_b)) if verbose > 1: print('CDU {}\nwas {}, from\n {}'.format( new_cdu, elt_b, '\n '.join(str(z) for z in sorted_a))) elif all(elt_a.local_id() in u_ids for elt_a in sorted_a): edu_merges.append((sorted_a, elt_b)) if verbose > 1: print('EDU merge {} from\n {}'.format( elt_b, '\n '.join(str(z) for z in sorted_a))) else: err_msg = 'Weird approximate cover:\n{}\n{}' raise ValueError(err_msg.format( ', '.join(str(y) for y in sorted_a), elt_b )) # map each of the segments to its CDU, so these pairs can be # removed from the list of conflicts later cdu_map = dict() for elts_a, elt_b in cdu_guess: map_items = [(elt_a, elt_b) for elt_a in elts_a] cdu_map.update(map_items) cautious_map.update(map_items) # map each of the merged segments to the new, bigger EDU + mark for elts_a, elt_b in edu_merges: map_items = [(elt_a, elt_b) for elt_a in elts_a] anno_map.update(map_items) cautious_map.update(map_items) # update list of conflicts: remove pairs that contain a segment # and its merged EDU, or a segment and its enclosing CDU pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if (anno_map.get(elt_a, elt_a) != elt_b and cdu_map.get(elt_a, elt_a) != elt_b)] # * EDU splits edu_splits = dict() # elt_a -> list of elt_b for elt_a, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[0]): sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span) # we approximately check that the sequence of new EDUs # fully covers the span of elt_a, from start to end, with # no overlap if ((elt_a.local_id() in u_ids and approximate_cover(sorted_b, elt_a))): edu_splits[elt_a] = sorted_b pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if elt_a not in set(edu_splits.keys())] # map the split segment to the first of the resulting EDUs + mark for elt_a, elts_b in edu_splits.items(): map_items = [(elt_a, elts_b[0])] anno_map.update(map_items) cautious_map.update(map_items) if verbose: if pw_conflicts: print('Conflict:') print('\n'.join(' {}\t<>\t{}'.format(str(elt_a), str(elt_b)) for elt_a, elt_b in pw_conflicts)) # update anno_doc using the computed mapping anno_map_id = {x.local_id(): y.local_id() for x, y in anno_map.items()} cautious_map_id = {x.local_id(): y.local_id() for x, y in cautious_map.items()} # * forget mapped units and segments rewritten as CDUs anno_doc.units = [x for x in anno_doc.units if (not is_edu(x) or x.local_id() not in anno_map_id)] # * add the new CDUs to the list of schemas anno_doc.schemas.extend(new_cdus) # rewrite the support of relations and schemas objects = {x.local_id(): x for x in itertools.chain(anno_doc.units, anno_doc.relations, anno_doc.schemas)} # * rewrite the support of relations for rel in anno_doc.relations: src = anno_map_id.get(rel.span.t1, rel.span.t1) tgt = anno_map_id.get(rel.span.t2, rel.span.t2) # update relation span, source, target rel.span = RelSpan(src, tgt) rel.source = objects[src] rel.target = objects[tgt] # if necessary, mark relation type for review if src in cautious_map_id or tgt in cautious_map_id: rel.type = _SPLIT_PREFIX + rel.type # * rewrite the support of schemas for sch in anno_doc.schemas: # sch.id = sch.id sch.units = set(anno_map_id.get(x, x) for x in sch.units) sch.relations = set(anno_map_id.get(x, x) for x in sch.relations) sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas) sch.type = sch.type # sch.features = sch.features # sch.metadata = sch.metadata sch.span = sch.units | sch.relations | sch.schemas sch.fleshout(objects) return anno_doc
def fuse_edus(discourse_doc, unit_doc, postags): """Return a copy of the discourse level doc, merging info from both the discourse and units stage. All EDUs will be converted to higher level EDUs. Notes ----- * The discourse stage is primary in that we work by going over what EDUs we find in the discourse stage and trying to enhance them with information we find on their units-level equivalents. Sometimes (rarely but it happens) annotations can go out of synch. EDUs missing on the units stage will be silently ignored (we try to make do without them). EDUs that were introduced on the units stage but not percolated to discourse will also be ignored. * We rely on annotation ids to match EDUs from both stages; it's up to you to ensure that the annotations are really in synch. * This does not constitute a full merge of the documents. For a full merge, you would have to bring over other annotations such as Resources, `Preference`, `Anaphor`, `Several_resources`, taking care all the while to ensure there are no timestamp clashes with pre-existing annotations (it's unlikely but best be on the safe side if you ever find yourself with automatically generated annotations, where all bets are off time-stamp wise). Parameters ---------- discourse_doc : GlozzDocument Document from the "discourse" stage. unit_doc : GlozzDocument Document from the "units" stage. postags : list of Token Sequence of educe tokens predicted by the POS tagger for this document. Returns ------- doc : GlozzDocument Deep copy of the discourse_doc with info from the units stage merged in. """ doc = copy.deepcopy(discourse_doc) # first pass: create the EDU objects annos = sorted([x for x in doc.units if is_edu(x)], key=lambda x: x.span) replacements = {} for anno in annos: unit_anno = None if unit_doc is None else twin_from(unit_doc, anno) edu = EDU(doc, anno, unit_anno) replacements[anno] = edu # second pass: rewrite doc so that annotations that correspond # to EDUs are replacement by their higher-level equivalents edus = [] for anno in annos: edu = replacements[anno] edus.append(edu) doc.units.remove(anno) doc.units.append(edu) for rel in doc.relations: if rel.source == anno: rel.source = edu if rel.target == anno: rel.target = edu for schema in doc.schemas: if anno in schema.units: schema.units.remove(anno) schema.units.append(edu) # fourth pass: flesh out the EDUs with contextual info # now the EDUs should work as contexts too contexts = Context.for_edus(doc, postags=postags) for edu in edus: edu.fleshout(contexts[edu]) return doc
def rel_feats(doc, ctx, anno, debug=False): """Get features for relations. Parameters ---------- doc : GlozzDocument Surrounding document ctx : anno : Returns ------- res : dict(string, string?) Features for this relation """ # get all EDUs from document, sorted by their span doc_edus = sorted([u for u in doc.units if is_edu(u)], key=lambda u: u.span) # TODO doc_tstars = ... src = anno.source if is_cdu(src): src_type = 'CDU' src_edus = sorted(src.terminals(), key=lambda e: e.span) elif is_edu(src): src_type = 'EDU' src_edus = [src] else: # covered by stac-check ("non-DU endpoints") return {} tgt = anno.target if is_cdu(tgt): tgt_type = 'CDU' tgt_edus = sorted(tgt.terminals(), key=lambda e: e.span) elif is_edu(tgt): tgt_type = 'EDU' tgt_edus = [tgt] else: # covered by stac-check ("non-DU endpoints") return {} # get the index of the EDUs in the interval between src and tgt src_idc = [doc_edus.index(e) for e in src_edus] tgt_idc = [doc_edus.index(e) for e in tgt_edus] # error case covered at least partially by stac-check, either # as "bizarre relation instance" or as "CDU punctures" if set(src_idc).intersection(set(tgt_idc)): if debug: direction = 'messed up' print('* {}: {} {}'.format(doc.origin, direction, anno.type)) print('\t' + ', '.join([ '[{}] {}'.format(str(e.span), doc.text(e.span)) for e in src_edus ])) print('\t' + ', '.join([ '[{}] {}'.format(str(e.span), doc.text(e.span)) for e in tgt_edus ])) return {} # src ... tgt if src_idc[-1] < tgt_idc[0]: direction = 'right' fst_idc = src_idc snd_idc = tgt_idc interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]] # tgt ... src elif tgt_idc[-1] < src_idc[0]: direction = 'left' fst_idc = tgt_idc snd_idc = src_idc interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]] # tgt and src are interwoven else: direction = 'interwoven' # FIXME src_tgt_idc = set(src_idc).union(tgt_idc) interv_edus = [] gap_edus = [ e for i, e in enumerate(doc_edus) if (i not in src_tgt_idc and i > min(src_tgt_idc) and i < max(src_tgt_idc)) ] if debug: print('* {}: {} {}'.format(doc.origin, direction, anno.type)) print('\t' + ', '.join([ '[{}] {}'.format(str(e.span), doc.text(e.span)) for e in src_edus ])) print('\t' + ', '.join([ '[{}] {}'.format(str(e.span), doc.text(e.span)) for e in tgt_edus ])) print('\t' + ', '.join([ '[{}] {}'.format(str(e.span), doc.text(e.span)) for e in gap_edus ])) edu_dist = len(interv_edus) + 1 # turn-stars distance src_tstars = [ctx[e].tstar for e in src_edus] tgt_tstars = [ctx[e].tstar for e in tgt_edus] interv_tstars = [ctx[e].tstar for e in interv_edus] # turn-stars from the interval that don't overlap with src nor tgt skipped_tstars = set(interv_tstars) - set(src_tstars) - set(tgt_tstars) # we define: # * tstar_dist = 0 if (part of) src and tgt belong to the same tstar # * tstar_dist = len(skipped_tstars) + 1 otherwise tstar_dist = (len(skipped_tstars) + 1 if not set(src_tstars).intersection(set(tgt_tstars)) else 0) res = { 'src_type': src_type, 'tgt_type': tgt_type, 'direction': direction, 'edu_dist': edu_dist, 'tstar_dist': tstar_dist, } return res
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = { anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel'] } for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in [ 'paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference' ]: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = { anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list } return res