def is_various(annotation): """None of {edu, turn, paragraph, dialogue}. It seems to capture only Resources (to be confirmed). """ return not (is_edu(annotation) or is_turn(annotation) or is_paragraph(annotation) or is_dialogue(annotation))
def is_various(annotation): """None of {edu, turn, paragraph, dialogue}. It seems to capture only Resources (to be confirmed). """ return not(is_edu(annotation) or is_turn(annotation) or is_paragraph(annotation) or is_dialogue(annotation))
def fix_likely_annotation_errors(anno_doc, verbose=1): """Fix a document for likely annotation errors due to glozz UX. Likely errors are currently defined as: - units of span length 0 (delete), - empty dialogue acts (delete), - schemas with no member (delete), - overflowing units (fix span). Parameters ---------- anno_doc : GlozzDocument Document to filter verbose : int Verbosity level Returns ------- anno_doc : GlozzDocument Same document but filtered. """ # units anno_units_err = [ x for x in anno_doc.units if (x.span.char_start == x.span.char_end or ( is_empty_dialogue_act(x) and any( y.encloses(x) for y in anno_doc.units if y.text_span() != x.text_span() and is_edu(y)))) ] # schemas anno_schms_err = [x for x in anno_doc.schemas if not x.members] # relations # TODO anno_relas_err = [] # warn about the ignored annotations if verbose: if anno_units_err or anno_schms_err or anno_relas_err: print('Likely errors due to glozz UX') print('-----------------------------') if anno_units_err: print('|-> Units') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_units_err)) if anno_schms_err: print('|-> Schemas') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_schms_err)) if anno_relas_err: print('|-> Relations') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_relas_err)) # remove detected errors anno_units_err = set(anno_units_err) anno_doc.units = [x for x in anno_doc.units if x not in anno_units_err] anno_schms_err = set(anno_schms_err) anno_doc.schemas = [x for x in anno_doc.schemas if x not in anno_schms_err] anno_relas_err = set(anno_relas_err) anno_doc.relations = [ x for x in anno_doc.relations if x not in anno_relas_err ] # fix span of units that overflow from their turn turns = [x for x in anno_doc.units if is_turn(x)] edus = [x for x in anno_doc.units if is_edu(x)] for edu in edus: enclosing_turns = [x for x in turns if x.encloses(edu)] if len(enclosing_turns) == 1: continue overlapping_turns = [x for x in turns if x.overlaps(edu)] if len(overlapping_turns) != 1: raise ValueError('No unique overlapping turn for {}'.format(edu)) turn = overlapping_turns[0] if turn.overlaps(edu) != edu.text_span(): edu.span = turn.overlaps(edu) if verbose: print('Fix span of overflowing unit: {}'.format(edu)) return anno_doc
def infer_resegmentation(unanno_doc, anno_doc, verbose=0): """Infer resegmentation of EDUs. Parameters ---------- anno_doc : GlozzDocument Document to filter verbose : int Verbosity level Returns ------- anno_doc : GlozzDocument Filtered document, where the support of relations and schemas has been rewritten. """ anno_map = dict() cautious_map = dict() new_cdus = [] turns = [x for x in unanno_doc.units if is_turn(x)] for turn in turns: # `unannotated` was the starting point for the annotation process u_edus = [ x for x in unanno_doc.units if is_edu(x) and turn.span.encloses(x.span) ] u_ids = set(x.local_id() for x in u_edus) # `annotated` is the result of the annotation process # find conflicts, as pair-wise overlaps between annotations # from `annotated` a_edus = [ x for x in anno_doc.units if is_edu(x) and turn.span.encloses(x.span) ] # 1. map new segments to their original equivalent, backporting # dialogue act annotation dup_items = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: (x.local_id() in u_ids, x.local_id())), 2) if (span_eq(elt_a.text_span(), elt_b.text_span(), eps=1) and elt_b.local_id() in u_ids)] anno_map.update(dup_items) # backport dialogue act annotation to original segment for elt_a, elt_b in dup_items: if elt_a.type in DIALOGUE_ACTS: # backport annotation to original segment elt_b elt_b.type = elt_a.type elt_b.features = elt_a.features for k in ['lastModifier', 'lastModificationDate']: elt_b.metadata[k] = elt_a.metadata[k] # (locally) update the list of EDUs in anno_doc, so conflicts # are not computed on trivially mapped segments a_edus = [x for x in a_edus if x not in anno_map] # 2. list conflicts, then whitelist them progressively # NB: we sort EDUs in reverse using their local_ids, so that # conflict pairs are of the form (stac*, skar*) ; this is # admittedly a cheap, ad-hoc, trick to simulate an ordering # such that annotations already present in unannotated < annotations # introduced in annotated pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: (x.type in DIALOGUE_ACTS, x.local_id())), 2) if elt_a.overlaps(elt_b)] # * Two cases are very close: EDU merges, and CDUs rels_support = set( anno_map.get(x, x) for rel in anno_doc.relations for x in [rel.source, rel.target]) edu_merges = [] # list of (list of elt_a, elt_b) cdu_guess = [] # list of (list of elt_a, elt_b) for elt_b, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[1]): sorted_a = sorted((y[0] for y in pairs), key=lambda z: z.text_span()) span_seq_a = Span(sorted_a[0].text_span().char_start, sorted_a[-1].text_span().char_end) # we approximately check that the sequence of EDUs elts_a # fully covers the span of elt_b, from start to end, with # no overlap or that the whole sequence is enclosed in # the annotation from `annotated` (this happens when some but # not all of the merged EDUs have been deleted) if ((approximate_cover(sorted_a, elt_b) or elt_b.text_span().encloses(span_seq_a))): # then, it is either an EDU merge or a CDU ; # if any element of the sequence supports a relation, # we take this as indicating a CDU if any(y in rels_support for y in sorted_a): # broadcast type, features, metadata to the segments for elt_a in sorted_a: elt_a.type = _SPLIT_PREFIX + elt_b.type elt_a.features = elt_b.features for k in ['lastModifier', 'lastModificationDate']: elt_a.metadata[k] = elt_b.metadata[k] # transform elt_b into a CDU sch_relid = elt_b.local_id() sch_units = set(y.local_id() for y in sorted_a) sch_relas = set() sch_schms = set() sch_stype = 'Complex_discourse_unit' sch_feats = {} sch_metad = elt_b.metadata new_cdu = Schema(sch_relid, sch_units, sch_relas, sch_schms, sch_stype, sch_feats, metadata=sch_metad) new_cdus.append(new_cdu) # map former (bad) segment to its proper CDU version anno_map[elt_b] = new_cdu cdu_guess.append((sorted_a, elt_b)) if verbose > 1: print('CDU {}\nwas {}, from\n {}'.format( new_cdu, elt_b, '\n '.join(str(z) for z in sorted_a))) elif all(elt_a.local_id() in u_ids for elt_a in sorted_a): edu_merges.append((sorted_a, elt_b)) if verbose > 1: print('EDU merge {} from\n {}'.format( elt_b, '\n '.join(str(z) for z in sorted_a))) else: err_msg = 'Weird approximate cover:\n{}\n{}' raise ValueError( err_msg.format(', '.join(str(y) for y in sorted_a), elt_b)) # map each of the segments to its CDU, so these pairs can be # removed from the list of conflicts later cdu_map = dict() for elts_a, elt_b in cdu_guess: map_items = [(elt_a, elt_b) for elt_a in elts_a] cdu_map.update(map_items) cautious_map.update(map_items) # map each of the merged segments to the new, bigger EDU + mark for elts_a, elt_b in edu_merges: map_items = [(elt_a, elt_b) for elt_a in elts_a] anno_map.update(map_items) cautious_map.update(map_items) # update list of conflicts: remove pairs that contain a segment # and its merged EDU, or a segment and its enclosing CDU pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if (anno_map.get(elt_a, elt_a) != elt_b and cdu_map.get(elt_a, elt_a) != elt_b)] # * EDU splits edu_splits = dict() # elt_a -> list of elt_b for elt_a, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[0]): sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span) # we approximately check that the sequence of new EDUs # fully covers the span of elt_a, from start to end, with # no overlap if ((elt_a.local_id() in u_ids and approximate_cover(sorted_b, elt_a))): edu_splits[elt_a] = sorted_b pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if elt_a not in set(edu_splits.keys())] # map the split segment to the first of the resulting EDUs + mark for elt_a, elts_b in edu_splits.items(): map_items = [(elt_a, elts_b[0])] anno_map.update(map_items) cautious_map.update(map_items) if verbose: if pw_conflicts: print('Conflict:') print('\n'.join(' {}\t<>\t{}'.format(str(elt_a), str(elt_b)) for elt_a, elt_b in pw_conflicts)) # update anno_doc using the computed mapping anno_map_id = {x.local_id(): y.local_id() for x, y in anno_map.items()} cautious_map_id = { x.local_id(): y.local_id() for x, y in cautious_map.items() } # * forget mapped units and segments rewritten as CDUs anno_doc.units = [ x for x in anno_doc.units if (not is_edu(x) or x.local_id() not in anno_map_id) ] # * add the new CDUs to the list of schemas anno_doc.schemas.extend(new_cdus) # rewrite the support of relations and schemas objects = { x.local_id(): x for x in itertools.chain(anno_doc.units, anno_doc.relations, anno_doc.schemas) } # * rewrite the support of relations for rel in anno_doc.relations: src = anno_map_id.get(rel.span.t1, rel.span.t1) tgt = anno_map_id.get(rel.span.t2, rel.span.t2) # update relation span, source, target rel.span = RelSpan(src, tgt) rel.source = objects[src] rel.target = objects[tgt] # if necessary, mark relation type for review if src in cautious_map_id or tgt in cautious_map_id: rel.type = _SPLIT_PREFIX + rel.type # * rewrite the support of schemas for sch in anno_doc.schemas: # sch.id = sch.id sch.units = set(anno_map_id.get(x, x) for x in sch.units) sch.relations = set(anno_map_id.get(x, x) for x in sch.relations) sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas) sch.type = sch.type # sch.features = sch.features # sch.metadata = sch.metadata sch.span = sch.units | sch.relations | sch.schemas sch.fleshout(objects) return anno_doc
def shift_dialogues(doc_src, doc_res, updates, gen): """Transpose dialogue split from target to source document. Remove all dialogues from updates. Parameters ---------- doc_src : Document Source (augmented) document. doc_res : Document Result document, originally a copy of doc_tgt with unshifted annotations. This function modifies `doc_res` by shifting the boundaries of its dialogues according to `updates`, and stretching the first and last dialogues so as to cover the same span as dialogues from `doc_src`. updates : set of updates Updates computed by `compute_updates`. gen: int Generation of annotations included in `doc_src` and the output. Returns ------- updates : Updates Trimmed down set of `updates`: no more dialogue. """ if gen < 3: dlgs_src = sorted([x for x in doc_src.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) dlgs_res = sorted([x for x in doc_res.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) # NEW 2016-06-15 adjust dialogue boundaries # for each target dialogue, find the smallest enclosing sequence of # source dialogues and map to it dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src]) dlgs_tgt_sbeg = np.array([ shift_char(x.span.char_start + 1, updates) - 1 for x in dlgs_res]) # NB: we need to broadcast (- 1) to get the source dialogue whose # start immediately precedes the start of the shifted target # dialogue tgt2src_beg = (np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg, side='right') - 1) dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg] # map the shifted end of each target dialogue to the first larger end # of a source dialogue dlgs_src_end = np.array([x.span.char_end for x in dlgs_src]) dlgs_tgt_send = np.array([shift_char(x.span.char_end - 1, updates) + 1 for x in dlgs_res]) tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send) dlgs_tgt_aend = dlgs_src_end[tgt2src_end] # overwrite the adjusted beginning and end, when a game turn # overlaps with two different tgt dialogues ; # each overlap in the matching signals a split, in the linguistic # version, that happens in the middle of a game turn for i, (end_cur, beg_nxt) in enumerate( zip(tgt2src_end[:-1], tgt2src_beg[1:])): if beg_nxt <= end_cur: # linguistic turns from the same game turn, in different # target dialogues => use the shifted cut point from tgt dlgs_tgt_aend[i] = dlgs_tgt_send[i] dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i] # find source dialogues included in the shifted+expanded target # dialogues dlgs_src_matched = reduce(np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(tgt2src_beg, tgt2src_end))) dlgs_src_matched = set(dlgs_src_matched) for dlg_res, adj_start, adj_end in zip( dlgs_res, dlgs_tgt_abeg, dlgs_tgt_aend): dlg_res.span.char_start = adj_start dlg_res.span.char_end = adj_end # alt: dlg_res.span = Span(start, end) # # optionally, update timestamp, id, span as in # `stac.edit.cmd.split_dialogue.{_actually_split,_set}` # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if (i in dlgs_src_matched and dlg_src in updates.expected_src_only): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) else: # situated version: we can rely on game turns # 1. get the identifier of the first and last turn of each game turn # in _src: these turns and those in between must end up in the same # dialogue turns_src = sorted((x for x in doc_src.units if is_turn(x)), key=lambda x: x.span) turns_src_tid = np.array([x.features['Identifier'] for x in turns_src]) turns_src_beg = np.array([x.span.char_start for x in turns_src]) turns_src_end = np.array([x.span.char_end for x in turns_src]) # * locate game turns (index of first and last turn) gturn_idc = game_turns(doc_src, turns_src, gen=3) gturn_idc_beg = np.array(gturn_idc) gturn_idc_end = np.array( [i - 1 for i in gturn_idc[1:]] + [len(turns_src) - 1]) # ... and finally gturn_src_tid_beg = turns_src_tid[gturn_idc_beg] gturn_src_tid_end = turns_src_tid[gturn_idc_end] # 2. get the identifier of the first and last turn of each dialogue # in _res: these turns and those in between must end up in the same # dialogue turns_res = sorted((x for x in doc_res.units if is_turn(x)), key=lambda x: x.span) turns_res_tid = np.array([x.features['Identifier'] for x in turns_res]) turns_res_beg = np.array([x.span.char_start for x in turns_res]) turns_res_end = np.array([x.span.char_end for x in turns_res]) # align dialogue spans with turn spans dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)), key=lambda x: x.span) dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res]) dlgs_res_end = np.array([x.span.char_end for x in dlgs_res]) dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg) dlgs_res_ti_end = np.searchsorted(turns_res_end, dlgs_res_end, side='right') - 1 # ... and finally dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg] dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end] # 3. map _res dialogues to _src game turns dlgs_res_ti_beg = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg]) dlgs_res_ti_end = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_end]) # * align the beginning (resp. end) indices of game turns and _res # dialogues dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg, side='right') - 1) dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end) # * turn indices of the adjusted beginning and end of the _res # dialogues # initialize along the boundaries of game turns dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg] dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end] # 4. make dialogue boundaries coincide with game turn boundaries, # which occasionally implies merging dialogues from _res # * compute a partition on dialogues such that any pair of dialogues # overlapping a given game turn are in the same class dlg2grp = [0] for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip( dlg2gturn_end[:-1], dlg2gturn_beg[1:])): if gturn_beg_nxt <= gturn_end_cur: # two _res dialogues overlap a single game turn: # put in the same class (to merge dialogues) dlg2grp.append(dlg2grp[-1]) else: dlg2grp.append(dlg2grp[-1] + 1) # keep one dialogue for each class of dialogues for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]): dlg_idc_merged = [x[0] for x in g] # adjust boundaries of the first dialogue of the group # index of first and last dialogues di_beg = dlg_idc_merged[0] di_end = dlg_idc_merged[-1] # index of first and last turns of these dialogues ti_beg = dlg_res_src_abeg[di_beg] ti_end = dlg_res_src_aend[di_end] # keep first dialogue, update its features to include those # from the other dialogues in the same class new_dlg = dlgs_res[di_beg] new_dlg.span.char_start = turns_src_beg[ti_beg] new_dlg.span.char_end = turns_src_end[ti_end] dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged] for feat in ['Trades', 'Gets', 'Dice_rolling']: new_dlg.features[feat] = _concatenate_features( dlgs_res_merged, feat) # remove merged dialogues [1:] from doc_res for i in dlg_idc_merged[1:]: dlg_res = dlgs_res[i] doc_res.units.remove(dlg_res) # transfer each unmatched (non-overlapping) game turn as a dialogue # (which already exists in doc_src) gturns_matched = reduce(np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end))) gturns_matched = set(gturns_matched) # each dialogue in doc_src is a game turn dlgs_src = sorted((x for x in doc_src.units if is_dialogue(x)), key=lambda x: x.span) # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if (i in gturns_matched and dlg_src in updates.expected_src_only): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) return updates
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True, strip_cdus=False, attach_len=False): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. strip_cdus : boolean, defaults to False If True, strip CDUs with the "head" strategy and sloppy=True. attach_len : boolean, defaults to False If True, compute attachment length. This requires strip_cdus=True. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_disc_rels = [] # discourse relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences df_unit_rels = [] # relations from the "units" stage (anaphora) print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) # give integer indices to segments, and EDUs in particular seg_idx = 0 eeu_idx = 0 edu_idx = 0 for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in sorted(doc_val.units, key=lambda x: x.span): # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError('Wow, a discourse segment has *features*') # assign index among segments, across the whole doc unit_dict['seg_idx'] = seg_idx seg_idx += 1 if anno.type == 'NonplayerSegment': # EEU unit_dict['eeu_idx'] = eeu_idx eeu_idx += 1 else: # EDU unit_dict['edu_idx'] = edu_idx edu_idx += 1 # df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted(anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set( ['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == ['Correctness', 'Kind', 'Quantity', 'Status']) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) # RELATIONS # * rewrite endpoints of relations if strip_cdus if strip_cdus: endpts = dict() # map relation ids to (src_id, tgt_id) dgr = Graph.from_doc(game_corpus, doc_key) dgraph = copy.deepcopy(dgr) dgraph.strip_cdus(sloppy=True, mode='head') for edge in dgraph.relations(): if "asoubeille_1414085458642" in edge: print('Wop', edge) raise ValueError('gni') links = dgraph.links(edge) # get the identifiers of the relation and its endpoints # to replace CDU ids with segment indices anno_rel = dgraph.annotation(edge) # as of 2017-06-24, anno_rel has no origin (why?) at # this point anno_rel.origin = doc_key # temporary(?) fix # anno_src = dgraph.annotation(links[0]) anno_tgt = dgraph.annotation(links[1]) gid_rel = anno_rel.identifier() if gid_rel.endswith('_0'): # strip_cdus appends an integer to each copy of # the relation ; with mode="head", we only expect # one such copy per relation so "_0" should be a # sufficient match, which we can cut off for the # mapping gid_rel = gid_rel[:-2] gid_src = anno_src.identifier() gid_tgt = anno_tgt.identifier() endpts[gid_rel] = (gid_src, gid_tgt) # * process relations for anno in doc_val.relations: # attributes common to all(?) types of annotations # * global ids of the relation and its endpoints gid_rel = anno.identifier() gid_src = anno.source.identifier() gid_tgt = anno.target.identifier() # * build dict rel_dict = { # identification 'global_id': gid_rel, 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier() ) warnings.warn(w_msg) # if strip_cdus, replace endpoints of *discourse* relations # with segment ids if strip_cdus and is_relation_instance(anno): gid_src, gid_tgt = endpts[gid_rel] rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': gid_src, 'target': gid_tgt, }) if stage == 'discourse': df_disc_rels.append(rel_dict) elif stage == 'units': df_unit_rels.append(rel_dict) else: raise ValueError( "relation from stage not in {'units', 'discourse'}") # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS) df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) # * length of attachments # 2017-06-29 restricted to *discourse* relations, for the time being if strip_cdus and attach_len: df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_disc_rels, df_res, df_pref, df_unit_rels)
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = { anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel'] } for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in [ 'paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference' ]: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = { anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list } return res
def shift_dialogues(doc_src, doc_res, updates, gen): """Transpose dialogue split from target to source document. Remove all dialogues from updates. Parameters ---------- doc_src : Document Source (augmented) document. doc_res : Document Result document, originally a copy of doc_tgt with unshifted annotations. This function modifies `doc_res` by shifting the boundaries of its dialogues according to `updates`, and stretching the first and last dialogues so as to cover the same span as dialogues from `doc_src`. updates : set of updates Updates computed by `compute_updates`. gen: int Generation of annotations included in `doc_src` and the output. Returns ------- updates : Updates Trimmed down set of `updates`: no more dialogue. """ if gen < 3: dlgs_src = sorted( [x for x in doc_src.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) dlgs_res = sorted( [x for x in doc_res.units if x.type.lower() == 'dialogue'], key=lambda y: y.span) # NEW 2016-06-15 adjust dialogue boundaries # for each target dialogue, find the smallest enclosing sequence of # source dialogues and map to it dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src]) dlgs_tgt_sbeg = np.array( [shift_char(x.span.char_start + 1, updates) - 1 for x in dlgs_res]) # NB: we need to broadcast (- 1) to get the source dialogue whose # start immediately precedes the start of the shifted target # dialogue tgt2src_beg = ( np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg, side='right') - 1) dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg] # map the shifted end of each target dialogue to the first larger end # of a source dialogue dlgs_src_end = np.array([x.span.char_end for x in dlgs_src]) dlgs_tgt_send = np.array( [shift_char(x.span.char_end - 1, updates) + 1 for x in dlgs_res]) tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send) dlgs_tgt_aend = dlgs_src_end[tgt2src_end] # overwrite the adjusted beginning and end, when a game turn # overlaps with two different tgt dialogues ; # each overlap in the matching signals a split, in the linguistic # version, that happens in the middle of a game turn for i, (end_cur, beg_nxt) in enumerate(zip(tgt2src_end[:-1], tgt2src_beg[1:])): if beg_nxt <= end_cur: # linguistic turns from the same game turn, in different # target dialogues => use the shifted cut point from tgt dlgs_tgt_aend[i] = dlgs_tgt_send[i] dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i] # find source dialogues included in the shifted+expanded target # dialogues dlgs_src_matched = reduce( np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(tgt2src_beg, tgt2src_end))) dlgs_src_matched = set(dlgs_src_matched) for dlg_res, adj_start, adj_end in zip(dlgs_res, dlgs_tgt_abeg, dlgs_tgt_aend): dlg_res.span.char_start = adj_start dlg_res.span.char_end = adj_end # alt: dlg_res.span = Span(start, end) # # optionally, update timestamp, id, span as in # `stac.edit.cmd.split_dialogue.{_actually_split,_set}` # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if ((i in dlgs_src_matched and dlg_src in updates.expected_src_only)): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) else: # situated version: we can rely on game turns # 1. get the identifier of the first and last turn of each game turn # in _src: these turns and those in between must end up in the same # dialogue turns_src = sorted((x for x in doc_src.units if is_turn(x)), key=lambda x: x.span) turns_src_tid = np.array([x.features['Identifier'] for x in turns_src]) turns_src_beg = np.array([x.span.char_start for x in turns_src]) turns_src_end = np.array([x.span.char_end for x in turns_src]) # * locate game turns (index of first and last turn) gturn_idc = game_turns(doc_src, turns_src, gen=3) gturn_idc_beg = np.array(gturn_idc) gturn_idc_end = np.array([i - 1 for i in gturn_idc[1:]] + [len(turns_src) - 1]) # ... and finally gturn_src_tid_beg = turns_src_tid[gturn_idc_beg] gturn_src_tid_end = turns_src_tid[gturn_idc_end] # 2. get the identifier of the first and last turn of each dialogue # in _res: these turns and those in between must end up in the same # dialogue turns_res = sorted((x for x in doc_res.units if is_turn(x)), key=lambda x: x.span) turns_res_tid = np.array([x.features['Identifier'] for x in turns_res]) turns_res_beg = np.array([x.span.char_start for x in turns_res]) turns_res_end = np.array([x.span.char_end for x in turns_res]) # align dialogue spans with turn spans dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)), key=lambda x: x.span) dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res]) dlgs_res_end = np.array([x.span.char_end for x in dlgs_res]) dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg) dlgs_res_ti_end = np.searchsorted( turns_res_end, dlgs_res_end, side='right') - 1 # ... and finally dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg] dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end] # 3. map _res dialogues to _src game turns dlgs_res_ti_beg = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg]) dlgs_res_ti_end = np.array( [list(turns_src_tid).index(x) for x in dlgs_res_tid_end]) # * align the beginning (resp. end) indices of game turns and _res # dialogues dlg2gturn_beg = ( np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg, side='right') - 1) dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end) # * turn indices of the adjusted beginning and end of the _res # dialogues # initialize along the boundaries of game turns dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg] dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end] # 4. make dialogue boundaries coincide with game turn boundaries, # which occasionally implies merging dialogues from _res # * compute a partition on dialogues such that any pair of dialogues # overlapping a given game turn are in the same class dlg2grp = [0] for i, (gturn_end_cur, gturn_beg_nxt) in enumerate( zip(dlg2gturn_end[:-1], dlg2gturn_beg[1:])): if gturn_beg_nxt <= gturn_end_cur: # two _res dialogues overlap a single game turn: # put in the same class (to merge dialogues) dlg2grp.append(dlg2grp[-1]) else: dlg2grp.append(dlg2grp[-1] + 1) # keep one dialogue for each class of dialogues for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]): dlg_idc_merged = [x[0] for x in g] # adjust boundaries of the first dialogue of the group # index of first and last dialogues di_beg = dlg_idc_merged[0] di_end = dlg_idc_merged[-1] # index of first and last turns of these dialogues ti_beg = dlg_res_src_abeg[di_beg] ti_end = dlg_res_src_aend[di_end] # keep first dialogue, update its features to include those # from the other dialogues in the same class new_dlg = dlgs_res[di_beg] new_dlg.span.char_start = turns_src_beg[ti_beg] new_dlg.span.char_end = turns_src_end[ti_end] dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged] for feat in ['Trades', 'Gets', 'Dice_rolling']: new_dlg.features[feat] = _concatenate_features( dlgs_res_merged, feat) # remove merged dialogues [1:] from doc_res for i in dlg_idc_merged[1:]: dlg_res = dlgs_res[i] doc_res.units.remove(dlg_res) # transfer each unmatched (non-overlapping) game turn as a dialogue # (which already exists in doc_src) gturns_matched = reduce( np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end))) gturns_matched = set(gturns_matched) # each dialogue in doc_src is a game turn dlgs_src = sorted((x for x in doc_src.units if is_dialogue(x)), key=lambda x: x.span) # remove all source and target dialogues from updates for dlg_res in dlgs_res: if dlg_res in updates.abnormal_tgt_only: updates.abnormal_tgt_only.remove(dlg_res) for i, dlg_src in enumerate(dlgs_src): if dlg_src in updates.abnormal_src_only: updates.abnormal_src_only.remove(dlg_src) if ((i in gturns_matched and dlg_src in updates.expected_src_only)): # remove matched source dialogues, leave the unmatched # ones in expected_src_only, so that they are added later # to the woven document updates.expected_src_only.remove(dlg_src) return updates
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = {anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']} for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in ['paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference']: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = {anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list} return res
def fix_likely_annotation_errors(anno_doc, verbose=1): """Fix a document for likely annotation errors due to glozz UX. Likely errors are currently defined as: - units of span length 0 (delete), - empty dialogue acts (delete), - schemas with no member (delete), - overflowing units (fix span). Parameters ---------- anno_doc : GlozzDocument Document to filter verbose : int Verbosity level Returns ------- anno_doc : GlozzDocument Same document but filtered. """ # units anno_units_err = [ x for x in anno_doc.units if (x.span.char_start == x.span.char_end or (is_empty_dialogue_act(x) and any(y.encloses(x) for y in anno_doc.units if y.text_span() != x.text_span() and is_edu(y)))) ] # schemas anno_schms_err = [ x for x in anno_doc.schemas if not x.members ] # relations # TODO anno_relas_err = [] # warn about the ignored annotations if verbose: if anno_units_err or anno_schms_err or anno_relas_err: print('Likely errors due to glozz UX') print('-----------------------------') if anno_units_err: print('|-> Units') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_units_err)) if anno_schms_err: print('|-> Schemas') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_schms_err)) if anno_relas_err: print('|-> Relations') print('\n'.join(' [ ] {}'.format(str(x)) for x in anno_relas_err)) # remove detected errors anno_units_err = set(anno_units_err) anno_doc.units = [x for x in anno_doc.units if x not in anno_units_err] anno_schms_err = set(anno_schms_err) anno_doc.schemas = [x for x in anno_doc.schemas if x not in anno_schms_err] anno_relas_err = set(anno_relas_err) anno_doc.relations = [x for x in anno_doc.relations if x not in anno_relas_err] # fix span of units that overflow from their turn turns = [x for x in anno_doc.units if is_turn(x)] edus = [x for x in anno_doc.units if is_edu(x)] for edu in edus: enclosing_turns = [x for x in turns if x.encloses(edu)] if len(enclosing_turns) == 1: continue overlapping_turns = [x for x in turns if x.overlaps(edu)] if len(overlapping_turns) != 1: raise ValueError('No unique overlapping turn for {}'.format(edu)) turn = overlapping_turns[0] if turn.overlaps(edu) != edu.text_span(): edu.span = turn.overlaps(edu) if verbose: print('Fix span of overflowing unit: {}'.format(edu)) return anno_doc
def infer_resegmentation(unanno_doc, anno_doc, verbose=0): """Infer resegmentation of EDUs. Parameters ---------- anno_doc : GlozzDocument Document to filter verbose : int Verbosity level Returns ------- anno_doc : GlozzDocument Filtered document, where the support of relations and schemas has been rewritten. """ anno_map = dict() cautious_map = dict() new_cdus = [] turns = [x for x in unanno_doc.units if is_turn(x)] for turn in turns: # `unannotated` was the starting point for the annotation process u_edus = [x for x in unanno_doc.units if is_edu(x) and turn.span.encloses(x.span)] u_ids = set(x.local_id() for x in u_edus) # `annotated` is the result of the annotation process # find conflicts, as pair-wise overlaps between annotations # from `annotated` a_edus = [x for x in anno_doc.units if is_edu(x) and turn.span.encloses(x.span)] # 1. map new segments to their original equivalent, backporting # dialogue act annotation dup_items = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: ( x.local_id() in u_ids, x.local_id())), 2) if (span_eq(elt_a.text_span(), elt_b.text_span(), eps=1) and elt_b.local_id() in u_ids)] anno_map.update(dup_items) # backport dialogue act annotation to original segment for elt_a, elt_b in dup_items: if elt_a.type in DIALOGUE_ACTS: # backport annotation to original segment elt_b elt_b.type = elt_a.type elt_b.features = elt_a.features for k in ['lastModifier', 'lastModificationDate']: elt_b.metadata[k] = elt_a.metadata[k] # (locally) update the list of EDUs in anno_doc, so conflicts # are not computed on trivially mapped segments a_edus = [x for x in a_edus if x not in anno_map] # 2. list conflicts, then whitelist them progressively # NB: we sort EDUs in reverse using their local_ids, so that # conflict pairs are of the form (stac*, skar*) ; this is # admittedly a cheap, ad-hoc, trick to simulate an ordering # such that annotations already present in unannotated < annotations # introduced in annotated pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations( sorted(a_edus, key=lambda x: ( x.type in DIALOGUE_ACTS, x.local_id())), 2) if elt_a.overlaps(elt_b)] # * Two cases are very close: EDU merges, and CDUs rels_support = set(anno_map.get(x, x) for rel in anno_doc.relations for x in [rel.source, rel.target]) edu_merges = [] # list of (list of elt_a, elt_b) cdu_guess = [] # list of (list of elt_a, elt_b) for elt_b, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[1]): sorted_a = sorted((y[0] for y in pairs), key=lambda z: z.text_span()) span_seq_a = Span(sorted_a[0].text_span().char_start, sorted_a[-1].text_span().char_end) # we approximately check that the sequence of EDUs elts_a # fully covers the span of elt_b, from start to end, with # no overlap or that the whole sequence is enclosed in # the annotation from `annotated` (this happens when some but # not all of the merged EDUs have been deleted) if ((approximate_cover(sorted_a, elt_b) or elt_b.text_span().encloses(span_seq_a))): # then, it is either an EDU merge or a CDU ; # if any element of the sequence supports a relation, # we take this as indicating a CDU if any(y in rels_support for y in sorted_a): # broadcast type, features, metadata to the segments for elt_a in sorted_a: elt_a.type = _SPLIT_PREFIX + elt_b.type elt_a.features = elt_b.features for k in ['lastModifier', 'lastModificationDate']: elt_a.metadata[k] = elt_b.metadata[k] # transform elt_b into a CDU sch_relid = elt_b.local_id() sch_units = set(y.local_id() for y in sorted_a) sch_relas = set() sch_schms = set() sch_stype = 'Complex_discourse_unit' sch_feats = {} sch_metad = elt_b.metadata new_cdu = Schema(sch_relid, sch_units, sch_relas, sch_schms, sch_stype, sch_feats, metadata=sch_metad) new_cdus.append(new_cdu) # map former (bad) segment to its proper CDU version anno_map[elt_b] = new_cdu cdu_guess.append((sorted_a, elt_b)) if verbose > 1: print('CDU {}\nwas {}, from\n {}'.format( new_cdu, elt_b, '\n '.join(str(z) for z in sorted_a))) elif all(elt_a.local_id() in u_ids for elt_a in sorted_a): edu_merges.append((sorted_a, elt_b)) if verbose > 1: print('EDU merge {} from\n {}'.format( elt_b, '\n '.join(str(z) for z in sorted_a))) else: err_msg = 'Weird approximate cover:\n{}\n{}' raise ValueError(err_msg.format( ', '.join(str(y) for y in sorted_a), elt_b )) # map each of the segments to its CDU, so these pairs can be # removed from the list of conflicts later cdu_map = dict() for elts_a, elt_b in cdu_guess: map_items = [(elt_a, elt_b) for elt_a in elts_a] cdu_map.update(map_items) cautious_map.update(map_items) # map each of the merged segments to the new, bigger EDU + mark for elts_a, elt_b in edu_merges: map_items = [(elt_a, elt_b) for elt_a in elts_a] anno_map.update(map_items) cautious_map.update(map_items) # update list of conflicts: remove pairs that contain a segment # and its merged EDU, or a segment and its enclosing CDU pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if (anno_map.get(elt_a, elt_a) != elt_b and cdu_map.get(elt_a, elt_a) != elt_b)] # * EDU splits edu_splits = dict() # elt_a -> list of elt_b for elt_a, pairs in itertools.groupby(pw_conflicts, key=lambda x: x[0]): sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span) # we approximately check that the sequence of new EDUs # fully covers the span of elt_a, from start to end, with # no overlap if ((elt_a.local_id() in u_ids and approximate_cover(sorted_b, elt_a))): edu_splits[elt_a] = sorted_b pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts if elt_a not in set(edu_splits.keys())] # map the split segment to the first of the resulting EDUs + mark for elt_a, elts_b in edu_splits.items(): map_items = [(elt_a, elts_b[0])] anno_map.update(map_items) cautious_map.update(map_items) if verbose: if pw_conflicts: print('Conflict:') print('\n'.join(' {}\t<>\t{}'.format(str(elt_a), str(elt_b)) for elt_a, elt_b in pw_conflicts)) # update anno_doc using the computed mapping anno_map_id = {x.local_id(): y.local_id() for x, y in anno_map.items()} cautious_map_id = {x.local_id(): y.local_id() for x, y in cautious_map.items()} # * forget mapped units and segments rewritten as CDUs anno_doc.units = [x for x in anno_doc.units if (not is_edu(x) or x.local_id() not in anno_map_id)] # * add the new CDUs to the list of schemas anno_doc.schemas.extend(new_cdus) # rewrite the support of relations and schemas objects = {x.local_id(): x for x in itertools.chain(anno_doc.units, anno_doc.relations, anno_doc.schemas)} # * rewrite the support of relations for rel in anno_doc.relations: src = anno_map_id.get(rel.span.t1, rel.span.t1) tgt = anno_map_id.get(rel.span.t2, rel.span.t2) # update relation span, source, target rel.span = RelSpan(src, tgt) rel.source = objects[src] rel.target = objects[tgt] # if necessary, mark relation type for review if src in cautious_map_id or tgt in cautious_map_id: rel.type = _SPLIT_PREFIX + rel.type # * rewrite the support of schemas for sch in anno_doc.schemas: # sch.id = sch.id sch.units = set(anno_map_id.get(x, x) for x in sch.units) sch.relations = set(anno_map_id.get(x, x) for x in sch.relations) sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas) sch.type = sch.type # sch.features = sch.features # sch.metadata = sch.metadata sch.span = sch.units | sch.relations | sch.schemas sch.fleshout(objects) return anno_doc
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_rels = [] # relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in doc_val.units: # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError( 'Wow, a discourse segment has *features*') df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted( anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == [ 'Correctness', 'Kind', 'Quantity', 'Status' ]) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) for anno in doc_val.relations: # attributes common to all(?) types of annotations rel_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier()) warnings.warn(w_msg) rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': anno.source.identifier(), 'target': anno.target.identifier(), }) df_rels.append(rel_dict) # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_rels = pd.DataFrame(df_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_rels, df_res, df_pref)
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True, strip_cdus=False, attach_len=False): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. strip_cdus : boolean, defaults to False If True, strip CDUs with the "head" strategy and sloppy=True. attach_len : boolean, defaults to False If True, compute attachment length. This requires strip_cdus=True. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_disc_rels = [] # discourse relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences df_unit_rels = [] # relations from the "units" stage (anaphora) print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) # give integer indices to segments, and EDUs in particular seg_idx = 0 eeu_idx = 0 edu_idx = 0 for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in sorted(doc_val.units, key=lambda x: x.span): # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError( 'Wow, a discourse segment has *features*') # assign index among segments, across the whole doc unit_dict['seg_idx'] = seg_idx seg_idx += 1 if anno.type == 'NonplayerSegment': # EEU unit_dict['eeu_idx'] = eeu_idx eeu_idx += 1 else: # EDU unit_dict['edu_idx'] = edu_idx edu_idx += 1 # df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted( anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == [ 'Correctness', 'Kind', 'Quantity', 'Status' ]) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) # RELATIONS # * rewrite endpoints of relations if strip_cdus if strip_cdus: endpts = dict() # map relation ids to (src_id, tgt_id) dgr = Graph.from_doc(game_corpus, doc_key) dgraph = copy.deepcopy(dgr) dgraph.strip_cdus(sloppy=True, mode='head') for edge in dgraph.relations(): if "asoubeille_1414085458642" in edge: print('Wop', edge) raise ValueError('gni') links = dgraph.links(edge) # get the identifiers of the relation and its endpoints # to replace CDU ids with segment indices anno_rel = dgraph.annotation(edge) # as of 2017-06-24, anno_rel has no origin (why?) at # this point anno_rel.origin = doc_key # temporary(?) fix # anno_src = dgraph.annotation(links[0]) anno_tgt = dgraph.annotation(links[1]) gid_rel = anno_rel.identifier() if gid_rel.endswith('_0'): # strip_cdus appends an integer to each copy of # the relation ; with mode="head", we only expect # one such copy per relation so "_0" should be a # sufficient match, which we can cut off for the # mapping gid_rel = gid_rel[:-2] gid_src = anno_src.identifier() gid_tgt = anno_tgt.identifier() endpts[gid_rel] = (gid_src, gid_tgt) # * process relations for anno in doc_val.relations: # attributes common to all(?) types of annotations # * global ids of the relation and its endpoints gid_rel = anno.identifier() gid_src = anno.source.identifier() gid_tgt = anno.target.identifier() # * build dict rel_dict = { # identification 'global_id': gid_rel, 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier()) warnings.warn(w_msg) # if strip_cdus, replace endpoints of *discourse* relations # with segment ids if strip_cdus and is_relation_instance(anno): gid_src, gid_tgt = endpts[gid_rel] rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': gid_src, 'target': gid_tgt, }) if stage == 'discourse': df_disc_rels.append(rel_dict) elif stage == 'units': df_unit_rels.append(rel_dict) else: raise ValueError( "relation from stage not in {'units', 'discourse'}") # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS) df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) # * length of attachments # 2017-06-29 restricted to *discourse* relations, for the time being if strip_cdus and attach_len: df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_disc_rels, df_res, df_pref, df_unit_rels)
def _fix_dialogue_boundaries(tcache, doc_ling, doc_situ): """Do the job. Parameters ---------- tcache: TimestampCache Timestamp cache to generate unit identifiers for new dialogues. doc_ling: GlozzDocument Linguistic version of the game. doc_situ: GlozzDocument Situated version of the game. Returns ------- doc_situ: GlozzDocument Fixed version of doc_situ. """ doc_key = doc_situ.origin # 1. get the identifier of the first and last turn of each game turn # in _situ: these turns and those in between must end up in the same # dialogue turns_situ = sorted((x for x in doc_situ.units if is_turn(x)), key=lambda x: x.span) turns_situ_tid = np.array([x.features['Identifier'] for x in turns_situ]) turns_situ_beg = np.array([x.span.char_start for x in turns_situ]) turns_situ_end = np.array([x.span.char_end for x in turns_situ]) # * locate game turns (index of first and last turn) gturn_idc = game_turns(doc_situ, turns_situ, gen=3) gturn_idc_beg = np.array(gturn_idc) gturn_idc_end = np.array( [i - 1 for i in gturn_idc[1:]] + [len(turns_situ) - 1]) # ... and finally gturn_situ_tid_beg = turns_situ_tid[gturn_idc_beg] gturn_situ_tid_end = turns_situ_tid[gturn_idc_end] # print('game turns in _situ', zip(gturn_situ_tid_beg, gturn_situ_tid_end)) # 2. get the identifier of the first and last turn of each dialogue in # _ling: these turns and those in between must end up in the same # dialogue turns_ling = sorted((x for x in doc_ling.units if is_turn(x)), key=lambda x: x.span) # DIRTY special processing for pilot02_01 if doc_key.doc == 'pilot02' and doc_key.subdoc == '01': # ignore turns 26-27 that were moved down from _01 to _02 turns_ling = turns_ling[:-2] turns_ling_tid = np.array([x.features['Identifier'] for x in turns_ling]) turns_ling_beg = np.array([x.span.char_start for x in turns_ling]) turns_ling_end = np.array([x.span.char_end for x in turns_ling]) # align dialogue spans with turn spans dlgs_ling = sorted((x for x in doc_ling.units if is_dialogue(x)), key=lambda x: x.span) # DIRTY if doc_key.doc == 'pilot02' and doc_key.subdoc == '01': # turns 26-27 are in the last dialogue in _01, in _ling dlgs_ling = dlgs_ling[:-1] dlgs_ling_beg = np.array([x.span.char_start for x in dlgs_ling]) dlgs_ling_end = np.array([x.span.char_end for x in dlgs_ling]) dlgs_ling_ti_beg = np.searchsorted(turns_ling_beg, dlgs_ling_beg) dlgs_ling_ti_end = np.searchsorted(turns_ling_end, dlgs_ling_end, side='right') - 1 # ... and finally dlgs_ling_tid_beg = turns_ling_tid[dlgs_ling_ti_beg] dlgs_ling_tid_end = turns_ling_tid[dlgs_ling_ti_end] # print('dialogues in _ling', zip(dlgs_ling_tid_beg, dlgs_ling_tid_end)) # 3. map _ling dialogues to _situ game turns # * locate the first and last turn of each _ling dialogue in the # list of turns in _situ # NB: we don't need indices in the list of turns from _ling anymore # hence it is safe to overwrite dlgs_ling_ti_{beg,end} dlgs_ling_ti_beg = np.array( [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_beg]) dlgs_ling_ti_end = np.array( [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_end]) # print('game turns (turn_idx)', zip(gturn_idc_beg, gturn_idc_end)) # print('core dlgs (turn_idx)', zip(dlgs_ling_ti_beg, dlgs_ling_ti_end)) # * align the beginning (resp. end) indices of game turns and _ling # dialogues dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_ling_ti_beg, side='right') - 1) dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_ling_ti_end) # print('map from dlg to gturn', zip(dlg2gturn_beg, dlg2gturn_end)) # * turn indices of the adjusted beginning and end of the _ling # dialogues # initialize along the boundaries of game turns dlg_ling_situ_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg] dlg_ling_situ_aend = [gturn_idc_end[i] for i in dlg2gturn_end] # 4. make dialogue boundaries coincide with game turn boundaries, # which occasionally implies merging dialogues from _ling # * compute a partition on dialogues such that any pair of # dialogues overlapping a given game turn are in the same # class dlg2grp = [0] for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip( dlg2gturn_end[:-1], dlg2gturn_beg[1:])): if gturn_beg_nxt <= gturn_end_cur: # two _ling dialogues overlap a single game turn: # put in the same class (to merge dialogues) dlg2grp.append(dlg2grp[-1]) else: dlg2grp.append(dlg2grp[-1] + 1) # remove all dialogues from the units in doc_situ, # they will be replaced with (hopefully) clean ones dlgs_situ = sorted((x for x in doc_situ.units if is_dialogue(x)), key=lambda x: x.span) for dlg_situ in dlgs_situ: doc_situ.units.remove(dlg_situ) # create one dialogue for each class of dialogues for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]): dlg_idc_merged = [x[0] for x in g] # adjust boundaries of the first dialogue of the group # index of first and last dialogues di_beg = dlg_idc_merged[0] di_end = dlg_idc_merged[-1] # index of first and last turns of these dialogues ti_beg = dlg_ling_situ_abeg[di_beg] ti_end = dlg_ling_situ_aend[di_end] # create dialogue, use the 1st _ling dialogue as basis then # customize dlg0 = dlgs_ling[di_beg] new_dlg = copy.deepcopy(dlg0) new_dlg.origin = doc_key new_dlg.span.char_start = turns_situ_beg[ti_beg] new_dlg.span.char_end = turns_situ_end[ti_end] dlgs_ling_merged = [dlgs_ling[i] for i in dlg_idc_merged] for feat in ['Trades', 'Gets', 'Dice_rolling']: new_dlg.features[feat] = _concatenate_features( dlgs_ling_merged, feat) # add the new dialogue to doc_situ doc_situ.units.append(new_dlg) # create a new dialogue for each unmatched (non-overlapping) game # turn gturns_matched = reduce(np.union1d, (np.arange(x_beg, x_end + 1) for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end))) gturns_matched = set(gturns_matched) for i, (gturn_idx_beg, gturn_idx_end) in enumerate(zip( gturn_idc_beg, gturn_idc_end)): if i not in gturns_matched: new_dlg_span = Span(turns_situ_beg[gturn_idx_beg], turns_situ_end[gturn_idx_end]) # UGLY this works just like split_dialogue: # create a new dialogue by copying an existing dialogue, # re-assign it an annotation id and span using a timestamp # cache, then erase all features new_dlg = copy.deepcopy(dlgs_situ[0]) _set(tcache, new_dlg_span, new_dlg) new_dlg.features = {} # ... "et voila": add this dialogue to the document doc_situ.units.append(new_dlg) # TODO restore dialogue features from the game events? return doc_situ