def relation_dict(doc, quiet=False): """ Return the relations instances from a document in the form of an id pair to label dictionary If there is more than one relation between a pair of EDUs we pick one of them arbitrarily and ignore the other """ relations = {} for rel in doc.relations: if not is_relation_instance(rel): # might be the odd Anaphora link lying around continue pair = rel.source.identifier(), rel.target.identifier() if pair not in relations: relations[pair] = rel.type elif not quiet: print(('Ignoring {type1} relation instance ({edu1} -> {edu2}); ' 'another of type {type2} already exists' '').format(type1=rel.type, edu1=pair[0], edu2=pair[1], type2=relations[pair]), file=sys.stderr) # generate fake root links for anno in doc.units: if not educe.stac.is_edu(anno): continue is_target = False for rel in doc.relations: if rel.target == anno: is_target = True break if not is_target: key = ROOT, anno.identifier() relations[key] = ROOT return relations
def relation_dict(doc, quiet=False): """ Return the relations instances from a document in the form of an id pair to label dictionary If there is more than one relation between a pair of EDUs we pick one of them arbitrarily and ignore the other """ relations = {} for rel in doc.relations: if not is_relation_instance(rel): # might be the odd Anaphora link lying around continue pair = rel.source.identifier(), rel.target.identifier() if pair not in relations: relations[pair] = rel.type elif not quiet: print(('Ignoring {type1} relation instance ({edu1} -> {edu2}); ' 'another of type {type2} already exists' '').format(type1=rel.type, edu1=pair[0], edu2=pair[1], type2=relations[pair]), file=sys.stderr) # generate fake root links for anno in doc.units: if not educe.stac.is_edu(anno): continue is_target = False for rel in doc.relations: if rel.target == anno: is_target = True break if not is_target: key = ROOT, anno.identifier() relations[key] = ROOT return relations
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True, strip_cdus=False, attach_len=False): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. strip_cdus : boolean, defaults to False If True, strip CDUs with the "head" strategy and sloppy=True. attach_len : boolean, defaults to False If True, compute attachment length. This requires strip_cdus=True. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_disc_rels = [] # discourse relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences df_unit_rels = [] # relations from the "units" stage (anaphora) print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) # give integer indices to segments, and EDUs in particular seg_idx = 0 eeu_idx = 0 edu_idx = 0 for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in sorted(doc_val.units, key=lambda x: x.span): # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError('Wow, a discourse segment has *features*') # assign index among segments, across the whole doc unit_dict['seg_idx'] = seg_idx seg_idx += 1 if anno.type == 'NonplayerSegment': # EEU unit_dict['eeu_idx'] = eeu_idx eeu_idx += 1 else: # EDU unit_dict['edu_idx'] = edu_idx edu_idx += 1 # df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted(anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set( ['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == ['Correctness', 'Kind', 'Quantity', 'Status']) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError('{}: schema with *features*'.format( stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) # RELATIONS # * rewrite endpoints of relations if strip_cdus if strip_cdus: endpts = dict() # map relation ids to (src_id, tgt_id) dgr = Graph.from_doc(game_corpus, doc_key) dgraph = copy.deepcopy(dgr) dgraph.strip_cdus(sloppy=True, mode='head') for edge in dgraph.relations(): if "asoubeille_1414085458642" in edge: print('Wop', edge) raise ValueError('gni') links = dgraph.links(edge) # get the identifiers of the relation and its endpoints # to replace CDU ids with segment indices anno_rel = dgraph.annotation(edge) # as of 2017-06-24, anno_rel has no origin (why?) at # this point anno_rel.origin = doc_key # temporary(?) fix # anno_src = dgraph.annotation(links[0]) anno_tgt = dgraph.annotation(links[1]) gid_rel = anno_rel.identifier() if gid_rel.endswith('_0'): # strip_cdus appends an integer to each copy of # the relation ; with mode="head", we only expect # one such copy per relation so "_0" should be a # sufficient match, which we can cut off for the # mapping gid_rel = gid_rel[:-2] gid_src = anno_src.identifier() gid_tgt = anno_tgt.identifier() endpts[gid_rel] = (gid_src, gid_tgt) # * process relations for anno in doc_val.relations: # attributes common to all(?) types of annotations # * global ids of the relation and its endpoints gid_rel = anno.identifier() gid_src = anno.source.identifier() gid_tgt = anno.target.identifier() # * build dict rel_dict = { # identification 'global_id': gid_rel, 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier() ) warnings.warn(w_msg) # if strip_cdus, replace endpoints of *discourse* relations # with segment ids if strip_cdus and is_relation_instance(anno): gid_src, gid_tgt = endpts[gid_rel] rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': gid_src, 'target': gid_tgt, }) if stage == 'discourse': df_disc_rels.append(rel_dict) elif stage == 'units': df_unit_rels.append(rel_dict) else: raise ValueError( "relation from stage not in {'units', 'discourse'}") # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS) df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) # * length of attachments # 2017-06-29 restricted to *discourse* relations, for the time being if strip_cdus and attach_len: df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_disc_rels, df_res, df_pref, df_unit_rels)
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = { anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel'] } for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in [ 'paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference' ]: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = { anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list } return res
def create_dfs(corpus): """Create pandas DataFrames for the corpus. Returns ------- res: dict(string, DataFrame) A DataFrame for each kind of structure present in the corpus. """ rows = {anno_type: list() for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']} for file_id, doc in corpus.items(): # common stuff: get general info (doc, subdoc, annotator) doc_name = file_id.doc subdoc_name = file_id.subdoc stage = file_id.stage annotator = file_id.annotator # context: yerk ctx = Context.for_edus(doc) # doc.annotations() := doc.units + doc.relations + doc.schemas for anno in doc.annotations(): common_cols = { 'anno_id': anno.identifier(), 'doc': doc_name, 'subdoc': subdoc_name, 'stage': stage, 'annotator': annotator, 'type': anno.type, # ? maybe not } if is_edu(anno): row = dict(common_cols.items() + edu_feats(doc, ctx, anno).items()) rows['edu'].append(row) elif is_cdu(anno): row = dict(common_cols.items() + cdu_feats(anno).items()) rows['cdu'].append(row) elif is_relation_instance(anno): row = dict(common_cols.items() + rel_feats(doc, ctx, anno).items()) rows['rel'].append(row) elif is_dialogue(anno): row = dict(common_cols.items() + dlg_feats(anno).items()) rows['dialogue'].append(row) elif is_turn(anno): row = dict(common_cols.items() + turn_feats(anno).items()) rows['turn'].append(row) elif is_turn_star(anno): row = dict(common_cols.items() + tstar_feats(anno).items()) rows['tstar'].append(row) elif anno.type in ['paragraph', 'Resource', 'Anaphora', 'Several_resources', 'Preference']: # each paragraph (normally) corresponds to a Turn # so just ignore them ; # the situation is less clear-cut for 'Resource', # 'Anaphora', 'Several_resources' continue else: err_msg = 'Unsupported annotation: {}'.format(anno) # raise ValueError(err_msg) print('W: {}'.format(err_msg)) continue res = {anno_type: pd.DataFrame(data=row_list) for anno_type, row_list in rows.items() if row_list} return res
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True, strip_cdus=False, attach_len=False): """Read an annotated game as dataframes. Parameters ---------- game_folder : path Path to the game folder. sel_annotator : str, optional Identifier of the annotator whose version we want. If `None`, the existing metal annotator will be used (BRONZE|SILVER|GOLD). thorough : boolean, defaults to True If True, check that annotations in 'units' and 'unannotated' that are expected to have a strict equivalent in 'dialogue' actually do. strip_cdus : boolean, defaults to False If True, strip CDUs with the "head" strategy and sloppy=True. attach_len : boolean, defaults to False If True, compute attachment length. This requires strip_cdus=True. Returns ------- dfs : tuple of DataFrame DataFrames for the annotated game. """ if sel_annotator is None: sel_annotator = 'metal' df_turns = [] # turns df_segs = [] # segments: EDUs, EEUs df_dlgs = [] # dialogues df_schms = [] # schemas: CDUs df_schm_mbrs = [] # schema members df_disc_rels = [] # discourse relations df_acts = [] # dialogue acts df_res = [] # resources df_pref = [] # preferences df_unit_rels = [] # relations from the "units" stage (anaphora) print(game_folder) # DEBUG game_upfolder, game_name = os.path.split(game_folder) game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name) # give integer indices to segments, and EDUs in particular seg_idx = 0 eeu_idx = 0 edu_idx = 0 for doc_key, doc_val in sorted(game_corpus.items()): doc = doc_key.doc subdoc = doc_key.subdoc stage = doc_key.stage annotator = doc_key.annotator # skip docs not from a selected annotator if ((sel_annotator == 'metal' and annotator not in ('BRONZE', 'SILVER', 'GOLD')) or (sel_annotator != 'metal' and annotator != sel_annotator)): continue # process annotations in doc # print(doc, subdoc, stage, annotator) # verbose doc_text = doc_val.text() # print(doc_text) for anno in sorted(doc_val.units, key=lambda x: x.span): # attributes common to all units unit_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type, span, text 'type': anno.type, 'span_beg': anno.span.char_start, 'span_end': anno.span.char_end, 'text': doc_val.text(span=anno.span), # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # fields specific to each type of unit if is_paragraph(anno): # paragraph: ignore? one per turn pass elif is_turn(anno): # turn # comments = anno.features['Comments'] # if comments == 'Please write in remarks...': unit_dict.update({ # features 'timestamp': anno.features['Timestamp'], 'comments': anno.features['Comments'], 'developments': anno.features['Developments'], 'turn_id': anno.features['Identifier'], 'emitter': anno.features['Emitter'], 'resources': anno.features['Resources'], }) if stage == 'discourse': df_turns.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_edu(anno): # segment: EDU or EEU if stage == 'discourse': if anno.features: raise ValueError( 'Wow, a discourse segment has *features*') # assign index among segments, across the whole doc unit_dict['seg_idx'] = seg_idx seg_idx += 1 if anno.type == 'NonplayerSegment': # EEU unit_dict['eeu_idx'] = eeu_idx eeu_idx += 1 else: # EDU unit_dict['edu_idx'] = edu_idx edu_idx += 1 # df_segs.append(unit_dict) elif stage == 'units': # each entry (should) correspond to an entry in df_segs act_dict = { 'global_id': anno.identifier(), # foreign key 'surface_act': anno.features['Surface_act'], 'addressee': anno.features['Addressee'], } assert (sorted( anno.features.keys()) == ['Addressee', 'Surface_act']) df_acts.append(act_dict) if thorough and stage in ('units', 'unannotated'): # maybe metadata in 'units' has changed? eg. last # modification date, last modifier pass # FIXME check existence (exact duplicate) elif is_dialogue(anno): expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades']) if set(anno.features.keys()).issubset(expected_dlg_features): unit_dict.update({ # features 'gets': anno.features.get('Gets', None), 'trades': anno.features.get('Trades', None), 'dice_rolls': anno.features.get('Dice_rolling', None), }) else: warn_msg = 'Dialogue {}: unexpected features {}'.format( anno.identifier(), ', '.join(x for x in sorted(anno.features.keys()) if x not in set(expected_dlg_features))) warnings.warn(warn_msg) if stage == 'discourse': df_dlgs.append(unit_dict) elif thorough: pass # FIXME check existence (exact duplicate) elif is_resource(anno): unit_dict.update({ # features 'status': anno.features['Status'], 'kind': anno.features['Kind'], 'correctness': anno.features['Correctness'], 'quantity': anno.features['Quantity'], }) assert (sorted(anno.features.keys()) == [ 'Correctness', 'Kind', 'Quantity', 'Status' ]) df_res.append(unit_dict) elif is_preference(anno): if anno.features: print(anno.__dict__) raise ValueError('Preference with features {}'.format( anno.features)) df_pref.append(unit_dict) else: print(anno.__dict__) raise ValueError('what unit is this?') # print('Unit', anno) for anno in doc_val.schemas: # in 'discourse': CDUs ; # in 'units': combinations of resources (OR, AND) schm_dict = { # identification 'global_id': anno.identifier(), 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], # optional? metadata 'last_modifier': anno.metadata.get('lastModifier', None), 'last_modif_date': anno.metadata.get('lastModificationDate', None), } # assumption: no feature if anno.features: if stage == 'units': if anno.features.keys() == ['Operator']: schm_dict.update({ 'operator': anno.features['Operator'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) elif stage == 'discourse': # tolerate 'default': 'default' for the moment, but # should probably cleaned out if anno.features.keys() == ['default']: schm_dict.update({ 'default': anno.features['default'], }) else: print(anno.origin) print(anno.__dict__) print(anno.features) raise ValueError( '{}: schema with *features*'.format(stage)) df_schms.append(schm_dict) # associate to this schema each of its members ; assumptions: # - members should be units or schemas (no relation) if anno.relations: raise ValueError('Wow, a schema with *relation members*') for member in anno.members: member_dict = { 'member_id': member.identifier(), 'schema_id': anno.identifier(), } df_schm_mbrs.append(member_dict) # TODO post-verification: check that all members do exist # (should be useless as stac-check should catch it) # RELATIONS # * rewrite endpoints of relations if strip_cdus if strip_cdus: endpts = dict() # map relation ids to (src_id, tgt_id) dgr = Graph.from_doc(game_corpus, doc_key) dgraph = copy.deepcopy(dgr) dgraph.strip_cdus(sloppy=True, mode='head') for edge in dgraph.relations(): if "asoubeille_1414085458642" in edge: print('Wop', edge) raise ValueError('gni') links = dgraph.links(edge) # get the identifiers of the relation and its endpoints # to replace CDU ids with segment indices anno_rel = dgraph.annotation(edge) # as of 2017-06-24, anno_rel has no origin (why?) at # this point anno_rel.origin = doc_key # temporary(?) fix # anno_src = dgraph.annotation(links[0]) anno_tgt = dgraph.annotation(links[1]) gid_rel = anno_rel.identifier() if gid_rel.endswith('_0'): # strip_cdus appends an integer to each copy of # the relation ; with mode="head", we only expect # one such copy per relation so "_0" should be a # sufficient match, which we can cut off for the # mapping gid_rel = gid_rel[:-2] gid_src = anno_src.identifier() gid_tgt = anno_tgt.identifier() endpts[gid_rel] = (gid_src, gid_tgt) # * process relations for anno in doc_val.relations: # attributes common to all(?) types of annotations # * global ids of the relation and its endpoints gid_rel = anno.identifier() gid_src = anno.source.identifier() gid_tgt = anno.target.identifier() # * build dict rel_dict = { # identification 'global_id': gid_rel, 'doc': doc, 'subdoc': subdoc, 'stage': stage, 'annotator': annotator, # type 'type': anno.type, # metadata 'last_modifier': anno.metadata['lastModifier'], 'last_modif_date': anno.metadata['lastModificationDate'], 'creation_date': anno.metadata['creation-date'], 'author': anno.metadata['author'], } # attributes specific to relations if 'Argument_scope' not in anno.features: # required feature w_msg = '{}: relation {} has no Argument_scope'.format( str(doc_key), anno.identifier()) warnings.warn(w_msg) # if strip_cdus, replace endpoints of *discourse* relations # with segment ids if strip_cdus and is_relation_instance(anno): gid_src, gid_tgt = endpts[gid_rel] rel_dict.update({ # features 'arg_scope': anno.features.get('Argument_scope', None), # req 'comments': anno.features.get('Comments', None), # opt # endpoints 'source': gid_src, 'target': gid_tgt, }) if stage == 'discourse': df_disc_rels.append(rel_dict) elif stage == 'units': df_unit_rels.append(rel_dict) else: raise ValueError( "relation from stage not in {'units', 'discourse'}") # create dataframes df_turns = pd.DataFrame(df_turns, columns=TURN_COLS) df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS) df_segs = pd.DataFrame(df_segs, columns=SEG_COLS) df_acts = pd.DataFrame(df_acts, columns=ACT_COLS) df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS) df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS) df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS) df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS) df_res = pd.DataFrame(df_res, columns=RES_COLS) df_pref = pd.DataFrame(df_pref, columns=PREF_COLS) # add columns computed from other dataframes # * for segments: retrieve the turn_id and the char positions of the # beg and end of the segment in the turn text def get_seg_turn_cols(seg): """Helper to retrieve turn info for a segment (EDU, EEU).""" doc = seg['doc'] subdoc = seg['subdoc'] seg_beg = seg['span_beg'] seg_end = seg['span_end'] cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) & (seg_end <= df_turns['span_end']) & (doc == df_turns['doc']) & (subdoc == df_turns['subdoc'])] # NB: cand_turns should contain a unique turn # compute the beg and end (char) positions of the segment in the turn # so we can match between the situated and linguistic versions when # the segmentation has changed turn_text = cand_turns['text'].item() seg_text = seg['text'] turn_span_beg = turn_text.find(seg_text) turn_span_end = turn_span_beg + len(seg_text) turn_dict = { 'turn_id': cand_turns['turn_id'].item(), 'turn_span_beg': turn_span_beg, 'turn_span_end': turn_span_end, } return pd.Series(turn_dict) seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1) df_segs = pd.concat([df_segs, seg_turn_cols], axis=1) # * length of attachments # 2017-06-29 restricted to *discourse* relations, for the time being if strip_cdus and attach_len: df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels) return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs, df_disc_rels, df_res, df_pref, df_unit_rels)