Beispiel #1
0
def relation_dict(doc, quiet=False):
    """
    Return the relations instances from a document in the
    form of an id pair to label dictionary

    If there is more than one relation between a pair of
    EDUs we pick one of them arbitrarily and ignore the
    other
    """
    relations = {}
    for rel in doc.relations:
        if not is_relation_instance(rel):
            # might be the odd Anaphora link lying around
            continue
        pair = rel.source.identifier(), rel.target.identifier()
        if pair not in relations:
            relations[pair] = rel.type
        elif not quiet:
            print(('Ignoring {type1} relation instance ({edu1} -> {edu2}); '
                   'another of type {type2} already exists'
                   '').format(type1=rel.type,
                              edu1=pair[0],
                              edu2=pair[1],
                              type2=relations[pair]),
                  file=sys.stderr)
    # generate fake root links
    for anno in doc.units:
        if not educe.stac.is_edu(anno):
            continue
        is_target = False
        for rel in doc.relations:
            if rel.target == anno:
                is_target = True
                break
        if not is_target:
            key = ROOT, anno.identifier()
            relations[key] = ROOT
    return relations
Beispiel #2
0
def relation_dict(doc, quiet=False):
    """
    Return the relations instances from a document in the
    form of an id pair to label dictionary

    If there is more than one relation between a pair of
    EDUs we pick one of them arbitrarily and ignore the
    other
    """
    relations = {}
    for rel in doc.relations:
        if not is_relation_instance(rel):
            # might be the odd Anaphora link lying around
            continue
        pair = rel.source.identifier(), rel.target.identifier()
        if pair not in relations:
            relations[pair] = rel.type
        elif not quiet:
            print(('Ignoring {type1} relation instance ({edu1} -> {edu2}); '
                   'another of type {type2} already exists'
                   '').format(type1=rel.type,
                              edu1=pair[0],
                              edu2=pair[1],
                              type2=relations[pair]),
                  file=sys.stderr)
    # generate fake root links
    for anno in doc.units:
        if not educe.stac.is_edu(anno):
            continue
        is_target = False
        for rel in doc.relations:
            if rel.target == anno:
                is_target = True
                break
        if not is_target:
            key = ROOT, anno.identifier()
            relations[key] = ROOT
    return relations
Beispiel #3
0
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True,
                            strip_cdus=False, attach_len=False):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.
    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).
    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.
    strip_cdus : boolean, defaults to False
        If True, strip CDUs with the "head" strategy and sloppy=True.
    attach_len : boolean, defaults to False
        If True, compute attachment length. This requires
        strip_cdus=True.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_disc_rels = []  # discourse relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences
    df_unit_rels = []  # relations from the "units" stage (anaphora)

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    # give integer indices to segments, and EDUs in particular
    seg_idx = 0
    eeu_idx = 0
    edu_idx = 0
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal' and
             annotator not in ('BRONZE', 'SILVER', 'GOLD')) or
            (sel_annotator != 'metal' and
             annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in sorted(doc_val.units, key=lambda x: x.span):
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError('Wow, a discourse segment has *features*')
                    # assign index among segments, across the whole doc
                    unit_dict['seg_idx'] = seg_idx
                    seg_idx += 1
                    if anno.type == 'NonplayerSegment':  # EEU
                        unit_dict['eeu_idx'] = eeu_idx
                        eeu_idx += 1
                    else:  # EDU
                        unit_dict['edu_idx'] = edu_idx
                        edu_idx += 1
                    #
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(anno.features.keys()) ==
                            ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(
                    ['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets': anno.features.get('Gets', None),
                        'trades': anno.features.get('Trades', None),
                        'dice_rolls': anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) ==
                        ['Correctness', 'Kind', 'Quantity', 'Status'])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)

        # RELATIONS
        # * rewrite endpoints of relations if strip_cdus
        if strip_cdus:
            endpts = dict()  # map relation ids to (src_id, tgt_id)
            dgr = Graph.from_doc(game_corpus, doc_key)
            dgraph = copy.deepcopy(dgr)
            dgraph.strip_cdus(sloppy=True, mode='head')
            for edge in dgraph.relations():
                if "asoubeille_1414085458642" in edge:
                    print('Wop', edge)
                    raise ValueError('gni')
                links = dgraph.links(edge)
                # get the identifiers of the relation and its endpoints
                # to replace CDU ids with segment indices
                anno_rel = dgraph.annotation(edge)
                # as of 2017-06-24, anno_rel has no origin (why?) at
                # this point
                anno_rel.origin = doc_key  # temporary(?) fix
                #
                anno_src = dgraph.annotation(links[0])
                anno_tgt = dgraph.annotation(links[1])
                gid_rel = anno_rel.identifier()
                if gid_rel.endswith('_0'):
                    # strip_cdus appends an integer to each copy of
                    # the relation ; with mode="head", we only expect
                    # one such copy per relation so "_0" should be a
                    # sufficient match, which we can cut off for the
                    # mapping
                    gid_rel = gid_rel[:-2]
                gid_src = anno_src.identifier()
                gid_tgt = anno_tgt.identifier()
                endpts[gid_rel] = (gid_src, gid_tgt)
        # * process relations
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            # * global ids of the relation and its endpoints
            gid_rel = anno.identifier()
            gid_src = anno.source.identifier()
            gid_tgt = anno.target.identifier()
            # * build dict
            rel_dict = {
                # identification
                'global_id': gid_rel,
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier()
                )
                warnings.warn(w_msg)
            # if strip_cdus, replace endpoints of *discourse* relations
            # with segment ids
            if strip_cdus and is_relation_instance(anno):
                gid_src, gid_tgt = endpts[gid_rel]

            rel_dict.update({
                # features
                'arg_scope': anno.features.get('Argument_scope', None), # req
                'comments': anno.features.get('Comments', None),  # opt
                # endpoints
                'source': gid_src,
                'target': gid_tgt,
            })
            if stage == 'discourse':
                df_disc_rels.append(rel_dict)
            elif stage == 'units':
                df_unit_rels.append(rel_dict)
            else:
                raise ValueError(
                    "relation from stage not in {'units', 'discourse'}")
            

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS)
    df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) &
                              (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)
    # * length of attachments
    # 2017-06-29 restricted to *discourse* relations, for the time being
    if strip_cdus and attach_len:
        df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_disc_rels, df_res, df_pref, df_unit_rels)
Beispiel #4
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {
        anno_type: list()
        for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']
    }

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() + cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() + dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() + turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() + tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in [
                    'paragraph', 'Resource', 'Anaphora', 'Several_resources',
                    'Preference'
            ]:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {
        anno_type: pd.DataFrame(data=row_list)
        for anno_type, row_list in rows.items() if row_list
    }

    return res
Beispiel #5
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {anno_type: list()
            for anno_type in ['edu', 'turn', 'tstar', 'dialogue',
                              'cdu', 'rel']}

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() +
                           cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() +
                           dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() +
                           turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() +
                           tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in ['paragraph',
                               'Resource', 'Anaphora',
                               'Several_resources', 'Preference']:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {anno_type: pd.DataFrame(data=row_list)
           for anno_type, row_list in rows.items()
           if row_list}

    return res
Beispiel #6
0
def read_game_as_dataframes(game_folder,
                            sel_annotator=None,
                            thorough=True,
                            strip_cdus=False,
                            attach_len=False):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.
    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).
    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.
    strip_cdus : boolean, defaults to False
        If True, strip CDUs with the "head" strategy and sloppy=True.
    attach_len : boolean, defaults to False
        If True, compute attachment length. This requires
        strip_cdus=True.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_disc_rels = []  # discourse relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences
    df_unit_rels = []  # relations from the "units" stage (anaphora)

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    # give integer indices to segments, and EDUs in particular
    seg_idx = 0
    eeu_idx = 0
    edu_idx = 0
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal'
             and annotator not in ('BRONZE', 'SILVER', 'GOLD'))
                or (sel_annotator != 'metal' and annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in sorted(doc_val.units, key=lambda x: x.span):
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError(
                            'Wow, a discourse segment has *features*')
                    # assign index among segments, across the whole doc
                    unit_dict['seg_idx'] = seg_idx
                    seg_idx += 1
                    if anno.type == 'NonplayerSegment':  # EEU
                        unit_dict['eeu_idx'] = eeu_idx
                        eeu_idx += 1
                    else:  # EDU
                        unit_dict['edu_idx'] = edu_idx
                        edu_idx += 1
                    #
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(
                        anno.features.keys()) == ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets':
                        anno.features.get('Gets', None),
                        'trades':
                        anno.features.get('Trades', None),
                        'dice_rolls':
                        anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) == [
                    'Correctness', 'Kind', 'Quantity', 'Status'
                ])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)

        # RELATIONS
        # * rewrite endpoints of relations if strip_cdus
        if strip_cdus:
            endpts = dict()  # map relation ids to (src_id, tgt_id)
            dgr = Graph.from_doc(game_corpus, doc_key)
            dgraph = copy.deepcopy(dgr)
            dgraph.strip_cdus(sloppy=True, mode='head')
            for edge in dgraph.relations():
                if "asoubeille_1414085458642" in edge:
                    print('Wop', edge)
                    raise ValueError('gni')
                links = dgraph.links(edge)
                # get the identifiers of the relation and its endpoints
                # to replace CDU ids with segment indices
                anno_rel = dgraph.annotation(edge)
                # as of 2017-06-24, anno_rel has no origin (why?) at
                # this point
                anno_rel.origin = doc_key  # temporary(?) fix
                #
                anno_src = dgraph.annotation(links[0])
                anno_tgt = dgraph.annotation(links[1])
                gid_rel = anno_rel.identifier()
                if gid_rel.endswith('_0'):
                    # strip_cdus appends an integer to each copy of
                    # the relation ; with mode="head", we only expect
                    # one such copy per relation so "_0" should be a
                    # sufficient match, which we can cut off for the
                    # mapping
                    gid_rel = gid_rel[:-2]
                gid_src = anno_src.identifier()
                gid_tgt = anno_tgt.identifier()
                endpts[gid_rel] = (gid_src, gid_tgt)
        # * process relations
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            # * global ids of the relation and its endpoints
            gid_rel = anno.identifier()
            gid_src = anno.source.identifier()
            gid_tgt = anno.target.identifier()
            # * build dict
            rel_dict = {
                # identification
                'global_id': gid_rel,
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier())
                warnings.warn(w_msg)
            # if strip_cdus, replace endpoints of *discourse* relations
            # with segment ids
            if strip_cdus and is_relation_instance(anno):
                gid_src, gid_tgt = endpts[gid_rel]

            rel_dict.update({
                # features
                'arg_scope':
                anno.features.get('Argument_scope', None),  # req
                'comments':
                anno.features.get('Comments', None),  # opt
                # endpoints
                'source':
                gid_src,
                'target':
                gid_tgt,
            })
            if stage == 'discourse':
                df_disc_rels.append(rel_dict)
            elif stage == 'units':
                df_unit_rels.append(rel_dict)
            else:
                raise ValueError(
                    "relation from stage not in {'units', 'discourse'}")

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS)
    df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg)
                              & (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)
    # * length of attachments
    # 2017-06-29 restricted to *discourse* relations, for the time being
    if strip_cdus and attach_len:
        df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_disc_rels, df_res, df_pref, df_unit_rels)