Python is_edu Examples, educe.stac.annotation.is_edu Python Examples

Example #1

0

Show file

File: fusion.py Project: irit-melodi/educe

 def __init__(self, doc, discourse_anno, unit_anno):
     """
     Parameters
     ----------
     doc : ?
         ?
     discourse_anno : ?
         Annotation from the discourse layer.
     unit_anno : ?
         Annotation from the units layer.
     """
     self._doc = doc
     self._anno = discourse_anno
     self._unit_anno = unit_anno
     unit_anno = unit_anno or discourse_anno
     unit_type = (unit_anno.type if is_edu(unit_anno)
                  else discourse_anno.type)
     super(EDU, self).__init__(discourse_anno.local_id(),
                               discourse_anno.text_span(),
                               unit_type,
                               discourse_anno.features,
                               discourse_anno.metadata,
                               discourse_anno.origin)
     # to be fleshed out
     self.turn = None
     self.tstar = None
     self.turn_edus = None
     self.dialogue = None
     self.dialogue_turns = None
     self.doc_turns = None
     self.tokens = None

Example #2

0

Show file

File: weave.py Project: eipiplusun/educe

    def is_various(annotation):
        """None of {edu, turn, paragraph, dialogue}.

        It seems to capture only Resources (to be confirmed).
        """
        return not(is_edu(annotation) or
                   is_turn(annotation) or
                   is_paragraph(annotation) or
                   is_dialogue(annotation))

Example #3

0

Show file

File: pd_count.py Project: irit-melodi/educe

def cdu_feats(anno):
    """Get CDU features that are not immediate.

    Parameters
    ----------
    anno: Schema
        The schema that codes this CDU in the glozz format

    Returns
    -------
    res: dict(string, int)
        Features on this CDU, currently
        'nb_edus_tot' (total number of EDUs spanned by this CDU),
        'nb_cdus_imm' (number of CDUs immediately embedded in this CDU),
        'nb_cdus_tot' (total number of CDUs recursively embedded in this
        CDU),
        'max_lvl' (maximal degree of CDU nesting in this CDU).
    """
    nb_members = len(anno.members)
    nb_cdus_imm = len([m for m in anno.members if is_cdu(m)])

    nb_edus_tot = 0
    nb_cdus_tot = 0
    max_lvl = 0

    cdus_to_expand = [(0, anno)]
    while cdus_to_expand:
        lvl, cur_cdu = cdus_to_expand.pop()
        mem_lvl = lvl + 1
        for member in cur_cdu.members:
            if is_edu(member):
                nb_edus_tot += 1
            elif is_cdu(member):
                nb_cdus_tot += 1
                if mem_lvl > max_lvl:
                    max_lvl = mem_lvl
                cdus_to_expand.append((mem_lvl, member))
            else:
                raise ValueError('Unexpected type for a CDU member')

    # TODO new features:
    # * nb_gaps: CDUs spans can be discontinuous
    # * gap_max_len: max len of a gap (in #EDUs)
    # * over_nb_turns: nb of turns this CDU (partly) spans over
    # * over_nb_tstars: nb of tstars this CDU (partly) spans over

    res = {
        'members': nb_members,
        'members_cdu': nb_cdus_imm,
        'spanned_cdus': nb_cdus_tot,
        'spanned_edus': nb_edus_tot,
        'depth': max_lvl,
    }

    return res

Example #4

0

Show file

File: split_annotated.py Project: eipiplusun/irit-stac

def is_empty_dialogue_act(anno):
    """Return True if anno is an empty dialogue act.

    This is defined as:
    - having span length 0 or
    - no addressee and no surface act.

    Parameters
    ----------
    anno : Annotation
        Annotation to be tested

    Returns
    -------
    res : boolean
        True if `anno` is an empty dialogue act.
    """
    return (is_edu(anno) and
            anno.type in DIALOGUE_ACTS and
            addressees(anno) is None and
            (anno.features.get('Surface_act') == 'Please choose...'))

Example #5

0

Show file

File: fusion.py Project: eipiplusun/educe

 def __init__(self, doc,
              discourse_anno,
              unit_anno):
     self._doc = doc
     self._anno = discourse_anno
     self._unit_anno = unit_anno
     unit_anno = unit_anno or discourse_anno
     unit_type = unit_anno.type if is_edu(unit_anno)\
         else discourse_anno.type
     super(EDU, self).__init__(discourse_anno.local_id(),
                               discourse_anno.text_span(),
                               unit_type,
                               discourse_anno.features,
                               discourse_anno.metadata,
                               discourse_anno.origin)
     # to be fleshed out
     self.turn = None
     self.tstar = None
     self.turn_edus = None
     self.dialogue = None
     self.dialogue_turns = None
     self.doc_turns = None
     self.tokens = None

Example #6

0

Show file

File: situated_stats.py Project: irit-melodi/educe

def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True,
                            strip_cdus=False, attach_len=False):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.
    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).
    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.
    strip_cdus : boolean, defaults to False
        If True, strip CDUs with the "head" strategy and sloppy=True.
    attach_len : boolean, defaults to False
        If True, compute attachment length. This requires
        strip_cdus=True.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_disc_rels = []  # discourse relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences
    df_unit_rels = []  # relations from the "units" stage (anaphora)

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    # give integer indices to segments, and EDUs in particular
    seg_idx = 0
    eeu_idx = 0
    edu_idx = 0
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal' and
             annotator not in ('BRONZE', 'SILVER', 'GOLD')) or
            (sel_annotator != 'metal' and
             annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in sorted(doc_val.units, key=lambda x: x.span):
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError('Wow, a discourse segment has *features*')
                    # assign index among segments, across the whole doc
                    unit_dict['seg_idx'] = seg_idx
                    seg_idx += 1
                    if anno.type == 'NonplayerSegment':  # EEU
                        unit_dict['eeu_idx'] = eeu_idx
                        eeu_idx += 1
                    else:  # EDU
                        unit_dict['edu_idx'] = edu_idx
                        edu_idx += 1
                    #
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(anno.features.keys()) ==
                            ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(
                    ['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets': anno.features.get('Gets', None),
                        'trades': anno.features.get('Trades', None),
                        'dice_rolls': anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) ==
                        ['Correctness', 'Kind', 'Quantity', 'Status'])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)

        # RELATIONS
        # * rewrite endpoints of relations if strip_cdus
        if strip_cdus:
            endpts = dict()  # map relation ids to (src_id, tgt_id)
            dgr = Graph.from_doc(game_corpus, doc_key)
            dgraph = copy.deepcopy(dgr)
            dgraph.strip_cdus(sloppy=True, mode='head')
            for edge in dgraph.relations():
                if "asoubeille_1414085458642" in edge:
                    print('Wop', edge)
                    raise ValueError('gni')
                links = dgraph.links(edge)
                # get the identifiers of the relation and its endpoints
                # to replace CDU ids with segment indices
                anno_rel = dgraph.annotation(edge)
                # as of 2017-06-24, anno_rel has no origin (why?) at
                # this point
                anno_rel.origin = doc_key  # temporary(?) fix
                #
                anno_src = dgraph.annotation(links[0])
                anno_tgt = dgraph.annotation(links[1])
                gid_rel = anno_rel.identifier()
                if gid_rel.endswith('_0'):
                    # strip_cdus appends an integer to each copy of
                    # the relation ; with mode="head", we only expect
                    # one such copy per relation so "_0" should be a
                    # sufficient match, which we can cut off for the
                    # mapping
                    gid_rel = gid_rel[:-2]
                gid_src = anno_src.identifier()
                gid_tgt = anno_tgt.identifier()
                endpts[gid_rel] = (gid_src, gid_tgt)
        # * process relations
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            # * global ids of the relation and its endpoints
            gid_rel = anno.identifier()
            gid_src = anno.source.identifier()
            gid_tgt = anno.target.identifier()
            # * build dict
            rel_dict = {
                # identification
                'global_id': gid_rel,
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier()
                )
                warnings.warn(w_msg)
            # if strip_cdus, replace endpoints of *discourse* relations
            # with segment ids
            if strip_cdus and is_relation_instance(anno):
                gid_src, gid_tgt = endpts[gid_rel]

            rel_dict.update({
                # features
                'arg_scope': anno.features.get('Argument_scope', None), # req
                'comments': anno.features.get('Comments', None),  # opt
                # endpoints
                'source': gid_src,
                'target': gid_tgt,
            })
            if stage == 'discourse':
                df_disc_rels.append(rel_dict)
            elif stage == 'units':
                df_unit_rels.append(rel_dict)
            else:
                raise ValueError(
                    "relation from stage not in {'units', 'discourse'}")
            

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS)
    df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) &
                              (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)
    # * length of attachments
    # 2017-06-29 restricted to *discourse* relations, for the time being
    if strip_cdus and attach_len:
        df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_disc_rels, df_res, df_pref, df_unit_rels)

Example #7

0

Show file

File: pd_count.py Project: irit-melodi/educe

def rel_feats(doc, ctx, anno, debug=False):
    """Get features for relations.

    Parameters
    ----------
    doc : GlozzDocument
        Surrounding document

    ctx :

    anno :

    Returns
    -------
    res : dict(string, string?)
        Features for this relation
    """
    # get all EDUs from document, sorted by their span
    doc_edus = sorted([u for u in doc.units if is_edu(u)],
                      key=lambda u: u.span)
    # TODO doc_tstars = ...

    src = anno.source
    if is_cdu(src):
        src_type = 'CDU'
        src_edus = sorted(src.terminals(), key=lambda e: e.span)
    elif is_edu(src):
        src_type = 'EDU'
        src_edus = [src]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    tgt = anno.target
    if is_cdu(tgt):
        tgt_type = 'CDU'
        tgt_edus = sorted(tgt.terminals(), key=lambda e: e.span)
    elif is_edu(tgt):
        tgt_type = 'EDU'
        tgt_edus = [tgt]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    # get the index of the EDUs in the interval between src and tgt
    src_idc = [doc_edus.index(e) for e in src_edus]
    tgt_idc = [doc_edus.index(e) for e in tgt_edus]

    # error case covered at least partially by stac-check, either
    # as "bizarre relation instance" or as "CDU punctures"
    if set(src_idc).intersection(set(tgt_idc)):
        if debug:
            direction = 'messed up'
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in src_edus]))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in tgt_edus]))
        return {}

    # src ... tgt
    if src_idc[-1] < tgt_idc[0]:
        direction = 'right'
        fst_idc = src_idc
        snd_idc = tgt_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt ... src
    elif tgt_idc[-1] < src_idc[0]:
        direction = 'left'
        fst_idc = tgt_idc
        snd_idc = src_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt and src are interwoven
    else:
        direction = 'interwoven'  # FIXME
        src_tgt_idc = set(src_idc).union(tgt_idc)
        interv_edus = []
        gap_edus = [e for i, e in enumerate(doc_edus)
                    if (i not in src_tgt_idc and
                        i > min(src_tgt_idc) and
                        i < max(src_tgt_idc))]
        if debug:
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in src_edus]))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in tgt_edus]))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in gap_edus]))
    edu_dist = len(interv_edus) + 1

    # turn-stars distance
    src_tstars = [ctx[e].tstar for e in src_edus]
    tgt_tstars = [ctx[e].tstar for e in tgt_edus]
    interv_tstars = [ctx[e].tstar for e in interv_edus]
    # turn-stars from the interval that don't overlap with src nor tgt
    skipped_tstars = set(interv_tstars) - set(src_tstars) - set(tgt_tstars)
    # we define:
    # * tstar_dist = 0  if (part of) src and tgt belong to the same tstar
    # * tstar_dist = len(skipped_tstars) + 1 otherwise
    tstar_dist = (len(skipped_tstars) + 1
                  if not set(src_tstars).intersection(set(tgt_tstars))
                  else 0)

    res = {
        'src_type': src_type,
        'tgt_type': tgt_type,
        'direction': direction,
        'edu_dist': edu_dist,
        'tstar_dist': tstar_dist,
    }

    return res

Example #8

0

Show file

File: pd_count.py Project: irit-melodi/educe

def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {anno_type: list()
            for anno_type in ['edu', 'turn', 'tstar', 'dialogue',
                              'cdu', 'rel']}

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() +
                           cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() +
                           dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() +
                           turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() +
                           tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in ['paragraph',
                               'Resource', 'Anaphora',
                               'Several_resources', 'Preference']:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {anno_type: pd.DataFrame(data=row_list)
           for anno_type, row_list in rows.items()
           if row_list}

    return res

Example #9

0

Show file

File: split_annotated.py Project: eipiplusun/irit-stac

def fix_likely_annotation_errors(anno_doc, verbose=1):
    """Fix a document for likely annotation errors due to glozz UX.

    Likely errors are currently defined as:
    - units of span length 0 (delete),
    - empty dialogue acts (delete),
    - schemas with no member (delete),
    - overflowing units (fix span).

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Same document but filtered.
    """
    # units
    anno_units_err = [
        x for x in anno_doc.units
        if (x.span.char_start == x.span.char_end or
            (is_empty_dialogue_act(x) and
             any(y.encloses(x) for y in anno_doc.units
                 if y.text_span() != x.text_span() and is_edu(y))))
    ]
    # schemas
    anno_schms_err = [
        x for x in anno_doc.schemas
        if not x.members
    ]
    # relations
    # TODO
    anno_relas_err = []

    # warn about the ignored annotations
    if verbose:
        if anno_units_err or anno_schms_err or anno_relas_err:
            print('Likely errors due to glozz UX')
            print('-----------------------------')
        if anno_units_err:
            print('|-> Units')
            print('\n'.join('  [ ] {}'.format(str(x))
                            for x in anno_units_err))
        if anno_schms_err:
            print('|-> Schemas')
            print('\n'.join('  [ ] {}'.format(str(x))
                            for x in anno_schms_err))
        if anno_relas_err:
            print('|-> Relations')
            print('\n'.join('  [ ] {}'.format(str(x))
                            for x in anno_relas_err))

    # remove detected errors
    anno_units_err = set(anno_units_err)
    anno_doc.units = [x for x in anno_doc.units
                      if x not in anno_units_err]
    anno_schms_err = set(anno_schms_err)
    anno_doc.schemas = [x for x in anno_doc.schemas
                        if x not in anno_schms_err]
    anno_relas_err = set(anno_relas_err)
    anno_doc.relations = [x for x in anno_doc.relations
                          if x not in anno_relas_err]

    # fix span of units that overflow from their turn
    turns = [x for x in anno_doc.units if is_turn(x)]
    edus = [x for x in anno_doc.units if is_edu(x)]
    for edu in edus:
        enclosing_turns = [x for x in turns if x.encloses(edu)]
        if len(enclosing_turns) == 1:
            continue

        overlapping_turns = [x for x in turns if x.overlaps(edu)]
        if len(overlapping_turns) != 1:
            raise ValueError('No unique overlapping turn for {}'.format(edu))
        turn = overlapping_turns[0]
        if turn.overlaps(edu) != edu.text_span():
            edu.span = turn.overlaps(edu)
            if verbose:
                print('Fix span of overflowing unit: {}'.format(edu))

    return anno_doc

Example #10

0

Show file

File: split_annotated.py Project: eipiplusun/irit-stac

def split_annotated(dir_orig, doc, verbose=0):
    """Do the split

    Parameters
    ----------
    dir_orig : string
        Folder of the annotated corpus
    doc : string
        Name of the document
    """
    # locate game folder
    dir_orig = os.path.abspath(dir_orig)
    game_dir_orig = os.path.join(dir_orig, doc)
    if not os.path.isdir(game_dir_orig):
        err_msg = 'Unable to find original files {}'.format(game_dir_orig)
        raise ValueError(err_msg)

    # check for unannotated subfolder
    unannotated_dir = os.path.join(game_dir_orig, 'unannotated')
    if not os.path.isdir(unannotated_dir):
        err_msg = 'Unable to find unannotated folder {}'.format(
            unannotated_dir)
        raise ValueError(err_msg)

    # check for annotated subfolder
    annotated_dir = os.path.join(game_dir_orig, 'annotated')
    if not os.path.isdir(annotated_dir):
        err_msg = 'Unable to find annotated folder {}'.format(
            annotated_dir)
        raise ValueError(err_msg)

    # create discourse/BRONZE and units/BRONZE (should it be skar?)
    disc_dir = os.path.join(game_dir_orig, 'discourse', 'BRONZE')
    if not os.path.isdir(disc_dir):
        os.makedirs(disc_dir)
        print('Creating folder {}'.format(disc_dir))
    units_dir = os.path.join(game_dir_orig, 'units', 'BRONZE')
    if not os.path.isdir(units_dir):
        os.makedirs(units_dir)
        print('Creating folder {}'.format(units_dir))

    # process each annotated file
    for anno_file in sorted(glob(os.path.join(annotated_dir, '*.aa'))):
        print('Processing {}'.format(os.path.basename(anno_file)))
        print('=================================')
        # matching text file
        text_file = os.path.splitext(anno_file)[0] + '.ac'

        # read and filter the `annotated` file
        anno_doc = educe.glozz.read_annotation_file(anno_file, text_file)
        anno_doc = fix_likely_annotation_errors(anno_doc, verbose=verbose)

        # read the `unannotated` file
        unanno_file = os.path.join(unannotated_dir,
                                   os.path.basename(anno_file))
        unanno_doc = educe.glozz.read_annotation_file(unanno_file, text_file)

        # infer resegmentation in `annotated`
        anno_doc = infer_resegmentation(unanno_doc, anno_doc, verbose=verbose)

        # create `units` doc from the cleaned `annotated`
        # port annotations: dialogue acts, resources, preferences
        units_doc = copy.deepcopy(anno_doc)
        # * keep all clean units
        # * relations: anaphors only
        units_doc.relations = [x for x in units_doc.relations
                               if x.type == 'Anaphor']
        # * schemas: 'Several_resources' only
        units_doc.schemas = [x for x in units_doc.schemas
                             if x.type == 'Several_resources']

        # create `discourse` from the cleaned `annotated`
        disc_doc = copy.deepcopy(anno_doc)
        # remove dialogue act annotation from segments, so that they revert
        # to being basic EDUs
        for disc_unit in disc_doc.units:
            if is_edu(disc_unit):
                disc_unit.type = 'Segment'
                disc_unit.features = {}
        # filter anaphoric relations
        disc_doc.relations = [x for x in disc_doc.relations
                              if x.type != 'Anaphor']
        # filter resources schemas
        disc_doc.schemas = [x for x in disc_doc.schemas
                            if x.type != 'Several_resources']

        # dump both files
        bname = os.path.basename(os.path.splitext(anno_file)[0])
        # discourse file
        disc_anno_file = os.path.join(disc_dir, bname + '.aa')
        write_annotation_file(disc_anno_file, disc_doc)
        # units file
        units_anno_file = os.path.join(units_dir, bname + '.aa')
        write_annotation_file(units_anno_file, units_doc)
        # create two symlinks to the same .ac file, for discourse and units
        ac_path = os.path.join(game_dir_orig, 'unannotated',
                               bname + '.ac')
        for subdir in [disc_dir, units_dir]:
            link_src = os.path.relpath(ac_path, subdir)
            link_name = os.path.join(subdir, os.path.basename(ac_path))
            if os.path.exists(link_name):
                os.unlink(link_name)
            try:
                os.symlink(link_src, link_name)
            except OSError:
                print('Unable to create symlink {} to {}'.format(
                    link_src, link_name
                ))
                raise

        # check that all annotations from the filtered annotated doc
        # have been ported to either units or discourse
        anno_all_annos = set(x.local_id() for x in itertools.chain(
            anno_doc.units, anno_doc.relations, anno_doc.schemas
        ))
        # gather all annotations from units_doc and disc_doc
        units_all_annos = set(x.local_id() for x in itertools.chain(
            units_doc.units, units_doc.relations, units_doc.schemas
        ))
        disc_all_annos = set(x.local_id() for x in itertools.chain(
            disc_doc.units, disc_doc.relations, disc_doc.schemas
        ))
        # do the check
        missing_annos = (anno_all_annos - units_all_annos - disc_all_annos)
        if missing_annos:
            print('Missing annotations from {}:\n  {}'.format(
                anno_file,
                '\n  '.join(str(x) for x
                            in set(anno_doc.units + anno_doc.relations +
                                   anno_doc.schemas)
                            if x.local_id() in missing_annos)
            ))
            print('unanno EDUs:\n  {}'.format(
                '\n  '.join(str(x) for x
                            in unanno_doc.units if is_edu(x))
            ))
            raise ValueError('Ho?')
        # pretty
        print()

Example #11

0

Show file

File: split_annotated.py Project: eipiplusun/irit-stac

def infer_resegmentation(unanno_doc, anno_doc, verbose=0):
    """Infer resegmentation of EDUs.

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Filtered document, where the support of relations and schemas
        has been rewritten.
    """
    anno_map = dict()
    cautious_map = dict()
    new_cdus = []

    turns = [x for x in unanno_doc.units if is_turn(x)]
    for turn in turns:
        # `unannotated` was the starting point for the annotation process
        u_edus = [x for x in unanno_doc.units
                  if is_edu(x) and turn.span.encloses(x.span)]
        u_ids = set(x.local_id() for x in u_edus)

        # `annotated` is the result of the annotation process
        # find conflicts, as pair-wise overlaps between annotations
        # from `annotated`
        a_edus = [x for x in anno_doc.units
                  if is_edu(x) and turn.span.encloses(x.span)]
        # 1. map new segments to their original equivalent, backporting
        # dialogue act annotation
        dup_items = [(elt_a, elt_b) for elt_a, elt_b
                     in itertools.combinations(
                         sorted(a_edus, key=lambda x: (
                             x.local_id() in u_ids,
                             x.local_id())),
                         2)
                     if (span_eq(elt_a.text_span(), elt_b.text_span(),
                                 eps=1) and
                         elt_b.local_id() in u_ids)]
        anno_map.update(dup_items)
        # backport dialogue act annotation to original segment
        for elt_a, elt_b in dup_items:
            if elt_a.type in DIALOGUE_ACTS:
                # backport annotation to original segment elt_b
                elt_b.type = elt_a.type
                elt_b.features = elt_a.features
                for k in ['lastModifier', 'lastModificationDate']:
                    elt_b.metadata[k] = elt_a.metadata[k]
        # (locally) update the list of EDUs in anno_doc, so conflicts
        # are not computed on trivially mapped segments
        a_edus = [x for x in a_edus if x not in anno_map]

        # 2. list conflicts, then whitelist them progressively
        # NB: we sort EDUs in reverse using their local_ids, so that
        # conflict pairs are of the form (stac*, skar*) ; this is
        # admittedly a cheap, ad-hoc, trick to simulate an ordering
        # such that annotations already present in unannotated < annotations
        # introduced in annotated
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b
                        in itertools.combinations(
                            sorted(a_edus, key=lambda x: (
                                x.type in DIALOGUE_ACTS, x.local_id())),
                            2)
                        if elt_a.overlaps(elt_b)]

        # * Two cases are very close: EDU merges, and CDUs
        rels_support = set(anno_map.get(x, x)
                           for rel in anno_doc.relations
                           for x in [rel.source, rel.target])
        edu_merges = []  # list of (list of elt_a, elt_b)
        cdu_guess = []  # list of (list of elt_a, elt_b)
        for elt_b, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[1]):
            sorted_a = sorted((y[0] for y in pairs),
                              key=lambda z: z.text_span())
            span_seq_a = Span(sorted_a[0].text_span().char_start,
                              sorted_a[-1].text_span().char_end)

            # we approximately check that the sequence of EDUs elts_a
            # fully covers the span of elt_b, from start to end, with
            # no overlap or that the whole sequence is enclosed in
            # the annotation from `annotated` (this happens when some but
            # not all of the merged EDUs have been deleted)
            if ((approximate_cover(sorted_a, elt_b) or
                 elt_b.text_span().encloses(span_seq_a))):
                # then, it is either an EDU merge or a CDU ;
                # if any element of the sequence supports a relation,
                # we take this as indicating a CDU
                if any(y in rels_support for y in sorted_a):
                    # broadcast type, features, metadata to the segments
                    for elt_a in sorted_a:
                        elt_a.type = _SPLIT_PREFIX + elt_b.type
                        elt_a.features = elt_b.features
                        for k in ['lastModifier', 'lastModificationDate']:
                            elt_a.metadata[k] = elt_b.metadata[k]
                    # transform elt_b into a CDU
                    sch_relid = elt_b.local_id()
                    sch_units = set(y.local_id() for y in sorted_a)
                    sch_relas = set()
                    sch_schms = set()
                    sch_stype = 'Complex_discourse_unit'
                    sch_feats = {}
                    sch_metad = elt_b.metadata
                    new_cdu = Schema(sch_relid, sch_units, sch_relas,
                                     sch_schms, sch_stype, sch_feats,
                                     metadata=sch_metad)
                    new_cdus.append(new_cdu)
                    # map former (bad) segment to its proper CDU version
                    anno_map[elt_b] = new_cdu
                    cdu_guess.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('CDU {}\nwas {}, from\n  {}'.format(
                            new_cdu, elt_b,
                            '\n  '.join(str(z) for z in sorted_a)))
                elif all(elt_a.local_id() in u_ids for elt_a in sorted_a):
                    edu_merges.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('EDU merge {} from\n  {}'.format(
                            elt_b, '\n  '.join(str(z) for z in sorted_a)))
                else:
                    err_msg = 'Weird approximate cover:\n{}\n{}'
                    raise ValueError(err_msg.format(
                        ', '.join(str(y) for y in sorted_a),
                        elt_b
                    ))
        # map each of the segments to its CDU, so these pairs can be
        # removed from the list of conflicts later
        cdu_map = dict()
        for elts_a, elt_b in cdu_guess:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            cdu_map.update(map_items)
            cautious_map.update(map_items)
        # map each of the merged segments to the new, bigger EDU + mark
        for elts_a, elt_b in edu_merges:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            anno_map.update(map_items)
            cautious_map.update(map_items)
        # update list of conflicts: remove pairs that contain a segment
        # and its merged EDU, or a segment and its enclosing CDU
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if (anno_map.get(elt_a, elt_a) != elt_b and
                            cdu_map.get(elt_a, elt_a) != elt_b)]

        # * EDU splits
        edu_splits = dict()  # elt_a -> list of elt_b
        for elt_a, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[0]):
            sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span)
            # we approximately check that the sequence of new EDUs
            # fully covers the span of elt_a, from start to end, with
            # no overlap
            if ((elt_a.local_id() in u_ids and
                 approximate_cover(sorted_b, elt_a))):
                edu_splits[elt_a] = sorted_b
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if elt_a not in set(edu_splits.keys())]
        # map the split segment to the first of the resulting EDUs + mark
        for elt_a, elts_b in edu_splits.items():
            map_items = [(elt_a, elts_b[0])]
            anno_map.update(map_items)
            cautious_map.update(map_items)

        if verbose:
            if pw_conflicts:
                print('Conflict:')
                print('\n'.join('  {}\t<>\t{}'.format(str(elt_a), str(elt_b))
                                for elt_a, elt_b in pw_conflicts))

    # update anno_doc using the computed mapping
    anno_map_id = {x.local_id(): y.local_id()
                   for x, y in anno_map.items()}
    cautious_map_id = {x.local_id(): y.local_id()
                       for x, y in cautious_map.items()}
    # * forget mapped units and segments rewritten as CDUs
    anno_doc.units = [x for x in anno_doc.units
                      if (not is_edu(x) or
                          x.local_id() not in anno_map_id)]
    # * add the new CDUs to the list of schemas
    anno_doc.schemas.extend(new_cdus)

    # rewrite the support of relations and schemas
    objects = {x.local_id(): x
               for x in itertools.chain(anno_doc.units, anno_doc.relations,
                                        anno_doc.schemas)}
    # * rewrite the support of relations
    for rel in anno_doc.relations:
        src = anno_map_id.get(rel.span.t1, rel.span.t1)
        tgt = anno_map_id.get(rel.span.t2, rel.span.t2)
        # update relation span, source, target
        rel.span = RelSpan(src, tgt)
        rel.source = objects[src]
        rel.target = objects[tgt]
        # if necessary, mark relation type for review
        if src in cautious_map_id or tgt in cautious_map_id:
            rel.type = _SPLIT_PREFIX + rel.type

    # * rewrite the support of schemas
    for sch in anno_doc.schemas:
        # sch.id = sch.id
        sch.units = set(anno_map_id.get(x, x) for x in sch.units)
        sch.relations = set(anno_map_id.get(x, x) for x in sch.relations)
        sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas)
        sch.type = sch.type
        # sch.features = sch.features
        # sch.metadata = sch.metadata
        sch.span = sch.units | sch.relations | sch.schemas
        sch.fleshout(objects)

    return anno_doc

Example #12

0

Show file

File: fusion.py Project: irit-melodi/educe

def fuse_edus(discourse_doc, unit_doc, postags):
    """Return a copy of the discourse level doc, merging info from both
    the discourse and units stage.

    All EDUs will be converted to higher level EDUs.

    Notes
    -----
    * The discourse stage is primary in that we work by going over what
      EDUs we find in the discourse stage and trying to enhance them
      with information we find on their units-level equivalents.
      Sometimes (rarely but it happens) annotations can go out of synch.
      EDUs missing on the units stage will be silently ignored (we try
      to make do without them).
      EDUs that were introduced on the units stage but not percolated to
      discourse will also be ignored.

    * We rely on annotation ids to match EDUs from both stages; it's up
      to you to ensure that the annotations are really in synch.

    * This does not constitute a full merge of the documents. For a full
      merge, you would have to bring over other annotations such as
      Resources, `Preference`, `Anaphor`, `Several_resources`, taking
      care all the while to ensure there are no timestamp clashes with
      pre-existing annotations (it's unlikely but best be on the safe
      side if you ever find yourself with automatically generated
      annotations, where all bets are off time-stamp wise).

    Parameters
    ----------
    discourse_doc : GlozzDocument
        Document from the "discourse" stage.
    unit_doc : GlozzDocument
        Document from the "units" stage.
    postags : list of Token
        Sequence of educe tokens predicted by the POS tagger for this
        document.

    Returns
    -------
    doc : GlozzDocument
        Deep copy of the discourse_doc with info from the units stage
        merged in.
    """
    doc = copy.deepcopy(discourse_doc)

    # first pass: create the EDU objects
    annos = sorted([x for x in doc.units if is_edu(x)],
                   key=lambda x: x.span)
    replacements = {}
    for anno in annos:
        unit_anno = None if unit_doc is None else twin_from(unit_doc, anno)
        edu = EDU(doc, anno, unit_anno)
        replacements[anno] = edu

    # second pass: rewrite doc so that annotations that correspond
    # to EDUs are replacement by their higher-level equivalents
    edus = []
    for anno in annos:
        edu = replacements[anno]
        edus.append(edu)
        doc.units.remove(anno)
        doc.units.append(edu)
        for rel in doc.relations:
            if rel.source == anno:
                rel.source = edu
            if rel.target == anno:
                rel.target = edu
        for schema in doc.schemas:
            if anno in schema.units:
                schema.units.remove(anno)
                schema.units.append(edu)

    # fourth pass: flesh out the EDUs with contextual info
    # now the EDUs should work as contexts too
    contexts = Context.for_edus(doc, postags=postags)
    for edu in edus:
        edu.fleshout(contexts[edu])
    return doc

Example #13

0

Show file

File: pd_count.py Project: moreymat/educe

def rel_feats(doc, ctx, anno, debug=False):
    """Get features for relations.

    Parameters
    ----------
    doc : GlozzDocument
        Surrounding document

    ctx :

    anno :

    Returns
    -------
    res : dict(string, string?)
        Features for this relation
    """
    # get all EDUs from document, sorted by their span
    doc_edus = sorted([u for u in doc.units if is_edu(u)],
                      key=lambda u: u.span)
    # TODO doc_tstars = ...

    src = anno.source
    if is_cdu(src):
        src_type = 'CDU'
        src_edus = sorted(src.terminals(), key=lambda e: e.span)
    elif is_edu(src):
        src_type = 'EDU'
        src_edus = [src]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    tgt = anno.target
    if is_cdu(tgt):
        tgt_type = 'CDU'
        tgt_edus = sorted(tgt.terminals(), key=lambda e: e.span)
    elif is_edu(tgt):
        tgt_type = 'EDU'
        tgt_edus = [tgt]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    # get the index of the EDUs in the interval between src and tgt
    src_idc = [doc_edus.index(e) for e in src_edus]
    tgt_idc = [doc_edus.index(e) for e in tgt_edus]

    # error case covered at least partially by stac-check, either
    # as "bizarre relation instance" or as "CDU punctures"
    if set(src_idc).intersection(set(tgt_idc)):
        if debug:
            direction = 'messed up'
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in src_edus
            ]))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in tgt_edus
            ]))
        return {}

    # src ... tgt
    if src_idc[-1] < tgt_idc[0]:
        direction = 'right'
        fst_idc = src_idc
        snd_idc = tgt_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt ... src
    elif tgt_idc[-1] < src_idc[0]:
        direction = 'left'
        fst_idc = tgt_idc
        snd_idc = src_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt and src are interwoven
    else:
        direction = 'interwoven'  # FIXME
        src_tgt_idc = set(src_idc).union(tgt_idc)
        interv_edus = []
        gap_edus = [
            e for i, e in enumerate(doc_edus)
            if (i not in src_tgt_idc and i > min(src_tgt_idc)
                and i < max(src_tgt_idc))
        ]
        if debug:
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in src_edus
            ]))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in tgt_edus
            ]))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in gap_edus
            ]))
    edu_dist = len(interv_edus) + 1

    # turn-stars distance
    src_tstars = [ctx[e].tstar for e in src_edus]
    tgt_tstars = [ctx[e].tstar for e in tgt_edus]
    interv_tstars = [ctx[e].tstar for e in interv_edus]
    # turn-stars from the interval that don't overlap with src nor tgt
    skipped_tstars = set(interv_tstars) - set(src_tstars) - set(tgt_tstars)
    # we define:
    # * tstar_dist = 0  if (part of) src and tgt belong to the same tstar
    # * tstar_dist = len(skipped_tstars) + 1 otherwise
    tstar_dist = (len(skipped_tstars) + 1
                  if not set(src_tstars).intersection(set(tgt_tstars)) else 0)

    res = {
        'src_type': src_type,
        'tgt_type': tgt_type,
        'direction': direction,
        'edu_dist': edu_dist,
        'tstar_dist': tstar_dist,
    }

    return res

Example #14

0

Show file

File: pd_count.py Project: moreymat/educe

def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {
        anno_type: list()
        for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']
    }

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() + cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() + dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() + turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() + tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in [
                    'paragraph', 'Resource', 'Anaphora', 'Several_resources',
                    'Preference'
            ]:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {
        anno_type: pd.DataFrame(data=row_list)
        for anno_type, row_list in rows.items() if row_list
    }

    return res