Example #1
0
    def is_various(annotation):
        """None of {edu, turn, paragraph, dialogue}.

        It seems to capture only Resources (to be confirmed).
        """
        return not (is_edu(annotation) or is_turn(annotation)
                    or is_paragraph(annotation) or is_dialogue(annotation))
Example #2
0
    def is_various(annotation):
        """None of {edu, turn, paragraph, dialogue}.

        It seems to capture only Resources (to be confirmed).
        """
        return not(is_edu(annotation) or
                   is_turn(annotation) or
                   is_paragraph(annotation) or
                   is_dialogue(annotation))
Example #3
0
def fix_likely_annotation_errors(anno_doc, verbose=1):
    """Fix a document for likely annotation errors due to glozz UX.

    Likely errors are currently defined as:
    - units of span length 0 (delete),
    - empty dialogue acts (delete),
    - schemas with no member (delete),
    - overflowing units (fix span).

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Same document but filtered.
    """
    # units
    anno_units_err = [
        x for x in anno_doc.units if (x.span.char_start == x.span.char_end or (
            is_empty_dialogue_act(x) and any(
                y.encloses(x) for y in anno_doc.units
                if y.text_span() != x.text_span() and is_edu(y))))
    ]
    # schemas
    anno_schms_err = [x for x in anno_doc.schemas if not x.members]
    # relations
    # TODO
    anno_relas_err = []

    # warn about the ignored annotations
    if verbose:
        if anno_units_err or anno_schms_err or anno_relas_err:
            print('Likely errors due to glozz UX')
            print('-----------------------------')
        if anno_units_err:
            print('|-> Units')
            print('\n'.join('  [ ] {}'.format(str(x)) for x in anno_units_err))
        if anno_schms_err:
            print('|-> Schemas')
            print('\n'.join('  [ ] {}'.format(str(x)) for x in anno_schms_err))
        if anno_relas_err:
            print('|-> Relations')
            print('\n'.join('  [ ] {}'.format(str(x)) for x in anno_relas_err))

    # remove detected errors
    anno_units_err = set(anno_units_err)
    anno_doc.units = [x for x in anno_doc.units if x not in anno_units_err]
    anno_schms_err = set(anno_schms_err)
    anno_doc.schemas = [x for x in anno_doc.schemas if x not in anno_schms_err]
    anno_relas_err = set(anno_relas_err)
    anno_doc.relations = [
        x for x in anno_doc.relations if x not in anno_relas_err
    ]

    # fix span of units that overflow from their turn
    turns = [x for x in anno_doc.units if is_turn(x)]
    edus = [x for x in anno_doc.units if is_edu(x)]
    for edu in edus:
        enclosing_turns = [x for x in turns if x.encloses(edu)]
        if len(enclosing_turns) == 1:
            continue

        overlapping_turns = [x for x in turns if x.overlaps(edu)]
        if len(overlapping_turns) != 1:
            raise ValueError('No unique overlapping turn for {}'.format(edu))
        turn = overlapping_turns[0]
        if turn.overlaps(edu) != edu.text_span():
            edu.span = turn.overlaps(edu)
            if verbose:
                print('Fix span of overflowing unit: {}'.format(edu))

    return anno_doc
Example #4
0
def infer_resegmentation(unanno_doc, anno_doc, verbose=0):
    """Infer resegmentation of EDUs.

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Filtered document, where the support of relations and schemas
        has been rewritten.
    """
    anno_map = dict()
    cautious_map = dict()
    new_cdus = []

    turns = [x for x in unanno_doc.units if is_turn(x)]
    for turn in turns:
        # `unannotated` was the starting point for the annotation process
        u_edus = [
            x for x in unanno_doc.units
            if is_edu(x) and turn.span.encloses(x.span)
        ]
        u_ids = set(x.local_id() for x in u_edus)

        # `annotated` is the result of the annotation process
        # find conflicts, as pair-wise overlaps between annotations
        # from `annotated`
        a_edus = [
            x for x in anno_doc.units
            if is_edu(x) and turn.span.encloses(x.span)
        ]
        # 1. map new segments to their original equivalent, backporting
        # dialogue act annotation
        dup_items = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations(
            sorted(a_edus, key=lambda x:
                   (x.local_id() in u_ids, x.local_id())), 2)
                     if (span_eq(elt_a.text_span(), elt_b.text_span(), eps=1)
                         and elt_b.local_id() in u_ids)]
        anno_map.update(dup_items)
        # backport dialogue act annotation to original segment
        for elt_a, elt_b in dup_items:
            if elt_a.type in DIALOGUE_ACTS:
                # backport annotation to original segment elt_b
                elt_b.type = elt_a.type
                elt_b.features = elt_a.features
                for k in ['lastModifier', 'lastModificationDate']:
                    elt_b.metadata[k] = elt_a.metadata[k]
        # (locally) update the list of EDUs in anno_doc, so conflicts
        # are not computed on trivially mapped segments
        a_edus = [x for x in a_edus if x not in anno_map]

        # 2. list conflicts, then whitelist them progressively
        # NB: we sort EDUs in reverse using their local_ids, so that
        # conflict pairs are of the form (stac*, skar*) ; this is
        # admittedly a cheap, ad-hoc, trick to simulate an ordering
        # such that annotations already present in unannotated < annotations
        # introduced in annotated
        pw_conflicts = [(elt_a, elt_b)
                        for elt_a, elt_b in itertools.combinations(
                            sorted(a_edus,
                                   key=lambda x:
                                   (x.type in DIALOGUE_ACTS, x.local_id())), 2)
                        if elt_a.overlaps(elt_b)]

        # * Two cases are very close: EDU merges, and CDUs
        rels_support = set(
            anno_map.get(x, x) for rel in anno_doc.relations
            for x in [rel.source, rel.target])
        edu_merges = []  # list of (list of elt_a, elt_b)
        cdu_guess = []  # list of (list of elt_a, elt_b)
        for elt_b, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[1]):
            sorted_a = sorted((y[0] for y in pairs),
                              key=lambda z: z.text_span())
            span_seq_a = Span(sorted_a[0].text_span().char_start,
                              sorted_a[-1].text_span().char_end)

            # we approximately check that the sequence of EDUs elts_a
            # fully covers the span of elt_b, from start to end, with
            # no overlap or that the whole sequence is enclosed in
            # the annotation from `annotated` (this happens when some but
            # not all of the merged EDUs have been deleted)
            if ((approximate_cover(sorted_a, elt_b)
                 or elt_b.text_span().encloses(span_seq_a))):
                # then, it is either an EDU merge or a CDU ;
                # if any element of the sequence supports a relation,
                # we take this as indicating a CDU
                if any(y in rels_support for y in sorted_a):
                    # broadcast type, features, metadata to the segments
                    for elt_a in sorted_a:
                        elt_a.type = _SPLIT_PREFIX + elt_b.type
                        elt_a.features = elt_b.features
                        for k in ['lastModifier', 'lastModificationDate']:
                            elt_a.metadata[k] = elt_b.metadata[k]
                    # transform elt_b into a CDU
                    sch_relid = elt_b.local_id()
                    sch_units = set(y.local_id() for y in sorted_a)
                    sch_relas = set()
                    sch_schms = set()
                    sch_stype = 'Complex_discourse_unit'
                    sch_feats = {}
                    sch_metad = elt_b.metadata
                    new_cdu = Schema(sch_relid,
                                     sch_units,
                                     sch_relas,
                                     sch_schms,
                                     sch_stype,
                                     sch_feats,
                                     metadata=sch_metad)
                    new_cdus.append(new_cdu)
                    # map former (bad) segment to its proper CDU version
                    anno_map[elt_b] = new_cdu
                    cdu_guess.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('CDU {}\nwas {}, from\n  {}'.format(
                            new_cdu, elt_b,
                            '\n  '.join(str(z) for z in sorted_a)))
                elif all(elt_a.local_id() in u_ids for elt_a in sorted_a):
                    edu_merges.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('EDU merge {} from\n  {}'.format(
                            elt_b, '\n  '.join(str(z) for z in sorted_a)))
                else:
                    err_msg = 'Weird approximate cover:\n{}\n{}'
                    raise ValueError(
                        err_msg.format(', '.join(str(y) for y in sorted_a),
                                       elt_b))
        # map each of the segments to its CDU, so these pairs can be
        # removed from the list of conflicts later
        cdu_map = dict()
        for elts_a, elt_b in cdu_guess:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            cdu_map.update(map_items)
            cautious_map.update(map_items)
        # map each of the merged segments to the new, bigger EDU + mark
        for elts_a, elt_b in edu_merges:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            anno_map.update(map_items)
            cautious_map.update(map_items)
        # update list of conflicts: remove pairs that contain a segment
        # and its merged EDU, or a segment and its enclosing CDU
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if (anno_map.get(elt_a, elt_a) != elt_b
                            and cdu_map.get(elt_a, elt_a) != elt_b)]

        # * EDU splits
        edu_splits = dict()  # elt_a -> list of elt_b
        for elt_a, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[0]):
            sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span)
            # we approximately check that the sequence of new EDUs
            # fully covers the span of elt_a, from start to end, with
            # no overlap
            if ((elt_a.local_id() in u_ids
                 and approximate_cover(sorted_b, elt_a))):
                edu_splits[elt_a] = sorted_b
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if elt_a not in set(edu_splits.keys())]
        # map the split segment to the first of the resulting EDUs + mark
        for elt_a, elts_b in edu_splits.items():
            map_items = [(elt_a, elts_b[0])]
            anno_map.update(map_items)
            cautious_map.update(map_items)

        if verbose:
            if pw_conflicts:
                print('Conflict:')
                print('\n'.join('  {}\t<>\t{}'.format(str(elt_a), str(elt_b))
                                for elt_a, elt_b in pw_conflicts))

    # update anno_doc using the computed mapping
    anno_map_id = {x.local_id(): y.local_id() for x, y in anno_map.items()}
    cautious_map_id = {
        x.local_id(): y.local_id()
        for x, y in cautious_map.items()
    }
    # * forget mapped units and segments rewritten as CDUs
    anno_doc.units = [
        x for x in anno_doc.units
        if (not is_edu(x) or x.local_id() not in anno_map_id)
    ]
    # * add the new CDUs to the list of schemas
    anno_doc.schemas.extend(new_cdus)

    # rewrite the support of relations and schemas
    objects = {
        x.local_id(): x
        for x in itertools.chain(anno_doc.units, anno_doc.relations,
                                 anno_doc.schemas)
    }
    # * rewrite the support of relations
    for rel in anno_doc.relations:
        src = anno_map_id.get(rel.span.t1, rel.span.t1)
        tgt = anno_map_id.get(rel.span.t2, rel.span.t2)
        # update relation span, source, target
        rel.span = RelSpan(src, tgt)
        rel.source = objects[src]
        rel.target = objects[tgt]
        # if necessary, mark relation type for review
        if src in cautious_map_id or tgt in cautious_map_id:
            rel.type = _SPLIT_PREFIX + rel.type

    # * rewrite the support of schemas
    for sch in anno_doc.schemas:
        # sch.id = sch.id
        sch.units = set(anno_map_id.get(x, x) for x in sch.units)
        sch.relations = set(anno_map_id.get(x, x) for x in sch.relations)
        sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas)
        sch.type = sch.type
        # sch.features = sch.features
        # sch.metadata = sch.metadata
        sch.span = sch.units | sch.relations | sch.schemas
        sch.fleshout(objects)

    return anno_doc
Example #5
0
def shift_dialogues(doc_src, doc_res, updates, gen):
    """Transpose dialogue split from target to source document.

    Remove all dialogues from updates.

    Parameters
    ----------
    doc_src : Document
        Source (augmented) document.
    doc_res : Document
        Result document, originally a copy of doc_tgt with unshifted
        annotations. This function modifies `doc_res` by shifting the
        boundaries of its dialogues according to `updates`, and
        stretching the first and last dialogues so as to cover the
        same span as dialogues from `doc_src`.
    updates : set of updates
        Updates computed by `compute_updates`.
    gen: int
        Generation of annotations included in `doc_src` and the output.

    Returns
    -------
    updates : Updates
        Trimmed down set of `updates`: no more dialogue.
    """
    if gen < 3:
        dlgs_src = sorted([x for x in doc_src.units
                           if x.type.lower() == 'dialogue'],
                          key=lambda y: y.span)
        dlgs_res = sorted([x for x in doc_res.units
                           if x.type.lower() == 'dialogue'],
                          key=lambda y: y.span)

        # NEW 2016-06-15 adjust dialogue boundaries
        # for each target dialogue, find the smallest enclosing sequence of
        # source dialogues and map to it
        dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src])
        dlgs_tgt_sbeg = np.array([
            shift_char(x.span.char_start + 1, updates) - 1
            for x in dlgs_res])
        # NB: we need to broadcast (- 1) to get the source dialogue whose
        # start immediately precedes the start of the shifted target
        # dialogue
        tgt2src_beg = (np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg,
                                       side='right')
                       - 1)
        dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg]
        # map the shifted end of each target dialogue to the first larger end
        # of a source dialogue
        dlgs_src_end = np.array([x.span.char_end for x in dlgs_src])
        dlgs_tgt_send = np.array([shift_char(x.span.char_end - 1, updates) + 1
                                  for x in dlgs_res])
        tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send)
        dlgs_tgt_aend = dlgs_src_end[tgt2src_end]
        # overwrite the adjusted beginning and end, when a game turn
        # overlaps with two different tgt dialogues ;
        # each overlap in the matching signals a split, in the linguistic
        # version, that happens in the middle of a game turn
        for i, (end_cur, beg_nxt) in enumerate(
                zip(tgt2src_end[:-1], tgt2src_beg[1:])):
            if beg_nxt <= end_cur:
                # linguistic turns from the same game turn, in different
                # target dialogues => use the shifted cut point from tgt
                dlgs_tgt_aend[i] = dlgs_tgt_send[i]
                dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i]
        # find source dialogues included in the shifted+expanded target
        # dialogues
        dlgs_src_matched = reduce(np.union1d,
                                  (np.arange(x_beg, x_end + 1)
                                   for x_beg, x_end
                                   in zip(tgt2src_beg, tgt2src_end)))
        dlgs_src_matched = set(dlgs_src_matched)

        for dlg_res, adj_start, adj_end in zip(
                dlgs_res, dlgs_tgt_abeg, dlgs_tgt_aend):
            dlg_res.span.char_start = adj_start
            dlg_res.span.char_end = adj_end
            # alt: dlg_res.span = Span(start, end)
            #
            # optionally, update timestamp, id, span as in
            # `stac.edit.cmd.split_dialogue.{_actually_split,_set}`

        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if (i in dlgs_src_matched
                and dlg_src in updates.expected_src_only):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    else:
        # situated version: we can rely on game turns

        # 1. get the identifier of the first and last turn of each game turn
        # in _src: these turns and those in between must end up in the same
        # dialogue
        turns_src = sorted((x for x in doc_src.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_src_tid = np.array([x.features['Identifier']
                                  for x in turns_src])
        turns_src_beg = np.array([x.span.char_start for x in turns_src])
        turns_src_end = np.array([x.span.char_end for x in turns_src])
        # * locate game turns (index of first and last turn)
        gturn_idc = game_turns(doc_src, turns_src, gen=3)
        gturn_idc_beg = np.array(gturn_idc)
        gturn_idc_end = np.array(
            [i - 1 for i in gturn_idc[1:]] + [len(turns_src) - 1])
        # ... and finally
        gturn_src_tid_beg = turns_src_tid[gturn_idc_beg]
        gturn_src_tid_end = turns_src_tid[gturn_idc_end]

        # 2. get the identifier of the first and last turn of each dialogue
        # in _res: these turns and those in between must end up in the same
        # dialogue
        turns_res = sorted((x for x in doc_res.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_res_tid = np.array([x.features['Identifier']
                                  for x in turns_res])
        turns_res_beg = np.array([x.span.char_start for x in turns_res])
        turns_res_end = np.array([x.span.char_end for x in turns_res])
        # align dialogue spans with turn spans
        dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)),
                          key=lambda x: x.span)
        dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res])
        dlgs_res_end = np.array([x.span.char_end for x in dlgs_res])
        dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg)
        dlgs_res_ti_end = np.searchsorted(turns_res_end, dlgs_res_end,
                                          side='right') - 1
        # ... and finally
        dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg]
        dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end]

        # 3. map _res dialogues to _src game turns
        dlgs_res_ti_beg = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg])
        dlgs_res_ti_end = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_end])
        # * align the beginning (resp. end) indices of game turns and _res
        # dialogues
        dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg,
                                         side='right') - 1)
        dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end)
        # * turn indices of the adjusted beginning and end of the _res
        # dialogues
        # initialize along the boundaries of game turns
        dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg]
        dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end]

        # 4. make dialogue boundaries coincide with game turn boundaries,
        # which occasionally implies merging dialogues from _res

        # * compute a partition on dialogues such that any pair of dialogues
        # overlapping a given game turn are in the same class
        dlg2grp = [0]
        for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip(
                dlg2gturn_end[:-1], dlg2gturn_beg[1:])):
            if gturn_beg_nxt <= gturn_end_cur:
                # two _res dialogues overlap a single game turn:
                # put in the same class (to merge dialogues)
                dlg2grp.append(dlg2grp[-1])
            else:
                dlg2grp.append(dlg2grp[-1] + 1)

        # keep one dialogue for each class of dialogues
        for k, g in itertools.groupby(enumerate(dlg2grp),
                                      key=lambda x: x[1]):
            dlg_idc_merged = [x[0] for x in g]
            # adjust boundaries of the first dialogue of the group
            # index of first and last dialogues
            di_beg = dlg_idc_merged[0]
            di_end = dlg_idc_merged[-1]
            # index of first and last turns of these dialogues
            ti_beg = dlg_res_src_abeg[di_beg]
            ti_end = dlg_res_src_aend[di_end]
            # keep first dialogue, update its features to include those
            # from the other dialogues in the same class
            new_dlg = dlgs_res[di_beg]
            new_dlg.span.char_start = turns_src_beg[ti_beg]
            new_dlg.span.char_end = turns_src_end[ti_end]
            dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged]
            for feat in ['Trades', 'Gets', 'Dice_rolling']:
                new_dlg.features[feat] = _concatenate_features(
                    dlgs_res_merged, feat)
            # remove merged dialogues [1:] from doc_res
            for i in dlg_idc_merged[1:]:
                dlg_res = dlgs_res[i]
                doc_res.units.remove(dlg_res)

        # transfer each unmatched (non-overlapping) game turn as a dialogue
        # (which already exists in doc_src)
        gturns_matched = reduce(np.union1d,
                                (np.arange(x_beg, x_end + 1)
                                 for x_beg, x_end
                                 in zip(dlg2gturn_beg, dlg2gturn_end)))
        gturns_matched = set(gturns_matched)
        # each dialogue in doc_src is a game turn
        dlgs_src =  sorted((x for x in doc_src.units if is_dialogue(x)),
                           key=lambda x: x.span)
        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if (i in gturns_matched
                and dlg_src in updates.expected_src_only):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    return updates
Example #6
0
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True,
                            strip_cdus=False, attach_len=False):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.
    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).
    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.
    strip_cdus : boolean, defaults to False
        If True, strip CDUs with the "head" strategy and sloppy=True.
    attach_len : boolean, defaults to False
        If True, compute attachment length. This requires
        strip_cdus=True.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_disc_rels = []  # discourse relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences
    df_unit_rels = []  # relations from the "units" stage (anaphora)

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    # give integer indices to segments, and EDUs in particular
    seg_idx = 0
    eeu_idx = 0
    edu_idx = 0
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal' and
             annotator not in ('BRONZE', 'SILVER', 'GOLD')) or
            (sel_annotator != 'metal' and
             annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in sorted(doc_val.units, key=lambda x: x.span):
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError('Wow, a discourse segment has *features*')
                    # assign index among segments, across the whole doc
                    unit_dict['seg_idx'] = seg_idx
                    seg_idx += 1
                    if anno.type == 'NonplayerSegment':  # EEU
                        unit_dict['eeu_idx'] = eeu_idx
                        eeu_idx += 1
                    else:  # EDU
                        unit_dict['edu_idx'] = edu_idx
                        edu_idx += 1
                    #
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(anno.features.keys()) ==
                            ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(
                    ['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets': anno.features.get('Gets', None),
                        'trades': anno.features.get('Trades', None),
                        'dice_rolls': anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) ==
                        ['Correctness', 'Kind', 'Quantity', 'Status'])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)

        # RELATIONS
        # * rewrite endpoints of relations if strip_cdus
        if strip_cdus:
            endpts = dict()  # map relation ids to (src_id, tgt_id)
            dgr = Graph.from_doc(game_corpus, doc_key)
            dgraph = copy.deepcopy(dgr)
            dgraph.strip_cdus(sloppy=True, mode='head')
            for edge in dgraph.relations():
                if "asoubeille_1414085458642" in edge:
                    print('Wop', edge)
                    raise ValueError('gni')
                links = dgraph.links(edge)
                # get the identifiers of the relation and its endpoints
                # to replace CDU ids with segment indices
                anno_rel = dgraph.annotation(edge)
                # as of 2017-06-24, anno_rel has no origin (why?) at
                # this point
                anno_rel.origin = doc_key  # temporary(?) fix
                #
                anno_src = dgraph.annotation(links[0])
                anno_tgt = dgraph.annotation(links[1])
                gid_rel = anno_rel.identifier()
                if gid_rel.endswith('_0'):
                    # strip_cdus appends an integer to each copy of
                    # the relation ; with mode="head", we only expect
                    # one such copy per relation so "_0" should be a
                    # sufficient match, which we can cut off for the
                    # mapping
                    gid_rel = gid_rel[:-2]
                gid_src = anno_src.identifier()
                gid_tgt = anno_tgt.identifier()
                endpts[gid_rel] = (gid_src, gid_tgt)
        # * process relations
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            # * global ids of the relation and its endpoints
            gid_rel = anno.identifier()
            gid_src = anno.source.identifier()
            gid_tgt = anno.target.identifier()
            # * build dict
            rel_dict = {
                # identification
                'global_id': gid_rel,
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier()
                )
                warnings.warn(w_msg)
            # if strip_cdus, replace endpoints of *discourse* relations
            # with segment ids
            if strip_cdus and is_relation_instance(anno):
                gid_src, gid_tgt = endpts[gid_rel]

            rel_dict.update({
                # features
                'arg_scope': anno.features.get('Argument_scope', None), # req
                'comments': anno.features.get('Comments', None),  # opt
                # endpoints
                'source': gid_src,
                'target': gid_tgt,
            })
            if stage == 'discourse':
                df_disc_rels.append(rel_dict)
            elif stage == 'units':
                df_unit_rels.append(rel_dict)
            else:
                raise ValueError(
                    "relation from stage not in {'units', 'discourse'}")
            

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS)
    df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) &
                              (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)
    # * length of attachments
    # 2017-06-29 restricted to *discourse* relations, for the time being
    if strip_cdus and attach_len:
        df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_disc_rels, df_res, df_pref, df_unit_rels)
Example #7
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {
        anno_type: list()
        for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']
    }

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() + cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() + dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() + turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() + tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in [
                    'paragraph', 'Resource', 'Anaphora', 'Several_resources',
                    'Preference'
            ]:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {
        anno_type: pd.DataFrame(data=row_list)
        for anno_type, row_list in rows.items() if row_list
    }

    return res
Example #8
0
def shift_dialogues(doc_src, doc_res, updates, gen):
    """Transpose dialogue split from target to source document.

    Remove all dialogues from updates.

    Parameters
    ----------
    doc_src : Document
        Source (augmented) document.
    doc_res : Document
        Result document, originally a copy of doc_tgt with unshifted
        annotations. This function modifies `doc_res` by shifting the
        boundaries of its dialogues according to `updates`, and
        stretching the first and last dialogues so as to cover the
        same span as dialogues from `doc_src`.
    updates : set of updates
        Updates computed by `compute_updates`.
    gen: int
        Generation of annotations included in `doc_src` and the output.

    Returns
    -------
    updates : Updates
        Trimmed down set of `updates`: no more dialogue.
    """
    if gen < 3:
        dlgs_src = sorted(
            [x for x in doc_src.units if x.type.lower() == 'dialogue'],
            key=lambda y: y.span)
        dlgs_res = sorted(
            [x for x in doc_res.units if x.type.lower() == 'dialogue'],
            key=lambda y: y.span)

        # NEW 2016-06-15 adjust dialogue boundaries
        # for each target dialogue, find the smallest enclosing sequence of
        # source dialogues and map to it
        dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src])
        dlgs_tgt_sbeg = np.array(
            [shift_char(x.span.char_start + 1, updates) - 1 for x in dlgs_res])
        # NB: we need to broadcast (- 1) to get the source dialogue whose
        # start immediately precedes the start of the shifted target
        # dialogue
        tgt2src_beg = (
            np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg, side='right') - 1)
        dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg]
        # map the shifted end of each target dialogue to the first larger end
        # of a source dialogue
        dlgs_src_end = np.array([x.span.char_end for x in dlgs_src])
        dlgs_tgt_send = np.array(
            [shift_char(x.span.char_end - 1, updates) + 1 for x in dlgs_res])
        tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send)
        dlgs_tgt_aend = dlgs_src_end[tgt2src_end]
        # overwrite the adjusted beginning and end, when a game turn
        # overlaps with two different tgt dialogues ;
        # each overlap in the matching signals a split, in the linguistic
        # version, that happens in the middle of a game turn
        for i, (end_cur,
                beg_nxt) in enumerate(zip(tgt2src_end[:-1], tgt2src_beg[1:])):
            if beg_nxt <= end_cur:
                # linguistic turns from the same game turn, in different
                # target dialogues => use the shifted cut point from tgt
                dlgs_tgt_aend[i] = dlgs_tgt_send[i]
                dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i]
        # find source dialogues included in the shifted+expanded target
        # dialogues
        dlgs_src_matched = reduce(
            np.union1d, (np.arange(x_beg, x_end + 1)
                         for x_beg, x_end in zip(tgt2src_beg, tgt2src_end)))
        dlgs_src_matched = set(dlgs_src_matched)

        for dlg_res, adj_start, adj_end in zip(dlgs_res, dlgs_tgt_abeg,
                                               dlgs_tgt_aend):
            dlg_res.span.char_start = adj_start
            dlg_res.span.char_end = adj_end
            # alt: dlg_res.span = Span(start, end)
            #
            # optionally, update timestamp, id, span as in
            # `stac.edit.cmd.split_dialogue.{_actually_split,_set}`

        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if ((i in dlgs_src_matched
                 and dlg_src in updates.expected_src_only)):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    else:
        # situated version: we can rely on game turns

        # 1. get the identifier of the first and last turn of each game turn
        # in _src: these turns and those in between must end up in the same
        # dialogue
        turns_src = sorted((x for x in doc_src.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_src_tid = np.array([x.features['Identifier'] for x in turns_src])
        turns_src_beg = np.array([x.span.char_start for x in turns_src])
        turns_src_end = np.array([x.span.char_end for x in turns_src])
        # * locate game turns (index of first and last turn)
        gturn_idc = game_turns(doc_src, turns_src, gen=3)
        gturn_idc_beg = np.array(gturn_idc)
        gturn_idc_end = np.array([i - 1 for i in gturn_idc[1:]] +
                                 [len(turns_src) - 1])
        # ... and finally
        gturn_src_tid_beg = turns_src_tid[gturn_idc_beg]
        gturn_src_tid_end = turns_src_tid[gturn_idc_end]

        # 2. get the identifier of the first and last turn of each dialogue
        # in _res: these turns and those in between must end up in the same
        # dialogue
        turns_res = sorted((x for x in doc_res.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_res_tid = np.array([x.features['Identifier'] for x in turns_res])
        turns_res_beg = np.array([x.span.char_start for x in turns_res])
        turns_res_end = np.array([x.span.char_end for x in turns_res])
        # align dialogue spans with turn spans
        dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)),
                          key=lambda x: x.span)
        dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res])
        dlgs_res_end = np.array([x.span.char_end for x in dlgs_res])
        dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg)
        dlgs_res_ti_end = np.searchsorted(
            turns_res_end, dlgs_res_end, side='right') - 1
        # ... and finally
        dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg]
        dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end]

        # 3. map _res dialogues to _src game turns
        dlgs_res_ti_beg = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg])
        dlgs_res_ti_end = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_end])
        # * align the beginning (resp. end) indices of game turns and _res
        # dialogues
        dlg2gturn_beg = (
            np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg, side='right') - 1)
        dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end)
        # * turn indices of the adjusted beginning and end of the _res
        # dialogues
        # initialize along the boundaries of game turns
        dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg]
        dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end]

        # 4. make dialogue boundaries coincide with game turn boundaries,
        # which occasionally implies merging dialogues from _res

        # * compute a partition on dialogues such that any pair of dialogues
        # overlapping a given game turn are in the same class
        dlg2grp = [0]
        for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(
                zip(dlg2gturn_end[:-1], dlg2gturn_beg[1:])):
            if gturn_beg_nxt <= gturn_end_cur:
                # two _res dialogues overlap a single game turn:
                # put in the same class (to merge dialogues)
                dlg2grp.append(dlg2grp[-1])
            else:
                dlg2grp.append(dlg2grp[-1] + 1)

        # keep one dialogue for each class of dialogues
        for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]):
            dlg_idc_merged = [x[0] for x in g]
            # adjust boundaries of the first dialogue of the group
            # index of first and last dialogues
            di_beg = dlg_idc_merged[0]
            di_end = dlg_idc_merged[-1]
            # index of first and last turns of these dialogues
            ti_beg = dlg_res_src_abeg[di_beg]
            ti_end = dlg_res_src_aend[di_end]
            # keep first dialogue, update its features to include those
            # from the other dialogues in the same class
            new_dlg = dlgs_res[di_beg]
            new_dlg.span.char_start = turns_src_beg[ti_beg]
            new_dlg.span.char_end = turns_src_end[ti_end]
            dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged]
            for feat in ['Trades', 'Gets', 'Dice_rolling']:
                new_dlg.features[feat] = _concatenate_features(
                    dlgs_res_merged, feat)
            # remove merged dialogues [1:] from doc_res
            for i in dlg_idc_merged[1:]:
                dlg_res = dlgs_res[i]
                doc_res.units.remove(dlg_res)

        # transfer each unmatched (non-overlapping) game turn as a dialogue
        # (which already exists in doc_src)
        gturns_matched = reduce(
            np.union1d,
            (np.arange(x_beg, x_end + 1)
             for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end)))
        gturns_matched = set(gturns_matched)
        # each dialogue in doc_src is a game turn
        dlgs_src = sorted((x for x in doc_src.units if is_dialogue(x)),
                          key=lambda x: x.span)
        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if ((i in gturns_matched
                 and dlg_src in updates.expected_src_only)):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    return updates
Example #9
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {anno_type: list()
            for anno_type in ['edu', 'turn', 'tstar', 'dialogue',
                              'cdu', 'rel']}

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() +
                           cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() +
                           dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() +
                           turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() +
                           tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in ['paragraph',
                               'Resource', 'Anaphora',
                               'Several_resources', 'Preference']:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {anno_type: pd.DataFrame(data=row_list)
           for anno_type, row_list in rows.items()
           if row_list}

    return res
Example #10
0
def fix_likely_annotation_errors(anno_doc, verbose=1):
    """Fix a document for likely annotation errors due to glozz UX.

    Likely errors are currently defined as:
    - units of span length 0 (delete),
    - empty dialogue acts (delete),
    - schemas with no member (delete),
    - overflowing units (fix span).

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Same document but filtered.
    """
    # units
    anno_units_err = [
        x for x in anno_doc.units
        if (x.span.char_start == x.span.char_end or
            (is_empty_dialogue_act(x) and
             any(y.encloses(x) for y in anno_doc.units
                 if y.text_span() != x.text_span() and is_edu(y))))
    ]
    # schemas
    anno_schms_err = [
        x for x in anno_doc.schemas
        if not x.members
    ]
    # relations
    # TODO
    anno_relas_err = []

    # warn about the ignored annotations
    if verbose:
        if anno_units_err or anno_schms_err or anno_relas_err:
            print('Likely errors due to glozz UX')
            print('-----------------------------')
        if anno_units_err:
            print('|-> Units')
            print('\n'.join('  [ ] {}'.format(str(x))
                            for x in anno_units_err))
        if anno_schms_err:
            print('|-> Schemas')
            print('\n'.join('  [ ] {}'.format(str(x))
                            for x in anno_schms_err))
        if anno_relas_err:
            print('|-> Relations')
            print('\n'.join('  [ ] {}'.format(str(x))
                            for x in anno_relas_err))

    # remove detected errors
    anno_units_err = set(anno_units_err)
    anno_doc.units = [x for x in anno_doc.units
                      if x not in anno_units_err]
    anno_schms_err = set(anno_schms_err)
    anno_doc.schemas = [x for x in anno_doc.schemas
                        if x not in anno_schms_err]
    anno_relas_err = set(anno_relas_err)
    anno_doc.relations = [x for x in anno_doc.relations
                          if x not in anno_relas_err]

    # fix span of units that overflow from their turn
    turns = [x for x in anno_doc.units if is_turn(x)]
    edus = [x for x in anno_doc.units if is_edu(x)]
    for edu in edus:
        enclosing_turns = [x for x in turns if x.encloses(edu)]
        if len(enclosing_turns) == 1:
            continue

        overlapping_turns = [x for x in turns if x.overlaps(edu)]
        if len(overlapping_turns) != 1:
            raise ValueError('No unique overlapping turn for {}'.format(edu))
        turn = overlapping_turns[0]
        if turn.overlaps(edu) != edu.text_span():
            edu.span = turn.overlaps(edu)
            if verbose:
                print('Fix span of overflowing unit: {}'.format(edu))

    return anno_doc
Example #11
0
def infer_resegmentation(unanno_doc, anno_doc, verbose=0):
    """Infer resegmentation of EDUs.

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Filtered document, where the support of relations and schemas
        has been rewritten.
    """
    anno_map = dict()
    cautious_map = dict()
    new_cdus = []

    turns = [x for x in unanno_doc.units if is_turn(x)]
    for turn in turns:
        # `unannotated` was the starting point for the annotation process
        u_edus = [x for x in unanno_doc.units
                  if is_edu(x) and turn.span.encloses(x.span)]
        u_ids = set(x.local_id() for x in u_edus)

        # `annotated` is the result of the annotation process
        # find conflicts, as pair-wise overlaps between annotations
        # from `annotated`
        a_edus = [x for x in anno_doc.units
                  if is_edu(x) and turn.span.encloses(x.span)]
        # 1. map new segments to their original equivalent, backporting
        # dialogue act annotation
        dup_items = [(elt_a, elt_b) for elt_a, elt_b
                     in itertools.combinations(
                         sorted(a_edus, key=lambda x: (
                             x.local_id() in u_ids,
                             x.local_id())),
                         2)
                     if (span_eq(elt_a.text_span(), elt_b.text_span(),
                                 eps=1) and
                         elt_b.local_id() in u_ids)]
        anno_map.update(dup_items)
        # backport dialogue act annotation to original segment
        for elt_a, elt_b in dup_items:
            if elt_a.type in DIALOGUE_ACTS:
                # backport annotation to original segment elt_b
                elt_b.type = elt_a.type
                elt_b.features = elt_a.features
                for k in ['lastModifier', 'lastModificationDate']:
                    elt_b.metadata[k] = elt_a.metadata[k]
        # (locally) update the list of EDUs in anno_doc, so conflicts
        # are not computed on trivially mapped segments
        a_edus = [x for x in a_edus if x not in anno_map]

        # 2. list conflicts, then whitelist them progressively
        # NB: we sort EDUs in reverse using their local_ids, so that
        # conflict pairs are of the form (stac*, skar*) ; this is
        # admittedly a cheap, ad-hoc, trick to simulate an ordering
        # such that annotations already present in unannotated < annotations
        # introduced in annotated
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b
                        in itertools.combinations(
                            sorted(a_edus, key=lambda x: (
                                x.type in DIALOGUE_ACTS, x.local_id())),
                            2)
                        if elt_a.overlaps(elt_b)]

        # * Two cases are very close: EDU merges, and CDUs
        rels_support = set(anno_map.get(x, x)
                           for rel in anno_doc.relations
                           for x in [rel.source, rel.target])
        edu_merges = []  # list of (list of elt_a, elt_b)
        cdu_guess = []  # list of (list of elt_a, elt_b)
        for elt_b, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[1]):
            sorted_a = sorted((y[0] for y in pairs),
                              key=lambda z: z.text_span())
            span_seq_a = Span(sorted_a[0].text_span().char_start,
                              sorted_a[-1].text_span().char_end)

            # we approximately check that the sequence of EDUs elts_a
            # fully covers the span of elt_b, from start to end, with
            # no overlap or that the whole sequence is enclosed in
            # the annotation from `annotated` (this happens when some but
            # not all of the merged EDUs have been deleted)
            if ((approximate_cover(sorted_a, elt_b) or
                 elt_b.text_span().encloses(span_seq_a))):
                # then, it is either an EDU merge or a CDU ;
                # if any element of the sequence supports a relation,
                # we take this as indicating a CDU
                if any(y in rels_support for y in sorted_a):
                    # broadcast type, features, metadata to the segments
                    for elt_a in sorted_a:
                        elt_a.type = _SPLIT_PREFIX + elt_b.type
                        elt_a.features = elt_b.features
                        for k in ['lastModifier', 'lastModificationDate']:
                            elt_a.metadata[k] = elt_b.metadata[k]
                    # transform elt_b into a CDU
                    sch_relid = elt_b.local_id()
                    sch_units = set(y.local_id() for y in sorted_a)
                    sch_relas = set()
                    sch_schms = set()
                    sch_stype = 'Complex_discourse_unit'
                    sch_feats = {}
                    sch_metad = elt_b.metadata
                    new_cdu = Schema(sch_relid, sch_units, sch_relas,
                                     sch_schms, sch_stype, sch_feats,
                                     metadata=sch_metad)
                    new_cdus.append(new_cdu)
                    # map former (bad) segment to its proper CDU version
                    anno_map[elt_b] = new_cdu
                    cdu_guess.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('CDU {}\nwas {}, from\n  {}'.format(
                            new_cdu, elt_b,
                            '\n  '.join(str(z) for z in sorted_a)))
                elif all(elt_a.local_id() in u_ids for elt_a in sorted_a):
                    edu_merges.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('EDU merge {} from\n  {}'.format(
                            elt_b, '\n  '.join(str(z) for z in sorted_a)))
                else:
                    err_msg = 'Weird approximate cover:\n{}\n{}'
                    raise ValueError(err_msg.format(
                        ', '.join(str(y) for y in sorted_a),
                        elt_b
                    ))
        # map each of the segments to its CDU, so these pairs can be
        # removed from the list of conflicts later
        cdu_map = dict()
        for elts_a, elt_b in cdu_guess:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            cdu_map.update(map_items)
            cautious_map.update(map_items)
        # map each of the merged segments to the new, bigger EDU + mark
        for elts_a, elt_b in edu_merges:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            anno_map.update(map_items)
            cautious_map.update(map_items)
        # update list of conflicts: remove pairs that contain a segment
        # and its merged EDU, or a segment and its enclosing CDU
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if (anno_map.get(elt_a, elt_a) != elt_b and
                            cdu_map.get(elt_a, elt_a) != elt_b)]

        # * EDU splits
        edu_splits = dict()  # elt_a -> list of elt_b
        for elt_a, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[0]):
            sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span)
            # we approximately check that the sequence of new EDUs
            # fully covers the span of elt_a, from start to end, with
            # no overlap
            if ((elt_a.local_id() in u_ids and
                 approximate_cover(sorted_b, elt_a))):
                edu_splits[elt_a] = sorted_b
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if elt_a not in set(edu_splits.keys())]
        # map the split segment to the first of the resulting EDUs + mark
        for elt_a, elts_b in edu_splits.items():
            map_items = [(elt_a, elts_b[0])]
            anno_map.update(map_items)
            cautious_map.update(map_items)

        if verbose:
            if pw_conflicts:
                print('Conflict:')
                print('\n'.join('  {}\t<>\t{}'.format(str(elt_a), str(elt_b))
                                for elt_a, elt_b in pw_conflicts))

    # update anno_doc using the computed mapping
    anno_map_id = {x.local_id(): y.local_id()
                   for x, y in anno_map.items()}
    cautious_map_id = {x.local_id(): y.local_id()
                       for x, y in cautious_map.items()}
    # * forget mapped units and segments rewritten as CDUs
    anno_doc.units = [x for x in anno_doc.units
                      if (not is_edu(x) or
                          x.local_id() not in anno_map_id)]
    # * add the new CDUs to the list of schemas
    anno_doc.schemas.extend(new_cdus)

    # rewrite the support of relations and schemas
    objects = {x.local_id(): x
               for x in itertools.chain(anno_doc.units, anno_doc.relations,
                                        anno_doc.schemas)}
    # * rewrite the support of relations
    for rel in anno_doc.relations:
        src = anno_map_id.get(rel.span.t1, rel.span.t1)
        tgt = anno_map_id.get(rel.span.t2, rel.span.t2)
        # update relation span, source, target
        rel.span = RelSpan(src, tgt)
        rel.source = objects[src]
        rel.target = objects[tgt]
        # if necessary, mark relation type for review
        if src in cautious_map_id or tgt in cautious_map_id:
            rel.type = _SPLIT_PREFIX + rel.type

    # * rewrite the support of schemas
    for sch in anno_doc.schemas:
        # sch.id = sch.id
        sch.units = set(anno_map_id.get(x, x) for x in sch.units)
        sch.relations = set(anno_map_id.get(x, x) for x in sch.relations)
        sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas)
        sch.type = sch.type
        # sch.features = sch.features
        # sch.metadata = sch.metadata
        sch.span = sch.units | sch.relations | sch.schemas
        sch.fleshout(objects)

    return anno_doc
Example #12
0
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.

    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).

    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_rels = []  # relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal'
             and annotator not in ('BRONZE', 'SILVER', 'GOLD'))
                or (sel_annotator != 'metal' and annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in doc_val.units:
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError(
                            'Wow, a discourse segment has *features*')
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(
                        anno.features.keys()) == ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets':
                        anno.features.get('Gets', None),
                        'trades':
                        anno.features.get('Trades', None),
                        'dice_rolls':
                        anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) == [
                    'Correctness', 'Kind', 'Quantity', 'Status'
                ])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            rel_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier())
                warnings.warn(w_msg)
            rel_dict.update({
                # features
                'arg_scope':
                anno.features.get('Argument_scope', None),  # req
                'comments':
                anno.features.get('Comments', None),  # opt
                # endpoints
                'source':
                anno.source.identifier(),
                'target':
                anno.target.identifier(),
            })
            df_rels.append(rel_dict)

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_rels = pd.DataFrame(df_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg)
                              & (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_rels, df_res, df_pref)
Example #13
0
def read_game_as_dataframes(game_folder,
                            sel_annotator=None,
                            thorough=True,
                            strip_cdus=False,
                            attach_len=False):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.
    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).
    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.
    strip_cdus : boolean, defaults to False
        If True, strip CDUs with the "head" strategy and sloppy=True.
    attach_len : boolean, defaults to False
        If True, compute attachment length. This requires
        strip_cdus=True.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_disc_rels = []  # discourse relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences
    df_unit_rels = []  # relations from the "units" stage (anaphora)

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    # give integer indices to segments, and EDUs in particular
    seg_idx = 0
    eeu_idx = 0
    edu_idx = 0
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal'
             and annotator not in ('BRONZE', 'SILVER', 'GOLD'))
                or (sel_annotator != 'metal' and annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in sorted(doc_val.units, key=lambda x: x.span):
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError(
                            'Wow, a discourse segment has *features*')
                    # assign index among segments, across the whole doc
                    unit_dict['seg_idx'] = seg_idx
                    seg_idx += 1
                    if anno.type == 'NonplayerSegment':  # EEU
                        unit_dict['eeu_idx'] = eeu_idx
                        eeu_idx += 1
                    else:  # EDU
                        unit_dict['edu_idx'] = edu_idx
                        edu_idx += 1
                    #
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(
                        anno.features.keys()) == ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets':
                        anno.features.get('Gets', None),
                        'trades':
                        anno.features.get('Trades', None),
                        'dice_rolls':
                        anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) == [
                    'Correctness', 'Kind', 'Quantity', 'Status'
                ])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)

        # RELATIONS
        # * rewrite endpoints of relations if strip_cdus
        if strip_cdus:
            endpts = dict()  # map relation ids to (src_id, tgt_id)
            dgr = Graph.from_doc(game_corpus, doc_key)
            dgraph = copy.deepcopy(dgr)
            dgraph.strip_cdus(sloppy=True, mode='head')
            for edge in dgraph.relations():
                if "asoubeille_1414085458642" in edge:
                    print('Wop', edge)
                    raise ValueError('gni')
                links = dgraph.links(edge)
                # get the identifiers of the relation and its endpoints
                # to replace CDU ids with segment indices
                anno_rel = dgraph.annotation(edge)
                # as of 2017-06-24, anno_rel has no origin (why?) at
                # this point
                anno_rel.origin = doc_key  # temporary(?) fix
                #
                anno_src = dgraph.annotation(links[0])
                anno_tgt = dgraph.annotation(links[1])
                gid_rel = anno_rel.identifier()
                if gid_rel.endswith('_0'):
                    # strip_cdus appends an integer to each copy of
                    # the relation ; with mode="head", we only expect
                    # one such copy per relation so "_0" should be a
                    # sufficient match, which we can cut off for the
                    # mapping
                    gid_rel = gid_rel[:-2]
                gid_src = anno_src.identifier()
                gid_tgt = anno_tgt.identifier()
                endpts[gid_rel] = (gid_src, gid_tgt)
        # * process relations
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            # * global ids of the relation and its endpoints
            gid_rel = anno.identifier()
            gid_src = anno.source.identifier()
            gid_tgt = anno.target.identifier()
            # * build dict
            rel_dict = {
                # identification
                'global_id': gid_rel,
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier())
                warnings.warn(w_msg)
            # if strip_cdus, replace endpoints of *discourse* relations
            # with segment ids
            if strip_cdus and is_relation_instance(anno):
                gid_src, gid_tgt = endpts[gid_rel]

            rel_dict.update({
                # features
                'arg_scope':
                anno.features.get('Argument_scope', None),  # req
                'comments':
                anno.features.get('Comments', None),  # opt
                # endpoints
                'source':
                gid_src,
                'target':
                gid_tgt,
            })
            if stage == 'discourse':
                df_disc_rels.append(rel_dict)
            elif stage == 'units':
                df_unit_rels.append(rel_dict)
            else:
                raise ValueError(
                    "relation from stage not in {'units', 'discourse'}")

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS)
    df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg)
                              & (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)
    # * length of attachments
    # 2017-06-29 restricted to *discourse* relations, for the time being
    if strip_cdus and attach_len:
        df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_disc_rels, df_res, df_pref, df_unit_rels)
def _fix_dialogue_boundaries(tcache, doc_ling, doc_situ):
    """Do the job.

    Parameters
    ----------
    tcache: TimestampCache
        Timestamp cache to generate unit identifiers for new dialogues.
    doc_ling: GlozzDocument
        Linguistic version of the game.
    doc_situ: GlozzDocument
        Situated version of the game.

    Returns
    -------
    doc_situ: GlozzDocument
        Fixed version of doc_situ.
    """
    doc_key = doc_situ.origin

    # 1. get the identifier of the first and last turn of each game turn
    # in _situ: these turns and those in between must end up in the same
    # dialogue
    turns_situ = sorted((x for x in doc_situ.units if is_turn(x)),
                        key=lambda x: x.span)
    turns_situ_tid = np.array([x.features['Identifier'] for x in turns_situ])
    turns_situ_beg = np.array([x.span.char_start for x in turns_situ])
    turns_situ_end = np.array([x.span.char_end for x in turns_situ])
    # * locate game turns (index of first and last turn)
    gturn_idc = game_turns(doc_situ, turns_situ, gen=3)
    gturn_idc_beg = np.array(gturn_idc)
    gturn_idc_end = np.array(
        [i - 1 for i in gturn_idc[1:]] + [len(turns_situ) - 1])
    # ... and finally
    gturn_situ_tid_beg = turns_situ_tid[gturn_idc_beg]
    gturn_situ_tid_end = turns_situ_tid[gturn_idc_end]
    # print('game turns in _situ', zip(gturn_situ_tid_beg, gturn_situ_tid_end))

    # 2. get the identifier of the first and last turn of each dialogue in
    # _ling: these turns and those in between must end up in the same
    # dialogue
    turns_ling = sorted((x for x in doc_ling.units if is_turn(x)),
                        key=lambda x: x.span)
    # DIRTY special processing for pilot02_01
    if doc_key.doc == 'pilot02' and doc_key.subdoc == '01':
        # ignore turns 26-27 that were moved down from _01 to _02
        turns_ling = turns_ling[:-2]
    turns_ling_tid = np.array([x.features['Identifier'] for x in turns_ling])
    turns_ling_beg = np.array([x.span.char_start for x in turns_ling])
    turns_ling_end = np.array([x.span.char_end for x in turns_ling])
    # align dialogue spans with turn spans
    dlgs_ling = sorted((x for x in doc_ling.units if is_dialogue(x)),
                       key=lambda x: x.span)
    # DIRTY
    if doc_key.doc == 'pilot02' and doc_key.subdoc == '01':
        # turns 26-27 are in the last dialogue in _01, in _ling
        dlgs_ling = dlgs_ling[:-1]
    dlgs_ling_beg = np.array([x.span.char_start for x in dlgs_ling])
    dlgs_ling_end = np.array([x.span.char_end for x in dlgs_ling])
    dlgs_ling_ti_beg = np.searchsorted(turns_ling_beg, dlgs_ling_beg)
    dlgs_ling_ti_end = np.searchsorted(turns_ling_end, dlgs_ling_end,
                                       side='right') - 1
    # ... and finally
    dlgs_ling_tid_beg = turns_ling_tid[dlgs_ling_ti_beg]
    dlgs_ling_tid_end = turns_ling_tid[dlgs_ling_ti_end]
    # print('dialogues in _ling', zip(dlgs_ling_tid_beg, dlgs_ling_tid_end))

    # 3. map _ling dialogues to _situ game turns
    # * locate the first and last turn of each _ling dialogue in the
    # list of turns in _situ
    # NB: we don't need indices in the list of turns from _ling anymore
    # hence it is safe to overwrite dlgs_ling_ti_{beg,end}
    dlgs_ling_ti_beg = np.array(
        [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_beg])
    dlgs_ling_ti_end = np.array(
        [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_end])
    # print('game turns (turn_idx)', zip(gturn_idc_beg, gturn_idc_end))
    # print('core dlgs (turn_idx)', zip(dlgs_ling_ti_beg, dlgs_ling_ti_end))
    # * align the beginning (resp. end) indices of game turns and _ling
    # dialogues
    dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_ling_ti_beg,
                                     side='right') - 1)
    dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_ling_ti_end)
    # print('map from dlg to gturn', zip(dlg2gturn_beg, dlg2gturn_end))
    # * turn indices of the adjusted beginning and end of the _ling
    # dialogues
    # initialize along the boundaries of game turns
    dlg_ling_situ_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg]
    dlg_ling_situ_aend = [gturn_idc_end[i] for i in dlg2gturn_end]

    # 4. make dialogue boundaries coincide with game turn boundaries,
    # which occasionally implies merging dialogues from _ling

    # * compute a partition on dialogues such that any pair of
    # dialogues overlapping a given game turn are in the same
    # class
    dlg2grp = [0]
    for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip(
            dlg2gturn_end[:-1], dlg2gturn_beg[1:])):
        if gturn_beg_nxt <= gturn_end_cur:
            # two _ling dialogues overlap a single game turn:
            # put in the same class (to merge dialogues)
            dlg2grp.append(dlg2grp[-1])
        else:
            dlg2grp.append(dlg2grp[-1] + 1)

    # remove all dialogues from the units in doc_situ,
    # they will be replaced with (hopefully) clean ones
    dlgs_situ = sorted((x for x in doc_situ.units if is_dialogue(x)),
                       key=lambda x: x.span)
    for dlg_situ in dlgs_situ:
        doc_situ.units.remove(dlg_situ)

    # create one dialogue for each class of dialogues
    for k, g in itertools.groupby(enumerate(dlg2grp),
                                  key=lambda x: x[1]):
        dlg_idc_merged = [x[0] for x in g]
        # adjust boundaries of the first dialogue of the group
        # index of first and last dialogues
        di_beg = dlg_idc_merged[0]
        di_end = dlg_idc_merged[-1]
        # index of first and last turns of these dialogues
        ti_beg = dlg_ling_situ_abeg[di_beg]
        ti_end = dlg_ling_situ_aend[di_end]
        # create dialogue, use the 1st _ling dialogue as basis then
        # customize
        dlg0 = dlgs_ling[di_beg]
        new_dlg = copy.deepcopy(dlg0)
        new_dlg.origin = doc_key
        new_dlg.span.char_start = turns_situ_beg[ti_beg]
        new_dlg.span.char_end = turns_situ_end[ti_end]
        dlgs_ling_merged = [dlgs_ling[i] for i in dlg_idc_merged]
        for feat in ['Trades', 'Gets', 'Dice_rolling']:
            new_dlg.features[feat] = _concatenate_features(
                dlgs_ling_merged, feat)
        # add the new dialogue to doc_situ
        doc_situ.units.append(new_dlg)

    # create a new dialogue for each unmatched (non-overlapping) game
    # turn
    gturns_matched = reduce(np.union1d,
                            (np.arange(x_beg, x_end + 1)
                             for x_beg, x_end
                             in zip(dlg2gturn_beg, dlg2gturn_end)))
    gturns_matched = set(gturns_matched)
    for i, (gturn_idx_beg, gturn_idx_end) in enumerate(zip(
            gturn_idc_beg, gturn_idc_end)):
        if i not in gturns_matched:
            new_dlg_span = Span(turns_situ_beg[gturn_idx_beg],
                                turns_situ_end[gturn_idx_end])
            # UGLY this works just like split_dialogue:
            # create a new dialogue by copying an existing dialogue,
            # re-assign it an annotation id and span using a timestamp
            # cache, then erase all features
            new_dlg = copy.deepcopy(dlgs_situ[0])
            _set(tcache, new_dlg_span, new_dlg)
            new_dlg.features = {}
            # ... "et voila": add this dialogue to the document
            doc_situ.units.append(new_dlg)

    # TODO restore dialogue features from the game events?
    return doc_situ