Ejemplo n.º 1
0
    def is_various(annotation):
        """None of {edu, turn, paragraph, dialogue}.

        It seems to capture only Resources (to be confirmed).
        """
        return not (is_edu(annotation) or is_turn(annotation)
                    or is_paragraph(annotation) or is_dialogue(annotation))
Ejemplo n.º 2
0
    def is_various(annotation):
        """None of {edu, turn, paragraph, dialogue}.

        It seems to capture only Resources (to be confirmed).
        """
        return not(is_edu(annotation) or
                   is_turn(annotation) or
                   is_paragraph(annotation) or
                   is_dialogue(annotation))
Ejemplo n.º 3
0
def fix_dialogue_boundaries(dir_ling, dir_situ, doc, seg_path=None):
    """Fix dialogue boundaries in a woven game.

    Dialogue boundaries are adjusted in the woven version, so they
    are tighter around the dialogues that existed in the annotated
    version.

    Parameters
    ----------
    dir_ling: filepath
        Path to the folder of the original version of the game.
    dir_situ: filepath
        Path to the folder of the woven version of the game.
    doc: string
        Name of the game.
    seg_path: TODO
        TODO ?
    """
    # select files for this game only, annotator GOLD
    is_interesting = lambda k: (k.doc == doc
                                and (k.annotator == 'GOLD'
                                     or k.annotator is None))

    # locate files
    dir_ling = os.path.abspath(dir_ling)
    reader_ling = Reader(dir_ling)
    files_ling = reader_ling.filter(reader_ling.files(), is_interesting)
    corpus_ling = reader_ling.slurp(cfiles=files_ling, verbose=True)

    dir_situ = os.path.abspath(dir_situ)
    reader_situ = Reader(dir_situ)
    files_situ = reader_situ.filter(reader_situ.files(), is_interesting)
    corpus_situ = reader_situ.slurp(cfiles=files_situ, verbose=True)
    # need a TimestampCache to generate unit_id for new dialogues
    tcache = TimestampCache()

    for key, doc_situ in sorted(corpus_situ.items()):
        doc_ling = corpus_ling[key]
        print(key)
        doc_situ_fixed = _fix_dialogue_boundaries(tcache, doc_ling, doc_situ)
        # DEBUG
        dlgs = sorted((x for x in doc_situ_fixed.units if is_dialogue(x)),
                      key=lambda x: x.span)
        dlg_beg = [x.span.char_start for x in dlgs]
        dlg_end = [x.span.char_end for x in dlgs]
        print(zip(dlg_beg, dlg_end))
        # end DEBUG
        save_document(dir_situ, key, doc_situ_fixed)
Ejemplo n.º 4
0
def shift_dialogues(doc_src, doc_res, updates, gen):
    """Transpose dialogue split from target to source document.

    Remove all dialogues from updates.

    Parameters
    ----------
    doc_src : Document
        Source (augmented) document.
    doc_res : Document
        Result document, originally a copy of doc_tgt with unshifted
        annotations. This function modifies `doc_res` by shifting the
        boundaries of its dialogues according to `updates`, and
        stretching the first and last dialogues so as to cover the
        same span as dialogues from `doc_src`.
    updates : set of updates
        Updates computed by `compute_updates`.
    gen: int
        Generation of annotations included in `doc_src` and the output.

    Returns
    -------
    updates : Updates
        Trimmed down set of `updates`: no more dialogue.
    """
    if gen < 3:
        dlgs_src = sorted([x for x in doc_src.units
                           if x.type.lower() == 'dialogue'],
                          key=lambda y: y.span)
        dlgs_res = sorted([x for x in doc_res.units
                           if x.type.lower() == 'dialogue'],
                          key=lambda y: y.span)

        # NEW 2016-06-15 adjust dialogue boundaries
        # for each target dialogue, find the smallest enclosing sequence of
        # source dialogues and map to it
        dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src])
        dlgs_tgt_sbeg = np.array([
            shift_char(x.span.char_start + 1, updates) - 1
            for x in dlgs_res])
        # NB: we need to broadcast (- 1) to get the source dialogue whose
        # start immediately precedes the start of the shifted target
        # dialogue
        tgt2src_beg = (np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg,
                                       side='right')
                       - 1)
        dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg]
        # map the shifted end of each target dialogue to the first larger end
        # of a source dialogue
        dlgs_src_end = np.array([x.span.char_end for x in dlgs_src])
        dlgs_tgt_send = np.array([shift_char(x.span.char_end - 1, updates) + 1
                                  for x in dlgs_res])
        tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send)
        dlgs_tgt_aend = dlgs_src_end[tgt2src_end]
        # overwrite the adjusted beginning and end, when a game turn
        # overlaps with two different tgt dialogues ;
        # each overlap in the matching signals a split, in the linguistic
        # version, that happens in the middle of a game turn
        for i, (end_cur, beg_nxt) in enumerate(
                zip(tgt2src_end[:-1], tgt2src_beg[1:])):
            if beg_nxt <= end_cur:
                # linguistic turns from the same game turn, in different
                # target dialogues => use the shifted cut point from tgt
                dlgs_tgt_aend[i] = dlgs_tgt_send[i]
                dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i]
        # find source dialogues included in the shifted+expanded target
        # dialogues
        dlgs_src_matched = reduce(np.union1d,
                                  (np.arange(x_beg, x_end + 1)
                                   for x_beg, x_end
                                   in zip(tgt2src_beg, tgt2src_end)))
        dlgs_src_matched = set(dlgs_src_matched)

        for dlg_res, adj_start, adj_end in zip(
                dlgs_res, dlgs_tgt_abeg, dlgs_tgt_aend):
            dlg_res.span.char_start = adj_start
            dlg_res.span.char_end = adj_end
            # alt: dlg_res.span = Span(start, end)
            #
            # optionally, update timestamp, id, span as in
            # `stac.edit.cmd.split_dialogue.{_actually_split,_set}`

        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if (i in dlgs_src_matched
                and dlg_src in updates.expected_src_only):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    else:
        # situated version: we can rely on game turns

        # 1. get the identifier of the first and last turn of each game turn
        # in _src: these turns and those in between must end up in the same
        # dialogue
        turns_src = sorted((x for x in doc_src.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_src_tid = np.array([x.features['Identifier']
                                  for x in turns_src])
        turns_src_beg = np.array([x.span.char_start for x in turns_src])
        turns_src_end = np.array([x.span.char_end for x in turns_src])
        # * locate game turns (index of first and last turn)
        gturn_idc = game_turns(doc_src, turns_src, gen=3)
        gturn_idc_beg = np.array(gturn_idc)
        gturn_idc_end = np.array(
            [i - 1 for i in gturn_idc[1:]] + [len(turns_src) - 1])
        # ... and finally
        gturn_src_tid_beg = turns_src_tid[gturn_idc_beg]
        gturn_src_tid_end = turns_src_tid[gturn_idc_end]

        # 2. get the identifier of the first and last turn of each dialogue
        # in _res: these turns and those in between must end up in the same
        # dialogue
        turns_res = sorted((x for x in doc_res.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_res_tid = np.array([x.features['Identifier']
                                  for x in turns_res])
        turns_res_beg = np.array([x.span.char_start for x in turns_res])
        turns_res_end = np.array([x.span.char_end for x in turns_res])
        # align dialogue spans with turn spans
        dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)),
                          key=lambda x: x.span)
        dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res])
        dlgs_res_end = np.array([x.span.char_end for x in dlgs_res])
        dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg)
        dlgs_res_ti_end = np.searchsorted(turns_res_end, dlgs_res_end,
                                          side='right') - 1
        # ... and finally
        dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg]
        dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end]

        # 3. map _res dialogues to _src game turns
        dlgs_res_ti_beg = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg])
        dlgs_res_ti_end = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_end])
        # * align the beginning (resp. end) indices of game turns and _res
        # dialogues
        dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg,
                                         side='right') - 1)
        dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end)
        # * turn indices of the adjusted beginning and end of the _res
        # dialogues
        # initialize along the boundaries of game turns
        dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg]
        dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end]

        # 4. make dialogue boundaries coincide with game turn boundaries,
        # which occasionally implies merging dialogues from _res

        # * compute a partition on dialogues such that any pair of dialogues
        # overlapping a given game turn are in the same class
        dlg2grp = [0]
        for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip(
                dlg2gturn_end[:-1], dlg2gturn_beg[1:])):
            if gturn_beg_nxt <= gturn_end_cur:
                # two _res dialogues overlap a single game turn:
                # put in the same class (to merge dialogues)
                dlg2grp.append(dlg2grp[-1])
            else:
                dlg2grp.append(dlg2grp[-1] + 1)

        # keep one dialogue for each class of dialogues
        for k, g in itertools.groupby(enumerate(dlg2grp),
                                      key=lambda x: x[1]):
            dlg_idc_merged = [x[0] for x in g]
            # adjust boundaries of the first dialogue of the group
            # index of first and last dialogues
            di_beg = dlg_idc_merged[0]
            di_end = dlg_idc_merged[-1]
            # index of first and last turns of these dialogues
            ti_beg = dlg_res_src_abeg[di_beg]
            ti_end = dlg_res_src_aend[di_end]
            # keep first dialogue, update its features to include those
            # from the other dialogues in the same class
            new_dlg = dlgs_res[di_beg]
            new_dlg.span.char_start = turns_src_beg[ti_beg]
            new_dlg.span.char_end = turns_src_end[ti_end]
            dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged]
            for feat in ['Trades', 'Gets', 'Dice_rolling']:
                new_dlg.features[feat] = _concatenate_features(
                    dlgs_res_merged, feat)
            # remove merged dialogues [1:] from doc_res
            for i in dlg_idc_merged[1:]:
                dlg_res = dlgs_res[i]
                doc_res.units.remove(dlg_res)

        # transfer each unmatched (non-overlapping) game turn as a dialogue
        # (which already exists in doc_src)
        gturns_matched = reduce(np.union1d,
                                (np.arange(x_beg, x_end + 1)
                                 for x_beg, x_end
                                 in zip(dlg2gturn_beg, dlg2gturn_end)))
        gturns_matched = set(gturns_matched)
        # each dialogue in doc_src is a game turn
        dlgs_src =  sorted((x for x in doc_src.units if is_dialogue(x)),
                           key=lambda x: x.span)
        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if (i in gturns_matched
                and dlg_src in updates.expected_src_only):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    return updates
Ejemplo n.º 5
0
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True,
                            strip_cdus=False, attach_len=False):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.
    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).
    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.
    strip_cdus : boolean, defaults to False
        If True, strip CDUs with the "head" strategy and sloppy=True.
    attach_len : boolean, defaults to False
        If True, compute attachment length. This requires
        strip_cdus=True.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_disc_rels = []  # discourse relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences
    df_unit_rels = []  # relations from the "units" stage (anaphora)

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    # give integer indices to segments, and EDUs in particular
    seg_idx = 0
    eeu_idx = 0
    edu_idx = 0
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal' and
             annotator not in ('BRONZE', 'SILVER', 'GOLD')) or
            (sel_annotator != 'metal' and
             annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in sorted(doc_val.units, key=lambda x: x.span):
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError('Wow, a discourse segment has *features*')
                    # assign index among segments, across the whole doc
                    unit_dict['seg_idx'] = seg_idx
                    seg_idx += 1
                    if anno.type == 'NonplayerSegment':  # EEU
                        unit_dict['eeu_idx'] = eeu_idx
                        eeu_idx += 1
                    else:  # EDU
                        unit_dict['edu_idx'] = edu_idx
                        edu_idx += 1
                    #
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(anno.features.keys()) ==
                            ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(
                    ['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets': anno.features.get('Gets', None),
                        'trades': anno.features.get('Trades', None),
                        'dice_rolls': anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) ==
                        ['Correctness', 'Kind', 'Quantity', 'Status'])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate', None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError('{}: schema with *features*'.format(
                            stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)

        # RELATIONS
        # * rewrite endpoints of relations if strip_cdus
        if strip_cdus:
            endpts = dict()  # map relation ids to (src_id, tgt_id)
            dgr = Graph.from_doc(game_corpus, doc_key)
            dgraph = copy.deepcopy(dgr)
            dgraph.strip_cdus(sloppy=True, mode='head')
            for edge in dgraph.relations():
                if "asoubeille_1414085458642" in edge:
                    print('Wop', edge)
                    raise ValueError('gni')
                links = dgraph.links(edge)
                # get the identifiers of the relation and its endpoints
                # to replace CDU ids with segment indices
                anno_rel = dgraph.annotation(edge)
                # as of 2017-06-24, anno_rel has no origin (why?) at
                # this point
                anno_rel.origin = doc_key  # temporary(?) fix
                #
                anno_src = dgraph.annotation(links[0])
                anno_tgt = dgraph.annotation(links[1])
                gid_rel = anno_rel.identifier()
                if gid_rel.endswith('_0'):
                    # strip_cdus appends an integer to each copy of
                    # the relation ; with mode="head", we only expect
                    # one such copy per relation so "_0" should be a
                    # sufficient match, which we can cut off for the
                    # mapping
                    gid_rel = gid_rel[:-2]
                gid_src = anno_src.identifier()
                gid_tgt = anno_tgt.identifier()
                endpts[gid_rel] = (gid_src, gid_tgt)
        # * process relations
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            # * global ids of the relation and its endpoints
            gid_rel = anno.identifier()
            gid_src = anno.source.identifier()
            gid_tgt = anno.target.identifier()
            # * build dict
            rel_dict = {
                # identification
                'global_id': gid_rel,
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier()
                )
                warnings.warn(w_msg)
            # if strip_cdus, replace endpoints of *discourse* relations
            # with segment ids
            if strip_cdus and is_relation_instance(anno):
                gid_src, gid_tgt = endpts[gid_rel]

            rel_dict.update({
                # features
                'arg_scope': anno.features.get('Argument_scope', None), # req
                'comments': anno.features.get('Comments', None),  # opt
                # endpoints
                'source': gid_src,
                'target': gid_tgt,
            })
            if stage == 'discourse':
                df_disc_rels.append(rel_dict)
            elif stage == 'units':
                df_unit_rels.append(rel_dict)
            else:
                raise ValueError(
                    "relation from stage not in {'units', 'discourse'}")
            

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS)
    df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg) &
                              (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)
    # * length of attachments
    # 2017-06-29 restricted to *discourse* relations, for the time being
    if strip_cdus and attach_len:
        df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_disc_rels, df_res, df_pref, df_unit_rels)
Ejemplo n.º 6
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {
        anno_type: list()
        for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']
    }

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() + cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() + dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() + turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() + tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in [
                    'paragraph', 'Resource', 'Anaphora', 'Several_resources',
                    'Preference'
            ]:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {
        anno_type: pd.DataFrame(data=row_list)
        for anno_type, row_list in rows.items() if row_list
    }

    return res
Ejemplo n.º 7
0
def shift_dialogues(doc_src, doc_res, updates, gen):
    """Transpose dialogue split from target to source document.

    Remove all dialogues from updates.

    Parameters
    ----------
    doc_src : Document
        Source (augmented) document.
    doc_res : Document
        Result document, originally a copy of doc_tgt with unshifted
        annotations. This function modifies `doc_res` by shifting the
        boundaries of its dialogues according to `updates`, and
        stretching the first and last dialogues so as to cover the
        same span as dialogues from `doc_src`.
    updates : set of updates
        Updates computed by `compute_updates`.
    gen: int
        Generation of annotations included in `doc_src` and the output.

    Returns
    -------
    updates : Updates
        Trimmed down set of `updates`: no more dialogue.
    """
    if gen < 3:
        dlgs_src = sorted(
            [x for x in doc_src.units if x.type.lower() == 'dialogue'],
            key=lambda y: y.span)
        dlgs_res = sorted(
            [x for x in doc_res.units if x.type.lower() == 'dialogue'],
            key=lambda y: y.span)

        # NEW 2016-06-15 adjust dialogue boundaries
        # for each target dialogue, find the smallest enclosing sequence of
        # source dialogues and map to it
        dlgs_src_beg = np.array([x.span.char_start for x in dlgs_src])
        dlgs_tgt_sbeg = np.array(
            [shift_char(x.span.char_start + 1, updates) - 1 for x in dlgs_res])
        # NB: we need to broadcast (- 1) to get the source dialogue whose
        # start immediately precedes the start of the shifted target
        # dialogue
        tgt2src_beg = (
            np.searchsorted(dlgs_src_beg, dlgs_tgt_sbeg, side='right') - 1)
        dlgs_tgt_abeg = dlgs_src_beg[tgt2src_beg]
        # map the shifted end of each target dialogue to the first larger end
        # of a source dialogue
        dlgs_src_end = np.array([x.span.char_end for x in dlgs_src])
        dlgs_tgt_send = np.array(
            [shift_char(x.span.char_end - 1, updates) + 1 for x in dlgs_res])
        tgt2src_end = np.searchsorted(dlgs_src_end, dlgs_tgt_send)
        dlgs_tgt_aend = dlgs_src_end[tgt2src_end]
        # overwrite the adjusted beginning and end, when a game turn
        # overlaps with two different tgt dialogues ;
        # each overlap in the matching signals a split, in the linguistic
        # version, that happens in the middle of a game turn
        for i, (end_cur,
                beg_nxt) in enumerate(zip(tgt2src_end[:-1], tgt2src_beg[1:])):
            if beg_nxt <= end_cur:
                # linguistic turns from the same game turn, in different
                # target dialogues => use the shifted cut point from tgt
                dlgs_tgt_aend[i] = dlgs_tgt_send[i]
                dlgs_tgt_abeg[i + 1] = dlgs_tgt_send[i]
        # find source dialogues included in the shifted+expanded target
        # dialogues
        dlgs_src_matched = reduce(
            np.union1d, (np.arange(x_beg, x_end + 1)
                         for x_beg, x_end in zip(tgt2src_beg, tgt2src_end)))
        dlgs_src_matched = set(dlgs_src_matched)

        for dlg_res, adj_start, adj_end in zip(dlgs_res, dlgs_tgt_abeg,
                                               dlgs_tgt_aend):
            dlg_res.span.char_start = adj_start
            dlg_res.span.char_end = adj_end
            # alt: dlg_res.span = Span(start, end)
            #
            # optionally, update timestamp, id, span as in
            # `stac.edit.cmd.split_dialogue.{_actually_split,_set}`

        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if ((i in dlgs_src_matched
                 and dlg_src in updates.expected_src_only)):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    else:
        # situated version: we can rely on game turns

        # 1. get the identifier of the first and last turn of each game turn
        # in _src: these turns and those in between must end up in the same
        # dialogue
        turns_src = sorted((x for x in doc_src.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_src_tid = np.array([x.features['Identifier'] for x in turns_src])
        turns_src_beg = np.array([x.span.char_start for x in turns_src])
        turns_src_end = np.array([x.span.char_end for x in turns_src])
        # * locate game turns (index of first and last turn)
        gturn_idc = game_turns(doc_src, turns_src, gen=3)
        gturn_idc_beg = np.array(gturn_idc)
        gturn_idc_end = np.array([i - 1 for i in gturn_idc[1:]] +
                                 [len(turns_src) - 1])
        # ... and finally
        gturn_src_tid_beg = turns_src_tid[gturn_idc_beg]
        gturn_src_tid_end = turns_src_tid[gturn_idc_end]

        # 2. get the identifier of the first and last turn of each dialogue
        # in _res: these turns and those in between must end up in the same
        # dialogue
        turns_res = sorted((x for x in doc_res.units if is_turn(x)),
                           key=lambda x: x.span)
        turns_res_tid = np.array([x.features['Identifier'] for x in turns_res])
        turns_res_beg = np.array([x.span.char_start for x in turns_res])
        turns_res_end = np.array([x.span.char_end for x in turns_res])
        # align dialogue spans with turn spans
        dlgs_res = sorted((x for x in doc_res.units if is_dialogue(x)),
                          key=lambda x: x.span)
        dlgs_res_beg = np.array([x.span.char_start for x in dlgs_res])
        dlgs_res_end = np.array([x.span.char_end for x in dlgs_res])
        dlgs_res_ti_beg = np.searchsorted(turns_res_beg, dlgs_res_beg)
        dlgs_res_ti_end = np.searchsorted(
            turns_res_end, dlgs_res_end, side='right') - 1
        # ... and finally
        dlgs_res_tid_beg = turns_res_tid[dlgs_res_ti_beg]
        dlgs_res_tid_end = turns_res_tid[dlgs_res_ti_end]

        # 3. map _res dialogues to _src game turns
        dlgs_res_ti_beg = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_beg])
        dlgs_res_ti_end = np.array(
            [list(turns_src_tid).index(x) for x in dlgs_res_tid_end])
        # * align the beginning (resp. end) indices of game turns and _res
        # dialogues
        dlg2gturn_beg = (
            np.searchsorted(gturn_idc_beg, dlgs_res_ti_beg, side='right') - 1)
        dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_res_ti_end)
        # * turn indices of the adjusted beginning and end of the _res
        # dialogues
        # initialize along the boundaries of game turns
        dlg_res_src_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg]
        dlg_res_src_aend = [gturn_idc_end[i] for i in dlg2gturn_end]

        # 4. make dialogue boundaries coincide with game turn boundaries,
        # which occasionally implies merging dialogues from _res

        # * compute a partition on dialogues such that any pair of dialogues
        # overlapping a given game turn are in the same class
        dlg2grp = [0]
        for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(
                zip(dlg2gturn_end[:-1], dlg2gturn_beg[1:])):
            if gturn_beg_nxt <= gturn_end_cur:
                # two _res dialogues overlap a single game turn:
                # put in the same class (to merge dialogues)
                dlg2grp.append(dlg2grp[-1])
            else:
                dlg2grp.append(dlg2grp[-1] + 1)

        # keep one dialogue for each class of dialogues
        for k, g in itertools.groupby(enumerate(dlg2grp), key=lambda x: x[1]):
            dlg_idc_merged = [x[0] for x in g]
            # adjust boundaries of the first dialogue of the group
            # index of first and last dialogues
            di_beg = dlg_idc_merged[0]
            di_end = dlg_idc_merged[-1]
            # index of first and last turns of these dialogues
            ti_beg = dlg_res_src_abeg[di_beg]
            ti_end = dlg_res_src_aend[di_end]
            # keep first dialogue, update its features to include those
            # from the other dialogues in the same class
            new_dlg = dlgs_res[di_beg]
            new_dlg.span.char_start = turns_src_beg[ti_beg]
            new_dlg.span.char_end = turns_src_end[ti_end]
            dlgs_res_merged = [dlgs_res[i] for i in dlg_idc_merged]
            for feat in ['Trades', 'Gets', 'Dice_rolling']:
                new_dlg.features[feat] = _concatenate_features(
                    dlgs_res_merged, feat)
            # remove merged dialogues [1:] from doc_res
            for i in dlg_idc_merged[1:]:
                dlg_res = dlgs_res[i]
                doc_res.units.remove(dlg_res)

        # transfer each unmatched (non-overlapping) game turn as a dialogue
        # (which already exists in doc_src)
        gturns_matched = reduce(
            np.union1d,
            (np.arange(x_beg, x_end + 1)
             for x_beg, x_end in zip(dlg2gturn_beg, dlg2gturn_end)))
        gturns_matched = set(gturns_matched)
        # each dialogue in doc_src is a game turn
        dlgs_src = sorted((x for x in doc_src.units if is_dialogue(x)),
                          key=lambda x: x.span)
        # remove all source and target dialogues from updates
        for dlg_res in dlgs_res:
            if dlg_res in updates.abnormal_tgt_only:
                updates.abnormal_tgt_only.remove(dlg_res)
        for i, dlg_src in enumerate(dlgs_src):
            if dlg_src in updates.abnormal_src_only:
                updates.abnormal_src_only.remove(dlg_src)
            if ((i in gturns_matched
                 and dlg_src in updates.expected_src_only)):
                # remove matched source dialogues, leave the unmatched
                # ones in expected_src_only, so that they are added later
                # to the woven document
                updates.expected_src_only.remove(dlg_src)

    return updates
Ejemplo n.º 8
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {anno_type: list()
            for anno_type in ['edu', 'turn', 'tstar', 'dialogue',
                              'cdu', 'rel']}

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() +
                           cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() +
                           dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() +
                           turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() +
                           tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in ['paragraph',
                               'Resource', 'Anaphora',
                               'Several_resources', 'Preference']:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {anno_type: pd.DataFrame(data=row_list)
           for anno_type, row_list in rows.items()
           if row_list}

    return res
Ejemplo n.º 9
0
def read_game_as_dataframes(game_folder, sel_annotator=None, thorough=True):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.

    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).

    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_rels = []  # relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal'
             and annotator not in ('BRONZE', 'SILVER', 'GOLD'))
                or (sel_annotator != 'metal' and annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in doc_val.units:
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError(
                            'Wow, a discourse segment has *features*')
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(
                        anno.features.keys()) == ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets':
                        anno.features.get('Gets', None),
                        'trades':
                        anno.features.get('Trades', None),
                        'dice_rolls':
                        anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) == [
                    'Correctness', 'Kind', 'Quantity', 'Status'
                ])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            rel_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier())
                warnings.warn(w_msg)
            rel_dict.update({
                # features
                'arg_scope':
                anno.features.get('Argument_scope', None),  # req
                'comments':
                anno.features.get('Comments', None),  # opt
                # endpoints
                'source':
                anno.source.identifier(),
                'target':
                anno.target.identifier(),
            })
            df_rels.append(rel_dict)

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_rels = pd.DataFrame(df_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg)
                              & (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_rels, df_res, df_pref)
Ejemplo n.º 10
0
def read_game_as_dataframes(game_folder,
                            sel_annotator=None,
                            thorough=True,
                            strip_cdus=False,
                            attach_len=False):
    """Read an annotated game as dataframes.

    Parameters
    ----------
    game_folder : path
        Path to the game folder.
    sel_annotator : str, optional
        Identifier of the annotator whose version we want. If `None`,
        the existing metal annotator will be used (BRONZE|SILVER|GOLD).
    thorough : boolean, defaults to True
        If True, check that annotations in 'units' and 'unannotated'
        that are expected to have a strict equivalent in 'dialogue'
        actually do.
    strip_cdus : boolean, defaults to False
        If True, strip CDUs with the "head" strategy and sloppy=True.
    attach_len : boolean, defaults to False
        If True, compute attachment length. This requires
        strip_cdus=True.

    Returns
    -------
    dfs : tuple of DataFrame
        DataFrames for the annotated game.
    """
    if sel_annotator is None:
        sel_annotator = 'metal'

    df_turns = []  # turns
    df_segs = []  # segments: EDUs, EEUs
    df_dlgs = []  # dialogues
    df_schms = []  # schemas: CDUs
    df_schm_mbrs = []  # schema members
    df_disc_rels = []  # discourse relations
    df_acts = []  # dialogue acts
    df_res = []  # resources
    df_pref = []  # preferences
    df_unit_rels = []  # relations from the "units" stage (anaphora)

    print(game_folder)  # DEBUG
    game_upfolder, game_name = os.path.split(game_folder)
    game_corpus = StacReader(game_upfolder).slurp(doc_glob=game_name)
    # give integer indices to segments, and EDUs in particular
    seg_idx = 0
    eeu_idx = 0
    edu_idx = 0
    for doc_key, doc_val in sorted(game_corpus.items()):
        doc = doc_key.doc
        subdoc = doc_key.subdoc
        stage = doc_key.stage
        annotator = doc_key.annotator
        # skip docs not from a selected annotator
        if ((sel_annotator == 'metal'
             and annotator not in ('BRONZE', 'SILVER', 'GOLD'))
                or (sel_annotator != 'metal' and annotator != sel_annotator)):
            continue
        # process annotations in doc
        # print(doc, subdoc, stage, annotator)  # verbose
        doc_text = doc_val.text()
        # print(doc_text)
        for anno in sorted(doc_val.units, key=lambda x: x.span):
            # attributes common to all units
            unit_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type, span, text
                'type': anno.type,
                'span_beg': anno.span.char_start,
                'span_end': anno.span.char_end,
                'text': doc_val.text(span=anno.span),
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional?
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }

            # fields specific to each type of unit
            if is_paragraph(anno):
                # paragraph: ignore? one per turn
                pass
            elif is_turn(anno):
                # turn
                # comments = anno.features['Comments']
                # if comments == 'Please write in remarks...':
                unit_dict.update({
                    # features
                    'timestamp': anno.features['Timestamp'],
                    'comments': anno.features['Comments'],
                    'developments': anno.features['Developments'],
                    'turn_id': anno.features['Identifier'],
                    'emitter': anno.features['Emitter'],
                    'resources': anno.features['Resources'],
                })
                if stage == 'discourse':
                    df_turns.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_edu(anno):
                # segment: EDU or EEU
                if stage == 'discourse':
                    if anno.features:
                        raise ValueError(
                            'Wow, a discourse segment has *features*')
                    # assign index among segments, across the whole doc
                    unit_dict['seg_idx'] = seg_idx
                    seg_idx += 1
                    if anno.type == 'NonplayerSegment':  # EEU
                        unit_dict['eeu_idx'] = eeu_idx
                        eeu_idx += 1
                    else:  # EDU
                        unit_dict['edu_idx'] = edu_idx
                        edu_idx += 1
                    #
                    df_segs.append(unit_dict)
                elif stage == 'units':
                    # each entry (should) correspond to an entry in df_segs
                    act_dict = {
                        'global_id': anno.identifier(),  # foreign key
                        'surface_act': anno.features['Surface_act'],
                        'addressee': anno.features['Addressee'],
                    }
                    assert (sorted(
                        anno.features.keys()) == ['Addressee', 'Surface_act'])
                    df_acts.append(act_dict)
                if thorough and stage in ('units', 'unannotated'):
                    # maybe metadata in 'units' has changed? eg. last
                    # modification date, last modifier
                    pass  # FIXME check existence (exact duplicate)
            elif is_dialogue(anno):
                expected_dlg_features = set(['Dice_rolling', 'Gets', 'Trades'])
                if set(anno.features.keys()).issubset(expected_dlg_features):
                    unit_dict.update({
                        # features
                        'gets':
                        anno.features.get('Gets', None),
                        'trades':
                        anno.features.get('Trades', None),
                        'dice_rolls':
                        anno.features.get('Dice_rolling', None),
                    })
                else:
                    warn_msg = 'Dialogue {}: unexpected features {}'.format(
                        anno.identifier(),
                        ', '.join(x for x in sorted(anno.features.keys())
                                  if x not in set(expected_dlg_features)))
                    warnings.warn(warn_msg)

                if stage == 'discourse':
                    df_dlgs.append(unit_dict)
                elif thorough:
                    pass  # FIXME check existence (exact duplicate)
            elif is_resource(anno):
                unit_dict.update({
                    # features
                    'status': anno.features['Status'],
                    'kind': anno.features['Kind'],
                    'correctness': anno.features['Correctness'],
                    'quantity': anno.features['Quantity'],
                })
                assert (sorted(anno.features.keys()) == [
                    'Correctness', 'Kind', 'Quantity', 'Status'
                ])
                df_res.append(unit_dict)
            elif is_preference(anno):
                if anno.features:
                    print(anno.__dict__)
                    raise ValueError('Preference with features {}'.format(
                        anno.features))
                df_pref.append(unit_dict)
            else:
                print(anno.__dict__)
                raise ValueError('what unit is this?')
            # print('Unit', anno)

        for anno in doc_val.schemas:
            # in 'discourse': CDUs ;
            # in 'units': combinations of resources (OR, AND)
            schm_dict = {
                # identification
                'global_id': anno.identifier(),
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
                # optional? metadata
                'last_modifier': anno.metadata.get('lastModifier', None),
                'last_modif_date': anno.metadata.get('lastModificationDate',
                                                     None),
            }
            # assumption: no feature
            if anno.features:
                if stage == 'units':
                    if anno.features.keys() == ['Operator']:
                        schm_dict.update({
                            'operator': anno.features['Operator'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
                elif stage == 'discourse':
                    # tolerate 'default': 'default' for the moment, but
                    # should probably cleaned out
                    if anno.features.keys() == ['default']:
                        schm_dict.update({
                            'default': anno.features['default'],
                        })
                    else:
                        print(anno.origin)
                        print(anno.__dict__)
                        print(anno.features)
                        raise ValueError(
                            '{}: schema with *features*'.format(stage))
            df_schms.append(schm_dict)
            # associate to this schema each of its members ; assumptions:
            # - members should be units or schemas (no relation)
            if anno.relations:
                raise ValueError('Wow, a schema with *relation members*')
            for member in anno.members:
                member_dict = {
                    'member_id': member.identifier(),
                    'schema_id': anno.identifier(),
                }
                df_schm_mbrs.append(member_dict)
            # TODO post-verification: check that all members do exist
            # (should be useless as stac-check should catch it)

        # RELATIONS
        # * rewrite endpoints of relations if strip_cdus
        if strip_cdus:
            endpts = dict()  # map relation ids to (src_id, tgt_id)
            dgr = Graph.from_doc(game_corpus, doc_key)
            dgraph = copy.deepcopy(dgr)
            dgraph.strip_cdus(sloppy=True, mode='head')
            for edge in dgraph.relations():
                if "asoubeille_1414085458642" in edge:
                    print('Wop', edge)
                    raise ValueError('gni')
                links = dgraph.links(edge)
                # get the identifiers of the relation and its endpoints
                # to replace CDU ids with segment indices
                anno_rel = dgraph.annotation(edge)
                # as of 2017-06-24, anno_rel has no origin (why?) at
                # this point
                anno_rel.origin = doc_key  # temporary(?) fix
                #
                anno_src = dgraph.annotation(links[0])
                anno_tgt = dgraph.annotation(links[1])
                gid_rel = anno_rel.identifier()
                if gid_rel.endswith('_0'):
                    # strip_cdus appends an integer to each copy of
                    # the relation ; with mode="head", we only expect
                    # one such copy per relation so "_0" should be a
                    # sufficient match, which we can cut off for the
                    # mapping
                    gid_rel = gid_rel[:-2]
                gid_src = anno_src.identifier()
                gid_tgt = anno_tgt.identifier()
                endpts[gid_rel] = (gid_src, gid_tgt)
        # * process relations
        for anno in doc_val.relations:
            # attributes common to all(?) types of annotations
            # * global ids of the relation and its endpoints
            gid_rel = anno.identifier()
            gid_src = anno.source.identifier()
            gid_tgt = anno.target.identifier()
            # * build dict
            rel_dict = {
                # identification
                'global_id': gid_rel,
                'doc': doc,
                'subdoc': subdoc,
                'stage': stage,
                'annotator': annotator,
                # type
                'type': anno.type,
                # metadata
                'last_modifier': anno.metadata['lastModifier'],
                'last_modif_date': anno.metadata['lastModificationDate'],
                'creation_date': anno.metadata['creation-date'],
                'author': anno.metadata['author'],
            }
            # attributes specific to relations
            if 'Argument_scope' not in anno.features:
                # required feature
                w_msg = '{}: relation {} has no Argument_scope'.format(
                    str(doc_key), anno.identifier())
                warnings.warn(w_msg)
            # if strip_cdus, replace endpoints of *discourse* relations
            # with segment ids
            if strip_cdus and is_relation_instance(anno):
                gid_src, gid_tgt = endpts[gid_rel]

            rel_dict.update({
                # features
                'arg_scope':
                anno.features.get('Argument_scope', None),  # req
                'comments':
                anno.features.get('Comments', None),  # opt
                # endpoints
                'source':
                gid_src,
                'target':
                gid_tgt,
            })
            if stage == 'discourse':
                df_disc_rels.append(rel_dict)
            elif stage == 'units':
                df_unit_rels.append(rel_dict)
            else:
                raise ValueError(
                    "relation from stage not in {'units', 'discourse'}")

    # create dataframes
    df_turns = pd.DataFrame(df_turns, columns=TURN_COLS)
    df_dlgs = pd.DataFrame(df_dlgs, columns=DLG_COLS)
    df_segs = pd.DataFrame(df_segs, columns=SEG_COLS)
    df_acts = pd.DataFrame(df_acts, columns=ACT_COLS)
    df_schms = pd.DataFrame(df_schms, columns=SCHM_COLS)
    df_schm_mbrs = pd.DataFrame(df_schm_mbrs, columns=SCHM_MBRS_COLS)
    df_disc_rels = pd.DataFrame(df_disc_rels, columns=REL_COLS)
    df_unit_rels = pd.DataFrame(df_unit_rels, columns=REL_COLS)
    df_res = pd.DataFrame(df_res, columns=RES_COLS)
    df_pref = pd.DataFrame(df_pref, columns=PREF_COLS)

    # add columns computed from other dataframes
    # * for segments: retrieve the turn_id and the char positions of the
    # beg and end of the segment in the turn text
    def get_seg_turn_cols(seg):
        """Helper to retrieve turn info for a segment (EDU, EEU)."""
        doc = seg['doc']
        subdoc = seg['subdoc']
        seg_beg = seg['span_beg']
        seg_end = seg['span_end']
        cand_turns = df_turns[(df_turns['span_beg'] <= seg_beg)
                              & (seg_end <= df_turns['span_end']) &
                              (doc == df_turns['doc']) &
                              (subdoc == df_turns['subdoc'])]
        # NB: cand_turns should contain a unique turn
        # compute the beg and end (char) positions of the segment in the turn
        # so we can match between the situated and linguistic versions when
        # the segmentation has changed
        turn_text = cand_turns['text'].item()
        seg_text = seg['text']
        turn_span_beg = turn_text.find(seg_text)
        turn_span_end = turn_span_beg + len(seg_text)
        turn_dict = {
            'turn_id': cand_turns['turn_id'].item(),
            'turn_span_beg': turn_span_beg,
            'turn_span_end': turn_span_end,
        }
        return pd.Series(turn_dict)

    seg_turn_cols = df_segs.apply(get_seg_turn_cols, axis=1)
    df_segs = pd.concat([df_segs, seg_turn_cols], axis=1)
    # * length of attachments
    # 2017-06-29 restricted to *discourse* relations, for the time being
    if strip_cdus and attach_len:
        df_disc_rels = compute_rel_attributes(df_segs, df_disc_rels)

    return (df_turns, df_dlgs, df_segs, df_acts, df_schms, df_schm_mbrs,
            df_disc_rels, df_res, df_pref, df_unit_rels)
Ejemplo n.º 11
0
def _fix_dialogue_boundaries(tcache, doc_ling, doc_situ):
    """Do the job.

    Parameters
    ----------
    tcache: TimestampCache
        Timestamp cache to generate unit identifiers for new dialogues.
    doc_ling: GlozzDocument
        Linguistic version of the game.
    doc_situ: GlozzDocument
        Situated version of the game.

    Returns
    -------
    doc_situ: GlozzDocument
        Fixed version of doc_situ.
    """
    doc_key = doc_situ.origin

    # 1. get the identifier of the first and last turn of each game turn
    # in _situ: these turns and those in between must end up in the same
    # dialogue
    turns_situ = sorted((x for x in doc_situ.units if is_turn(x)),
                        key=lambda x: x.span)
    turns_situ_tid = np.array([x.features['Identifier'] for x in turns_situ])
    turns_situ_beg = np.array([x.span.char_start for x in turns_situ])
    turns_situ_end = np.array([x.span.char_end for x in turns_situ])
    # * locate game turns (index of first and last turn)
    gturn_idc = game_turns(doc_situ, turns_situ, gen=3)
    gturn_idc_beg = np.array(gturn_idc)
    gturn_idc_end = np.array(
        [i - 1 for i in gturn_idc[1:]] + [len(turns_situ) - 1])
    # ... and finally
    gturn_situ_tid_beg = turns_situ_tid[gturn_idc_beg]
    gturn_situ_tid_end = turns_situ_tid[gturn_idc_end]
    # print('game turns in _situ', zip(gturn_situ_tid_beg, gturn_situ_tid_end))

    # 2. get the identifier of the first and last turn of each dialogue in
    # _ling: these turns and those in between must end up in the same
    # dialogue
    turns_ling = sorted((x for x in doc_ling.units if is_turn(x)),
                        key=lambda x: x.span)
    # DIRTY special processing for pilot02_01
    if doc_key.doc == 'pilot02' and doc_key.subdoc == '01':
        # ignore turns 26-27 that were moved down from _01 to _02
        turns_ling = turns_ling[:-2]
    turns_ling_tid = np.array([x.features['Identifier'] for x in turns_ling])
    turns_ling_beg = np.array([x.span.char_start for x in turns_ling])
    turns_ling_end = np.array([x.span.char_end for x in turns_ling])
    # align dialogue spans with turn spans
    dlgs_ling = sorted((x for x in doc_ling.units if is_dialogue(x)),
                       key=lambda x: x.span)
    # DIRTY
    if doc_key.doc == 'pilot02' and doc_key.subdoc == '01':
        # turns 26-27 are in the last dialogue in _01, in _ling
        dlgs_ling = dlgs_ling[:-1]
    dlgs_ling_beg = np.array([x.span.char_start for x in dlgs_ling])
    dlgs_ling_end = np.array([x.span.char_end for x in dlgs_ling])
    dlgs_ling_ti_beg = np.searchsorted(turns_ling_beg, dlgs_ling_beg)
    dlgs_ling_ti_end = np.searchsorted(turns_ling_end, dlgs_ling_end,
                                       side='right') - 1
    # ... and finally
    dlgs_ling_tid_beg = turns_ling_tid[dlgs_ling_ti_beg]
    dlgs_ling_tid_end = turns_ling_tid[dlgs_ling_ti_end]
    # print('dialogues in _ling', zip(dlgs_ling_tid_beg, dlgs_ling_tid_end))

    # 3. map _ling dialogues to _situ game turns
    # * locate the first and last turn of each _ling dialogue in the
    # list of turns in _situ
    # NB: we don't need indices in the list of turns from _ling anymore
    # hence it is safe to overwrite dlgs_ling_ti_{beg,end}
    dlgs_ling_ti_beg = np.array(
        [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_beg])
    dlgs_ling_ti_end = np.array(
        [list(turns_situ_tid).index(x) for x in dlgs_ling_tid_end])
    # print('game turns (turn_idx)', zip(gturn_idc_beg, gturn_idc_end))
    # print('core dlgs (turn_idx)', zip(dlgs_ling_ti_beg, dlgs_ling_ti_end))
    # * align the beginning (resp. end) indices of game turns and _ling
    # dialogues
    dlg2gturn_beg = (np.searchsorted(gturn_idc_beg, dlgs_ling_ti_beg,
                                     side='right') - 1)
    dlg2gturn_end = np.searchsorted(gturn_idc_end, dlgs_ling_ti_end)
    # print('map from dlg to gturn', zip(dlg2gturn_beg, dlg2gturn_end))
    # * turn indices of the adjusted beginning and end of the _ling
    # dialogues
    # initialize along the boundaries of game turns
    dlg_ling_situ_abeg = [gturn_idc_beg[i] for i in dlg2gturn_beg]
    dlg_ling_situ_aend = [gturn_idc_end[i] for i in dlg2gturn_end]

    # 4. make dialogue boundaries coincide with game turn boundaries,
    # which occasionally implies merging dialogues from _ling

    # * compute a partition on dialogues such that any pair of
    # dialogues overlapping a given game turn are in the same
    # class
    dlg2grp = [0]
    for i, (gturn_end_cur, gturn_beg_nxt) in enumerate(zip(
            dlg2gturn_end[:-1], dlg2gturn_beg[1:])):
        if gturn_beg_nxt <= gturn_end_cur:
            # two _ling dialogues overlap a single game turn:
            # put in the same class (to merge dialogues)
            dlg2grp.append(dlg2grp[-1])
        else:
            dlg2grp.append(dlg2grp[-1] + 1)

    # remove all dialogues from the units in doc_situ,
    # they will be replaced with (hopefully) clean ones
    dlgs_situ = sorted((x for x in doc_situ.units if is_dialogue(x)),
                       key=lambda x: x.span)
    for dlg_situ in dlgs_situ:
        doc_situ.units.remove(dlg_situ)

    # create one dialogue for each class of dialogues
    for k, g in itertools.groupby(enumerate(dlg2grp),
                                  key=lambda x: x[1]):
        dlg_idc_merged = [x[0] for x in g]
        # adjust boundaries of the first dialogue of the group
        # index of first and last dialogues
        di_beg = dlg_idc_merged[0]
        di_end = dlg_idc_merged[-1]
        # index of first and last turns of these dialogues
        ti_beg = dlg_ling_situ_abeg[di_beg]
        ti_end = dlg_ling_situ_aend[di_end]
        # create dialogue, use the 1st _ling dialogue as basis then
        # customize
        dlg0 = dlgs_ling[di_beg]
        new_dlg = copy.deepcopy(dlg0)
        new_dlg.origin = doc_key
        new_dlg.span.char_start = turns_situ_beg[ti_beg]
        new_dlg.span.char_end = turns_situ_end[ti_end]
        dlgs_ling_merged = [dlgs_ling[i] for i in dlg_idc_merged]
        for feat in ['Trades', 'Gets', 'Dice_rolling']:
            new_dlg.features[feat] = _concatenate_features(
                dlgs_ling_merged, feat)
        # add the new dialogue to doc_situ
        doc_situ.units.append(new_dlg)

    # create a new dialogue for each unmatched (non-overlapping) game
    # turn
    gturns_matched = reduce(np.union1d,
                            (np.arange(x_beg, x_end + 1)
                             for x_beg, x_end
                             in zip(dlg2gturn_beg, dlg2gturn_end)))
    gturns_matched = set(gturns_matched)
    for i, (gturn_idx_beg, gturn_idx_end) in enumerate(zip(
            gturn_idc_beg, gturn_idc_end)):
        if i not in gturns_matched:
            new_dlg_span = Span(turns_situ_beg[gturn_idx_beg],
                                turns_situ_end[gturn_idx_end])
            # UGLY this works just like split_dialogue:
            # create a new dialogue by copying an existing dialogue,
            # re-assign it an annotation id and span using a timestamp
            # cache, then erase all features
            new_dlg = copy.deepcopy(dlgs_situ[0])
            _set(tcache, new_dlg_span, new_dlg)
            new_dlg.features = {}
            # ... "et voila": add this dialogue to the document
            doc_situ.units.append(new_dlg)

    # TODO restore dialogue features from the game events?
    return doc_situ