Ejemplo n.º 1
0
def cdu_feats(anno):
    """Get CDU features that are not immediate.

    Parameters
    ----------
    anno: Schema
        The schema that codes this CDU in the glozz format

    Returns
    -------
    res: dict(string, int)
        Features on this CDU, currently
        'nb_edus_tot' (total number of EDUs spanned by this CDU),
        'nb_cdus_imm' (number of CDUs immediately embedded in this CDU),
        'nb_cdus_tot' (total number of CDUs recursively embedded in this
        CDU),
        'max_lvl' (maximal degree of CDU nesting in this CDU).
    """
    nb_members = len(anno.members)
    nb_cdus_imm = len([m for m in anno.members if is_cdu(m)])

    nb_edus_tot = 0
    nb_cdus_tot = 0
    max_lvl = 0

    cdus_to_expand = [(0, anno)]
    while cdus_to_expand:
        lvl, cur_cdu = cdus_to_expand.pop()
        mem_lvl = lvl + 1
        for member in cur_cdu.members:
            if is_edu(member):
                nb_edus_tot += 1
            elif is_cdu(member):
                nb_cdus_tot += 1
                if mem_lvl > max_lvl:
                    max_lvl = mem_lvl
                cdus_to_expand.append((mem_lvl, member))
            else:
                raise ValueError('Unexpected type for a CDU member')

    # TODO new features:
    # * nb_gaps: CDUs spans can be discontinuous
    # * gap_max_len: max len of a gap (in #EDUs)
    # * over_nb_turns: nb of turns this CDU (partly) spans over
    # * over_nb_tstars: nb of tstars this CDU (partly) spans over

    res = {
        'members': nb_members,
        'members_cdu': nb_cdus_imm,
        'spanned_cdus': nb_cdus_tot,
        'spanned_edus': nb_edus_tot,
        'depth': max_lvl,
    }

    return res
Ejemplo n.º 2
0
def cdu_feats(anno):
    """Get CDU features that are not immediate.

    Parameters
    ----------
    anno: Schema
        The schema that codes this CDU in the glozz format

    Returns
    -------
    res: dict(string, int)
        Features on this CDU, currently
        'nb_edus_tot' (total number of EDUs spanned by this CDU),
        'nb_cdus_imm' (number of CDUs immediately embedded in this CDU),
        'nb_cdus_tot' (total number of CDUs recursively embedded in this
        CDU),
        'max_lvl' (maximal degree of CDU nesting in this CDU).
    """
    nb_members = len(anno.members)
    nb_cdus_imm = len([m for m in anno.members if is_cdu(m)])

    nb_edus_tot = 0
    nb_cdus_tot = 0
    max_lvl = 0

    cdus_to_expand = [(0, anno)]
    while cdus_to_expand:
        lvl, cur_cdu = cdus_to_expand.pop()
        mem_lvl = lvl + 1
        for member in cur_cdu.members:
            if is_edu(member):
                nb_edus_tot += 1
            elif is_cdu(member):
                nb_cdus_tot += 1
                if mem_lvl > max_lvl:
                    max_lvl = mem_lvl
                cdus_to_expand.append((mem_lvl, member))
            else:
                raise ValueError('Unexpected type for a CDU member')

    # TODO new features:
    # * nb_gaps: CDUs spans can be discontinuous
    # * gap_max_len: max len of a gap (in #EDUs)
    # * over_nb_turns: nb of turns this CDU (partly) spans over
    # * over_nb_tstars: nb of tstars this CDU (partly) spans over

    res = {
        'members': nb_members,
        'members_cdu': nb_cdus_imm,
        'spanned_cdus': nb_cdus_tot,
        'spanned_edus': nb_edus_tot,
        'depth': max_lvl,
    }

    return res
Ejemplo n.º 3
0
def rel_feats(doc, ctx, anno, debug=False):
    """Get features for relations.

    Parameters
    ----------
    doc:
    ctx:
    anno:

    Returns
    -------
    res: dict
        Features for this relation
    """
    # get all EDUs from document, sorted by their span
    doc_edus = sorted([u for u in doc.units if is_edu(u)],
                      key=lambda u: u.span)
    # TODO doc_tstars = ...

    src = anno.source
    if is_cdu(src):
        src_type = 'CDU'
        src_edus = sorted(src.terminals(), key=lambda e: e.span)
    elif is_edu(src):
        src_type = 'EDU'
        src_edus = [src]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    tgt = anno.target
    if is_cdu(tgt):
        tgt_type = 'CDU'
        tgt_edus = sorted(tgt.terminals(), key=lambda e: e.span)
    elif is_edu(tgt):
        tgt_type = 'EDU'
        tgt_edus = [tgt]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    # get the index of the EDUs in the interval between src and tgt
    src_idc = [doc_edus.index(e) for e in src_edus]
    tgt_idc = [doc_edus.index(e) for e in tgt_edus]

    # error case covered at least partially by stac-check, either
    # as "bizarre relation instance" or as "CDU punctures"
    if set(src_idc).intersection(set(tgt_idc)):
        if debug:
            direction = 'messed up'
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in src_edus
            ]))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in tgt_edus
            ]))
        return {}

    # src ... tgt
    if src_idc[-1] < tgt_idc[0]:
        direction = 'right'
        fst_idc = src_idc
        snd_idc = tgt_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt ... src
    elif tgt_idc[-1] < src_idc[0]:
        direction = 'left'
        fst_idc = tgt_idc
        snd_idc = src_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt and src are interwoven
    else:
        direction = 'interwoven'  # FIXME
        src_tgt_idc = set(src_idc).union(tgt_idc)
        interv_edus = []
        gap_edus = [
            e for i, e in enumerate(doc_edus)
            if (i not in src_tgt_idc and i > min(src_tgt_idc)
                and i < max(src_tgt_idc))
        ]
        if debug:
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in src_edus
            ]))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in tgt_edus
            ]))
            print('\t' + ', '.join([
                '[{}] {}'.format(str(e.span), doc.text(e.span))
                for e in gap_edus
            ]))
    edu_dist = len(interv_edus) + 1

    # turn-stars distance
    src_tstars = [ctx[e].tstar for e in src_edus]
    tgt_tstars = [ctx[e].tstar for e in tgt_edus]
    interv_tstars = [ctx[e].tstar for e in interv_edus]
    # turn-stars from the interval that don't overlap with src nor tgt
    skipped_tstars = set(interv_tstars) - set(src_tstars) - set(tgt_tstars)
    # we define:
    # * tstar_dist = 0  if (part of) src and tgt belong to the same tstar
    # * tstar_dist = len(skipped_tstars) + 1 otherwise
    tstar_dist = (len(skipped_tstars) + 1
                  if not set(src_tstars).intersection(set(tgt_tstars)) else 0)

    res = {
        'src_type': src_type,
        'tgt_type': tgt_type,
        'direction': direction,
        'edu_dist': edu_dist,
        'tstar_dist': tstar_dist,
    }

    return res
Ejemplo n.º 4
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {
        anno_type: list()
        for anno_type in ['edu', 'turn', 'tstar', 'dialogue', 'cdu', 'rel']
    }

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() + cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() + dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() + turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() + tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in [
                    'paragraph', 'Resource', 'Anaphora', 'Several_resources',
                    'Preference'
            ]:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {
        anno_type: pd.DataFrame(data=row_list)
        for anno_type, row_list in rows.items() if row_list
    }

    return res
Ejemplo n.º 5
0
def rel_feats(doc, ctx, anno, debug=False):
    """Get features for relations.

    Parameters
    ----------
    doc : GlozzDocument
        Surrounding document

    ctx :

    anno :

    Returns
    -------
    res : dict(string, string?)
        Features for this relation
    """
    # get all EDUs from document, sorted by their span
    doc_edus = sorted([u for u in doc.units if is_edu(u)],
                      key=lambda u: u.span)
    # TODO doc_tstars = ...

    src = anno.source
    if is_cdu(src):
        src_type = 'CDU'
        src_edus = sorted(src.terminals(), key=lambda e: e.span)
    elif is_edu(src):
        src_type = 'EDU'
        src_edus = [src]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    tgt = anno.target
    if is_cdu(tgt):
        tgt_type = 'CDU'
        tgt_edus = sorted(tgt.terminals(), key=lambda e: e.span)
    elif is_edu(tgt):
        tgt_type = 'EDU'
        tgt_edus = [tgt]
    else:
        # covered by stac-check ("non-DU endpoints")
        return {}

    # get the index of the EDUs in the interval between src and tgt
    src_idc = [doc_edus.index(e) for e in src_edus]
    tgt_idc = [doc_edus.index(e) for e in tgt_edus]

    # error case covered at least partially by stac-check, either
    # as "bizarre relation instance" or as "CDU punctures"
    if set(src_idc).intersection(set(tgt_idc)):
        if debug:
            direction = 'messed up'
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in src_edus]))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in tgt_edus]))
        return {}

    # src ... tgt
    if src_idc[-1] < tgt_idc[0]:
        direction = 'right'
        fst_idc = src_idc
        snd_idc = tgt_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt ... src
    elif tgt_idc[-1] < src_idc[0]:
        direction = 'left'
        fst_idc = tgt_idc
        snd_idc = src_idc
        interv_edus = doc_edus[(fst_idc[-1] + 1):snd_idc[0]]
    # tgt and src are interwoven
    else:
        direction = 'interwoven'  # FIXME
        src_tgt_idc = set(src_idc).union(tgt_idc)
        interv_edus = []
        gap_edus = [e for i, e in enumerate(doc_edus)
                    if (i not in src_tgt_idc and
                        i > min(src_tgt_idc) and
                        i < max(src_tgt_idc))]
        if debug:
            print('* {}: {} {}'.format(doc.origin, direction, anno.type))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in src_edus]))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in tgt_edus]))
            print('\t' + ', '.join(['[{}] {}'.format(str(e.span),
                                                     doc.text(e.span))
                                    for e in gap_edus]))
    edu_dist = len(interv_edus) + 1

    # turn-stars distance
    src_tstars = [ctx[e].tstar for e in src_edus]
    tgt_tstars = [ctx[e].tstar for e in tgt_edus]
    interv_tstars = [ctx[e].tstar for e in interv_edus]
    # turn-stars from the interval that don't overlap with src nor tgt
    skipped_tstars = set(interv_tstars) - set(src_tstars) - set(tgt_tstars)
    # we define:
    # * tstar_dist = 0  if (part of) src and tgt belong to the same tstar
    # * tstar_dist = len(skipped_tstars) + 1 otherwise
    tstar_dist = (len(skipped_tstars) + 1
                  if not set(src_tstars).intersection(set(tgt_tstars))
                  else 0)

    res = {
        'src_type': src_type,
        'tgt_type': tgt_type,
        'direction': direction,
        'edu_dist': edu_dist,
        'tstar_dist': tstar_dist,
    }

    return res
Ejemplo n.º 6
0
def create_dfs(corpus):
    """Create pandas DataFrames for the corpus.

    Returns
    -------
    res: dict(string, DataFrame)
        A DataFrame for each kind of structure present in the corpus.
    """
    rows = {anno_type: list()
            for anno_type in ['edu', 'turn', 'tstar', 'dialogue',
                              'cdu', 'rel']}

    for file_id, doc in corpus.items():
        # common stuff: get general info (doc, subdoc, annotator)
        doc_name = file_id.doc
        subdoc_name = file_id.subdoc
        stage = file_id.stage
        annotator = file_id.annotator
        # context: yerk
        ctx = Context.for_edus(doc)
        # doc.annotations() := doc.units + doc.relations + doc.schemas
        for anno in doc.annotations():
            common_cols = {
                'anno_id': anno.identifier(),
                'doc': doc_name,
                'subdoc': subdoc_name,
                'stage': stage,
                'annotator': annotator,
                'type': anno.type,  # ? maybe not
            }
            if is_edu(anno):
                row = dict(common_cols.items() +
                           edu_feats(doc, ctx, anno).items())
                rows['edu'].append(row)
            elif is_cdu(anno):
                row = dict(common_cols.items() +
                           cdu_feats(anno).items())
                rows['cdu'].append(row)
            elif is_relation_instance(anno):
                row = dict(common_cols.items() +
                           rel_feats(doc, ctx, anno).items())
                rows['rel'].append(row)
            elif is_dialogue(anno):
                row = dict(common_cols.items() +
                           dlg_feats(anno).items())
                rows['dialogue'].append(row)
            elif is_turn(anno):
                row = dict(common_cols.items() +
                           turn_feats(anno).items())
                rows['turn'].append(row)
            elif is_turn_star(anno):
                row = dict(common_cols.items() +
                           tstar_feats(anno).items())
                rows['tstar'].append(row)
            elif anno.type in ['paragraph',
                               'Resource', 'Anaphora',
                               'Several_resources', 'Preference']:
                # each paragraph (normally) corresponds to a Turn
                # so just ignore them ;
                # the situation is less clear-cut for 'Resource',
                # 'Anaphora', 'Several_resources'
                continue
            else:
                err_msg = 'Unsupported annotation: {}'.format(anno)
                # raise ValueError(err_msg)
                print('W: {}'.format(err_msg))
                continue

    res = {anno_type: pd.DataFrame(data=row_list)
           for anno_type, row_list in rows.items()
           if row_list}

    return res