Example #1
0
def mk_relation(tstamp, local_id_parent, local_id_child, label):
    """
    Given a document and edu ids, create a relation
    instance betweenthem

    """
    span = RelSpan(local_id_parent, local_id_child)
    label = label
    annotator = 'stacparser'
    date = tstamp.next()
    rel_id = stac_glozz.anno_id_from_tuple((annotator, date))
    features = {}
    metadata = {}
    metadata['author'] = annotator
    metadata['creation-date'] = str(date)
    return Relation(rel_id=rel_id,
                    span=span,
                    rtype=label,
                    features=features,
                    metadata=metadata)
Example #2
0
File: graph.py Project: tjane/educe
    def strip_cdus(self, sloppy=False, mode='head'):
        """ Delete all CDUs in this graph.
            Links involving a CDU will point to/from the elements
            of this CDU.
            Non-head modes may add new edges to the graph.

            Parameters
            ----------
            sloppy: boolean, default=False
                See `cdu_head`.

            mode: string, default='head'
                Strategy for replacing edges involving CDUs.
                `head` will relocate the edge on the recursive head of the
                CDU (see `recursive_cdu_heads`).
                `broadcast` will distribute the edge over all EDUs belonging
                to the CDU. A copy of the edge will be created for each of
                them. If the edge's source and target are both distributed,
                a new copy will be created for each combination of EDUs.
                `custom` (or any other string) will distribute or relocate on
                the head depending on the relation label.
        """

        # Set of labels for which the source node should be distributed
        LEFT_DIST = frozenset(
            ('Acknowledgement', 'Explanation', 'Comment', 'Continuation',
             'Narration', 'Contrast', 'Parallel', 'Background'))

        # Set of labels for which the target node should be distributed
        RIGHT_DIST = frozenset(
            ('Result', 'Continuation', 'Narration', 'Comment', 'Contrast',
             'Parallel', 'Background', 'Elaboration'))

        # Warning: heads.keys() are hyperedges
        heads = self.recursive_cdu_heads(sloppy)

        def distrib_candidates(links, label):
            """ Return a pair of list of nodes to be attached,
                depending on the edge label.
            """
            src_node, tgt_node = links

            def candidates(node, distributive):
                if not self.is_cdu(node):
                    return [node]
                if (mode != 'head'
                        and (mode == 'broadcast' or label in distributive)):
                    # Either distribute over all components...
                    # (always do in broadcast mode)
                    nodes = edu_components(node)
                else:
                    # ... or link to the CDU recursive head only
                    # (always do in head mode)
                    nodes = [heads[self.mirror(node)]]
                return nodes

            return (candidates(src_node,
                               LEFT_DIST), candidates(tgt_node, RIGHT_DIST))

        def edu_components(node):
            """ Returns a list of all EDUs contained by a node. """
            if not self.is_cdu(node):
                return [node]
            return [
                snode for snode in self.cdu_members(node, deep=True)
                if self.is_edu(snode)
            ]

        # Convert all edges in order
        for old_edge in self.relations():
            links = self.links(old_edge)
            # Verify the edge is well-formed
            assert (len(links) == 2)
            if not any(self.is_cdu(l) for l in links):
                # No CDU to strip: skip
                continue

            old_attrs = self.edge_attributes(old_edge)
            old_anno = self.annotation(old_edge)
            src_nodes, tgt_nodes = distrib_candidates(links, old_anno.type)
            # Remove the old edge
            self.del_edge(old_edge)
            self.doc.relations.remove(old_anno)
            # Build a new edge for all new combinations
            for i, (n_src, n_tgt) in enumerate(
                    itertools.product(src_nodes, tgt_nodes)):
                if n_src == n_tgt:
                    print("WARNING: something is pointing to its own CDU : " +
                          str(n_src))
                    continue
                # First, build a new Relation for the annotation layer
                n_src_anno = self.annotation(n_src)
                n_tgt_anno = self.annotation(n_tgt)
                new_anno = Relation(
                    '{0}_{1}'.format(old_anno._anno_id, i),
                    RelSpan(n_src_anno._anno_id, n_tgt_anno._anno_id),
                    old_anno.type, dict())
                new_anno.source = n_src_anno
                new_anno.target = n_tgt_anno
                self.doc.relations.append(new_anno)
                # Second, build a new graph edge
                new_edge = '{0}_{1}'.format(old_edge, i)
                new_attrs = dict(old_attrs)
                new_attrs['annotation'] = new_anno
                self.add_edge(new_edge)
                self.add_edge_attributes(new_edge, new_attrs.items())
                self.link(n_src, new_edge)
                self.link(n_tgt, new_edge)

        # Now all the CDUs are edge-orphaned, remove them from the graph
        for e_cdu in self.cdus():
            self.del_node(self.mirror(e_cdu))
            self.del_edge(e_cdu)
        # Same for annotation-level CDUs
        self.doc.schemas = [s for s in self.doc.schemas if not stac.is_cdu(s)]
Example #3
0
def infer_resegmentation(unanno_doc, anno_doc, verbose=0):
    """Infer resegmentation of EDUs.

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Filtered document, where the support of relations and schemas
        has been rewritten.
    """
    anno_map = dict()
    cautious_map = dict()
    new_cdus = []

    turns = [x for x in unanno_doc.units if is_turn(x)]
    for turn in turns:
        # `unannotated` was the starting point for the annotation process
        u_edus = [
            x for x in unanno_doc.units
            if is_edu(x) and turn.span.encloses(x.span)
        ]
        u_ids = set(x.local_id() for x in u_edus)

        # `annotated` is the result of the annotation process
        # find conflicts, as pair-wise overlaps between annotations
        # from `annotated`
        a_edus = [
            x for x in anno_doc.units
            if is_edu(x) and turn.span.encloses(x.span)
        ]
        # 1. map new segments to their original equivalent, backporting
        # dialogue act annotation
        dup_items = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations(
            sorted(a_edus, key=lambda x:
                   (x.local_id() in u_ids, x.local_id())), 2)
                     if (span_eq(elt_a.text_span(), elt_b.text_span(), eps=1)
                         and elt_b.local_id() in u_ids)]
        anno_map.update(dup_items)
        # backport dialogue act annotation to original segment
        for elt_a, elt_b in dup_items:
            if elt_a.type in DIALOGUE_ACTS:
                # backport annotation to original segment elt_b
                elt_b.type = elt_a.type
                elt_b.features = elt_a.features
                for k in ['lastModifier', 'lastModificationDate']:
                    elt_b.metadata[k] = elt_a.metadata[k]
        # (locally) update the list of EDUs in anno_doc, so conflicts
        # are not computed on trivially mapped segments
        a_edus = [x for x in a_edus if x not in anno_map]

        # 2. list conflicts, then whitelist them progressively
        # NB: we sort EDUs in reverse using their local_ids, so that
        # conflict pairs are of the form (stac*, skar*) ; this is
        # admittedly a cheap, ad-hoc, trick to simulate an ordering
        # such that annotations already present in unannotated < annotations
        # introduced in annotated
        pw_conflicts = [(elt_a, elt_b)
                        for elt_a, elt_b in itertools.combinations(
                            sorted(a_edus,
                                   key=lambda x:
                                   (x.type in DIALOGUE_ACTS, x.local_id())), 2)
                        if elt_a.overlaps(elt_b)]

        # * Two cases are very close: EDU merges, and CDUs
        rels_support = set(
            anno_map.get(x, x) for rel in anno_doc.relations
            for x in [rel.source, rel.target])
        edu_merges = []  # list of (list of elt_a, elt_b)
        cdu_guess = []  # list of (list of elt_a, elt_b)
        for elt_b, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[1]):
            sorted_a = sorted((y[0] for y in pairs),
                              key=lambda z: z.text_span())
            span_seq_a = Span(sorted_a[0].text_span().char_start,
                              sorted_a[-1].text_span().char_end)

            # we approximately check that the sequence of EDUs elts_a
            # fully covers the span of elt_b, from start to end, with
            # no overlap or that the whole sequence is enclosed in
            # the annotation from `annotated` (this happens when some but
            # not all of the merged EDUs have been deleted)
            if ((approximate_cover(sorted_a, elt_b)
                 or elt_b.text_span().encloses(span_seq_a))):
                # then, it is either an EDU merge or a CDU ;
                # if any element of the sequence supports a relation,
                # we take this as indicating a CDU
                if any(y in rels_support for y in sorted_a):
                    # broadcast type, features, metadata to the segments
                    for elt_a in sorted_a:
                        elt_a.type = _SPLIT_PREFIX + elt_b.type
                        elt_a.features = elt_b.features
                        for k in ['lastModifier', 'lastModificationDate']:
                            elt_a.metadata[k] = elt_b.metadata[k]
                    # transform elt_b into a CDU
                    sch_relid = elt_b.local_id()
                    sch_units = set(y.local_id() for y in sorted_a)
                    sch_relas = set()
                    sch_schms = set()
                    sch_stype = 'Complex_discourse_unit'
                    sch_feats = {}
                    sch_metad = elt_b.metadata
                    new_cdu = Schema(sch_relid,
                                     sch_units,
                                     sch_relas,
                                     sch_schms,
                                     sch_stype,
                                     sch_feats,
                                     metadata=sch_metad)
                    new_cdus.append(new_cdu)
                    # map former (bad) segment to its proper CDU version
                    anno_map[elt_b] = new_cdu
                    cdu_guess.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('CDU {}\nwas {}, from\n  {}'.format(
                            new_cdu, elt_b,
                            '\n  '.join(str(z) for z in sorted_a)))
                elif all(elt_a.local_id() in u_ids for elt_a in sorted_a):
                    edu_merges.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('EDU merge {} from\n  {}'.format(
                            elt_b, '\n  '.join(str(z) for z in sorted_a)))
                else:
                    err_msg = 'Weird approximate cover:\n{}\n{}'
                    raise ValueError(
                        err_msg.format(', '.join(str(y) for y in sorted_a),
                                       elt_b))
        # map each of the segments to its CDU, so these pairs can be
        # removed from the list of conflicts later
        cdu_map = dict()
        for elts_a, elt_b in cdu_guess:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            cdu_map.update(map_items)
            cautious_map.update(map_items)
        # map each of the merged segments to the new, bigger EDU + mark
        for elts_a, elt_b in edu_merges:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            anno_map.update(map_items)
            cautious_map.update(map_items)
        # update list of conflicts: remove pairs that contain a segment
        # and its merged EDU, or a segment and its enclosing CDU
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if (anno_map.get(elt_a, elt_a) != elt_b
                            and cdu_map.get(elt_a, elt_a) != elt_b)]

        # * EDU splits
        edu_splits = dict()  # elt_a -> list of elt_b
        for elt_a, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[0]):
            sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span)
            # we approximately check that the sequence of new EDUs
            # fully covers the span of elt_a, from start to end, with
            # no overlap
            if ((elt_a.local_id() in u_ids
                 and approximate_cover(sorted_b, elt_a))):
                edu_splits[elt_a] = sorted_b
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if elt_a not in set(edu_splits.keys())]
        # map the split segment to the first of the resulting EDUs + mark
        for elt_a, elts_b in edu_splits.items():
            map_items = [(elt_a, elts_b[0])]
            anno_map.update(map_items)
            cautious_map.update(map_items)

        if verbose:
            if pw_conflicts:
                print('Conflict:')
                print('\n'.join('  {}\t<>\t{}'.format(str(elt_a), str(elt_b))
                                for elt_a, elt_b in pw_conflicts))

    # update anno_doc using the computed mapping
    anno_map_id = {x.local_id(): y.local_id() for x, y in anno_map.items()}
    cautious_map_id = {
        x.local_id(): y.local_id()
        for x, y in cautious_map.items()
    }
    # * forget mapped units and segments rewritten as CDUs
    anno_doc.units = [
        x for x in anno_doc.units
        if (not is_edu(x) or x.local_id() not in anno_map_id)
    ]
    # * add the new CDUs to the list of schemas
    anno_doc.schemas.extend(new_cdus)

    # rewrite the support of relations and schemas
    objects = {
        x.local_id(): x
        for x in itertools.chain(anno_doc.units, anno_doc.relations,
                                 anno_doc.schemas)
    }
    # * rewrite the support of relations
    for rel in anno_doc.relations:
        src = anno_map_id.get(rel.span.t1, rel.span.t1)
        tgt = anno_map_id.get(rel.span.t2, rel.span.t2)
        # update relation span, source, target
        rel.span = RelSpan(src, tgt)
        rel.source = objects[src]
        rel.target = objects[tgt]
        # if necessary, mark relation type for review
        if src in cautious_map_id or tgt in cautious_map_id:
            rel.type = _SPLIT_PREFIX + rel.type

    # * rewrite the support of schemas
    for sch in anno_doc.schemas:
        # sch.id = sch.id
        sch.units = set(anno_map_id.get(x, x) for x in sch.units)
        sch.relations = set(anno_map_id.get(x, x) for x in sch.relations)
        sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas)
        sch.type = sch.type
        # sch.features = sch.features
        # sch.metadata = sch.metadata
        sch.span = sch.units | sch.relations | sch.schemas
        sch.fleshout(objects)

    return anno_doc
Example #4
0
def read_node(node, context=None):
    def get_one(name, default, ctx=None):
        f = lambda n: read_node(n, ctx)
        return on_single_element(node, default, f, name)

    def get_all(name):
        return list(map(read_node, node.findall(name)))

    if node.tag == 'annotations':
        hashcode = get_one('metadata', '', 'annotations')
        if hashcode is '':
            hashcode = None
        units = get_all('unit')
        rels = get_all('relation')
        schemas = get_all('schema')
        return (hashcode, units, rels, schemas)

    elif node.tag == 'characterisation':
        fs = get_one('featureSet', {})
        unit_type = get_one('type', None)
        return (unit_type, fs)

    elif node.tag == 'feature':
        attr = node.attrib['name']
        val = node.text.strip() if node.text else None
        return (attr, val)

    # TODO throw exception if we see more than one instance of a key
    elif node.tag == 'featureSet':
        return dict(get_all('feature'))

    elif node.tag == 'metadata' and context == 'annotations':
        return node.attrib['corpusHashcode']

    elif node.tag == 'metadata':
        return dict([(t.tag, t.text.strip()) for t in node])

    elif node.tag == 'positioning' and context == 'unit':
        start = get_one('start', None)
        end = get_one('end', None)
        return Span(start, end)

    elif node.tag == 'positioning' and context == 'relation':
        terms = get_all('term')
        if len(terms) != 2:
            raise GlozzException("Was expecting exactly 2 terms, but got %d" %
                                 len(terms))
        else:
            return RelSpan(terms[0], terms[1])

    elif node.tag == 'positioning' and context == 'schema':
        units = frozenset(get_all('embedded-unit'))
        relations = frozenset(get_all('embedded-relation'))
        schemas = frozenset(get_all('embedded-schema'))
        return units, relations, schemas

    elif node.tag == 'relation':
        rel_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'relation')
        metadata = get_one('metadata', {})
        return Relation(rel_id, span, unit_type, fs, metadata=metadata)

    if node.tag == 'schema':
        anno_id = node.attrib['id']
        (anno_type, fs) = get_one('characterisation', None)
        units, rels, schemas = get_one('positioning', None, 'schema')
        metadata = get_one('metadata', {})
        return Schema(anno_id,
                      units,
                      rels,
                      schemas,
                      anno_type,
                      fs,
                      metadata=metadata)

    elif node.tag == 'singlePosition':
        return int(node.attrib['index'])

    elif node.tag == 'start' or node.tag == 'end':
        return get_one('singlePosition', None)

    elif node.tag in [
            'term', 'embedded-unit', 'embedded-relation', 'embedded-schema'
    ]:
        return node.attrib['id']

    elif node.tag == 'type':
        return node.text.strip()

    elif node.tag == 'unit':
        unit_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'unit')
        metadata = get_one('metadata', {})
        return Unit(unit_id, span, unit_type, fs, metadata=metadata)
Example #5
0
 def __init__(self, id, start, end):
     Relation.__init__(self, id, RelSpan(start, end), '', {})
Example #6
0
    def _mk_doc(self):
        """ Create an educe.annotation.Document from this graph """
        def start(name):
            return ord(name) - ord('a')

        def glozz_id(name):
            return 'du_' + str(start(name))

        def is_edu(name):
            return name not in self.cdus

        anno_units = list()
        anno_cdus = list()
        anno_rels = list()

        for du_name, speaker_set in self.speakers.items():
            # EDU loop
            if not is_edu(du_name):
                continue

            du_start, du_glozz_id = start(du_name), glozz_id(du_name)
            x_edu = Unit(du_glozz_id, Span(du_start, du_start + 1), 'Segment',
                         dict())
            speaker = list(speaker_set)[0]
            turn = Unit('t' + du_glozz_id, Span(du_start, du_start + 1),
                        'Turn', {
                            'Identifier': du_start,
                            'Emitter': speaker
                        })

            self.anno_map[du_name] = x_edu
            anno_units.append(x_edu)
            anno_units.append(turn)

        for du_name, sub_names in self.cdus.items():
            x_cdu = Schema(
                glozz_id(du_name),
                set(glozz_id(x) for x in sub_names if is_edu(x)), set(),
                set(glozz_id(x) for x in sub_names if not is_edu(x)),
                'Complex_discourse_unit', dict())
            self.anno_map[du_name] = x_cdu
            anno_cdus.append(x_cdu)

        rel_count = 0
        for src_name in self.down:
            for tgt_name, rel_tag in self.down[src_name]:
                rel_glozz_id = 'rel_' + str(rel_count)
                rel_count += 1
                if rel_tag == 'S':
                    rel_name = 'Q-Elab'
                elif rel_tag == 'C':
                    rel_name = 'Contrast'
                else:
                    raise ValueError('Unknown tag {0}'.format(rel_tag))

                rel = Relation(rel_glozz_id,
                               RelSpan(glozz_id(src_name), glozz_id(tgt_name)),
                               rel_name, dict())
                self.anno_map[(src_name, tgt_name)] = rel
                anno_rels.append(rel)

        dialogue = Unit(
            'dialogue_0',
            Span(0, max(u.text_span().char_end for u in anno_units)),
            'Dialogue', {})
        anno_units.append(dialogue)

        doc = Document(anno_units, anno_rels, anno_cdus,
                       string.ascii_lowercase)
        return doc