Python RelSpan Examples

Programming Language: Python

Namespace/Package Name: educe.annotation

Class/Type: RelSpan

Examples at hotexamples.com: 6

Python RelSpan - 6 examples found. These are the top rated real world Python examples of educe.annotation.RelSpan extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

RelSpan(6)

Frequently Used Methods

RelSpan (6)

Example #1

Show file

File: attelo_out.py Project: popescuv/irit-stac

def mk_relation(tstamp, local_id_parent, local_id_child, label):
    """
    Given a document and edu ids, create a relation
    instance betweenthem

    """
    span = RelSpan(local_id_parent, local_id_child)
    label = label
    annotator = 'stacparser'
    date = tstamp.next()
    rel_id = stac_glozz.anno_id_from_tuple((annotator, date))
    features = {}
    metadata = {}
    metadata['author'] = annotator
    metadata['creation-date'] = str(date)
    return Relation(rel_id=rel_id,
                    span=span,
                    rtype=label,
                    features=features,
                    metadata=metadata)

Example #2

Show file

File: graph.py Project: tjane/educe

    def strip_cdus(self, sloppy=False, mode='head'):
        """ Delete all CDUs in this graph.
            Links involving a CDU will point to/from the elements
            of this CDU.
            Non-head modes may add new edges to the graph.

            Parameters
            ----------
            sloppy: boolean, default=False
                See `cdu_head`.

            mode: string, default='head'
                Strategy for replacing edges involving CDUs.
                `head` will relocate the edge on the recursive head of the
                CDU (see `recursive_cdu_heads`).
                `broadcast` will distribute the edge over all EDUs belonging
                to the CDU. A copy of the edge will be created for each of
                them. If the edge's source and target are both distributed,
                a new copy will be created for each combination of EDUs.
                `custom` (or any other string) will distribute or relocate on
                the head depending on the relation label.
        """

        # Set of labels for which the source node should be distributed
        LEFT_DIST = frozenset(
            ('Acknowledgement', 'Explanation', 'Comment', 'Continuation',
             'Narration', 'Contrast', 'Parallel', 'Background'))

        # Set of labels for which the target node should be distributed
        RIGHT_DIST = frozenset(
            ('Result', 'Continuation', 'Narration', 'Comment', 'Contrast',
             'Parallel', 'Background', 'Elaboration'))

        # Warning: heads.keys() are hyperedges
        heads = self.recursive_cdu_heads(sloppy)

        def distrib_candidates(links, label):
            """ Return a pair of list of nodes to be attached,
                depending on the edge label.
            """
            src_node, tgt_node = links

            def candidates(node, distributive):
                if not self.is_cdu(node):
                    return [node]
                if (mode != 'head'
                        and (mode == 'broadcast' or label in distributive)):
                    # Either distribute over all components...
                    # (always do in broadcast mode)
                    nodes = edu_components(node)
                else:
                    # ... or link to the CDU recursive head only
                    # (always do in head mode)
                    nodes = [heads[self.mirror(node)]]
                return nodes

            return (candidates(src_node,
                               LEFT_DIST), candidates(tgt_node, RIGHT_DIST))

        def edu_components(node):
            """ Returns a list of all EDUs contained by a node. """
            if not self.is_cdu(node):
                return [node]
            return [
                snode for snode in self.cdu_members(node, deep=True)
                if self.is_edu(snode)
            ]

        # Convert all edges in order
        for old_edge in self.relations():
            links = self.links(old_edge)
            # Verify the edge is well-formed
            assert (len(links) == 2)
            if not any(self.is_cdu(l) for l in links):
                # No CDU to strip: skip
                continue

            old_attrs = self.edge_attributes(old_edge)
            old_anno = self.annotation(old_edge)
            src_nodes, tgt_nodes = distrib_candidates(links, old_anno.type)
            # Remove the old edge
            self.del_edge(old_edge)
            self.doc.relations.remove(old_anno)
            # Build a new edge for all new combinations
            for i, (n_src, n_tgt) in enumerate(
                    itertools.product(src_nodes, tgt_nodes)):
                if n_src == n_tgt:
                    print("WARNING: something is pointing to its own CDU : " +
                          str(n_src))
                    continue
                # First, build a new Relation for the annotation layer
                n_src_anno = self.annotation(n_src)
                n_tgt_anno = self.annotation(n_tgt)
                new_anno = Relation(
                    '{0}_{1}'.format(old_anno._anno_id, i),
                    RelSpan(n_src_anno._anno_id, n_tgt_anno._anno_id),
                    old_anno.type, dict())
                new_anno.source = n_src_anno
                new_anno.target = n_tgt_anno
                self.doc.relations.append(new_anno)
                # Second, build a new graph edge
                new_edge = '{0}_{1}'.format(old_edge, i)
                new_attrs = dict(old_attrs)
                new_attrs['annotation'] = new_anno
                self.add_edge(new_edge)
                self.add_edge_attributes(new_edge, new_attrs.items())
                self.link(n_src, new_edge)
                self.link(n_tgt, new_edge)

        # Now all the CDUs are edge-orphaned, remove them from the graph
        for e_cdu in self.cdus():
            self.del_node(self.mirror(e_cdu))
            self.del_edge(e_cdu)
        # Same for annotation-level CDUs
        self.doc.schemas = [s for s in self.doc.schemas if not stac.is_cdu(s)]

Example #3

Show file

File: split_annotated.py Project: popescuv/irit-stac

def infer_resegmentation(unanno_doc, anno_doc, verbose=0):
    """Infer resegmentation of EDUs.

    Parameters
    ----------
    anno_doc : GlozzDocument
        Document to filter
    verbose : int
        Verbosity level

    Returns
    -------
    anno_doc : GlozzDocument
        Filtered document, where the support of relations and schemas
        has been rewritten.
    """
    anno_map = dict()
    cautious_map = dict()
    new_cdus = []

    turns = [x for x in unanno_doc.units if is_turn(x)]
    for turn in turns:
        # `unannotated` was the starting point for the annotation process
        u_edus = [
            x for x in unanno_doc.units
            if is_edu(x) and turn.span.encloses(x.span)
        ]
        u_ids = set(x.local_id() for x in u_edus)

        # `annotated` is the result of the annotation process
        # find conflicts, as pair-wise overlaps between annotations
        # from `annotated`
        a_edus = [
            x for x in anno_doc.units
            if is_edu(x) and turn.span.encloses(x.span)
        ]
        # 1. map new segments to their original equivalent, backporting
        # dialogue act annotation
        dup_items = [(elt_a, elt_b) for elt_a, elt_b in itertools.combinations(
            sorted(a_edus, key=lambda x:
                   (x.local_id() in u_ids, x.local_id())), 2)
                     if (span_eq(elt_a.text_span(), elt_b.text_span(), eps=1)
                         and elt_b.local_id() in u_ids)]
        anno_map.update(dup_items)
        # backport dialogue act annotation to original segment
        for elt_a, elt_b in dup_items:
            if elt_a.type in DIALOGUE_ACTS:
                # backport annotation to original segment elt_b
                elt_b.type = elt_a.type
                elt_b.features = elt_a.features
                for k in ['lastModifier', 'lastModificationDate']:
                    elt_b.metadata[k] = elt_a.metadata[k]
        # (locally) update the list of EDUs in anno_doc, so conflicts
        # are not computed on trivially mapped segments
        a_edus = [x for x in a_edus if x not in anno_map]

        # 2. list conflicts, then whitelist them progressively
        # NB: we sort EDUs in reverse using their local_ids, so that
        # conflict pairs are of the form (stac*, skar*) ; this is
        # admittedly a cheap, ad-hoc, trick to simulate an ordering
        # such that annotations already present in unannotated < annotations
        # introduced in annotated
        pw_conflicts = [(elt_a, elt_b)
                        for elt_a, elt_b in itertools.combinations(
                            sorted(a_edus,
                                   key=lambda x:
                                   (x.type in DIALOGUE_ACTS, x.local_id())), 2)
                        if elt_a.overlaps(elt_b)]

        # * Two cases are very close: EDU merges, and CDUs
        rels_support = set(
            anno_map.get(x, x) for rel in anno_doc.relations
            for x in [rel.source, rel.target])
        edu_merges = []  # list of (list of elt_a, elt_b)
        cdu_guess = []  # list of (list of elt_a, elt_b)
        for elt_b, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[1]):
            sorted_a = sorted((y[0] for y in pairs),
                              key=lambda z: z.text_span())
            span_seq_a = Span(sorted_a[0].text_span().char_start,
                              sorted_a[-1].text_span().char_end)

            # we approximately check that the sequence of EDUs elts_a
            # fully covers the span of elt_b, from start to end, with
            # no overlap or that the whole sequence is enclosed in
            # the annotation from `annotated` (this happens when some but
            # not all of the merged EDUs have been deleted)
            if ((approximate_cover(sorted_a, elt_b)
                 or elt_b.text_span().encloses(span_seq_a))):
                # then, it is either an EDU merge or a CDU ;
                # if any element of the sequence supports a relation,
                # we take this as indicating a CDU
                if any(y in rels_support for y in sorted_a):
                    # broadcast type, features, metadata to the segments
                    for elt_a in sorted_a:
                        elt_a.type = _SPLIT_PREFIX + elt_b.type
                        elt_a.features = elt_b.features
                        for k in ['lastModifier', 'lastModificationDate']:
                            elt_a.metadata[k] = elt_b.metadata[k]
                    # transform elt_b into a CDU
                    sch_relid = elt_b.local_id()
                    sch_units = set(y.local_id() for y in sorted_a)
                    sch_relas = set()
                    sch_schms = set()
                    sch_stype = 'Complex_discourse_unit'
                    sch_feats = {}
                    sch_metad = elt_b.metadata
                    new_cdu = Schema(sch_relid,
                                     sch_units,
                                     sch_relas,
                                     sch_schms,
                                     sch_stype,
                                     sch_feats,
                                     metadata=sch_metad)
                    new_cdus.append(new_cdu)
                    # map former (bad) segment to its proper CDU version
                    anno_map[elt_b] = new_cdu
                    cdu_guess.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('CDU {}\nwas {}, from\n  {}'.format(
                            new_cdu, elt_b,
                            '\n  '.join(str(z) for z in sorted_a)))
                elif all(elt_a.local_id() in u_ids for elt_a in sorted_a):
                    edu_merges.append((sorted_a, elt_b))
                    if verbose > 1:
                        print('EDU merge {} from\n  {}'.format(
                            elt_b, '\n  '.join(str(z) for z in sorted_a)))
                else:
                    err_msg = 'Weird approximate cover:\n{}\n{}'
                    raise ValueError(
                        err_msg.format(', '.join(str(y) for y in sorted_a),
                                       elt_b))
        # map each of the segments to its CDU, so these pairs can be
        # removed from the list of conflicts later
        cdu_map = dict()
        for elts_a, elt_b in cdu_guess:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            cdu_map.update(map_items)
            cautious_map.update(map_items)
        # map each of the merged segments to the new, bigger EDU + mark
        for elts_a, elt_b in edu_merges:
            map_items = [(elt_a, elt_b) for elt_a in elts_a]
            anno_map.update(map_items)
            cautious_map.update(map_items)
        # update list of conflicts: remove pairs that contain a segment
        # and its merged EDU, or a segment and its enclosing CDU
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if (anno_map.get(elt_a, elt_a) != elt_b
                            and cdu_map.get(elt_a, elt_a) != elt_b)]

        # * EDU splits
        edu_splits = dict()  # elt_a -> list of elt_b
        for elt_a, pairs in itertools.groupby(pw_conflicts,
                                              key=lambda x: x[0]):
            sorted_b = sorted((y[1] for y in pairs), key=lambda z: z.span)
            # we approximately check that the sequence of new EDUs
            # fully covers the span of elt_a, from start to end, with
            # no overlap
            if ((elt_a.local_id() in u_ids
                 and approximate_cover(sorted_b, elt_a))):
                edu_splits[elt_a] = sorted_b
        pw_conflicts = [(elt_a, elt_b) for elt_a, elt_b in pw_conflicts
                        if elt_a not in set(edu_splits.keys())]
        # map the split segment to the first of the resulting EDUs + mark
        for elt_a, elts_b in edu_splits.items():
            map_items = [(elt_a, elts_b[0])]
            anno_map.update(map_items)
            cautious_map.update(map_items)

        if verbose:
            if pw_conflicts:
                print('Conflict:')
                print('\n'.join('  {}\t<>\t{}'.format(str(elt_a), str(elt_b))
                                for elt_a, elt_b in pw_conflicts))

    # update anno_doc using the computed mapping
    anno_map_id = {x.local_id(): y.local_id() for x, y in anno_map.items()}
    cautious_map_id = {
        x.local_id(): y.local_id()
        for x, y in cautious_map.items()
    }
    # * forget mapped units and segments rewritten as CDUs
    anno_doc.units = [
        x for x in anno_doc.units
        if (not is_edu(x) or x.local_id() not in anno_map_id)
    ]
    # * add the new CDUs to the list of schemas
    anno_doc.schemas.extend(new_cdus)

    # rewrite the support of relations and schemas
    objects = {
        x.local_id(): x
        for x in itertools.chain(anno_doc.units, anno_doc.relations,
                                 anno_doc.schemas)
    }
    # * rewrite the support of relations
    for rel in anno_doc.relations:
        src = anno_map_id.get(rel.span.t1, rel.span.t1)
        tgt = anno_map_id.get(rel.span.t2, rel.span.t2)
        # update relation span, source, target
        rel.span = RelSpan(src, tgt)
        rel.source = objects[src]
        rel.target = objects[tgt]
        # if necessary, mark relation type for review
        if src in cautious_map_id or tgt in cautious_map_id:
            rel.type = _SPLIT_PREFIX + rel.type

    # * rewrite the support of schemas
    for sch in anno_doc.schemas:
        # sch.id = sch.id
        sch.units = set(anno_map_id.get(x, x) for x in sch.units)
        sch.relations = set(anno_map_id.get(x, x) for x in sch.relations)
        sch.schemas = set(anno_map_id.get(x, x) for x in sch.schemas)
        sch.type = sch.type
        # sch.features = sch.features
        # sch.metadata = sch.metadata
        sch.span = sch.units | sch.relations | sch.schemas
        sch.fleshout(objects)

    return anno_doc

Example #4

Show file

def read_node(node, context=None):
    def get_one(name, default, ctx=None):
        f = lambda n: read_node(n, ctx)
        return on_single_element(node, default, f, name)

    def get_all(name):
        return list(map(read_node, node.findall(name)))

    if node.tag == 'annotations':
        hashcode = get_one('metadata', '', 'annotations')
        if hashcode is '':
            hashcode = None
        units = get_all('unit')
        rels = get_all('relation')
        schemas = get_all('schema')
        return (hashcode, units, rels, schemas)

    elif node.tag == 'characterisation':
        fs = get_one('featureSet', {})
        unit_type = get_one('type', None)
        return (unit_type, fs)

    elif node.tag == 'feature':
        attr = node.attrib['name']
        val = node.text.strip() if node.text else None
        return (attr, val)

    # TODO throw exception if we see more than one instance of a key
    elif node.tag == 'featureSet':
        return dict(get_all('feature'))

    elif node.tag == 'metadata' and context == 'annotations':
        return node.attrib['corpusHashcode']

    elif node.tag == 'metadata':
        return dict([(t.tag, t.text.strip()) for t in node])

    elif node.tag == 'positioning' and context == 'unit':
        start = get_one('start', None)
        end = get_one('end', None)
        return Span(start, end)

    elif node.tag == 'positioning' and context == 'relation':
        terms = get_all('term')
        if len(terms) != 2:
            raise GlozzException("Was expecting exactly 2 terms, but got %d" %
                                 len(terms))
        else:
            return RelSpan(terms[0], terms[1])

    elif node.tag == 'positioning' and context == 'schema':
        units = frozenset(get_all('embedded-unit'))
        relations = frozenset(get_all('embedded-relation'))
        schemas = frozenset(get_all('embedded-schema'))
        return units, relations, schemas

    elif node.tag == 'relation':
        rel_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'relation')
        metadata = get_one('metadata', {})
        return Relation(rel_id, span, unit_type, fs, metadata=metadata)

    if node.tag == 'schema':
        anno_id = node.attrib['id']
        (anno_type, fs) = get_one('characterisation', None)
        units, rels, schemas = get_one('positioning', None, 'schema')
        metadata = get_one('metadata', {})
        return Schema(anno_id,
                      units,
                      rels,
                      schemas,
                      anno_type,
                      fs,
                      metadata=metadata)

    elif node.tag == 'singlePosition':
        return int(node.attrib['index'])

    elif node.tag == 'start' or node.tag == 'end':
        return get_one('singlePosition', None)

    elif node.tag in [
            'term', 'embedded-unit', 'embedded-relation', 'embedded-schema'
    ]:
        return node.attrib['id']

    elif node.tag == 'type':
        return node.text.strip()

    elif node.tag == 'unit':
        unit_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'unit')
        metadata = get_one('metadata', {})
        return Unit(unit_id, span, unit_type, fs, metadata=metadata)

Example #5

Show file

 def __init__(self, id, start, end):
     Relation.__init__(self, id, RelSpan(start, end), '', {})

Example #6

Show file

    def _mk_doc(self):
        """ Create an educe.annotation.Document from this graph """
        def start(name):
            return ord(name) - ord('a')

        def glozz_id(name):
            return 'du_' + str(start(name))

        def is_edu(name):
            return name not in self.cdus

        anno_units = list()
        anno_cdus = list()
        anno_rels = list()

        for du_name, speaker_set in self.speakers.items():
            # EDU loop
            if not is_edu(du_name):
                continue

            du_start, du_glozz_id = start(du_name), glozz_id(du_name)
            x_edu = Unit(du_glozz_id, Span(du_start, du_start + 1), 'Segment',
                         dict())
            speaker = list(speaker_set)[0]
            turn = Unit('t' + du_glozz_id, Span(du_start, du_start + 1),
                        'Turn', {
                            'Identifier': du_start,
                            'Emitter': speaker
                        })

            self.anno_map[du_name] = x_edu
            anno_units.append(x_edu)
            anno_units.append(turn)

        for du_name, sub_names in self.cdus.items():
            x_cdu = Schema(
                glozz_id(du_name),
                set(glozz_id(x) for x in sub_names if is_edu(x)), set(),
                set(glozz_id(x) for x in sub_names if not is_edu(x)),
                'Complex_discourse_unit', dict())
            self.anno_map[du_name] = x_cdu
            anno_cdus.append(x_cdu)

        rel_count = 0
        for src_name in self.down:
            for tgt_name, rel_tag in self.down[src_name]:
                rel_glozz_id = 'rel_' + str(rel_count)
                rel_count += 1
                if rel_tag == 'S':
                    rel_name = 'Q-Elab'
                elif rel_tag == 'C':
                    rel_name = 'Contrast'
                else:
                    raise ValueError('Unknown tag {0}'.format(rel_tag))

                rel = Relation(rel_glozz_id,
                               RelSpan(glozz_id(src_name), glozz_id(tgt_name)),
                               rel_name, dict())
                self.anno_map[(src_name, tgt_name)] = rel
                anno_rels.append(rel)

        dialogue = Unit(
            'dialogue_0',
            Span(0, max(u.text_span().char_end for u in anno_units)),
            'Dialogue', {})
        anno_units.append(dialogue)

        doc = Document(anno_units, anno_rels, anno_cdus,
                       string.ascii_lowercase)
        return doc