Exemple #1
0
def mk_relation(tstamp, local_id_parent, local_id_child, label):
    """
    Given a document and edu ids, create a relation
    instance betweenthem

    """
    span = RelSpan(local_id_parent, local_id_child)
    label = label
    annotator = 'stacparser'
    date = tstamp.next()
    rel_id = stac_glozz.anno_id_from_tuple((annotator, date))
    features = {}
    metadata = {}
    metadata['author'] = annotator
    metadata['creation-date'] = str(date)
    return Relation(rel_id=rel_id,
                    span=span,
                    rtype=label,
                    features=features,
                    metadata=metadata)
Exemple #2
0
    def strip_cdus(self, sloppy=False, mode='head'):
        """ Delete all CDUs in this graph.
            Links involving a CDU will point to/from the elements
            of this CDU.
            Non-head modes may add new edges to the graph.

            Parameters
            ----------
            sloppy: boolean, default=False
                See `cdu_head`.

            mode: string, default='head'
                Strategy for replacing edges involving CDUs.
                `head` will relocate the edge on the recursive head of the
                CDU (see `recursive_cdu_heads`).
                `broadcast` will distribute the edge over all EDUs belonging
                to the CDU. A copy of the edge will be created for each of
                them. If the edge's source and target are both distributed,
                a new copy will be created for each combination of EDUs.
                `custom` (or any other string) will distribute or relocate on
                the head depending on the relation label.
        """

        # Set of labels for which the source node should be distributed
        LEFT_DIST = frozenset(
            ('Acknowledgement', 'Explanation', 'Comment', 'Continuation',
             'Narration', 'Contrast', 'Parallel', 'Background'))

        # Set of labels for which the target node should be distributed
        RIGHT_DIST = frozenset(
            ('Result', 'Continuation', 'Narration', 'Comment', 'Contrast',
             'Parallel', 'Background', 'Elaboration'))

        # Warning: heads.keys() are hyperedges
        heads = self.recursive_cdu_heads(sloppy)

        def distrib_candidates(links, label):
            """ Return a pair of list of nodes to be attached,
                depending on the edge label.
            """
            src_node, tgt_node = links

            def candidates(node, distributive):
                if not self.is_cdu(node):
                    return [node]
                if (mode != 'head'
                        and (mode == 'broadcast' or label in distributive)):
                    # Either distribute over all components...
                    # (always do in broadcast mode)
                    nodes = edu_components(node)
                else:
                    # ... or link to the CDU recursive head only
                    # (always do in head mode)
                    nodes = [heads[self.mirror(node)]]
                return nodes

            return (candidates(src_node,
                               LEFT_DIST), candidates(tgt_node, RIGHT_DIST))

        def edu_components(node):
            """ Returns a list of all EDUs contained by a node. """
            if not self.is_cdu(node):
                return [node]
            return [
                snode for snode in self.cdu_members(node, deep=True)
                if self.is_edu(snode)
            ]

        # Convert all edges in order
        for old_edge in self.relations():
            links = self.links(old_edge)
            # Verify the edge is well-formed
            assert (len(links) == 2)
            if not any(self.is_cdu(l) for l in links):
                # No CDU to strip: skip
                continue

            old_attrs = self.edge_attributes(old_edge)
            old_anno = self.annotation(old_edge)
            src_nodes, tgt_nodes = distrib_candidates(links, old_anno.type)
            # Remove the old edge
            self.del_edge(old_edge)
            self.doc.relations.remove(old_anno)
            # Build a new edge for all new combinations
            for i, (n_src, n_tgt) in enumerate(
                    itertools.product(src_nodes, tgt_nodes)):
                if n_src == n_tgt:
                    print("WARNING: something is pointing to its own CDU : " +
                          str(n_src))
                    continue
                # First, build a new Relation for the annotation layer
                n_src_anno = self.annotation(n_src)
                n_tgt_anno = self.annotation(n_tgt)
                new_anno = Relation(
                    '{0}_{1}'.format(old_anno._anno_id, i),
                    RelSpan(n_src_anno._anno_id, n_tgt_anno._anno_id),
                    old_anno.type, dict())
                new_anno.source = n_src_anno
                new_anno.target = n_tgt_anno
                self.doc.relations.append(new_anno)
                # Second, build a new graph edge
                new_edge = '{0}_{1}'.format(old_edge, i)
                new_attrs = dict(old_attrs)
                new_attrs['annotation'] = new_anno
                self.add_edge(new_edge)
                self.add_edge_attributes(new_edge, new_attrs.items())
                self.link(n_src, new_edge)
                self.link(n_tgt, new_edge)

        # Now all the CDUs are edge-orphaned, remove them from the graph
        for e_cdu in self.cdus():
            self.del_node(self.mirror(e_cdu))
            self.del_edge(e_cdu)
        # Same for annotation-level CDUs
        self.doc.schemas = [s for s in self.doc.schemas if not stac.is_cdu(s)]
Exemple #3
0
def read_node(node, context=None):
    def get_one(name, default, ctx=None):
        f = lambda n: read_node(n, ctx)
        return on_single_element(node, default, f, name)

    def get_all(name):
        return list(map(read_node, node.findall(name)))

    if node.tag == 'annotations':
        hashcode = get_one('metadata', '', 'annotations')
        if hashcode is '':
            hashcode = None
        units = get_all('unit')
        rels = get_all('relation')
        schemas = get_all('schema')
        return (hashcode, units, rels, schemas)

    elif node.tag == 'characterisation':
        fs = get_one('featureSet', {})
        unit_type = get_one('type', None)
        return (unit_type, fs)

    elif node.tag == 'feature':
        attr = node.attrib['name']
        val = node.text.strip() if node.text else None
        return (attr, val)

    # TODO throw exception if we see more than one instance of a key
    elif node.tag == 'featureSet':
        return dict(get_all('feature'))

    elif node.tag == 'metadata' and context == 'annotations':
        return node.attrib['corpusHashcode']

    elif node.tag == 'metadata':
        return dict([(t.tag, t.text.strip()) for t in node])

    elif node.tag == 'positioning' and context == 'unit':
        start = get_one('start', None)
        end = get_one('end', None)
        return Span(start, end)

    elif node.tag == 'positioning' and context == 'relation':
        terms = get_all('term')
        if len(terms) != 2:
            raise GlozzException("Was expecting exactly 2 terms, but got %d" %
                                 len(terms))
        else:
            return RelSpan(terms[0], terms[1])

    elif node.tag == 'positioning' and context == 'schema':
        units = frozenset(get_all('embedded-unit'))
        relations = frozenset(get_all('embedded-relation'))
        schemas = frozenset(get_all('embedded-schema'))
        return units, relations, schemas

    elif node.tag == 'relation':
        rel_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'relation')
        metadata = get_one('metadata', {})
        return Relation(rel_id, span, unit_type, fs, metadata=metadata)

    if node.tag == 'schema':
        anno_id = node.attrib['id']
        (anno_type, fs) = get_one('characterisation', None)
        units, rels, schemas = get_one('positioning', None, 'schema')
        metadata = get_one('metadata', {})
        return Schema(anno_id,
                      units,
                      rels,
                      schemas,
                      anno_type,
                      fs,
                      metadata=metadata)

    elif node.tag == 'singlePosition':
        return int(node.attrib['index'])

    elif node.tag == 'start' or node.tag == 'end':
        return get_one('singlePosition', None)

    elif node.tag in [
            'term', 'embedded-unit', 'embedded-relation', 'embedded-schema'
    ]:
        return node.attrib['id']

    elif node.tag == 'type':
        return node.text.strip()

    elif node.tag == 'unit':
        unit_id = node.attrib['id']
        (unit_type, fs) = get_one('characterisation', None)
        span = get_one('positioning', None, 'unit')
        metadata = get_one('metadata', {})
        return Unit(unit_id, span, unit_type, fs, metadata=metadata)
Exemple #4
0
 def __init__(self, id, start, end):
     Relation.__init__(self, id, RelSpan(start, end), '', {})
Exemple #5
0
    def strip_cdus(self, sloppy=False, mode='head'):
        """Delete all CDUs in this graph.

        Links involving a CDU will point to/from the elements of this
        CDU.
        Non-head modes may add new edges to the graph.

        Parameters
        ----------
        sloppy : boolean, default=False
            See `cdu_head`.

        mode : string, default='head'
            Strategy for replacing edges involving CDUs.
            `head` will relocate the edge on the recursive head of the
            CDU (see `recursive_cdu_heads`).
            `broadcast` will distribute the edge over all EDUs belonging
            to the CDU. A copy of the edge will be created for each of
            them. If the edge's source and target are both distributed,
            a new copy will be created for each combination of EDUs.
            `custom` (or any other string) will distribute or relocate on
            the head depending on the relation label.
        """

        # Set of labels for which the source node should be distributed
        LEFT_DIST = frozenset((
            'Acknowledgement',
            'Explanation',
            'Comment',
            'Continuation',
            'Narration',
            'Contrast',
            'Parallel',
            'Background'))

        # Set of labels for which the target node should be distributed
        RIGHT_DIST = frozenset((
            'Result',
            'Continuation',
            'Narration',
            'Comment',
            'Contrast',
            'Parallel',
            'Background',
            'Elaboration'))

        # Warning: heads.keys() are hyperedges
        heads = self.recursive_cdu_heads(sloppy=sloppy)

        def distrib_candidates(links, label):
            """Return a pair of list of nodes to be attached, depending
            on the edge label.
            """
            src_node, tgt_node = links

            def candidates(node, distributive):
                """Nodes to which edges from or to `node` should be
                distributed.
                """
                if not self.is_cdu(node):
                    return [node]

                if ((mode == 'broadcast' or
                     (mode != 'head' and label in distributive))):
                    # Either distribute over all components...
                    # (always do in broadcast mode)
                    nodes = edu_components(node)
                else:
                    # ... or link to the CDU recursive head only
                    # (always do in head mode)
                    nodes = [heads[self.mirror(node)]]
                return nodes

            return (candidates(src_node, LEFT_DIST),
                    candidates(tgt_node, RIGHT_DIST))

        def edu_components(node):
            """ Returns a list of all EDUs contained by a node. """
            if not self.is_cdu(node):
                return [node]
            return [snode for snode in self.cdu_members(node, deep=True)
                    if self.is_edu(snode)]

        # Convert all edges in order
        for old_edge in self.relations():
            links = self.links(old_edge)
            # Verify the edge is well-formed
            assert len(links) == 2
            if not any(self.is_cdu(l) for l in links):
                # No CDU to strip: skip
                continue

            old_attrs = self.edge_attributes(old_edge)
            old_anno = self.annotation(old_edge)
            src_nodes, tgt_nodes = distrib_candidates(links, old_anno.type)
            # Remove the old edge
            self.del_edge(old_edge)
            self.doc.relations.remove(old_anno)
            # Build a new edge for all new combinations
            for i, (n_src, n_tgt) in enumerate(
                    itertools.product(src_nodes, tgt_nodes)):
                if n_src == n_tgt:
                    # FIXME find a way to add this to the errors voiced in
                    # educe.stac.sanity.checks.graph
                    # we should likely squawk() this
                    print("WARNING: something is pointing to its own CDU : " +
                          str(n_src))
                    continue
                # First, build a new Relation for the annotation layer
                n_src_anno = self.annotation(n_src)
                n_tgt_anno = self.annotation(n_tgt)
                new_anno = Relation(
                    '{0}_{1}'.format(old_anno._anno_id, i),
                    RelSpan(n_src_anno._anno_id,
                            n_tgt_anno._anno_id),
                    old_anno.type,
                    dict())
                new_anno.source = n_src_anno
                new_anno.target = n_tgt_anno
                self.doc.relations.append(new_anno)
                # Second, build a new graph edge
                new_edge = '{0}_{1}'.format(old_edge, i)
                new_attrs = dict(old_attrs)
                new_attrs['annotation'] = new_anno
                self.add_edge(new_edge)
                self.add_edge_attributes(new_edge, new_attrs.items())
                self.link(n_src, new_edge)
                self.link(n_tgt, new_edge)

        # Now all the CDUs are edge-orphaned, remove them from the graph
        for e_cdu in self.cdus():
            self.del_node(self.mirror(e_cdu))
            self.del_edge(e_cdu)
        # Same for annotation-level CDUs
        self.doc.schemas = [s for s in self.doc.schemas if not stac.is_cdu(s)]
Exemple #6
0
    def _mk_doc(self):
        """ Create an educe.annotation.Document from this graph """
        def start(name):
            return ord(name) - ord('a')

        def glozz_id(name):
            return 'du_' + str(start(name))

        def is_edu(name):
            return name not in self.cdus

        anno_units = list()
        anno_cdus = list()
        anno_rels = list()

        for du_name, speaker_set in self.speakers.items():
            # EDU loop
            if not is_edu(du_name):
                continue

            du_start, du_glozz_id = start(du_name), glozz_id(du_name)
            x_edu = Unit(du_glozz_id, Span(du_start, du_start + 1), 'Segment',
                         dict())
            speaker = list(speaker_set)[0]
            turn = Unit('t' + du_glozz_id, Span(du_start, du_start + 1),
                        'Turn', {
                            'Identifier': du_start,
                            'Emitter': speaker
                        })

            self.anno_map[du_name] = x_edu
            anno_units.append(x_edu)
            anno_units.append(turn)

        for du_name, sub_names in self.cdus.items():
            x_cdu = Schema(
                glozz_id(du_name),
                set(glozz_id(x) for x in sub_names if is_edu(x)), set(),
                set(glozz_id(x) for x in sub_names if not is_edu(x)),
                'Complex_discourse_unit', dict())
            self.anno_map[du_name] = x_cdu
            anno_cdus.append(x_cdu)

        rel_count = 0
        for src_name in self.down:
            for tgt_name, rel_tag in self.down[src_name]:
                rel_glozz_id = 'rel_' + str(rel_count)
                rel_count += 1
                if rel_tag == 'S':
                    rel_name = 'Q-Elab'
                elif rel_tag == 'C':
                    rel_name = 'Contrast'
                else:
                    raise ValueError('Unknown tag {0}'.format(rel_tag))

                rel = Relation(rel_glozz_id,
                               RelSpan(glozz_id(src_name), glozz_id(tgt_name)),
                               rel_name, dict())
                self.anno_map[(src_name, tgt_name)] = rel
                anno_rels.append(rel)

        dialogue = Unit(
            'dialogue_0',
            Span(0, max(u.text_span().char_end for u in anno_units)),
            'Dialogue', {})
        anno_units.append(dialogue)

        doc = Document(anno_units, anno_rels, anno_cdus,
                       string.ascii_lowercase)
        return doc
Exemple #7
0
 def __init__(self, id, start, end):
     Relation.__init__(self, id, RelSpan(start, end), '', {})