Exemple #1
0
def get_subordinate_clauses(tiger_docgraph):
    """
    given a document graph of a TIGER syntax tree, return all
    node IDs of nodes representing subordinate clause constituents.

    Parameters
    ----------
    tiger_docgraph : DiscourseDocumentGraph or TigerDocumentGraph
        document graph from which subordinate clauses will be extracted

    Returns
    -------
    subord_clause_nodes : list(str)
        list of node IDs of nodes directly dominating subordinate clauses
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])

    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes
Exemple #2
0
def get_subordinate_clauses(tiger_docgraph):
    """
    given a document graph of a TIGER syntax tree, return all
    node IDs of nodes representing subordinate clause constituents.

    Parameters
    ----------
    tiger_docgraph : DiscourseDocumentGraph or TigerDocumentGraph
        document graph from which subordinate clauses will be extracted

    Returns
    -------
    subord_clause_nodes : list(str)
        list of node IDs of nodes directly dominating subordinate clauses
    """
    subord_clause_rels = \
        dg.select_edges_by_attribute(
            tiger_docgraph, attribute='tiger:label',
            value=['MO', 'RC', 'SB'])

    subord_clause_nodes = []
    for src_id, target_id in subord_clause_rels:
        src_cat = tiger_docgraph.node[src_id].get('tiger:cat')
        if src_cat == 'S' and not dg.istoken(tiger_docgraph, target_id):
            subord_clause_nodes.append(target_id)
    return subord_clause_nodes
Exemple #3
0
    def __repair_unconnected_nodes(self):
        """
        Adds a (``dominance_relation``) edge from the sentence root node to all
        previously unconnected nodes (token nodes, that either represent a
        punctuation mark or are part of a headline 'sentence' that has no
        full syntax structure annotation).
        """
        unconnected_node_ids = get_unconnected_nodes(self)
        if dg.istoken(self, self.root):
            # This sentence has no hierarchical structure, i.e. the root
            # node is also a terminal / token node.
            # We will add a virtual root node to compensate for this.
            self.root = self.ns + ':VROOT'
            self.add_node(self.root,
                          layers={
                              'tiger', 'tiger:syntax', 'tiger:sentence',
                              'tiger:sentence:root'
                          })

        for unconnected_node_id in unconnected_node_ids:
            self.add_edge(self.root,
                          unconnected_node_id,
                          layers={
                              self.ns, self.ns + ':sentence',
                              self.ns + ':unconnected'
                          },
                          edge_type=EdgeTypes.dominance_relation)
Exemple #4
0
    def __gen_struct_anno_files(self, top_level_layer):
        """
        A struct annotation file contains node (struct) attributes (of
        non-token nodes). It is e.g. used to annotate the type of a syntactic
        category (NP, VP etc.).

        See also: __gen_hierarchy_file()
        """
        paula_id = '{0}.{1}.{2}_{3}_struct'.format(top_level_layer,
                                               self.corpus_name, self.name,
                                               top_level_layer)
        E, tree = gen_paula_etree(paula_id)

        base_paula_id = self.paulamap['hierarchy'][top_level_layer]
        mflist = E('multiFeatList',
                   {XMLBASE: base_paula_id+'.xml'})

        for node_id in select_nodes_by_layer(self.dg, top_level_layer):
            if not istoken(self.dg, node_id):
                mfeat = E('multiFeat',
                          {XLINKHREF: '#{0}'.format(node_id)})
                node_dict = self.dg.node[node_id]
                for attr in node_dict:
                    if attr not in IGNORED_NODE_ATTRIBS:
                        mfeat.append(
                            E('feat',
                              {'name': attr, 'value': node_dict[attr]}))
                if self.human_readable:  # adds node label as a <!--comment-->
                    mfeat.append(Comment(node_dict.get('label')))
                mflist.append(mfeat)
        tree.append(mflist)
        self.files[paula_id] = tree
        self.file2dtd[paula_id] = PaulaDTDs.multifeat
        return paula_id
Exemple #5
0
 def __gen_node_href(self, layer, node_id):
     """
     generates a complete xlink:href for any node (token node,
     structure node etc.) in the docgraph. This will only work AFTER
     the corresponding PAULA files have been created (and their file names
     are registered in ``self.paulamap``).
     """
     if istoken(self.dg, node_id):
         base_paula_id = self.paulamap['tokenization']
     else:
         base_paula_id = self.paulamap['hierarchy'][layer]
     return '{0}.xml#{1}'.format(base_paula_id, node_id)
def node2bracket(docgraph, node_id, child_str=''):
    """convert a docgraph node into a PTB-style string."""
    node_attrs = docgraph.node[node_id]
    if istoken(docgraph, node_id):
        pos_str = node_attrs.get(docgraph.ns+':pos', '')
        token_str = node_attrs[docgraph.ns+':token']
        return u"({pos}{space1}{token}{space2}{child})".format(
            pos=pos_str, space1=bool(pos_str)*' ', token=token_str,
            space2=bool(child_str)*' ', child=child_str)
    else:  # node is not a token
        label_str=node_attrs.get('label', '')
        return u"({label}{space}{child})".format(
            label=label_str, space=bool(label_str and child_str)*' ',
            child=child_str)
Exemple #7
0
def node2freqt(docgraph, node_id, child_str='', include_pos=False,
               escape_func=FREQT_ESCAPE_FUNC):
    """convert a docgraph node into a FREQT string."""
    node_attrs = docgraph.node[node_id]
    if istoken(docgraph, node_id):
        token_str = escape_func(node_attrs[docgraph.ns+':token'])
        if include_pos:
            pos_str = escape_func(node_attrs.get(docgraph.ns+':pos', ''))
            return u"({pos}({token}){child})".format(
                pos=pos_str, token=token_str, child=child_str)
        else:
            return u"({token}{child})".format(token=token_str, child=child_str)

    else:  # node is not a token
        label_str=escape_func(node_attrs.get('label', node_id))
        return u"({label}{child})".format(label=label_str, child=child_str)
Exemple #8
0
def node2bracket(docgraph, node_id, child_str=''):
    """convert a docgraph node into a PTB-style string."""
    node_attrs = docgraph.node[node_id]
    if istoken(docgraph, node_id):
        pos_str = node_attrs.get(docgraph.ns + ':pos', '')
        token_str = node_attrs[docgraph.ns + ':token']
        return u"({pos}{space1}{token}{space2}{child})".format(
            pos=pos_str,
            space1=bool(pos_str) * ' ',
            token=token_str,
            space2=bool(child_str) * ' ',
            child=child_str)
    else:  # node is not a token
        label_str = node_attrs.get('label', '')
        return u"({label}{space}{child})".format(
            label=label_str,
            space=bool(label_str and child_str) * ' ',
            child=child_str)
Exemple #9
0
def traverse_dependencies_up(docgraph, node_id, node_attr=None):
    """
    starting from the given node, traverse ingoing edges up to the root element
    of the sentence. return the given node attribute from all the nodes visited
    along the way.
    """
    # there's only one, but we're in a multidigraph
    source, target = docgraph.in_edges(node_id)[0]
    traverse_attr = node_attr if node_attr else docgraph.lemma_attr

    attrib_value = docgraph.node[source].get(traverse_attr)
    if attrib_value:
        yield attrib_value

    if istoken(docgraph, source) is True:
        for attrib_value in traverse_dependencies_up(docgraph, source,
                                                     traverse_attr):
            yield attrib_value
Exemple #10
0
def node2freqt(docgraph,
               node_id,
               child_str='',
               include_pos=False,
               escape_func=FREQT_ESCAPE_FUNC):
    """convert a docgraph node into a FREQT string."""
    node_attrs = docgraph.node[node_id]
    if istoken(docgraph, node_id):
        token_str = escape_func(node_attrs[docgraph.ns + ':token'])
        if include_pos:
            pos_str = escape_func(node_attrs.get(docgraph.ns + ':pos', ''))
            return u"({pos}({token}){child})".format(pos=pos_str,
                                                     token=token_str,
                                                     child=child_str)
        else:
            return u"({token}{child})".format(token=token_str, child=child_str)

    else:  # node is not a token
        label_str = escape_func(node_attrs.get('label', node_id))
        return u"({label}{child})".format(label=label_str, child=child_str)
Exemple #11
0
    def __repair_unconnected_nodes(self):
        """
        Adds a (``dominance_relation``) edge from the sentence root node to all
        previously unconnected nodes (token nodes, that either represent a
        punctuation mark or are part of a headline 'sentence' that has no
        full syntax structure annotation).
        """
        unconnected_node_ids = get_unconnected_nodes(self)
        if dg.istoken(self, self.root):
            # This sentence has no hierarchical structure, i.e. the root
            # node is also a terminal / token node.
            # We will add a virtual root node to compensate for this.
            self.root = self.ns+':VROOT'
            self.add_node(self.root,
                layers={'tiger', 'tiger:syntax', 'tiger:sentence',
                        'tiger:sentence:root'})

        for unconnected_node_id in unconnected_node_ids:
            self.add_edge(self.root, unconnected_node_id,
                          layers={self.ns, self.ns+':sentence',
                                  self.ns+':unconnected'},
                          edge_type=EdgeTypes.dominance_relation)
Exemple #12
0
def get_rst_relations(docgraph):
    """
    returns a dictionary with RST relation root node IDs (str, e.g. 'rst:23')
    as keys and dictionaries describing these RST relations as values.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph which contains RST annotations

    Returns
    -------
    rst_relations : defaultdict(str)
        possible keys: 'tokens', 'nucleus', 'satellites', 'multinuc'
        maps from an RST relation root node ID (str, e.g. 'rst:23') to a
        dictionary describing this RST relation.
        The key 'tokens' maps to a list of token (node IDs) which the relation
        spans.
        If the dictionary contains the key 'multinuc', the relation is
        multinuclear and the keys 'nucleus' and 'satellites' contain nothing.
        The key 'multinuc' maps to a list of
        (node ID (str), RST reltype (str), list of token node IDs) triples;
        each one describes a nucleus.
        The key 'nucleus' maps to a list of token (node IDs) which the relation
        spans.
        The key 'satellites' maps to a list of
        (node ID (str), RST reltype (str), list of token node IDs) triples;
        each one describes a satellite.
    """
    rst_relations = defaultdict(lambda : defaultdict(str))

    for dom_node, relname, toks in get_rst_relation_root_nodes(docgraph):
        neighbors = \
            list(select_neighbors_by_layer(docgraph, dom_node,
                                           layer={'rst:segment', 'rst:group'}))
        multinuc_nuc_count = 1
        directly_dominated_tokens = sorted([node for node in docgraph.neighbors(dom_node)
                                            if istoken(docgraph, node)], key=natural_sort_key)
        if directly_dominated_tokens:
            rst_relations[dom_node]['tokens'] = directly_dominated_tokens

        for neighbor in neighbors:
            for edge in docgraph[dom_node][neighbor]:  # multidigraph
                edge_attrs = docgraph[dom_node][neighbor][edge]

                if edge_attrs['edge_type'] == EdgeTypes.spanning_relation:
                    # a span always signifies the nucleus of a relation
                    # there can be only one
                    rst_relations[dom_node]['nucleus'] = (neighbor, get_span(docgraph, neighbor))
                elif edge_attrs['rst:rel_type'] == 'rst':
                    # a segment/group nucleus can dominate multiple satellites
                    # (in different RST relations)
                    satellite = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor))
                    if 'satellites' in rst_relations[dom_node]:
                        rst_relations[dom_node]['satellites'].append(satellite)
                    else:
                        rst_relations[dom_node]['satellites'] = [satellite]
                elif edge_attrs['rst:rel_type'] == 'multinuc':
                    nucleus = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor))
                    if 'multinuc' in rst_relations[dom_node]:
                        rst_relations[dom_node]['multinuc'].append(nucleus)
                    else:
                        rst_relations[dom_node]['multinuc'] = [nucleus]
                    multinuc_nuc_count += 1
                else:
                    raise NotImplementedError("unknown type of RST segment domination")
    return rst_relations
Exemple #13
0
    def __gen_hierarchy_file(self, layer):
        """
        Hierarchical structures (<structList> elements) are used to create
        hierarchically nested annotation graphs (e.g. to express consists-of
        relationships or dominance-edges in syntax trees, RST).
        A <struct> element will be created for each hierarchical node
        (e.g. an NP) with edges (<rel> elements) to each dominated element
        (e.g. tokens, other <struct> elements).

        NOTE: The types/labels of these newly create hierarchical nodes and
        edges aren't stored in this file, but in feat/multiFeat files
        referencing this one! See: __gen_struct_anno_files() and
        __gen_rel_anno_file()).

        There will be one hierarchy file for each top level layer.
        TODO: check, if we can omit hierarchy files for layers that don't
              contain dominance edges
        """
        paula_id = '{0}.{1}.{2}_{3}'.format(layer, self.corpus_name, self.name,
                                        layer)
        self.paulamap['hierarchy'][layer] = paula_id
        E, tree = gen_paula_etree(paula_id)

        dominance_edges = select_edges_by(
            self.dg, layer=layer, edge_type=EdgeTypes.dominance_relation,
            data=True)
        span_edges = select_edges_by(
            self.dg, layer=layer, edge_type=EdgeTypes.spanning_relation,
            data=True)
        dominance_dict = defaultdict(lambda: defaultdict(str))
        for source_id, target_id, edge_attrs in dominance_edges:
            if source_id != layer+':root_node':
                dominance_dict[source_id][target_id] = edge_attrs

        # in PAULA XML, token spans are also part of the hierarchy
        for source_id, target_id, edge_attrs in span_edges:
            if istoken(self.dg, target_id):
                dominance_dict[source_id][target_id] = edge_attrs

        # NOTE: we don't add a base file here, because the nodes could be
        # tokens or structural nodes
        slist = E('structList', {'type': layer})
        for source_id in dominance_dict:
            struct = E('struct',
                       {'id': str(source_id)})
            if self.human_readable:
                struct.append(Comment(self.dg.node[source_id].get('label')))

            for target_id in dominance_dict[source_id]:
                if istoken(self.dg, target_id):
                    href = '{0}.xml#{1}'.format(self.paulamap['tokenization'],
                                              target_id)
                else:
                    href = '#{0}'.format(target_id)

                rel = E(
                    'rel',
                    {'id': 'rel_{0}_{1}'.format(source_id, target_id),
                     'type': dominance_dict[source_id][target_id]['edge_type'],
                     XLINKHREF: href})
                struct.append(rel)
                if self.human_readable:
                    struct.append(
                        Comment(self.dg.node[target_id].get('label')))
            slist.append(struct)
        tree.append(slist)
        self.files[paula_id] = tree
        self.file2dtd[paula_id] = PaulaDTDs.struct
        return paula_id