Python get_span Examples, discoursegraphs.get_span Python Examples

Example #1

0

Show file

def get_rst_relation_root_nodes(docgraph, data=True, rst_namespace='rst'):
    """
    yield all nodes that dominate one or more RST relations in the given
    document graph (in no particular order).

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph which contains RST annotations
    data : bool
        If True (default), yields (node ID, relation name, list of tokens)
        tuples. If False, yields just node IDs.
    rst_namespace : str
        The namespace that the RST annotations use (default: rst)

    Yields
    ------
    relations : str or (str, str, list of str) tuples
        If data=False, this will just yield node IDs of the nodes that
        directly dominate an RST relation. If data=True, this yields
        tuples of the form: (node ID, relation name, list of tokens that this
        relation spans).
    """
    rel_attr = rst_namespace+':rel_name'
    for node_id, node_attrs in docgraph.nodes_iter(data=True):
        if rel_attr in node_attrs and node_attrs[rel_attr] != 'span':
            yield (node_id, node_attrs[rel_attr], get_span(docgraph, node_id)) if data else (node_id)

Example #2

0

Show file

File: conano.py Project: ccolonna/R2R

def get_conano_units(docgraph, data=True, conano_namespace='conano'):
    """
    yield all Conano units that occur in the given document graph,
    sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph which contains Conano annotations
    data : bool
        If True (default), yields (node ID, list of tokens)
        tuples. If False, yields just unit IDs.
    conano_namespace : str
        The namespace that the Conano annotations use (default: conano)

    Yields
    ------
    relations : str or (str, str, list of str) tuples
        If data=False, this will just yield node IDs of the nodes that
        directly dominate an RST relation. If data=True, this yields
        tuples of the form: (node ID, relation name, list of tokens that this
        relation spans).
    """
    for unit_id in sorted(select_nodes_by_layer(docgraph,
                                                conano_namespace + ':unit'),
                          key=natural_sort_key):
        yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)

Example #3

0

Show file

    def __add_annotation_tier(self, docgraph, body, annotation_layer):
        """
        adds a span-based annotation layer as a <tier> to the Exmaralda <body>.

        Parameter
        ---------
        docgraph : DiscourseDocumentGraph
            the document graph from which the chains will be extracted
        body : etree._Element
            an etree representation of the <basic_body> element (and all its
            descendants) of the Exmaralda file
        annotation_layer : str
            the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence'
        """
        layer_cat = annotation_layer.split(':')[-1]
        temp_tier = self.E(
            'tier', {
                'id': "TIE{}".format(self.tier_count),
                'category': layer_cat,
                'type': "t",
                'display-name': "[{}]".format(annotation_layer)
            })
        self.tier_count += 1

        for node_id in select_nodes_by_layer(docgraph, annotation_layer):
            span_node_ids = get_span(docgraph, node_id)
            if span_node_ids:
                start_id, end_id = self.__span2event(span_node_ids)
                event_label = docgraph.node[node_id].get('label', '')
                event = self.E('event', {
                    'start': "T{}".format(start_id),
                    'end': "T{}".format(end_id)
                }, event_label)
                temp_tier.append(event)
        body.append(temp_tier)

Example #4

0

Show file

File: conano.py Project: arne-cl/discoursegraphs

def get_conano_units(docgraph, data=True, conano_namespace='conano'):
    """
    yield all Conano units that occur in the given document graph,
    sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph which contains Conano annotations
    data : bool
        If True (default), yields (node ID, list of tokens)
        tuples. If False, yields just unit IDs.
    conano_namespace : str
        The namespace that the Conano annotations use (default: conano)

    Yields
    ------
    relations : str or (str, str, list of str) tuples
        If data=False, this will just yield node IDs of the nodes that
        directly dominate an RST relation. If data=True, this yields
        tuples of the form: (node ID, relation name, list of tokens that this
        relation spans).
    """
    for unit_id in sorted(select_nodes_by_layer(docgraph, conano_namespace+':unit'),
                          key=natural_sort_key):
        yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)

Example #5

0

Show file

File: exmaralda.py Project: arne-cl/discoursegraphs

    def __add_annotation_tier(self, docgraph, body, annotation_layer):
        """
        adds a span-based annotation layer as a <tier> to the Exmaralda <body>.

        Parameter
        ---------
        docgraph : DiscourseDocumentGraph
            the document graph from which the chains will be extracted
        body : etree._Element
            an etree representation of the <basic_body> element (and all its
            descendants) of the Exmaralda file
        annotation_layer : str
            the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence'
        """
        layer_cat = annotation_layer.split(':')[-1]
        temp_tier = self.E('tier',
                           {'id': "TIE{}".format(self.tier_count),
                            'category': layer_cat, 'type': "t",
                            'display-name': "[{}]".format(annotation_layer)})
        self.tier_count += 1

        for node_id in select_nodes_by_layer(docgraph, annotation_layer):
            span_node_ids = get_span(docgraph, node_id)
            if span_node_ids:
                start_id, end_id = self.__span2event(span_node_ids)
                event_label = docgraph.node[node_id].get('label', '')
                event = self.E('event',
                               {'start': "T{}".format(start_id),
                                'end': "T{}".format(end_id)},
                               event_label)
                temp_tier.append(event)
        body.append(temp_tier)

Example #6

0

Show file

File: conano.py Project: arne-cl/discoursegraphs

def get_connective(docgraph, unit_id):
    """
    returns the lowercased string of the connective used in the given Conano unit.
    """
    unit_index, _unit_type = unit_id.split(':')
    connective_id = unit_index+':connective'
    return ' '.join(docgraph.get_token(tok_id).lower()
                    for tok_id in get_span(docgraph, connective_id))

Example #7

0

Show file

File: conano.py Project: ccolonna/R2R

def get_connective(docgraph, unit_id):
    """
    returns the lowercased string of the connective used in the given Conano unit.
    """
    unit_index, _unit_type = unit_id.split(':')
    connective_id = unit_index + ':connective'
    return ' '.join(
        docgraph.get_token(tok_id).lower()
        for tok_id in get_span(docgraph, connective_id))

Example #8

0

Show file

def test_tiger_sentence_spans():
    """
    convert a TigerXML sentence (without a secondary edge) into a
    ``TigerSentenceGraph`` and check, if the syntax nodes cover the right
    tokens / string spans.
    """
    maz_13125_s389 = etree.fromstring(SENTENCE_WITHOUT_SECEDGE)
    tsg = TigerSentenceGraph(maz_13125_s389)

    # the root element should cover the complete sentence
    assert get_span(tsg, 's389_503') == [
        's389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7',
        's389_8', 's389_9', 's389_10', 's389_11']
    assert get_text(tsg, 's389_503') == \
        u"Was man nicht durch Augenschein nachprüfen kann , ist manipulierbar ."
    assert dg.is_continuous(tsg, 's389_503')

    # a subordinated ('SB') clause
    assert get_span(tsg, 's389_502') == [
        's389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7']
    assert get_text(tsg, 's389_502') == \
        u"Was man nicht durch Augenschein nachprüfen kann"
    assert dg.is_continuous(tsg, 's389_502')

    # a discontinuously annotated VP ('OC', i.e. a clausal object)
    assert get_span(tsg, 's389_501') == [
        's389_1', 's389_4', 's389_5', 's389_6']
    assert get_text(tsg, 's389_501') == \
        u"Was durch Augenschein nachprüfen"
    assert not dg.is_continuous(tsg, 's389_501')

    # a PP modifier ('MO')
    assert get_span(tsg, 's389_500') == ['s389_4', 's389_5']
    assert get_text(tsg, 's389_500') == \
        u"durch Augenschein"
    assert dg.is_continuous(tsg, 's389_500')

Example #9

0

Show file

File: test_tiger.py Project: arne-cl/discoursegraphs

def test_tiger_sentence_spans():
    """
    convert a TigerXML sentence (without a secondary edge) into a
    ``TigerSentenceGraph`` and check, if the syntax nodes cover the right
    tokens / string spans.
    """
    maz_13125_s389 = etree.fromstring(SENTENCE_WITHOUT_SECEDGE)
    tsg = TigerSentenceGraph(maz_13125_s389)

    # the root element should cover the complete sentence
    assert get_span(tsg, 's389_503') == [
        's389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7',
        's389_8', 's389_9', 's389_10', 's389_11']
    assert get_text(tsg, 's389_503') == \
        u"Was man nicht durch Augenschein nachprüfen kann , ist manipulierbar ."
    assert dg.is_continuous(tsg, 's389_503')

    # a subordinated ('SB') clause
    assert get_span(tsg, 's389_502') == [
        's389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7']
    assert get_text(tsg, 's389_502') == \
        u"Was man nicht durch Augenschein nachprüfen kann"
    assert dg.is_continuous(tsg, 's389_502')

    # a discontinuously annotated VP ('OC', i.e. a clausal object)
    assert get_span(tsg, 's389_501') == [
        's389_1', 's389_4', 's389_5', 's389_6']
    assert get_text(tsg, 's389_501') == \
        u"Was durch Augenschein nachprüfen"
    assert not dg.is_continuous(tsg, 's389_501')

    # a PP modifier ('MO')
    assert get_span(tsg, 's389_500') == ['s389_4', 's389_5']
    assert get_text(tsg, 's389_500') == \
        u"durch Augenschein"
    assert dg.is_continuous(tsg, 's389_500')

Example #10

0

Show file

    def __add_coreference_chain_tiers(self,
                                      docgraph,
                                      body,
                                      min_chain_length=3):
        """
        Parameters
        ----------
        docgraph : DiscourseDocumentGraph
            the document graph from which the chains will be extracted
        body : etree._Element
            an etree representation of the <basic_body> element (and all its
            descendants) of the Exmaralda file
        min_chain_length : int
            don't add tiers for chains with less than N elements (default: 3)

        TODO: this method assumes that each pointing relation chains signifies
        a coreference chain.
        """
        E = self.E

        for i, chain in enumerate(get_pointing_chains(docgraph)):
            chain_tier = E(
                'tier', {
                    'id': "TIE{}".format(self.tier_count),
                    'category': "chain",
                    'type': "t",
                    'display-name': "[coref-chain-{}]".format(i)
                })
            self.tier_count += 1

            chain_length = len(chain)
            if chain_length < min_chain_length:
                continue  # ignore short chains

            for j, node_id in enumerate(chain):
                span_node_ids = get_span(docgraph, node_id)
                if span_node_ids:
                    start_id, end_id = self.__span2event(span_node_ids)
                    element_str = "chain_{0}: {1}/{2}".format(
                        i, chain_length - j, chain_length)
                    chain_tier.append(
                        E(
                            'event', {
                                'start': "T{}".format(start_id),
                                'end': "T{}".format(end_id)
                            }, element_str))
            body.append(chain_tier)

Example #11

0

Show file

File: conll.py Project: ccolonna/R2R

    def __build_markable_token_mapper(self, coreference_layer=None,
                                      markable_layer=None):
        """
        Creates mappings from tokens to the markable spans they belong to
        and the coreference chains these markables are part of.

        Returns
        -------
        tok2markables : dict (str -> set of str)
            Maps from a token (node ID) to all the markables (node IDs)
            it is part of.
        markable2toks : dict (str -> list of str)
            Maps from a markable (node ID) to all the tokens (node IDs)
            that belong to it.
        markable2chains : dict (str -> list of int)
            Maps from a markable (node ID) to all the chains (chain ID) it
            belongs to.
        """
        tok2markables = defaultdict(set)
        markable2toks = defaultdict(list)
        markable2chains = defaultdict(list)

        coreference_chains = get_pointing_chains(self.docgraph,
                                                 layer=coreference_layer)
        for chain_id, chain in enumerate(coreference_chains):
            for markable_node_id in chain:
                markable2chains[markable_node_id].append(chain_id)

        # ID of the first singleton (if there are any)
        singleton_id = len(coreference_chains)

        # markable2toks/tok2markables shall contains all markables, not only
        # those which are part of a coreference chain
        for markable_node_id in select_nodes_by_layer(self.docgraph,
                                                      markable_layer):
            span = get_span(self.docgraph, markable_node_id)
            markable2toks[markable_node_id] = span
            for token_node_id in span:
                tok2markables[token_node_id].add(markable_node_id)

            # singletons each represent their own chain (with only one element)
            if markable_node_id not in markable2chains:
                markable2chains[markable_node_id] = [singleton_id]
                singleton_id += 1

        return tok2markables, markable2toks, markable2chains

Example #12

0

Show file

File: exmaralda.py Project: arne-cl/discoursegraphs

    def __add_coreference_chain_tiers(self, docgraph, body,
                                      min_chain_length=3):
        """
        Parameters
        ----------
        docgraph : DiscourseDocumentGraph
            the document graph from which the chains will be extracted
        body : etree._Element
            an etree representation of the <basic_body> element (and all its
            descendants) of the Exmaralda file
        min_chain_length : int
            don't add tiers for chains with less than N elements (default: 3)

        TODO: this method assumes that each pointing relation chains signifies
        a coreference chain.
        """
        E = self.E

        for i, chain in enumerate(get_pointing_chains(docgraph)):
            chain_tier = E('tier',
                           {'id': "TIE{}".format(self.tier_count),
                            'category': "chain", 'type': "t",
                            'display-name': "[coref-chain-{}]".format(i)})
            self.tier_count += 1

            chain_length = len(chain)
            if chain_length < min_chain_length:
                continue  # ignore short chains

            for j, node_id in enumerate(chain):
                span_node_ids = get_span(docgraph, node_id)
                if span_node_ids:
                    start_id, end_id = self.__span2event(span_node_ids)
                    element_str = "chain_{0}: {1}/{2}".format(
                        i, chain_length-j, chain_length)
                    chain_tier.append(
                        E('event', {'start': "T{}".format(start_id),
                                    'end': "T{}".format(end_id)}, element_str))
            body.append(chain_tier)

Example #13

0

Show file

File: test_discoursegraph.py Project: arne-cl/discoursegraphs

def test_get_span():
    """get spans from an sentence graph with dominance, spanning and
    pointing relations, but without self-loops"""
    sg1 = make_sentencegraph1()
    assert is_directed_acyclic_graph(sg1) is False
    assert len(sg1) == 15

    # token nodes only "span" themselves
    for i in range(8):
        assert dg.get_span(sg1, i) == [i]

    # the sentence covers all tokens, except for the markers ',' and '.'
    assert dg.get_span(sg1, 'S') == [0, 1, 3, 4, 5, 6]
    assert dg.get_span(sg1, 'NP1') == [0]
    assert dg.get_span(sg1, 'VP1') == [1]
    assert dg.get_span(sg1, 'SBAR') == [3, 4, 5, 6]
    assert dg.get_span(sg1, 'NP2') == [3]
    assert dg.get_span(sg1, 'VP2') == [4, 5, 6]

    # the debug parameter should 'raise' a warning (since the graph is
    # cyclic), but the result must be the same)
    assert dg.get_span(sg1, 'S', debug=True) == [0, 1, 3, 4, 5, 6]

    # get_span() must be robust against self-loops
    sg1.add_edge('SBAR', 'SBAR', layers={sg1.ns+':selfloop'},
                 edge_type=dg.EdgeTypes.dominance_relation)
    assert dg.get_span(sg1, 'S') == [0, 1, 3, 4, 5, 6]
    assert dg.get_span(sg1, 'SBAR') == [3, 4, 5, 6]
    assert dg.get_span(sg1, 'SBAR', debug=True) == [3, 4, 5, 6]

    # get_span() won't be able to recover from a dominance relation
    # (non self)-loop
    sg1.add_edge('NP1', 'S', layers={sg1.ns+':loop'},
                 edge_type=dg.EdgeTypes.dominance_relation)
    with pytest.raises(RuntimeError) as excinfo:
        assert dg.get_span(sg1, 'S')

Example #14

0

Show file

def test_tiger_sentence_with_secedge_spans():
    """
    convert a TigerXML sentence (without a secondary edge) into a
    ``TigerSentenceGraph`` and check, if the syntax nodes cover the right
    tokens / string spans.
    """
    maz_12976_s367 = etree.fromstring(SENTENCE_WITH_SECEDGE)
    tsg_secedge = TigerSentenceGraph(maz_12976_s367)
    
    # sentence root
    assert get_span(tsg_secedge, 's367_508') == [
        's367_1', 's367_2', 's367_3', 's367_4', 's367_5', 's367_6', 's367_7',
        's367_8', 's367_9', 's367_10', 's367_11', 's367_12', 's367_13',
        's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19',
        's367_20']
    assert get_text(tsg_secedge, 's367_508') == \
        u"Es kann nicht sein , dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt ."
    assert dg.is_continuous(tsg_secedge, 's367_508')

    # discontinuous NP
    assert get_span(tsg_secedge, 's367_507') == [
        's367_1', 's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11',
        's367_12', 's367_13', 's367_14', 's367_15', 's367_16', 's367_17',
        's367_18', 's367_19']
    assert get_text(tsg_secedge, 's367_507') == \
        u"Es dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt"
    assert not dg.is_continuous(tsg_secedge, 's367_507')

    # a coordinated sentence ('CS')
    assert get_span(tsg_secedge, 's367_506') == [
        's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11',
        's367_12', 's367_13', 's367_14', 's367_15', 's367_16', 's367_17',
        's367_18', 's367_19']
    assert get_text(tsg_secedge, 's367_506') == \
        u"dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt"
    assert dg.is_continuous(tsg_secedge, 's367_506')

    # a conjunct sentence ('CJ') with an ingoing secondary edge
    assert get_span(tsg_secedge, 's367_503') == [
        's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11',
        's367_12']
    assert get_text(tsg_secedge, 's367_503') == \
        u"dass die Basis gewissermaßen die Moral pachtet"
    assert dg.is_continuous(tsg_secedge, 's367_503')

    assert get_span(tsg_secedge, 's367_500') == ['s367_7', 's367_8']
    assert get_text(tsg_secedge, 's367_500') == u"die Basis"
    assert dg.is_continuous(tsg_secedge, 's367_500')

    assert get_span(tsg_secedge, 's367_501') == ['s367_10', 's367_11']
    assert get_text(tsg_secedge, 's367_501') == u"die Moral"
    assert dg.is_continuous(tsg_secedge, 's367_501')

    # a conjunct sentence ('CJ') with an ingoing secondary edge
    assert get_span(tsg_secedge, 's367_505') == [
        's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19']
    assert get_text(tsg_secedge, 's367_505') == \
        u"ihn die realpolitische Schmutzarbeit machen lässt"
    assert dg.is_continuous(tsg_secedge, 's367_505')

    # a clausal object ('OC') VP
    assert get_span(tsg_secedge, 's367_504') == [
        's367_15', 's367_16', 's367_17', 's367_18']
    assert get_text(tsg_secedge, 's367_504') == \
        u"die realpolitische Schmutzarbeit machen"
    assert dg.is_continuous(tsg_secedge, 's367_504')

    assert get_span(tsg_secedge, 's367_502') == [
        's367_15', 's367_16', 's367_17']
    assert get_text(tsg_secedge, 's367_502') == \
        u"die realpolitische Schmutzarbeit"
    assert dg.is_continuous(tsg_secedge, 's367_502')

Example #15

0

Show file

def get_rst_relations(docgraph):
    """
    returns a dictionary with RST relation root node IDs (str, e.g. 'rst:23')
    as keys and dictionaries describing these RST relations as values.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph which contains RST annotations

    Returns
    -------
    rst_relations : defaultdict(str)
        possible keys: 'tokens', 'nucleus', 'satellites', 'multinuc'
        maps from an RST relation root node ID (str, e.g. 'rst:23') to a
        dictionary describing this RST relation.
        The key 'tokens' maps to a list of token (node IDs) which the relation
        spans.
        If the dictionary contains the key 'multinuc', the relation is
        multinuclear and the keys 'nucleus' and 'satellites' contain nothing.
        The key 'multinuc' maps to a list of
        (node ID (str), RST reltype (str), list of token node IDs) triples;
        each one describes a nucleus.
        The key 'nucleus' maps to a list of token (node IDs) which the relation
        spans.
        The key 'satellites' maps to a list of
        (node ID (str), RST reltype (str), list of token node IDs) triples;
        each one describes a satellite.
    """
    rst_relations = defaultdict(lambda : defaultdict(str))

    for dom_node, relname, toks in get_rst_relation_root_nodes(docgraph):
        neighbors = \
            list(select_neighbors_by_layer(docgraph, dom_node,
                                           layer={'rst:segment', 'rst:group'}))
        multinuc_nuc_count = 1
        directly_dominated_tokens = sorted([node for node in docgraph.neighbors(dom_node)
                                            if istoken(docgraph, node)], key=natural_sort_key)
        if directly_dominated_tokens:
            rst_relations[dom_node]['tokens'] = directly_dominated_tokens

        for neighbor in neighbors:
            for edge in docgraph[dom_node][neighbor]:  # multidigraph
                edge_attrs = docgraph[dom_node][neighbor][edge]

                if edge_attrs['edge_type'] == EdgeTypes.spanning_relation:
                    # a span always signifies the nucleus of a relation
                    # there can be only one
                    rst_relations[dom_node]['nucleus'] = (neighbor, get_span(docgraph, neighbor))
                elif edge_attrs['rst:rel_type'] == 'rst':
                    # a segment/group nucleus can dominate multiple satellites
                    # (in different RST relations)
                    satellite = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor))
                    if 'satellites' in rst_relations[dom_node]:
                        rst_relations[dom_node]['satellites'].append(satellite)
                    else:
                        rst_relations[dom_node]['satellites'] = [satellite]
                elif edge_attrs['rst:rel_type'] == 'multinuc':
                    nucleus = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor))
                    if 'multinuc' in rst_relations[dom_node]:
                        rst_relations[dom_node]['multinuc'].append(nucleus)
                    else:
                        rst_relations[dom_node]['multinuc'] = [nucleus]
                    multinuc_nuc_count += 1
                else:
                    raise NotImplementedError("unknown type of RST segment domination")
    return rst_relations

Example #16

0

Show file

def test_get_span():
    """get spans from an sentence graph with dominance, spanning and
    pointing relations, but without self-loops"""
    sg1 = make_sentencegraph1()
    assert is_directed_acyclic_graph(sg1) is False
    assert len(sg1) == 15

    # token nodes only "span" themselves
    for i in range(8):
        assert dg.get_span(sg1, i) == [i]

    # the sentence covers all tokens, except for the markers ',' and '.'
    assert dg.get_span(sg1, 'S') == [0, 1, 3, 4, 5, 6]
    assert dg.get_span(sg1, 'NP1') == [0]
    assert dg.get_span(sg1, 'VP1') == [1]
    assert dg.get_span(sg1, 'SBAR') == [3, 4, 5, 6]
    assert dg.get_span(sg1, 'NP2') == [3]
    assert dg.get_span(sg1, 'VP2') == [4, 5, 6]

    # the debug parameter should 'raise' a warning (since the graph is
    # cyclic), but the result must be the same)
    assert dg.get_span(sg1, 'S', debug=True) == [0, 1, 3, 4, 5, 6]

    # get_span() must be robust against self-loops
    sg1.add_edge('SBAR', 'SBAR', layers={sg1.ns+':selfloop'},
                 edge_type=dg.EdgeTypes.dominance_relation)
    assert dg.get_span(sg1, 'S') == [0, 1, 3, 4, 5, 6]
    assert dg.get_span(sg1, 'SBAR') == [3, 4, 5, 6]
    assert dg.get_span(sg1, 'SBAR', debug=True) == [3, 4, 5, 6]

    # get_span() won't be able to recover from a dominance relation
    # (non self)-loop
    sg1.add_edge('NP1', 'S', layers={sg1.ns+':loop'},
                 edge_type=dg.EdgeTypes.dominance_relation)
    with pytest.raises(RuntimeError) as excinfo:
        assert dg.get_span(sg1, 'S')

Example #17

0

Show file

File: test_tiger.py Project: arne-cl/discoursegraphs

def test_tiger_sentence_with_secedge_spans():
    """
    convert a TigerXML sentence (without a secondary edge) into a
    ``TigerSentenceGraph`` and check, if the syntax nodes cover the right
    tokens / string spans.
    """
    maz_12976_s367 = etree.fromstring(SENTENCE_WITH_SECEDGE)
    tsg_secedge = TigerSentenceGraph(maz_12976_s367)
    
    # sentence root
    assert get_span(tsg_secedge, 's367_508') == [
        's367_1', 's367_2', 's367_3', 's367_4', 's367_5', 's367_6', 's367_7',
        's367_8', 's367_9', 's367_10', 's367_11', 's367_12', 's367_13',
        's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19',
        's367_20']
    assert get_text(tsg_secedge, 's367_508') == \
        u"Es kann nicht sein , dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt ."
    assert dg.is_continuous(tsg_secedge, 's367_508')

    # discontinuous NP
    assert get_span(tsg_secedge, 's367_507') == [
        's367_1', 's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11',
        's367_12', 's367_13', 's367_14', 's367_15', 's367_16', 's367_17',
        's367_18', 's367_19']
    assert get_text(tsg_secedge, 's367_507') == \
        u"Es dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt"
    assert not dg.is_continuous(tsg_secedge, 's367_507')

    # a coordinated sentence ('CS')
    assert get_span(tsg_secedge, 's367_506') == [
        's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11',
        's367_12', 's367_13', 's367_14', 's367_15', 's367_16', 's367_17',
        's367_18', 's367_19']
    assert get_text(tsg_secedge, 's367_506') == \
        u"dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt"
    assert dg.is_continuous(tsg_secedge, 's367_506')

    # a conjunct sentence ('CJ') with an ingoing secondary edge
    assert get_span(tsg_secedge, 's367_503') == [
        's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11',
        's367_12']
    assert get_text(tsg_secedge, 's367_503') == \
        u"dass die Basis gewissermaßen die Moral pachtet"
    assert dg.is_continuous(tsg_secedge, 's367_503')

    assert get_span(tsg_secedge, 's367_500') == ['s367_7', 's367_8']
    assert get_text(tsg_secedge, 's367_500') == u"die Basis"
    assert dg.is_continuous(tsg_secedge, 's367_500')

    assert get_span(tsg_secedge, 's367_501') == ['s367_10', 's367_11']
    assert get_text(tsg_secedge, 's367_501') == u"die Moral"
    assert dg.is_continuous(tsg_secedge, 's367_501')

    # a conjunct sentence ('CJ') with an ingoing secondary edge
    assert get_span(tsg_secedge, 's367_505') == [
        's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19']
    assert get_text(tsg_secedge, 's367_505') == \
        u"ihn die realpolitische Schmutzarbeit machen lässt"
    assert dg.is_continuous(tsg_secedge, 's367_505')

    # a clausal object ('OC') VP
    assert get_span(tsg_secedge, 's367_504') == [
        's367_15', 's367_16', 's367_17', 's367_18']
    assert get_text(tsg_secedge, 's367_504') == \
        u"die realpolitische Schmutzarbeit machen"
    assert dg.is_continuous(tsg_secedge, 's367_504')

    assert get_span(tsg_secedge, 's367_502') == [
        's367_15', 's367_16', 's367_17']
    assert get_text(tsg_secedge, 's367_502') == \
        u"die realpolitische Schmutzarbeit"
    assert dg.is_continuous(tsg_secedge, 's367_502')