Esempio n. 1
0
def test_multiedge_keyincrement():
    """test, if keys are automatically incremented when adding multiple edges
    between two nodes. This tests redundant code common to add_edge() and
    add_edges_from().
    """
    dg1 = dg.DiscourseDocumentGraph()

    # add an edge with a key. keys are used in multigraphs to distinguish
    # between multiple edges between the same nodes
    dg1.add_edge('a', 'b', layers={'love'}, key=1)
    assert len(dg1.edge['a']['b']) == 1
    # add another edge between the same nodes.
    # keys should auto-increment, especially if the key is already in use
    dg1.add_edge('a', 'b', layers={'hate'})
    assert len(dg1.edge['a']['b']) == 2
    assert 'love' in dg1.edge['a']['b'][1]['layers']
    assert 'hate' in dg1.edge['a']['b'][2]['layers']

    # the method add_edges_from should show the same behaviour
    dg2 = dg.DiscourseDocumentGraph()
    dg2.add_edges_from([('a', 'b', 1, {'layers': {'love'}})])
    assert len(dg2.edge['a']['b']) == 1
    dg2.add_edges_from([('a', 'b', {'layers': {'hate'}})])
    assert len(dg2.edge['a']['b']) == 2
    assert 'love' in dg2.edge['a']['b'][1]['layers']
    assert 'hate' in dg2.edge['a']['b'][2]['layers']
Esempio n. 2
0
def test_select_nodes_by_layer():
    """Are nodes correctly filtered based on their layer?"""
    ddg = dg.DiscourseDocumentGraph(namespace='test')
    assert len(ddg) == 1
    add_tokens(ddg, ['The', 'dog', 'barks', '.'])
    assert len(ddg) == 5

    # don't filter any nodes
    all_node_ids = list(dg.select_nodes_by_layer(ddg))
    all_nodes = list(dg.select_nodes_by_layer(ddg, data=True))
    assert len(all_node_ids) == len(all_nodes) == 5

    test_node_ids = list(dg.select_nodes_by_layer(ddg, 'test'))
    test_nodes = list(dg.select_nodes_by_layer(ddg, 'test', data=True))
    assert len(ddg) == len(test_node_ids) == len(test_nodes) == 5

    ddg.add_node(10, layers={'foo'})
    ddg.add_node(11, layers={'bar'})

    # filter several layers
    test_foo_ids = list(dg.select_nodes_by_layer(ddg, layer={'test', 'foo'}))
    test_foo_nodes = list(dg.select_nodes_by_layer(
        ddg, layer={'test', 'foo'}, data=True))
    assert len(test_foo_ids) == len(test_foo_nodes) == 6
    test_foobar_ids = list(dg.select_nodes_by_layer(
        ddg, layer={'test', 'foo', 'bar'}))
    assert len(test_foobar_ids) == 7

    # test if data=True works as expected
    for nodelist in (all_nodes, test_nodes, test_foo_nodes):
        for node_id, attr_dict in nodelist:
            assert isinstance(node_id, (str, int))
            assert isinstance(attr_dict, dict)
Esempio n. 3
0
def test_is_continuous():
    """tests, if a discontinuous span of tokens is recognised as such.

    The node ``lower`` spans the two non-adjacent tokens 1 and 4.

            root
              |
         ---upper---
         |    |    |
    -----|--lower--|----
    |    |         |   |
    1    2         3   4
    """
    docgraph = dg.DiscourseDocumentGraph()
    docgraph.add_node('1', attr_dict={'discoursegraph:token': '1'})
    docgraph.add_node('2', attr_dict={'discoursegraph:token': '2'})
    docgraph.add_node('3', attr_dict={'discoursegraph:token': '3'})
    docgraph.add_node('4', attr_dict={'discoursegraph:token': '4'})
    docgraph.add_edge(docgraph.root, 'upper', edge_type=dg.EdgeTypes.dominance_relation)
    docgraph.add_edge('upper', 'lower', edge_type=dg.EdgeTypes.dominance_relation)
    docgraph.add_edge('lower', '1', edge_type=dg.EdgeTypes.spanning_relation)
    docgraph.add_edge('upper', '2', edge_type=dg.EdgeTypes.spanning_relation)
    docgraph.add_edge('upper', '3', edge_type=dg.EdgeTypes.spanning_relation)
    docgraph.add_edge('lower', '4', edge_type=dg.EdgeTypes.spanning_relation)
    # determine order of the tokens
    docgraph.tokens = ['1', '2', '3', '4']

    assert dg.is_continuous(docgraph, docgraph.root)
    assert dg.is_continuous(docgraph, 'upper')
    assert not dg.is_continuous(docgraph, 'lower')
    assert dg.is_continuous(docgraph, '1')
    assert dg.is_continuous(docgraph, '2')
    assert dg.is_continuous(docgraph, '3')
    assert dg.is_continuous(docgraph, '4')
Esempio n. 4
0
 def test_docgraph_name_namespace(self):
     """create a docgraph with a user-defined name and namespace"""
     name = 'mydoc'
     ns = 'weird'
     ddg = dg.DiscourseDocumentGraph(name=name, namespace=ns)
     assert isinstance(ddg, dg.DiscourseDocumentGraph)
     assert ddg.name == name
     assert ddg.nodes() == ['{}:root_node'.format(ns)]
Esempio n. 5
0
def test_create_token_mapping():
    """check if two docgraphs cover the same text with the same tokenization"""
    # merging must fail when tokens aren't identical
    first_graph = dg.DiscourseDocumentGraph(name='first')
    add_tokens(first_graph, ['Ich', 'bin', 'ein', 'Berliner', '.'])

    second_graph = dg.DiscourseDocumentGraph(name='second')
    add_tokens(second_graph, ['Ich', 'bin', 'kein', 'Berliner', '.'])

    with pytest.raises(ValueError) as excinfo:
        create_token_mapping(first_graph, second_graph, verbose=False)
    assert 'Tokenization mismatch' in str(excinfo.value)
    assert 'kein != ein' in str(excinfo.value)

    with pytest.raises(ValueError) as excinfo:
        create_token_mapping(first_graph, second_graph, verbose=True)
    assert 'Tokenization mismatch' in str(excinfo.value)
    assert 'Ich bin [[ein]] Berliner .' in str(excinfo.value)
    assert 'Ich bin [[kein]] Berliner .' in str(excinfo.value)
Esempio n. 6
0
def test_select_edges_by_attribute():
    """test if edges can be filtered for attributes/values"""
    # create a simple graph with 3 tokens, all dominated by the root node
    # and with precedence relations between the tokens
    token_graph = dg.DiscourseDocumentGraph(
        name='example.tok', namespace='tokenized')
    add_tokens(token_graph, ['He', 'sleeps', '.'])
    for token_node in token_graph.tokens:
        token_graph.add_edge(token_graph.root, token_node,
                             edge_type=dg.EdgeTypes.dominance_relation)

    for src, target in [(0, 1), (1, 2)]:
        token_graph.add_edge(
            src, target, edge_type=dg.EdgeTypes.precedence_relation)

    assert len(token_graph) == 4

    all_edge_ids = list(dg.select_edges_by_attribute(token_graph))
    all_edges = list(dg.select_edges_by_attribute(token_graph, data=True))
    assert len(token_graph.edges()) == len(all_edge_ids) == len(all_edges) == 5

    # test if data=True works as expected
    for src, target, attrs in all_edges:
        assert isinstance(src, (str, int))
        assert isinstance(target, (str, int))
        assert isinstance(attrs, dict)

    # edges with any edge_type
    edges_with_edgetype = list(dg.select_edges_by_attribute(
        token_graph, attribute='edge_type'))
    assert len(edges_with_edgetype) == 5

    # edges with dominance relation edge_type
    dominance_edge_ids = list(dg.select_edges_by_attribute(
        token_graph, attribute='edge_type',
        value=dg.EdgeTypes.dominance_relation))
    assert len(dominance_edge_ids) == 3

    # edges with dominance or precedence edge_type
    dominance_or_precendence = list(dg.select_edges_by_attribute(
        token_graph, attribute='edge_type',
        value=[dg.EdgeTypes.dominance_relation,
               dg.EdgeTypes.precedence_relation]))
    assert len(dominance_or_precendence) == 5
Esempio n. 7
0
    def test_node2bracket(self):
        """A docgraph node can be converted into PTB-style bracket notation."""
        ddg = dg.DiscourseDocumentGraph()
        ns = ddg.ns

        ddg.add_node(5)
        assert node2bracket(ddg, node_id=5) == u'()'
        #~ import pudb; pudb.set_trace()
        assert node2bracket(ddg, node_id=5, child_str='()') == u'(())'

        ddg.add_node(4, attr_dict={'label': 'S'})
        assert node2bracket(ddg, node_id=4) == u'(S)'
        assert node2bracket(ddg, node_id=4, child_str='') == u'(S)'
        assert node2bracket(ddg, node_id=4,
                            child_str='(NP Ernst)') == u'(S (NP Ernst))'

        ddg.add_node(3, attr_dict={ns + ':token': 'Horst'})
        assert node2bracket(ddg, node_id=3) == u'(Horst)'
        assert node2bracket(ddg, node_id=3, child_str='()') == u'(Horst ())'

        ddg.add_node(2, attr_dict={ns + ':token': 'Horst', ns + ':pos': 'N'})
        assert node2bracket(ddg, node_id=2) == u'(N Horst)'
        assert node2bracket(
            ddg, node_id=2,
            child_str='(N Schneider)') == u'(N Horst (N Schneider))'

        # if node is a token and has a label attribute, the output contains
        # the token attrib, not the label
        ddg.add_node(1,
                     attr_dict={
                         ns + ':token': u'Björn',
                         ns + ':pos': 'NE',
                         'label': u'Horst'
                     })
        assert node2bracket(ddg, node_id=1) == u'(NE Björn)'
        assert node2bracket(
            ddg, node_id=1,
            child_str='(N Schneider)') == u'(NE Björn (N Schneider))'

        ddg.add_node(6, attr_dict={ns + ':token': u'Björn', 'label': u'Horst'})
        assert node2bracket(ddg, node_id=6) == u'(Björn)'
        assert node2bracket(ddg, node_id=6,
                            child_str='(Schneider)') == u'(Björn (Schneider))'
Esempio n. 8
0
def make_sentencegraph1():
    """return a docgraph containing one sentence with syntax and coreference
    annotation, as well as precedence relations.

    The graph is cyclic because of a coreference relation (pointing relation).
    """
    docgraph = dg.DiscourseDocumentGraph()
    # tokens: 0    1       2    3     4      5       6     7
    add_tokens(docgraph,
        ['Guido', 'died', ',', 'he', 'was', 'only', '54', '.'])

    # add syntax structure (nodes, dominance and spanning relations)
    docgraph.add_node('S', layers={docgraph.ns+':syntax'})
    docgraph.add_node('NP1', layers={docgraph.ns+':syntax'})
    docgraph.add_node('VP1', layers={docgraph.ns+':syntax'})
    docgraph.add_node('SBAR', layers={docgraph.ns+':syntax'})
    docgraph.add_node('NP2', layers={docgraph.ns+':syntax'})
    docgraph.add_node('VP2', layers={docgraph.ns+':syntax'})

    dom_rels = [(docgraph.root, 'S'), ('S', 'NP1'), ('S', 'VP1'),
                ('S', 'SBAR'), ('SBAR', 'NP2'), ('SBAR', 'VP2')]
    for src, target in dom_rels:
        docgraph.add_edge(src, target, layers={docgraph.ns+':syntax'},
                          edge_type=dg.EdgeTypes.dominance_relation)

    span_rels = [('NP1', 0), ('VP1', 1), ('NP2', 3), ('VP2', 4), ('VP2', 5),
                 ('VP2', 6)]
    for src, target in span_rels:
        docgraph.add_edge(src, target,
                          edge_type=dg.EdgeTypes.spanning_relation)

    # coreference: he -> Guido
    docgraph.add_edge(3, 0, layers={docgraph.ns+':coreference'},
                      edge_type=dg.EdgeTypes.pointing_relation)

    # add precedence relations
    prec_rels = [(i, i+1) for i in range(7)]
    for src, target in prec_rels:
        docgraph.add_edge(src, target, layers={docgraph.ns+':precedence'},
                          edge_type=dg.EdgeTypes.pointing_relation)
    return docgraph
Esempio n. 9
0
    def test_merge_graphs(self):
        """merge a very simple graph into an empty graph"""
        # create a simple graph with 3 tokens, all dominated by the root node
        token_graph = dg.DiscourseDocumentGraph(
            name='example.tok', namespace='tokenized')
        add_tokens(token_graph, ['He', 'sleeps', '.'])
        for token_node in token_graph.tokens:
            token_graph.add_edge(token_graph.root, token_node,
                                 edge_type=dg.EdgeTypes.dominance_relation)
        assert len(token_graph) == 4
        assert len(token_graph.edges()) == 3

        assert self.docgraph.name == ''
        assert self.docgraph.tokens == []
        assert len(self.docgraph) == 1
        self.docgraph.merge_graphs(token_graph)

        assert self.docgraph.name == 'example.tok'
        assert len(self.docgraph.tokens) == 3
        assert len(self.docgraph) == 4
        assert len(token_graph.edges()) == 3
Esempio n. 10
0
    def test_docgraph2freqt_escaped(self):
        """Convert a docgraph into a FREQT string, with/out POS tags and escaping."""
        docgraph = dg.DiscourseDocumentGraph(root='TEXT')
        assert '(TEXT)' == node2freqt(docgraph,
                                      docgraph.root,
                                      escape_func=FREQT_ESCAPE_FUNC)
        assert '(TEXT)' == node2freqt(docgraph,
                                      docgraph.root,
                                      escape_func=lambda x: x)

        docgraph = dg.DiscourseDocumentGraph(root='(TEXT)')
        assert '(-LRB-TEXT-RRB-)' == node2freqt(docgraph,
                                                docgraph.root,
                                                escape_func=FREQT_ESCAPE_FUNC)
        assert '((TEXT))' == node2freqt(docgraph,
                                        docgraph.root,
                                        escape_func=lambda x: x)

        docgraph = dg.DiscourseDocumentGraph(root='TE(X)T')
        assert '(TE-LRB-X-RRB-T)' == node2freqt(docgraph,
                                                docgraph.root,
                                                escape_func=FREQT_ESCAPE_FUNC)
        assert '(TE(X)T)' == node2freqt(docgraph,
                                        docgraph.root,
                                        escape_func=lambda x: x)

        # sentence: I am (un)certain .
        docgraph = dg.DiscourseDocumentGraph(root='ROOT')
        ns = docgraph.ns

        nodes = [
            ('S', {
                'label': 'S',
                'layers': {ns + ':syntax'}
            }),
            ('NP', {
                'label': 'NP',
                'layers': {ns + ':syntax'}
            }),
            ('VP', {
                'label': 'VP',
                'layers': {ns + ':syntax'}
            }),
            ('ADJP', {
                'label': 'ADJP',
                'layers': {ns + ':syntax'}
            }),
            ('token1', {
                ns + ':token': 'I',
                ns + ':pos': 'PRP',
                'layers': {ns + ':token'}
            }),
            ('token2', {
                ns + ':token': 'am',
                ns + ':pos': 'VBP',
                'layers': {ns + ':token'}
            }),
            ('token3', {
                ns + ':token': '(un)certain',
                ns + ':pos': 'JJ',
                'layers': {ns + ':token'}
            }),
            ('token4', {
                ns + ':token': '.',
                ns + ':pos': '$(',
                'layers': {ns + ':token'}
            }),
        ]

        edges = [
            ('ROOT', 'S', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('S', 'NP', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('S', 'VP', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('NP', 'token1', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('VP', 'token2', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('VP', 'ADJP', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('ADJP', 'token3', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('S', 'token4', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
        ]

        docgraph.add_nodes_from(nodes)
        docgraph.add_edges_from(edges)
        docgraph.tokens = ['token' + str(tok_id) for tok_id in range(1, 5)]

        # generate FREQT string without POS; don't escape brackets
        freqtstr_nopos_noescape = u"(ROOT(S(NP(I))(VP(am)(ADJP((un)certain)))(.)))"
        assert freqtstr_nopos_noescape == docgraph2freqt(
            docgraph,
            docgraph.root,
            include_pos=False,
            escape_func=lambda x: x)

        # generate FREQT string without POS; escape brackets
        freqtstr_nopos_escape = u"(ROOT(S(NP(I))(VP(am)(ADJP(-LRB-un-RRB-certain)))(.)))"
        assert freqtstr_nopos_escape == docgraph2freqt(
            docgraph,
            docgraph.root,
            include_pos=False,
            escape_func=FREQT_ESCAPE_FUNC)

        # generate FREQT string with POS; don't escape brackets
        freqtstr_pos_noescape = u"(ROOT(S(NP(PRP(I)))(VP(VBP(am))(ADJP(JJ((un)certain))))($((.))))"
        assert freqtstr_pos_noescape == docgraph2freqt(docgraph,
                                                       docgraph.root,
                                                       include_pos=True,
                                                       escape_func=lambda x: x)

        # generate FREQT string with POS; escape brackets
        freqtstr_pos_escape = u"(ROOT(S(NP(PRP(I)))(VP(VBP(am))(ADJP(JJ(-LRB-un-RRB-certain))))($-LRB-(.))))"
        assert freqtstr_pos_escape == docgraph2freqt(
            docgraph,
            docgraph.root,
            include_pos=True,
            escape_func=FREQT_ESCAPE_FUNC)
Esempio n. 11
0
    def setup_class(cls):
        """generate a simple docgraph for testing the FREQT export"""
        cls.docgraph = dg.DiscourseDocumentGraph(root='TEXT')
        ns = cls.docgraph.ns

        nodes = [
            ('S', {
                'label': 'S',
                'layers': {ns + ':syntax'}
            }),
            ('NP1', {
                'label': 'NP',
                'layers': {ns + ':syntax'}
            }),
            ('VP', {
                'label': 'VP',
                'layers': {ns + ':syntax'}
            }),
            ('NP2', {
                'label': 'NP',
                'layers': {ns + ':syntax'}
            }),
            ('PP', {
                'label': 'PP',
                'layers': {ns + ':syntax'}
            }),
            ('NP3', {
                'label': 'NP',
                'layers': {ns + ':syntax'}
            }),
            ('token1', {
                ns + ':token': 'I',
                ns + ':pos': 'PRON',
                'layers': {ns + ':token'}
            }),
            ('token2', {
                ns + ':token': 'saw',
                ns + ':pos': 'VVFIN',
                'layers': {ns + ':token'}
            }),
            ('token3', {
                ns + ':token': 'a',
                ns + ':pos': 'DET',
                'layers': {ns + ':token'}
            }),
            ('token4', {
                ns + ':token': 'girl',
                ns + ':pos': 'N',
                'layers': {ns + ':token'}
            }),
            ('token5', {
                ns + ':token': 'with',
                ns + ':pos': 'PREP',
                'layers': {ns + ':token'}
            }),
            ('token6', {
                ns + ':token': 'a',
                ns + ':pos': 'DET',
                'layers': {ns + ':token'}
            }),
            ('token7', {
                ns + ':token': 'telescope',
                ns + ':pos': 'N',
                'layers': {ns + ':token'}
            }),
            ('token8', {
                ns + ':token': '.',
                ns + ':pos': 'PUNCT',
                'layers': {ns + ':token'}
            }),
        ]

        edges = [
            ('TEXT', 'S', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('S', 'NP1', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('S', 'VP', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('S', 'token8', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('NP1', 'token1', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('VP', 'token2', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('VP', 'NP2', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('VP', 'PP', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('NP2', 'token3', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('NP2', 'token4', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('PP', 'token5', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('PP', 'NP3', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('NP3', 'token6', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
            ('NP3', 'token7', {
                'edge_type': dg.EdgeTypes.dominance_relation
            }),
        ]

        cls.docgraph.add_nodes_from(nodes)
        cls.docgraph.add_edges_from(edges)
        cls.docgraph.tokens = ['token' + str(tok_id) for tok_id in range(1, 9)]
Esempio n. 12
0
 def setup(self):
     """create an empty DiscourseDocumentGraph"""
     self.docgraph = dg.DiscourseDocumentGraph()
     assert isinstance(self.docgraph, dg.DiscourseDocumentGraph)
     assert self.docgraph.name == ''
     assert self.docgraph.nodes() == ['discoursegraph:root_node']