def test_multiedge_keyincrement(): """test, if keys are automatically incremented when adding multiple edges between two nodes. This tests redundant code common to add_edge() and add_edges_from(). """ dg1 = dg.DiscourseDocumentGraph() # add an edge with a key. keys are used in multigraphs to distinguish # between multiple edges between the same nodes dg1.add_edge('a', 'b', layers={'love'}, key=1) assert len(dg1.edge['a']['b']) == 1 # add another edge between the same nodes. # keys should auto-increment, especially if the key is already in use dg1.add_edge('a', 'b', layers={'hate'}) assert len(dg1.edge['a']['b']) == 2 assert 'love' in dg1.edge['a']['b'][1]['layers'] assert 'hate' in dg1.edge['a']['b'][2]['layers'] # the method add_edges_from should show the same behaviour dg2 = dg.DiscourseDocumentGraph() dg2.add_edges_from([('a', 'b', 1, {'layers': {'love'}})]) assert len(dg2.edge['a']['b']) == 1 dg2.add_edges_from([('a', 'b', {'layers': {'hate'}})]) assert len(dg2.edge['a']['b']) == 2 assert 'love' in dg2.edge['a']['b'][1]['layers'] assert 'hate' in dg2.edge['a']['b'][2]['layers']
def test_select_nodes_by_layer(): """Are nodes correctly filtered based on their layer?""" ddg = dg.DiscourseDocumentGraph(namespace='test') assert len(ddg) == 1 add_tokens(ddg, ['The', 'dog', 'barks', '.']) assert len(ddg) == 5 # don't filter any nodes all_node_ids = list(dg.select_nodes_by_layer(ddg)) all_nodes = list(dg.select_nodes_by_layer(ddg, data=True)) assert len(all_node_ids) == len(all_nodes) == 5 test_node_ids = list(dg.select_nodes_by_layer(ddg, 'test')) test_nodes = list(dg.select_nodes_by_layer(ddg, 'test', data=True)) assert len(ddg) == len(test_node_ids) == len(test_nodes) == 5 ddg.add_node(10, layers={'foo'}) ddg.add_node(11, layers={'bar'}) # filter several layers test_foo_ids = list(dg.select_nodes_by_layer(ddg, layer={'test', 'foo'})) test_foo_nodes = list(dg.select_nodes_by_layer( ddg, layer={'test', 'foo'}, data=True)) assert len(test_foo_ids) == len(test_foo_nodes) == 6 test_foobar_ids = list(dg.select_nodes_by_layer( ddg, layer={'test', 'foo', 'bar'})) assert len(test_foobar_ids) == 7 # test if data=True works as expected for nodelist in (all_nodes, test_nodes, test_foo_nodes): for node_id, attr_dict in nodelist: assert isinstance(node_id, (str, int)) assert isinstance(attr_dict, dict)
def test_is_continuous(): """tests, if a discontinuous span of tokens is recognised as such. The node ``lower`` spans the two non-adjacent tokens 1 and 4. root | ---upper--- | | | -----|--lower--|---- | | | | 1 2 3 4 """ docgraph = dg.DiscourseDocumentGraph() docgraph.add_node('1', attr_dict={'discoursegraph:token': '1'}) docgraph.add_node('2', attr_dict={'discoursegraph:token': '2'}) docgraph.add_node('3', attr_dict={'discoursegraph:token': '3'}) docgraph.add_node('4', attr_dict={'discoursegraph:token': '4'}) docgraph.add_edge(docgraph.root, 'upper', edge_type=dg.EdgeTypes.dominance_relation) docgraph.add_edge('upper', 'lower', edge_type=dg.EdgeTypes.dominance_relation) docgraph.add_edge('lower', '1', edge_type=dg.EdgeTypes.spanning_relation) docgraph.add_edge('upper', '2', edge_type=dg.EdgeTypes.spanning_relation) docgraph.add_edge('upper', '3', edge_type=dg.EdgeTypes.spanning_relation) docgraph.add_edge('lower', '4', edge_type=dg.EdgeTypes.spanning_relation) # determine order of the tokens docgraph.tokens = ['1', '2', '3', '4'] assert dg.is_continuous(docgraph, docgraph.root) assert dg.is_continuous(docgraph, 'upper') assert not dg.is_continuous(docgraph, 'lower') assert dg.is_continuous(docgraph, '1') assert dg.is_continuous(docgraph, '2') assert dg.is_continuous(docgraph, '3') assert dg.is_continuous(docgraph, '4')
def test_docgraph_name_namespace(self): """create a docgraph with a user-defined name and namespace""" name = 'mydoc' ns = 'weird' ddg = dg.DiscourseDocumentGraph(name=name, namespace=ns) assert isinstance(ddg, dg.DiscourseDocumentGraph) assert ddg.name == name assert ddg.nodes() == ['{}:root_node'.format(ns)]
def test_create_token_mapping(): """check if two docgraphs cover the same text with the same tokenization""" # merging must fail when tokens aren't identical first_graph = dg.DiscourseDocumentGraph(name='first') add_tokens(first_graph, ['Ich', 'bin', 'ein', 'Berliner', '.']) second_graph = dg.DiscourseDocumentGraph(name='second') add_tokens(second_graph, ['Ich', 'bin', 'kein', 'Berliner', '.']) with pytest.raises(ValueError) as excinfo: create_token_mapping(first_graph, second_graph, verbose=False) assert 'Tokenization mismatch' in str(excinfo.value) assert 'kein != ein' in str(excinfo.value) with pytest.raises(ValueError) as excinfo: create_token_mapping(first_graph, second_graph, verbose=True) assert 'Tokenization mismatch' in str(excinfo.value) assert 'Ich bin [[ein]] Berliner .' in str(excinfo.value) assert 'Ich bin [[kein]] Berliner .' in str(excinfo.value)
def test_select_edges_by_attribute(): """test if edges can be filtered for attributes/values""" # create a simple graph with 3 tokens, all dominated by the root node # and with precedence relations between the tokens token_graph = dg.DiscourseDocumentGraph( name='example.tok', namespace='tokenized') add_tokens(token_graph, ['He', 'sleeps', '.']) for token_node in token_graph.tokens: token_graph.add_edge(token_graph.root, token_node, edge_type=dg.EdgeTypes.dominance_relation) for src, target in [(0, 1), (1, 2)]: token_graph.add_edge( src, target, edge_type=dg.EdgeTypes.precedence_relation) assert len(token_graph) == 4 all_edge_ids = list(dg.select_edges_by_attribute(token_graph)) all_edges = list(dg.select_edges_by_attribute(token_graph, data=True)) assert len(token_graph.edges()) == len(all_edge_ids) == len(all_edges) == 5 # test if data=True works as expected for src, target, attrs in all_edges: assert isinstance(src, (str, int)) assert isinstance(target, (str, int)) assert isinstance(attrs, dict) # edges with any edge_type edges_with_edgetype = list(dg.select_edges_by_attribute( token_graph, attribute='edge_type')) assert len(edges_with_edgetype) == 5 # edges with dominance relation edge_type dominance_edge_ids = list(dg.select_edges_by_attribute( token_graph, attribute='edge_type', value=dg.EdgeTypes.dominance_relation)) assert len(dominance_edge_ids) == 3 # edges with dominance or precedence edge_type dominance_or_precendence = list(dg.select_edges_by_attribute( token_graph, attribute='edge_type', value=[dg.EdgeTypes.dominance_relation, dg.EdgeTypes.precedence_relation])) assert len(dominance_or_precendence) == 5
def test_node2bracket(self): """A docgraph node can be converted into PTB-style bracket notation.""" ddg = dg.DiscourseDocumentGraph() ns = ddg.ns ddg.add_node(5) assert node2bracket(ddg, node_id=5) == u'()' #~ import pudb; pudb.set_trace() assert node2bracket(ddg, node_id=5, child_str='()') == u'(())' ddg.add_node(4, attr_dict={'label': 'S'}) assert node2bracket(ddg, node_id=4) == u'(S)' assert node2bracket(ddg, node_id=4, child_str='') == u'(S)' assert node2bracket(ddg, node_id=4, child_str='(NP Ernst)') == u'(S (NP Ernst))' ddg.add_node(3, attr_dict={ns + ':token': 'Horst'}) assert node2bracket(ddg, node_id=3) == u'(Horst)' assert node2bracket(ddg, node_id=3, child_str='()') == u'(Horst ())' ddg.add_node(2, attr_dict={ns + ':token': 'Horst', ns + ':pos': 'N'}) assert node2bracket(ddg, node_id=2) == u'(N Horst)' assert node2bracket( ddg, node_id=2, child_str='(N Schneider)') == u'(N Horst (N Schneider))' # if node is a token and has a label attribute, the output contains # the token attrib, not the label ddg.add_node(1, attr_dict={ ns + ':token': u'Björn', ns + ':pos': 'NE', 'label': u'Horst' }) assert node2bracket(ddg, node_id=1) == u'(NE Björn)' assert node2bracket( ddg, node_id=1, child_str='(N Schneider)') == u'(NE Björn (N Schneider))' ddg.add_node(6, attr_dict={ns + ':token': u'Björn', 'label': u'Horst'}) assert node2bracket(ddg, node_id=6) == u'(Björn)' assert node2bracket(ddg, node_id=6, child_str='(Schneider)') == u'(Björn (Schneider))'
def make_sentencegraph1(): """return a docgraph containing one sentence with syntax and coreference annotation, as well as precedence relations. The graph is cyclic because of a coreference relation (pointing relation). """ docgraph = dg.DiscourseDocumentGraph() # tokens: 0 1 2 3 4 5 6 7 add_tokens(docgraph, ['Guido', 'died', ',', 'he', 'was', 'only', '54', '.']) # add syntax structure (nodes, dominance and spanning relations) docgraph.add_node('S', layers={docgraph.ns+':syntax'}) docgraph.add_node('NP1', layers={docgraph.ns+':syntax'}) docgraph.add_node('VP1', layers={docgraph.ns+':syntax'}) docgraph.add_node('SBAR', layers={docgraph.ns+':syntax'}) docgraph.add_node('NP2', layers={docgraph.ns+':syntax'}) docgraph.add_node('VP2', layers={docgraph.ns+':syntax'}) dom_rels = [(docgraph.root, 'S'), ('S', 'NP1'), ('S', 'VP1'), ('S', 'SBAR'), ('SBAR', 'NP2'), ('SBAR', 'VP2')] for src, target in dom_rels: docgraph.add_edge(src, target, layers={docgraph.ns+':syntax'}, edge_type=dg.EdgeTypes.dominance_relation) span_rels = [('NP1', 0), ('VP1', 1), ('NP2', 3), ('VP2', 4), ('VP2', 5), ('VP2', 6)] for src, target in span_rels: docgraph.add_edge(src, target, edge_type=dg.EdgeTypes.spanning_relation) # coreference: he -> Guido docgraph.add_edge(3, 0, layers={docgraph.ns+':coreference'}, edge_type=dg.EdgeTypes.pointing_relation) # add precedence relations prec_rels = [(i, i+1) for i in range(7)] for src, target in prec_rels: docgraph.add_edge(src, target, layers={docgraph.ns+':precedence'}, edge_type=dg.EdgeTypes.pointing_relation) return docgraph
def test_merge_graphs(self): """merge a very simple graph into an empty graph""" # create a simple graph with 3 tokens, all dominated by the root node token_graph = dg.DiscourseDocumentGraph( name='example.tok', namespace='tokenized') add_tokens(token_graph, ['He', 'sleeps', '.']) for token_node in token_graph.tokens: token_graph.add_edge(token_graph.root, token_node, edge_type=dg.EdgeTypes.dominance_relation) assert len(token_graph) == 4 assert len(token_graph.edges()) == 3 assert self.docgraph.name == '' assert self.docgraph.tokens == [] assert len(self.docgraph) == 1 self.docgraph.merge_graphs(token_graph) assert self.docgraph.name == 'example.tok' assert len(self.docgraph.tokens) == 3 assert len(self.docgraph) == 4 assert len(token_graph.edges()) == 3
def test_docgraph2freqt_escaped(self): """Convert a docgraph into a FREQT string, with/out POS tags and escaping.""" docgraph = dg.DiscourseDocumentGraph(root='TEXT') assert '(TEXT)' == node2freqt(docgraph, docgraph.root, escape_func=FREQT_ESCAPE_FUNC) assert '(TEXT)' == node2freqt(docgraph, docgraph.root, escape_func=lambda x: x) docgraph = dg.DiscourseDocumentGraph(root='(TEXT)') assert '(-LRB-TEXT-RRB-)' == node2freqt(docgraph, docgraph.root, escape_func=FREQT_ESCAPE_FUNC) assert '((TEXT))' == node2freqt(docgraph, docgraph.root, escape_func=lambda x: x) docgraph = dg.DiscourseDocumentGraph(root='TE(X)T') assert '(TE-LRB-X-RRB-T)' == node2freqt(docgraph, docgraph.root, escape_func=FREQT_ESCAPE_FUNC) assert '(TE(X)T)' == node2freqt(docgraph, docgraph.root, escape_func=lambda x: x) # sentence: I am (un)certain . docgraph = dg.DiscourseDocumentGraph(root='ROOT') ns = docgraph.ns nodes = [ ('S', { 'label': 'S', 'layers': {ns + ':syntax'} }), ('NP', { 'label': 'NP', 'layers': {ns + ':syntax'} }), ('VP', { 'label': 'VP', 'layers': {ns + ':syntax'} }), ('ADJP', { 'label': 'ADJP', 'layers': {ns + ':syntax'} }), ('token1', { ns + ':token': 'I', ns + ':pos': 'PRP', 'layers': {ns + ':token'} }), ('token2', { ns + ':token': 'am', ns + ':pos': 'VBP', 'layers': {ns + ':token'} }), ('token3', { ns + ':token': '(un)certain', ns + ':pos': 'JJ', 'layers': {ns + ':token'} }), ('token4', { ns + ':token': '.', ns + ':pos': '$(', 'layers': {ns + ':token'} }), ] edges = [ ('ROOT', 'S', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('S', 'NP', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('S', 'VP', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('NP', 'token1', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('VP', 'token2', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('VP', 'ADJP', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('ADJP', 'token3', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('S', 'token4', { 'edge_type': dg.EdgeTypes.dominance_relation }), ] docgraph.add_nodes_from(nodes) docgraph.add_edges_from(edges) docgraph.tokens = ['token' + str(tok_id) for tok_id in range(1, 5)] # generate FREQT string without POS; don't escape brackets freqtstr_nopos_noescape = u"(ROOT(S(NP(I))(VP(am)(ADJP((un)certain)))(.)))" assert freqtstr_nopos_noescape == docgraph2freqt( docgraph, docgraph.root, include_pos=False, escape_func=lambda x: x) # generate FREQT string without POS; escape brackets freqtstr_nopos_escape = u"(ROOT(S(NP(I))(VP(am)(ADJP(-LRB-un-RRB-certain)))(.)))" assert freqtstr_nopos_escape == docgraph2freqt( docgraph, docgraph.root, include_pos=False, escape_func=FREQT_ESCAPE_FUNC) # generate FREQT string with POS; don't escape brackets freqtstr_pos_noescape = u"(ROOT(S(NP(PRP(I)))(VP(VBP(am))(ADJP(JJ((un)certain))))($((.))))" assert freqtstr_pos_noescape == docgraph2freqt(docgraph, docgraph.root, include_pos=True, escape_func=lambda x: x) # generate FREQT string with POS; escape brackets freqtstr_pos_escape = u"(ROOT(S(NP(PRP(I)))(VP(VBP(am))(ADJP(JJ(-LRB-un-RRB-certain))))($-LRB-(.))))" assert freqtstr_pos_escape == docgraph2freqt( docgraph, docgraph.root, include_pos=True, escape_func=FREQT_ESCAPE_FUNC)
def setup_class(cls): """generate a simple docgraph for testing the FREQT export""" cls.docgraph = dg.DiscourseDocumentGraph(root='TEXT') ns = cls.docgraph.ns nodes = [ ('S', { 'label': 'S', 'layers': {ns + ':syntax'} }), ('NP1', { 'label': 'NP', 'layers': {ns + ':syntax'} }), ('VP', { 'label': 'VP', 'layers': {ns + ':syntax'} }), ('NP2', { 'label': 'NP', 'layers': {ns + ':syntax'} }), ('PP', { 'label': 'PP', 'layers': {ns + ':syntax'} }), ('NP3', { 'label': 'NP', 'layers': {ns + ':syntax'} }), ('token1', { ns + ':token': 'I', ns + ':pos': 'PRON', 'layers': {ns + ':token'} }), ('token2', { ns + ':token': 'saw', ns + ':pos': 'VVFIN', 'layers': {ns + ':token'} }), ('token3', { ns + ':token': 'a', ns + ':pos': 'DET', 'layers': {ns + ':token'} }), ('token4', { ns + ':token': 'girl', ns + ':pos': 'N', 'layers': {ns + ':token'} }), ('token5', { ns + ':token': 'with', ns + ':pos': 'PREP', 'layers': {ns + ':token'} }), ('token6', { ns + ':token': 'a', ns + ':pos': 'DET', 'layers': {ns + ':token'} }), ('token7', { ns + ':token': 'telescope', ns + ':pos': 'N', 'layers': {ns + ':token'} }), ('token8', { ns + ':token': '.', ns + ':pos': 'PUNCT', 'layers': {ns + ':token'} }), ] edges = [ ('TEXT', 'S', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('S', 'NP1', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('S', 'VP', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('S', 'token8', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('NP1', 'token1', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('VP', 'token2', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('VP', 'NP2', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('VP', 'PP', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('NP2', 'token3', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('NP2', 'token4', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('PP', 'token5', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('PP', 'NP3', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('NP3', 'token6', { 'edge_type': dg.EdgeTypes.dominance_relation }), ('NP3', 'token7', { 'edge_type': dg.EdgeTypes.dominance_relation }), ] cls.docgraph.add_nodes_from(nodes) cls.docgraph.add_edges_from(edges) cls.docgraph.tokens = ['token' + str(tok_id) for tok_id in range(1, 9)]
def setup(self): """create an empty DiscourseDocumentGraph""" self.docgraph = dg.DiscourseDocumentGraph() assert isinstance(self.docgraph, dg.DiscourseDocumentGraph) assert self.docgraph.name == '' assert self.docgraph.nodes() == ['discoursegraph:root_node']