def test_punctuation_verylow(discont_tree, cont_tree): """transform.punctuation_verylow """ terminals = trees.terminals(discont_tree) old_vp_children = terminals[0].parent.children old_q_parent = terminals[-1].parent terminals[0].data['word'] = "(" terminals[7].data['word'] = ")" discont_tree = transform.punctuation_verylow(discont_tree) new_vp_children = terminals[0].parent.children new_q_parent = terminals[-1].parent new_labels_test = [u'VROOT', u'S', u'VP', u'SBAR', u'VP', u'WP', u'VB', u'?', u'IN', u'NP', u'NNP', u'VB', u'NNP', u'VB', u'NNP'] assert old_q_parent == discont_tree assert old_vp_children == new_vp_children assert new_q_parent == terminals[-2].parent assert new_labels_test == [node.data['label'] for node in trees.preorder(discont_tree)] terminals = trees.terminals(cont_tree) old_vp_children = terminals[0].parent.children old_q_parent = terminals[-1].parent terminals[0].data['word'] = "(" terminals[7].data['word'] = ")" cont_tree = transform.punctuation_verylow(cont_tree) new_vp_children = terminals[0].parent.children new_q_parent = terminals[-1].parent new_labels_test = [u'VROOT', u'S', u'WP', u'VB', u'NNP', u'VP', u'VB', u'NNP', u'SBAR', u'IN', u'NP', u'NNP', u'VP', u'VB', u'?'] assert old_q_parent == cont_tree assert old_vp_children == new_vp_children assert new_q_parent == terminals[-2].parent assert new_labels_test == [node.data['label'] for node in trees.preorder(cont_tree)]
def test_ptb_delete_traces(cont_tree): """transform.ptb_delete_traces """ terms = trees.terminals(cont_tree) terms[-2].data['label'] = "-NONE-" cont_tree = transform.ptb_delete_traces(cont_tree) assert len(trees.terminals(cont_tree)) == len(terms) - 1
def test_delete_terminal(discont_tree, cont_tree): """trees.delete_terminal """ # discont old_num_nodes = len([node for node in trees.preorder(discont_tree)]) terminals = trees.terminals(discont_tree) to_remove = terminals[0] to_remove_p = to_remove.parent result = trees.delete_terminal(discont_tree, to_remove) res_words = [node.data['word'] for node in trees.terminals(discont_tree)] res_num_nodes = len([node for node in trees.preorder(discont_tree)]) assert result == to_remove_p assert res_words == testdata.WORDS[1:] assert res_num_nodes == old_num_nodes - 1 # cont old_num_nodes = len([node for node in trees.preorder(cont_tree)]) terminals = trees.terminals(cont_tree) to_remove = terminals[0] to_remove_p = to_remove.parent result = trees.delete_terminal(cont_tree, to_remove) res_words = [node.data['word'] for node in trees.terminals(cont_tree)] res_num_nodes = len([node for node in trees.preorder(cont_tree)]) assert result == to_remove_p assert res_words == testdata.WORDS[1:] assert res_num_nodes == old_num_nodes - 1
def test_dominance(discont_tree, cont_tree): """trees.dominance """ dterms = trees.terminals(discont_tree) ddom = [node.data['label'] for node in trees.dominance(dterms[0])] cterms = trees.terminals(cont_tree) cdom = [node.data['label'] for node in trees.dominance(cterms[0])] assert ddom == testdata.DISCONT_DOM_FIRST assert cdom == testdata.CONT_DOM_FIRST
def test_discont_general(discont_tree): """General tests concerning discontinuous trees. """ tree = discont_tree nodes = [node for node in trees.preorder(tree)] labels = [node.data['label'] for node in nodes] terms = trees.terminals(tree) words = [node.data['word'] for node in terms] uterms = trees.unordered_terminals(tree) uwords = [node.data['word'] for node in uterms] tree = transform.negra_mark_heads(tree) tree = transform.binarize(tree) left_reorder = [node.data['num'] for node in treeanalysis.disco_order(tree, 'left')] rightd_reorder = [node.data['num'] for node in treeanalysis.disco_order(tree, 'rightd')] assert left_reorder == testdata.DISCONT_LEFT_REORDER assert rightd_reorder == testdata.DISCONT_RIGHTD_REORDER assert all(['num' in node.data for node in terms]) assert all([node in uterms for node in terms]) assert len(terms) == 9 assert len(uterms) == 9 assert len(nodes) == 15 assert labels == testdata.DISCONT_LABELS_PREORDER assert words == testdata.WORDS assert set(uwords) == set(testdata.WORDS)
def test_punctuation_root(discont_tree, cont_tree): """transform.punctuation_root """ terms = trees.terminals(discont_tree) terms[0].data['word'] = "," old_p = terms[0].parent assert len(trees.children(old_p)) == 2 discont_tree = transform.punctuation_root(discont_tree) assert len(trees.children(old_p)) == 1 assert terms[0].parent == discont_tree terms = trees.terminals(cont_tree) terms[3].data['word'] = "," old_p = terms[3].parent assert len(trees.children(old_p)) == 3 cont_tree = transform.punctuation_root(cont_tree) assert len(trees.children(old_p)) == 2 assert terms[3].parent == cont_tree
def test_lca(discont_tree, cont_tree): """trees.lca """ tree = discont_tree ctree = cont_tree terms = trees.terminals(tree) cterms = trees.terminals(ctree) root_children = trees.children(tree) croot_children = trees.children(ctree) lca = trees.lca(terms[0], terms[1]) clca = trees.lca(cterms[0], cterms[1]) assert terms[0].data['word'] == 'Who' assert cterms[0].data['word'] == 'Who' assert terms[1].data['word'] == 'did' assert cterms[1].data['word'] == 'did' assert root_children[0].data['label'] == 'S' assert croot_children[0].data['label'] == 'S' assert root_children[0] == lca assert croot_children[0] == clca
def test_punctuation_symetrify(discont_tree, cont_tree): """transform.punctuation_symetrify """ temp = tempfile.NamedTemporaryFile(mode='w') temp.write('1\t3\t"\t$(\n') temp.write('1\t5\t"\t$(\n') temp.write('1\t8\t,\t$,\n') temp.flush() params = {'terminalfile': temp.name, 'quiet': True} old_terms = trees.terminals(discont_tree) discont_tree = transform.insert_terminals(discont_tree, **params) new_terms = trees.terminals(discont_tree) assert len(old_terms) == len(new_terms) - 3 discont_tree = transform.root_attach(discont_tree) discont_tree = transform.punctuation_symetrify(discont_tree) treeoutput.compute_export_numbering(discont_tree) assert new_terms[2].parent.data['num'] == 504 assert new_terms[4].parent.data['num'] == 504 assert new_terms[7].parent.data['num'] == 503 # cont temp = tempfile.NamedTemporaryFile(mode='w') temp.write('1\t3\t"\t$(\n') temp.write('1\t5\t"\t$(\n') temp.write('1\t8\t,\t$,\n') temp.flush() params = {'terminalfile': temp.name, 'quiet': True} old_terms = trees.terminals(cont_tree) cont_tree = transform.insert_terminals(cont_tree, **params) new_terms = trees.terminals(cont_tree) assert len(old_terms) == len(new_terms) - 3 cont_tree = transform.root_attach(cont_tree) cont_tree = transform.punctuation_symetrify(cont_tree) treeoutput.compute_export_numbering(cont_tree) assert new_terms[2].parent.data['num'] == 504 assert new_terms[4].parent.data['num'] == 504 assert new_terms[7].parent.data['num'] == 503
def test_boyd(discont_tree): """See transform.boyd_split """ tree = discont_tree tree = transform.root_attach(tree) tree = transform.negra_mark_heads(tree) tree = transform.boyd_split(tree) nodes = [node for node in trees.preorder(tree)] labels = [node.data['label'] for node in nodes] terms = trees.terminals(tree) words = [node.data['word'] for node in terms] uterms = trees.unordered_terminals(tree) uwords = [node.data['word'] for node in uterms] assert labels == testdata.DISCONT_LABELSBOYD_PREORDER assert words == testdata.WORDS assert set(uwords) == set(testdata.WORDS)
def test_root_attach(discont_tree): """See transform.root_attach """ tree = discont_tree tree = transform.root_attach(tree) nodes = [node for node in trees.preorder(tree)] labels = [node.data['label'] for node in nodes] terms = trees.terminals(tree) words = [node.data['word'] for node in terms] uterms = trees.unordered_terminals(tree) uwords = [node.data['word'] for node in uterms] assert labels == testdata.DISCONT_LABELS_PREORDER assert words == testdata.WORDS assert set(uwords) == set(testdata.WORDS) with pytest.raises(ValueError): transform.boyd_split(tree)
def test_cont_general(cont_tree): """General tests concerning continuous trees. """ tree = cont_tree terms = trees.terminals(tree) uterms = trees.unordered_terminals(tree) nodes = [node for node in trees.preorder(tree)] labels = [node.data['label'] for node in nodes] words = [node.data['word'] for node in terms] uwords = [node.data['word'] for node in uterms] assert all(['num' in node.data for node in terms]) assert all([node in uterms for node in terms]) assert len(terms) == 9 assert len(uterms) == 9 assert len(nodes) == 15 assert labels == testdata.CONT_LABELS_PREORDER assert words == testdata.WORDS assert set(uwords) == set(testdata.WORDS)
def cont_tree(request): """Load continuous tree samples """ tempfile_name = None with tempfile.NamedTemporaryFile(mode='w', delete=False) as temp: tempfile_name = temp.name temp.write(request.param[1]) temp.flush() request.param[2]['quiet'] = True reader = request.param[0](tempfile_name, 'utf8', **request.param[2]) def fin(): os.remove(tempfile_name) tree = next(reader) # 'fix' POS tags for brackets_emptypos mode terms = trees.terminals(tree) if all([term.data['label'] == trees.DEFAULT_LABEL for term in terms]): for term, pos in zip(terms, testdata.POS): term.data['label'] = pos return tree
def test_insert_terminal(discont_tree, cont_tree): """transform.insert_terminals """ temp = tempfile.NamedTemporaryFile(mode='w') temp.write('1\t0\tTest1\tPosTest1\n') temp.write('1\t2\tTest1\tPosTest1\n') temp.write('1\t6\tTest2\tPosTest2\n') temp.write('1\t6\tTest2\tPosTest2\n') temp.write('1\t100\tTest2\tPosTest2\n') temp.flush() params = {'terminalfile': temp.name, 'quiet': True} with pytest.raises(ValueError): transform.insert_terminals(discont_tree, **params) temp = tempfile.NamedTemporaryFile(mode='w') temp.write('1\t0\tTest1\tPosTest1\n') temp.write('1\t2\tTest1\tPosTest1\n') temp.write('1\t6\tTest2\tPosTest2\n') temp.write('1\t100\tTest2\tPosTest2\n') temp.flush() params = {'terminalfile': temp.name, 'quiet': True} old_terms = trees.terminals(discont_tree) discont_tree = transform.insert_terminals(discont_tree, **params) new_terms = trees.terminals(discont_tree) assert len(old_terms) == len(new_terms) - 2 gold_words = list(testdata.WORDS) out_words = [term.data['word'] for term in new_terms] gold_words[1:1] = ['Test1'] gold_words[5:5] = ['Test2'] assert gold_words == out_words gold_pos = list(testdata.POS) out_pos = [term.data['label'] for term in new_terms] gold_pos[1:1] = ['PosTest1'] gold_pos[5:5] = ['PosTest2'] assert gold_pos == out_pos # cont temp = tempfile.NamedTemporaryFile(mode='w') temp.write('1\t0\tTest1\tPosTest1\n') temp.write('1\t2\tTest1\tPosTest1\n') temp.write('1\t6\tTest2\tPosTest2\n') temp.write('1\t6\tTest2\tPosTest2\n') temp.write('1\t100\tTest2\tPosTest2\n') temp.flush() params = {'terminalfile': temp.name, 'quiet': True} with pytest.raises(ValueError): transform.insert_terminals(cont_tree, **params) temp = tempfile.NamedTemporaryFile(mode='w') temp.write('1\t0\tTest1\tPosTest1\n') temp.write('1\t2\tTest1\tPosTest1\n') temp.write('1\t6\tTest2\tPosTest2\n') temp.write('1\t100\tTest2\tPosTest2\n') temp.flush() params = {'terminalfile': temp.name, 'quiet': True} old_terms = trees.terminals(cont_tree) cont_tree = transform.insert_terminals(cont_tree, **params) new_terms = trees.terminals(cont_tree) assert len(old_terms) == len(new_terms) - 2 gold_words = list(testdata.WORDS) out_words = [term.data['word'] for term in new_terms] gold_words[1:1] = ['Test1'] gold_words[5:5] = ['Test2'] assert gold_words == out_words gold_pos = list(testdata.POS) out_pos = [term.data['label'] for term in new_terms] gold_pos[1:1] = ['PosTest1'] gold_pos[5:5] = ['PosTest2'] assert gold_pos == out_pos