def setUp(self): nl_filename = os.path.join(os.path.dirname(__file__), 'data/europarl/nl/ep-00-12-15.xml') self.nl_extractor = EuroparlExtractor('nl', ['en']) self.nl_tree = etree.parse(nl_filename) self.nl_alignmenttrees = self.nl_extractor.parse_alignment_trees(nl_filename) self.nl_translationtrees = self.nl_extractor.parse_translation_trees(nl_filename) en_filename = os.path.join(os.path.dirname(__file__), 'data/europarl/en/ep-00-12-15.xml') self.en_extractor = EuroparlExtractor('en', ['nl']) self.en_tree = etree.parse(en_filename) self.en_alignmenttrees = self.en_extractor.parse_alignment_trees(en_filename) self.en_translationtrees = self.en_extractor.parse_translation_trees(en_filename)
def test_is_lexically_bound(self): extractor = EuroparlExtractor('en', ['de', 'es', 'fr', 'nl']) lemma_attr = extractor.config.get('all', 'lemma_attr') mock_aux_verb = {lemma_attr: 'zijn'} mock_perfect = {lemma_attr: 'botsen'} self.assertTrue(extractor.is_lexically_bound('nl', mock_aux_verb, mock_perfect)) mock_aux_verb = {lemma_attr: 'zijn'} mock_perfect = {lemma_attr: 'hebben'} self.assertFalse(extractor.is_lexically_bound('nl', mock_aux_verb, mock_perfect)) mock_aux_verb = {lemma_attr: u'être'} mock_perfect = {lemma_attr: 'regarder'} self.assertFalse(extractor.is_lexically_bound('fr', mock_aux_verb, mock_perfect)) mock_aux_verb = {lemma_attr: u'être'} mock_perfect = {lemma_attr: 'revenir'} self.assertTrue(extractor.is_lexically_bound('fr', mock_aux_verb, mock_perfect)) mock_aux_verb = {lemma_attr: 'have'} mock_perfect = {lemma_attr: 'collided'} self.assertTrue(extractor.is_lexically_bound('en', mock_aux_verb, mock_perfect))
class TestEuroparlExtractor(unittest.TestCase): def setUp(self): nl_filename = os.path.join(os.path.dirname(__file__), 'data/europarl/nl/ep-00-12-15.xml') self.nl_extractor = EuroparlExtractor('nl', ['en']) self.nl_tree = etree.parse(nl_filename) self.nl_alignmenttrees = self.nl_extractor.parse_alignment_trees(nl_filename) self.nl_translationtrees = self.nl_extractor.parse_translation_trees(nl_filename) en_filename = os.path.join(os.path.dirname(__file__), 'data/europarl/en/ep-00-12-15.xml') self.en_extractor = EuroparlExtractor('en', ['nl']) self.en_tree = etree.parse(en_filename) self.en_alignmenttrees = self.en_extractor.parse_alignment_trees(en_filename) self.en_translationtrees = self.en_extractor.parse_translation_trees(en_filename) def test_init(self): self.assertEqual(self.nl_extractor.config.get('nl', 'perfect_tag'), 'verbpapa') self.assertIn('dunken', self.nl_extractor.aux_be_list['nl']) def test_get_translated_lines(self): from_lines, to_lines, align = self.nl_extractor.get_translated_lines(self.nl_alignmenttrees, 'nl', 'en', '17') self.assertEqual(from_lines, ['17']) self.assertEqual(to_lines, ['11']) self.assertEqual(align, '1 => 1') from_lines, to_lines, align = self.nl_extractor.get_translated_lines(self.nl_alignmenttrees, 'nl', 'en', '18') self.assertEqual(from_lines, ['18', '19']) self.assertEqual(to_lines, ['12']) self.assertEqual(align, '2 => 1') from_lines, to_lines, align = self.nl_extractor.get_translated_lines(self.nl_alignmenttrees, 'nl', 'en', '57') self.assertEqual(from_lines, ['57']) self.assertEqual(to_lines, ['46', '47']) self.assertEqual(align, '1 => 2') from_lines, to_lines, align = self.nl_extractor.get_translated_lines(self.nl_alignmenttrees, 'nl', 'en', '9') self.assertEqual(from_lines, ['9']) self.assertEqual(to_lines, []) self.assertEqual(align, '') from_lines, to_lines, align = self.en_extractor.get_translated_lines(self.en_alignmenttrees, 'en', 'nl', '19') self.assertEqual(from_lines, ['19']) self.assertEqual(to_lines, ['27']) self.assertEqual(align, '1 => 1') from_lines, to_lines, align = self.en_extractor.get_translated_lines(self.en_alignmenttrees, 'en', 'nl', '8') self.assertEqual(from_lines, ['8']) self.assertEqual(to_lines, ['13', '14']) self.assertEqual(align, '1 => 2') from_lines, to_lines, align = self.en_extractor.get_translated_lines(self.en_alignmenttrees, 'en', 'nl', '234') self.assertEqual(from_lines, ['234', '235']) self.assertEqual(to_lines, ['290']) self.assertEqual(align, '2 => 1') def test_get_line_by_number(self): xml_sentence, _, pp = self.nl_extractor.get_line_by_number(self.nl_tree, 'nl', '4') self.assertEqual(etree.fromstring(xml_sentence).get('id'), '4') self.assertEqual(pp.get_sentence_id(), '4') self.assertEqual(pp.verbs(), ['is', 'aangebroken']) self.assertEqual(pp.verb_ids(), 'w4.9 w4.19') self.assertEqual(pp.words_between(), 9) xml_sentence, _, pp = self.nl_extractor.get_line_by_number(self.nl_tree, 'nl', '15') self.assertEqual(etree.fromstring(xml_sentence).get('id'), '15') self.assertEqual(pp.verbs(), ['heeft', 'bemoeid']) self.assertEqual(pp.words_between(), 0) xml_sentence, _, pp = self.nl_extractor.get_line_by_number(self.nl_translationtrees['en'], 'en', '6') self.assertEqual(etree.fromstring(xml_sentence).get('id'), '6') self.assertEqual(pp.verbs(), ['has', 'said']) self.assertEqual(pp.words_between(), 1) xml_sentence, _, pp = self.en_extractor.get_line_by_number(self.en_tree, 'en', '89') self.assertEqual(etree.fromstring(xml_sentence).get('id'), '89') self.assertEqual(pp.verbs(), ['has', 'been', 'mentioned']) self.assertEqual(pp.words_between(), 1) def test_list_filenames(self): files = self.nl_extractor.list_filenames(os.path.join(os.path.dirname(__file__), 'data/europarl/nl')) self.assertEqual([os.path.basename(f) for f in files], ['ep-00-12-15.xml'])