def atest_canonical(self): """ Test WA align for canonical examples. Each entry is source_raw - The string src_tokens - List of words tok_src - char position of src_tokens tok_trg - char position of tokenized target src_tags - Dict of markup tags trg_tokens - List of target words wrd_align= Alignment between source and target tokens """ examples = [ # One tag around everything <i>a b</i> # ["<i>a b</i>--><i>1 2</i>", ["<i>a b</i>", ["a", "b"], [[0, 0], [2, 2]], [[0, 0], [2, 2]], {0: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 3: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}, ["1", "2"], [(0, 0), (1, 1)], {0: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 3: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}], # One tag around the first word same order # <i>a</i> b --> <i>1</i> 2 # ["<i>a</i> b--><i>1</i> 2", ["<i>a</i> b", ["a", "b"], [[0, 0], [2, 2]], [[0, 0], [2, 2]], {0: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 1: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}, ["1", "2"], [(0, 0), (1, 1)], {0: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 1: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}], # One tag around the first word change order # <i>a</i> b --> 1 <i>2</i> # ["<i>a</i> b-->1 <i>2</i>", ["<i>a</i> b", ["a", "b"], [[0, 0], [2, 2]], [[0, 0], [2, 2]], {0: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 1: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}, ["1", "2"], [(0, 1), (1, 0)], {2: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 3: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}], # ["<i>a</i> c d b-->1 3 <i>2</i> 4", ["<i>a</i> c d b", ["a", "c", "d", "b"], [[0, 0], [2, 2], [4, 4], [6, 6]], [[0, 0], [2, 2], [4, 4], [6, 6]], {0: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 1: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}, ["1", "3", "4", "2"], [(0, 2), (1, 3), (2, 0), (3, 1)], {4: [{'tag_type': 'open', 'close_tid': 1, 'text': u'<i>', 'tid': 0}], 5: [{'tag_type': 'close', 'open_tid': 0, 'text': u'</i>', 'tid': 1}]}] ] for i, (text, s_token, tok_src, tok_trg, s_tags, t_token, wrd_align, truth) in enumerate(examples): with self.subTest(i=i): t_tags = align_tags(text, s_token, tok_src, tok_trg, s_tags, t_token, wrd_align=wrd_align) self.assertEqual(t_tags, truth, "\n%s\nTruth:\n%s\nGot:\n%s" % (text, truth, t_tags))
def test_neste_examples(self): """ Assume we can always tokenize on space Assume that we have the same characters on target. """ examples = [ ["<a><i>a b</i></a>", [[0, 0], [1, 1]], "<a><i>a b</i></a>"], ["<a><i>a b</i></a>", [[0, 1], [1, 0]], "<a><i>b a</i></a>"], ["<a>c<i>a b</i></a>", [[0, 0], [1, 2], [2, 1]], "<a>c<i>b a</i></a>"], ["<a>c<i>a b</i> d</a>", [[0, 0], [1, 2], [2, 1], [3, 3]], "<a>c<i>b a</i> d</a>"], ["<a>c<i>a b</i> e d</a>", [[0, 0], [1, 2], [2, 1], [3, 4], [4, 3]], "<a>c<i>b a</i> d e</a>"], ["<a>a<i> b<b> c</b> d</i> e</a>", [[0, 0], [1, 1], [2, 2], [3, 3], [4, 4]], "<a>a<i> b<b> c</b> d</i> e</a>"], ["<a>a<i> b</i></a>", [[0, 0], [1, 1]], "<a>a<i> b</i></a>"], ] # nested tags # "<a><i>a</i></a> --> <a><i>1</i></a>" # "<a>a<i>b</i></a> --> <a>a<i>b</i></a>" # "<a>a<i>b</i>c</a> --> <a>a<i>b</i>c</a>" for i, (tagged_text, wrd_align, truth) in enumerate(examples): with self.subTest(i=i): text, s_tags = remove_tags(tagged_text) s_token = text.split(" ") tok_src = split_with_indices(text, " ") tok_trg = tok_src t_token = get_target_tokens(s_token, wrd_align) t_tags = align_tags(tagged_text, text, s_token, tok_src, tok_trg, s_tags, t_token, wrd_align=wrd_align) target_text = " ".join(t_token) target_tagged_text = insert_tags(target_text, t_tags) self.assertEqual(target_tagged_text, truth, "\n%s\nTruth:\n%s\nGot:\n%s" % ( tagged_text, truth, target_tagged_text))