def test_project_invalid_span(): src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"] tgt_tokens = [ "Members", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir", "Ġhands" ] # reference: tgt_token_index = [[0], [1], [2], [3], [4, 5], [6], [7]] ta = TokenAligner(src_tokens, tgt_tokens) with pytest.raises(ValueError): ta.project_span(0, 0)
def test_token_aligner_project_span_last_token_range_is_end_exclusive(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = ["abc", "d", "ef", "ghi", "jkl"] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_span(3, 4) m_expected = np.array([4, 5]) assert (m == m_expected).all()
def test_token_aligner_project_span(): source_tokens = ["abc", "def", "ghi", "jkl"] target_tokens = ["abc", "d", "ef", "ghi", "jkl"] ta = TokenAligner(source_tokens, target_tokens) m = ta.project_span(1, 2) m_expected = np.array([1, 3]) assert (m == m_expected).all()
def test_project_span_covering_whole_sequence(): src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"] tgt_tokens = [ "Members", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir", "Ġhands" ] # reference: tgt_token_index = [[0], [1], [2], [3], [4, 5], [6], [7]] ta = TokenAligner(src_tokens, tgt_tokens) assert (0, 8) == ta.project_span(0, 7)