Beispiel #1
0
def test_token_aligner_project_single_token_index():
    source_tokens = ["abc", "def", "ghi", "jkl"]
    target_tokens = ["abc", "d", "ef", "ghi", "jkl"]
    ta = TokenAligner(source_tokens, target_tokens)
    m = ta.project_token_idxs(1)
    m_expected = np.array([1, 2])
    assert (m == m_expected).all()
def test_token_aligner_project_to_empty_target_token_sequence():
    source_tokens = ["abc", "def", "ghi", "jkl"]
    target_tokens = []
    ta = TokenAligner(source_tokens, target_tokens)
    m = ta.project_tokens([1, 3])
    m_expected = np.array([])
    assert (m == m_expected).all()
def test_token_aligner_project_to_mismatched_token_sequence():
    source_tokens = ["abc", "def", "ghi", "jkl"]
    target_tokens = ["qrs", "tuv", "wxy", "z"]
    ta = TokenAligner(source_tokens, target_tokens)
    m = ta.project_tokens([1])
    m_expected = np.array([])
    assert (m == m_expected).all()
def test_token_aligner_project_span():
    source_tokens = ["abc", "def", "ghi", "jkl"]
    target_tokens = ["abc", "d", "ef", "ghi", "jkl"]
    ta = TokenAligner(source_tokens, target_tokens)
    m = ta.project_span(1, 2)
    m_expected = np.array([1, 3])
    assert (m == m_expected).all()
def test_moses_tok_idx_proj_3():
    src_tokens = [
        "Mr.",
        "Immelt",
        "chose",
        "to",
        "focus",
        "on",
        "the",
        "incomprehensibility",
        "of",
        "accounting",
        "rules.",
    ]
    tgt_tokens = [
        "Mr.",
        "Immelt",
        "chose",
        "to",
        "focus",
        "on",
        "the",
        "incomprehensibility",
        "of",
        "accounting",
        "rules",
        ".",
    ]
    tgt_token_index = [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9],
                       [10, 11]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_token_aligner_project_span_last_token_range_is_end_exclusive():
    source_tokens = ["abc", "def", "ghi", "jkl"]
    target_tokens = ["abc", "d", "ef", "ghi", "jkl"]
    ta = TokenAligner(source_tokens, target_tokens)
    m = ta.project_span(3, 4)
    m_expected = np.array([4, 5])
    assert (m == m_expected).all()
def test_token_aligner_project_multiple_token_indices():
    source_tokens = ["abc", "def", "ghi", "jkl"]
    target_tokens = ["abc", "d", "ef", "ghi", "jkl"]
    ta = TokenAligner(source_tokens, target_tokens)
    m = ta.project_tokens([1, 3])
    m_expected = np.array([1, 2, 4])
    assert (m == m_expected).all()
def test_project_span_covering_whole_sequence():
    src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"]
    tgt_tokens = [
        "Members", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir", "Ġhands"
    ]
    # reference: tgt_token_index = [[0], [1], [2], [3], [4, 5], [6], [7]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    assert (0, 8) == ta.project_span(0, 7)
def test_bytebpe_tok_idx_proj_4():
    src_tokens = ["What?"]
    tgt_tokens = ["What", "?"]
    tgt_token_index = [[0, 1]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_wpm_tok_idx_proj_1():
    src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"]
    tgt_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"]
    tgt_token_index = [[0], [1], [2], [3], [4], [5], [6]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
Beispiel #11
0
    def _create_examples(self, qa_file_path, set_type):
        wiki_df = pd.read_csv(self.path_dict["wiki_dict"],
                              sep="\t",
                              names=["sent_id", "text"])
        wiki_dict = {
            row.sent_id: row.text
            for row in wiki_df.itertuples(index=False)
        }

        data_df = pd.read_csv(
            qa_file_path,
            sep="\t",
            header=None,
            names=[
                "sent_id",
                "target_ids",
                "worker_id",
                "qa_index",
                "qa_word",
                "question",
                "answer",
                "response1",
                "response2",
            ],
        )
        data_df["sent"] = data_df["sent_id"].apply(wiki_dict.get)

        examples = []
        ptb_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()
        for i, row in enumerate(data_df.itertuples(index=False)):
            # Answer indices are a space-limited list of numbers.
            # We simply take the min/max of the indices
            answer_idxs = list(map(int, row.answer.split()))
            answer_token_start, answer_token_end = min(answer_idxs), max(
                answer_idxs)
            passage_ptb_tokens = row.sent.split()
            passage_space_tokens = ptb_detokenizer.detokenize(
                passage_ptb_tokens, convert_parentheses=True).split()
            passage_space_str = " ".join(passage_space_tokens)

            token_aligner = TokenAligner(source=passage_ptb_tokens,
                                         target=passage_space_tokens)
            answer_char_span = token_aligner.project_token_to_char_span(
                answer_token_start, answer_token_end, inclusive=True)
            answer_str = passage_space_str[
                answer_char_span[0]:answer_char_span[1] + 1]

            examples.append(
                span_pred_template.Example(
                    guid="%s-%s" % (set_type, i),
                    passage=passage_space_str,
                    question=row.question,
                    answer=answer_str,
                    answer_char_span=answer_char_span,
                ))

        return examples
def test_project_invalid_span():
    src_tokens = ["Members", "of", "the", "House", "clapped", "their", "hands"]
    tgt_tokens = [
        "Members", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir", "Ġhands"
    ]
    # reference: tgt_token_index = [[0], [1], [2], [3], [4, 5], [6], [7]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    with pytest.raises(ValueError):
        ta.project_span(0, 0)
def test_sentencepiece_tok_idx_proj_3():
    src_tokens = [
        "Mr.",
        "Immelt",
        "chose",
        "to",
        "focus",
        "on",
        "the",
        "incomprehensibility",
        "of",
        "accounting",
        "rules.",
    ]
    tgt_tokens = [
        "▁Mr",
        ".",
        "▁I",
        "m",
        "mel",
        "t",
        "▁chose",
        "▁to",
        "▁focus",
        "▁on",
        "▁the",
        "▁in",
        "comp",
        "re",
        "hen",
        "s",
        "ibility",
        "▁of",
        "▁accounting",
        "▁rules",
        ".",
    ]
    tgt_token_index = [
        [0, 1],
        [2, 3, 4, 5],
        [6],
        [7],
        [8],
        [9],
        [10],
        [11, 12, 13, 14, 15, 16],
        [17],
        [18],
        [19, 20],
    ]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_moses_tok_idx_proj_2():
    src_tokens = ["I", "look", "at", "Sarah's", "dog.", "It", "was", "cute.!"]
    tgt_tokens = [
        "I", "look", "at", "Sarah", "'s", "dog", ".", "It", "was", "cute",
        ".", "!"
    ]
    tgt_token_index = [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_wpm_tok_idx_proj_3():
    src_tokens = [
        "Mr.",
        "Immelt",
        "chose",
        "to",
        "focus",
        "on",
        "the",
        "incomprehensibility",
        "of",
        "accounting",
        "rules.",
    ]
    tgt_tokens = [
        "Mr",
        ".",
        "I",
        "##mme",
        "##lt",
        "chose",
        "to",
        "focus",
        "on",
        "the",
        "in",
        "##com",
        "##p",
        "##re",
        "##hen",
        "##si",
        "##bility",
        "of",
        "accounting",
        "rules",
        ".",
    ]
    tgt_token_index = [
        [0, 1],
        [2, 3, 4],
        [5],
        [6],
        [7],
        [8],
        [9],
        [10, 11, 12, 13, 14, 15, 16],
        [17],
        [18],
        [19, 20],
    ]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_wpm_tok_idx_proj_2():
    src_tokens = ["I", "look", "at", "Sarah's", "dog.", "It", "was", "cute.!"]
    tgt_tokens = [
        "I", "look", "at", "Sarah", "'", "s", "dog", ".", "It", "was", "cute",
        ".", "!"
    ]
    tgt_token_index = [[0], [1], [2], [3, 4, 5], [6, 7], [8], [9],
                       [10, 11, 12]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
Beispiel #17
0
    def tokenize(self, tokenizer):

        passage_tokens = tokenizer.tokenize(self.passage)
        token_aligner = TokenAligner(source=self.passage, target=passage_tokens)
        answer_token_span = token_aligner.project_char_to_token_span(
            self.answer_char_span[0], self.answer_char_span[1], inclusive=True
        )

        return TokenizedExample(
            guid=self.guid,
            passage=passage_tokens,
            question=tokenizer.tokenize(self.question),
            answer_str=self.answer,
            passage_str=self.passage,
            answer_token_span=answer_token_span,
            token_idx_to_char_idx_map=token_aligner.source_char_idx_to_target_token_idx.T,
        )
def test_bpe_tok_idx_proj_2():
    src_tokens = ["I", "look", "at", "Sarah's", "dog.", "It", "was", "cute.!"]
    tgt_tokens = [
        "i</w>",
        "look</w>",
        "at</w>",
        "sarah</w>",
        "'s</w>",
        "dog</w>",
        ".</w>",
        "it</w>",
        "was</w>",
        "cute</w>",
        ".</w>",
        "!</w>",
    ]
    tgt_token_index = [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_bpe_tok_idx_proj_3():
    src_tokens = [
        "Mr.",
        "Immelt",
        "chose",
        "to",
        "focus",
        "on",
        "the",
        "incomprehensibility",
        "of",
        "accounting",
        "rules.",
    ]
    tgt_tokens = [
        "mr.</w>",
        "im",
        "melt</w>",
        "chose</w>",
        "to</w>",
        "focus</w>",
        "on</w>",
        "the</w>",
        "in",
        "comprehen",
        "si",
        "bility</w>",
        "of</w>",
        "accounting</w>",
        "rules</w>",
        ".</w>",
    ]
    tgt_token_index = [[0], [1, 2], [3], [4], [5], [6], [7], [8, 9, 10, 11],
                       [12], [13], [14, 15]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
def test_bytebpe_tok_idx_proj_3():
    src_tokens = [
        "Mr.",
        "Immelt",
        "chose",
        "to",
        "focus",
        "on",
        "the",
        "incomprehensibility",
        "of",
        "accounting",
        "rules.",
    ]
    tgt_tokens = [
        "Mr",
        ".",
        "ĠImm",
        "elt",
        "Ġchose",
        "Ġto",
        "Ġfocus",
        "Ġon",
        "Ġthe",
        "Ġincomp",
        "rehens",
        "ibility",
        "Ġof",
        "Ġaccounting",
        "Ġrules",
        ".",
    ]
    tgt_token_index = [[0, 1], [2, 3], [4], [5], [6], [7], [8], [9, 10, 11],
                       [12], [13], [14, 15]]
    ta = TokenAligner(src_tokens, tgt_tokens)
    for src_token_idx in range(len(src_tokens)):
        projected_tgt_tok_idx = ta.project_tokens(src_token_idx)
        assert (tgt_token_index[src_token_idx] == projected_tgt_tok_idx).all()
Beispiel #21
0
def test_private_project_token_span():
    mat = np.eye(5, dtype=int)
    mat[0][0] = 0
    mat[3][3] = 0
    assert TokenAligner._project_span(mat, 1, 3, inclusive=True) == (1, 2)
    assert TokenAligner._project_span(mat, 1, 3, inclusive=False) == (1, 3)
    assert TokenAligner._project_span(mat, 1, 2, inclusive=True) == (1, 2)
    assert TokenAligner._project_span(mat, 1, 2, inclusive=False) == (1, 2)
    assert TokenAligner._project_span(mat, 1, 4, inclusive=True) == (1, 4)
    assert TokenAligner._project_span(mat, 1, 4, inclusive=False) == (1, 3)
Beispiel #22
0
    def _create_examples(self, file_path, set_type):

        with gzip.open(file_path) as f:
            lines = f.read().splitlines()

        examples = []
        ptb_detokenizer = nltk.tokenize.treebank.TreebankWordDetokenizer()

        for line in lines:
            datum = json.loads(line)
            datum = {
                "sentence_tokens":
                datum["sentenceTokens"],
                "entries": [{
                    "verb": verb_entry["verbInflectedForms"]["stem"],
                    "verb_idx": verb_idx,
                    "questions": {
                        question:
                        [[{
                            "tokens": datum["sentenceTokens"][span[0]:span[1]],
                            "span": (span[0], span[1] - 1),
                        } for span in answer_judgment["spans"]]
                         for answer_judgment in q_data["answerJudgments"]
                         if answer_judgment["isValid"]]
                        for question, q_data in
                        verb_entry["questionLabels"].items()
                    },
                } for verb_idx, verb_entry in datum["verbEntries"].items()],
            }

            passage_ptb_tokens = datum["sentence_tokens"]
            passage_space_tokens = ptb_detokenizer.detokenize(
                passage_ptb_tokens, convert_parentheses=True).split()
            passage_space_str = " ".join(passage_space_tokens)

            token_aligner = TokenAligner(source=passage_ptb_tokens,
                                         target=passage_space_tokens)

            for entry in datum["entries"]:
                for question, answer_list in entry["questions"].items():
                    for answer in answer_list:
                        for answer_span in answer:
                            try:
                                answer_char_span = token_aligner.project_token_to_char_span(
                                    answer_span["span"][0],
                                    answer_span["span"][1],
                                    inclusive=True)
                            except ValueError:
                                continue
                            answer_str = passage_space_str[
                                answer_char_span[0]:answer_char_span[1] + 1]

                            examples.append(
                                span_pred_template.Example(
                                    guid="%s-%s" % (set_type, len(examples)),
                                    passage=passage_space_str,
                                    question=question,
                                    answer=answer_str,
                                    answer_char_span=answer_char_span,
                                ))

        return examples