Esempio n. 1
0
    def test_bytebpe(self):
        self.tokens = [
            [
                "ĠMembers", "Ġof", "Ġthe", "ĠHouse", "Ġcl", "apped", "Ġtheir",
                "Ġhands"
            ],
            [
                "ĠI", "Ġlook", "Ġat", "ĠSarah", "'s", "Ġdog", ".", "ĠIt",
                "Ġwas", "Ġcute", ".", "!"
            ],
            [
                "ĠMr",
                ".",
                "ĠImm",
                "elt",
                "Ġchose",
                "Ġto",
                "Ġfocus",
                "Ġon",
                "Ġthe",
                "Ġincomp",
                "rehens",
                "ibility",
                "Ġof",
                "Ġaccounting",
                "Ġrules",
                ".",
            ],
            ["ĠWhat", "?"],
        ]
        self.token_index_tgt = [
            [[0], [1], [2], [3], [4, 5], [6], [7]],
            [[0], [1], [2], [3, 4], [5], [6, 7], [8], [9, 10, 11]],
            [[0], [1, 2, 3], [4], [5], [6], [7], [8], [9, 10, 11], [12], [13],
             [14, 15]],
            [[0, 1]],
        ]
        self.span_index_tgt = [
            [(0, 4), (6, 8)],
            [(0, 1), (3, 6)],
            [(0, 4), (8, 16), (8, 12), (9, 16)],
            [(0, 2)],
        ]

        aligner_fn = retokenize.get_aligner_fn("roberta-base")
        tas, tokens = zip(*(aligner_fn(sent) for sent in self.text))
        tas, tokens = list(tas), list(tokens)
        token_index_tgt = [[
            ta.project_tokens(idxs).tolist() for idxs in token_idxs
        ] for ta, token_idxs in zip(tas, self.token_index_src)]
        span_index_tgt = [[
            ta.project_span(start, end) for (start, end) in span_idxs
        ] for ta, span_idxs in zip(tas, self.span_index_src)]
        assert self.tokens == tokens
        assert self.token_index_tgt == token_index_tgt
        assert self.span_index_tgt == span_index_tgt
Esempio n. 2
0
def get_tags(text, current_tags, tokenizer_name, tag_dict):
    aligner_fn = get_aligner_fn(tokenizer_name)
    assert len(text) == len(current_tags)
    introduced_tokenizer_tag = len(tag_dict)
    token_aligner, aligned_text = aligner_fn(" ".join(text))
    aligned_tags = [introduced_tokenizer_tag for token in aligned_text]
    for text_idx, text_tag in enumerate(current_tags):
        aligned_idx = token_aligner.project_tokens(text_idx)[0]
        aligned_tags[aligned_idx] = tag_dict[text_tag]
    str_tags = [str(s) for s in aligned_tags]
    return " ".join(str_tags)
def retokenize_record(record, tokenizer_name):
    """Retokenize an edge probing example. Modifies in-place."""
    text = record["text"]
    aligner_fn = retokenize.get_aligner_fn(tokenizer_name)
    ta, new_tokens = aligner_fn(text)
    record["text"] = " ".join(new_tokens)
    for target in record["targets"]:
        if "span1" in target:
            target["span1"] = list(map(int, ta.project_span(*target["span1"])))
        if "span2" in target:
            target["span2"] = list(map(int, ta.project_span(*target["span2"])))
    return record
Esempio n. 4
0
    def test_moses(self):
        self.tokens = [
            ["Members", "of", "the", "House", "clapped", "their", "hands"],
            [
                "I", "look", "at", "Sarah", "'s", "dog", ".", "It", "was",
                "cute", ".", "!"
            ],
            [
                "Mr.",
                "Immelt",
                "chose",
                "to",
                "focus",
                "on",
                "the",
                "incomprehensibility",
                "of",
                "accounting",
                "rules",
                ".",
            ],
            ["What", "?"],
        ]
        self.token_index_tgt = [
            [[0], [1], [2], [3], [4], [5], [6]],
            [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]],
            [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10, 11]],
            [[0, 1]],
        ]
        self.span_index_tgt = [
            [(0, 4), (5, 7)],
            [(0, 1), (3, 7)],
            [(0, 2), (6, 12), (6, 8), (7, 12)],
            [(0, 2)],
        ]

        aligner_fn = retokenize.get_aligner_fn("transfo-xl-wt103")
        token_aligners, tokens = zip(*(aligner_fn(sent) for sent in self.text))
        token_aligners, tokens = list(token_aligners), list(tokens)
        token_index_tgt = [[
            token_aligner.project_tokens(idxs).tolist() for idxs in token_idxs
        ]
                           for token_aligner, token_idxs in zip(
                               token_aligners, self.token_index_src)]
        span_index_tgt = [[
            token_aligner.project_span(start, end)
            for (start, end) in span_idxs
        ]
                          for token_aligner, span_idxs in zip(
                              token_aligners, self.span_index_src)]
        assert self.tokens == tokens
        assert self.token_index_tgt == token_index_tgt
        assert self.span_index_tgt == span_index_tgt
Esempio n. 5
0
def get_tags(text, current_tags, tokenizer_name, tag_dict):
    aligner_fn = retokenize.get_aligner_fn(tokenizer_name)
    assert len(text) == len(current_tags)
    res_tags = []
    introduced_tokenizer_tag = len(tag_dict)
    for i in range(len(text)):
        token = text[i]
        _, new_toks = aligner_fn(token)
        res_tags.append(tag_dict[current_tags[i]])
        if len(new_toks) > 1:
            for tok in new_toks[1:]:
                res_tags.append(introduced_tokenizer_tag)
                # based on BERT-paper for wordpiece, we only keep the tag
                # for the first part of the word.
    _, aligned_text = aligner_fn(" ".join(text))
    assert len(aligned_text) == len(res_tags)
    str_tags = [str(s) for s in res_tags]
    return " ".join(str_tags)
Esempio n. 6
0
    def test_wpm(self):
        self.tokens = [
            ["Members", "of", "the", "House", "clapped", "their", "hands"],
            [
                "I", "look", "at", "Sarah", "'", "s", "dog", ".", "It", "was",
                "cute", ".", "!"
            ],
            [
                "Mr",
                ".",
                "I",
                "##mme",
                "##lt",
                "chose",
                "to",
                "focus",
                "on",
                "the",
                "in",
                "##com",
                "##p",
                "##re",
                "##hen",
                "##si",
                "##bility",
                "of",
                "accounting",
                "rules",
                ".",
            ],
            ["What", "?"],
        ]
        self.token_index_tgt = [
            [[0], [1], [2], [3], [4], [5], [6]],
            [[0], [1], [2], [3, 4, 5], [6, 7], [8], [9], [10, 11, 12]],
            [
                [0, 1],
                [2, 3, 4],
                [5],
                [6],
                [7],
                [8],
                [9],
                [10, 11, 12, 13, 14, 15, 16],
                [17],
                [18],
                [19, 20],
            ],
            [[0, 1]],
        ]
        self.span_index_tgt = [
            [(0, 4), (5, 7)],
            [(0, 1), (3, 8)],
            [(0, 5), (9, 21), (9, 17), (10, 21)],
            [(0, 2)],
        ]

        aligner_fn = retokenize.get_aligner_fn("bert-base-cased")
        tas, tokens = zip(*(aligner_fn(sent) for sent in self.text))
        tas, tokens = list(tas), list(tokens)
        token_index_tgt = [[
            ta.project_tokens(idxs).tolist() for idxs in token_idxs
        ] for ta, token_idxs in zip(tas, self.token_index_src)]
        span_index_tgt = [[
            ta.project_span(start, end) for (start, end) in span_idxs
        ] for ta, span_idxs in zip(tas, self.span_index_src)]
        assert self.tokens == tokens
        assert self.token_index_tgt == token_index_tgt
        assert self.span_index_tgt == span_index_tgt
Esempio n. 7
0
    def test_sentencepiece(self):
        self.tokens = [
            [
                "▁Members", "▁of", "▁the", "▁House", "▁clapped", "▁their",
                "▁hands"
            ],
            [
                "▁I",
                "▁look",
                "▁at",
                "▁Sarah",
                "'",
                "s",
                "▁dog",
                ".",
                "▁It",
                "▁was",
                "▁cute",
                ".",
                "!",
            ],
            [
                "▁Mr",
                ".",
                "▁I",
                "m",
                "mel",
                "t",
                "▁chose",
                "▁to",
                "▁focus",
                "▁on",
                "▁the",
                "▁in",
                "comp",
                "re",
                "hen",
                "s",
                "ibility",
                "▁of",
                "▁accounting",
                "▁rules",
                ".",
            ],
            ["▁What", "?"],
        ]
        self.token_index_tgt = [
            [[0], [1], [2], [3], [4], [5], [6]],
            [[0], [1], [2], [3, 4, 5], [6, 7], [8], [9], [10, 11, 12]],
            [
                [0, 1],
                [2, 3, 4, 5],
                [6],
                [7],
                [8],
                [9],
                [10],
                [11, 12, 13, 14, 15, 16],
                [17],
                [18],
                [19, 20],
            ],
            [[0, 1]],
        ]
        self.span_index_tgt = [
            [(0, 4), (5, 7)],
            [(0, 1), (3, 8)],
            [(0, 6), (10, 21), (10, 17), (11, 21)],
            [(0, 2)],
        ]

        aligner_fn = retokenize.get_aligner_fn("xlnet-base-cased")
        tas, tokens = zip(*(aligner_fn(sent) for sent in self.text))
        tas, tokens = list(tas), list(tokens)
        token_index_tgt = [[
            ta.project_tokens(idxs).tolist() for idxs in token_idxs
        ] for ta, token_idxs in zip(tas, self.token_index_src)]
        span_index_tgt = [[
            ta.project_span(start, end) for (start, end) in span_idxs
        ] for ta, span_idxs in zip(tas, self.span_index_src)]
        assert self.tokens == tokens
        assert self.token_index_tgt == token_index_tgt
        assert self.span_index_tgt == span_index_tgt
Esempio n. 8
0
    def test_bpe(self):
        self.tokens = [
            [
                "members</w>",
                "of</w>",
                "the</w>",
                "house</w>",
                "clapped</w>",
                "their</w>",
                "hands</w>",
            ],
            [
                "i</w>",
                "look</w>",
                "at</w>",
                "sarah</w>",
                "'s</w>",
                "dog</w>",
                ".</w>",
                "it</w>",
                "was</w>",
                "cute</w>",
                ".</w>",
                "!</w>",
            ],
            [
                "mr.</w>",
                "im",
                "melt</w>",
                "chose</w>",
                "to</w>",
                "focus</w>",
                "on</w>",
                "the</w>",
                "in",
                "comprehen",
                "si",
                "bility</w>",
                "of</w>",
                "accounting</w>",
                "rules</w>",
                ".</w>",
            ],
            ["what</w>", "?</w>"],
        ]
        self.token_index_tgt = [
            [[0], [1], [2], [3], [4], [5], [6]],
            [[0], [1], [2], [3, 4], [5, 6], [7], [8], [9, 10, 11]],
            [[0], [1, 2], [3], [4], [5], [6], [7], [8, 9, 10, 11], [12], [13],
             [14, 15]],
            [[0, 1]],
        ]
        self.span_index_tgt = [
            [(0, 4), (5, 7)],
            [(0, 1), (3, 7)],
            [(0, 3), (7, 16), (7, 12), (8, 16)],
            [(0, 2)],
        ]

        aligner_fn = retokenize.get_aligner_fn("openai-gpt")
        tas, tokens = zip(*(aligner_fn(sent) for sent in self.text))
        tas, tokens = list(tas), list(tokens)
        token_index_tgt = [[
            ta.project_tokens(idxs).tolist() for idxs in token_idxs
        ] for ta, token_idxs in zip(tas, self.token_index_src)]
        span_index_tgt = [[
            ta.project_span(start, end) for (start, end) in span_idxs
        ] for ta, span_idxs in zip(tas, self.span_index_src)]
        assert self.tokens == tokens
        assert self.token_index_tgt == token_index_tgt
        assert self.span_index_tgt == span_index_tgt
Esempio n. 9
0
def realign_spans(record, tokenizer_name):
    """
    Builds the indices alignment while also tokenizing the input
    piece by piece.

    Parameters
    -----------------------
        record: dict with the below fields
            text: str
            targets: list of dictionaries
                label: bool
                span1_index: int, start index of first span
                span1_text: str, text of first span
                span2_index: int, start index of second span
                span2_text: str, text of second span
        tokenizer_name: str

    Returns
    ------------------------
        record: dict with the below fields:
            text: str in tokenized form
            targets: dictionary with the below fields
                -label: bool
                -span_1: (int, int) of token indices
                -span1_text: str, the string
                -span2: (int, int) of token indices
                -span2_text: str, the string
    """

    # find span indices and text
    text = record["text"].split()
    span1 = record["targets"][0]["span1_index"]
    span1_text = record["targets"][0]["span1_text"]
    span2 = record["targets"][0]["span2_index"]
    span2_text = record["targets"][0]["span2_text"]

    # construct end spans given span text space-tokenized length
    span1 = [span1, span1 + len(span1_text.strip().split())]
    span2 = [span2, span2 + len(span2_text.strip().split())]
    indices = [span1, span2]

    sorted_indices = sorted(indices, key=lambda x: x[0])
    current_tokenization = []
    span_mapping = {}

    # align first span to tokenized text
    aligner_fn = retokenize.get_aligner_fn(tokenizer_name)
    _, new_tokens = aligner_fn(" ".join(text[: sorted_indices[0][0]]))
    current_tokenization.extend(new_tokens)
    new_span1start = len(current_tokenization)
    _, span_tokens = aligner_fn(" ".join(text[sorted_indices[0][0] : sorted_indices[0][1]]))
    current_tokenization.extend(span_tokens)
    new_span1end = len(current_tokenization)
    span_mapping[sorted_indices[0][0]] = [new_span1start, new_span1end]

    # re-indexing second span
    _, new_tokens = aligner_fn(" ".join(text[sorted_indices[0][1] : sorted_indices[1][0]]))
    current_tokenization.extend(new_tokens)
    new_span2start = len(current_tokenization)
    _, span_tokens = aligner_fn(" ".join(text[sorted_indices[1][0] : sorted_indices[1][1]]))
    current_tokenization.extend(span_tokens)
    new_span2end = len(current_tokenization)
    span_mapping[sorted_indices[1][0]] = [new_span2start, new_span2end]

    # save back into record
    _, all_text = aligner_fn(" ".join(text))
    record["targets"][0]["span1"] = span_mapping[record["targets"][0]["span1_index"]]
    record["targets"][0]["span2"] = span_mapping[record["targets"][0]["span2_index"]]
    record["text"] = " ".join(all_text)
    return record