Beispiel #1
0
 def test_has_expected_type_and_methods(self):
     tokenizer = Tokenizer(BPE.empty())
     assert type(tokenizer) == Tokenizer
     assert callable(tokenizer.num_special_tokens_to_add)
     assert callable(tokenizer.get_vocab)
     assert callable(tokenizer.get_vocab_size)
     assert callable(tokenizer.enable_truncation)
     assert callable(tokenizer.no_truncation)
     assert callable(tokenizer.enable_padding)
     assert callable(tokenizer.no_padding)
     assert callable(tokenizer.normalize)
     assert callable(tokenizer.encode)
     assert callable(tokenizer.encode_batch)
     assert callable(tokenizer.decode)
     assert callable(tokenizer.decode_batch)
     assert callable(tokenizer.token_to_id)
     assert callable(tokenizer.id_to_token)
     assert callable(tokenizer.add_tokens)
     assert callable(tokenizer.add_special_tokens)
     assert callable(tokenizer.train)
     assert callable(tokenizer.post_process)
     assert isinstance(tokenizer.model, Model)
     assert tokenizer.normalizer is None
     assert tokenizer.pre_tokenizer is None
     assert tokenizer.post_processor is None
     assert tokenizer.decoder is None
Beispiel #2
0
    def test_normalize(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.normalizer = Lowercase()

        output = tokenizer.normalize("My Name Is John")
        assert output == "my name is john"
Beispiel #3
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: Optional[str] = "<unk>",
                 suffix: Optional[str] = "</w>",
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()])
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
        tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Beispiel #4
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: str = "<unk>",
                 replacement: str = "▁",
                 add_prefix_space: bool = True,
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.add_special_tokens([unk_token])

        tokenizer.normalizer = NFKC.new()
        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(
            replacement=replacement, add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.Metaspace.new(
            replacement=replacement, add_prefix_space=add_prefix_space)

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Beispiel #5
0
    def test_strip_accents(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = BertNormalizer(
            strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
        )

        output = tokenizer.normalize("Héllò")
        assert output == "Hello"
Beispiel #6
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
        continuing_subword_prefix: Optional[str] = None,
        end_of_word_suffix: Optional[str] = None,
        trim_offsets: bool = False,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(
                    vocab_file,
                    merges_file,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
                ))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(
            trim_offsets=trim_offsets)

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
            "lowercase": lowercase,
            "dropout": dropout,
            "unicode_normalizer": unicode_normalizer,
            "continuing_subword_prefix": continuing_subword_prefix,
            "end_of_word_suffix": end_of_word_suffix,
            "trim_offsets": trim_offsets,
        }

        super().__init__(tokenizer, parameters)
Beispiel #7
0
    def test_processing(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_special_tokens(["<s>", "</s>"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))

        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"]
        assert output.ids == [0, 2, 3, 1, 1, 6, 1]
Beispiel #8
0
    def test_processing(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))

        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"]
        assert output.ids == [1, 2, 3, 0, 6, 0]
Beispiel #9
0
    def test_add_tokens(self):
        tokenizer = Tokenizer(BPE.empty())
        added = tokenizer.add_tokens(["my", "name", "is", "john"])
        assert added == 4

        added = tokenizer.add_tokens(
            [AddedToken("the"),
             AddedToken("quick", rstrip=True)])
        assert added == 2
Beispiel #10
0
    def test_get_vocab_size(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can retrieve vocab's size with added tokens
        size = tokenizer.get_vocab_size(with_added_tokens=True)
        assert size == 5

        # Can retrieve vocab's size without added tokens
        size = tokenizer.get_vocab_size(with_added_tokens=False)
        assert size == 0
Beispiel #11
0
    def test_get_vocab(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can retrieve vocab with added tokens
        vocab = tokenizer.get_vocab(with_added_tokens=True)
        assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4}

        # Can retrieve vocab without added tokens
        vocab = tokenizer.get_vocab(with_added_tokens=False)
        assert vocab == {}
Beispiel #12
0
    def test_decode(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can decode single sequences
        output = tokenizer.decode([0, 1, 2, 3])
        assert output == "my name is john"

        # Can decode batch
        output = tokenizer.decode_batch([[0, 1, 2, 3], [4]])
        assert output == ["my name is john", "pair"]
Beispiel #13
0
    def test_add_special_tokens(self):
        tokenizer = Tokenizer(BPE.empty())

        # Can add special tokens as `str`
        added = tokenizer.add_special_tokens(["my", "name", "is", "john"])
        assert added == 4

        # Can add special tokens as `AddedToken`
        added = tokenizer.add_special_tokens(
            [AddedToken("the"),
             AddedToken("quick", rstrip=True)])
        assert added == 2
Beispiel #14
0
    def test_truncation(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)

        # Can truncate single sequences
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name"]

        # Can truncate pair sequences as well
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "pair"]
Beispiel #15
0
    def test_post_process(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.enable_truncation(2)
        tokenizer.enable_padding(max_length=4)

        encoding = tokenizer.encode("my name is john")
        pair_encoding = tokenizer.encode("pair")

        # Can post process a single encoding
        output = tokenizer.post_process(encoding)
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]

        # Can post process a pair of encodings
        output = tokenizer.post_process(encoding, pair_encoding)
        assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        unk_token: Optional[str] = "<unk>",
        suffix: Optional[str] = "</w>",
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        # OpenAI normalization is the same as Bert
        normalizers += [BertNormalizer()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = BPEDecoder(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Beispiel #17
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 add_prefix_space: bool = False,
                 do_lowercase: bool = False,
                 unicode_normalizer: Optional[str] = None,
                 continuing_subword_prefix: Optional[str] = None,
                 end_of_word_suffix: Optional[str] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(
                    vocab_file,
                    merges_file,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or ""))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if do_lowercase:
            normalizers += [Lowercase.new()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence.new(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel.new()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
Beispiel #18
0
    def __init__(self,
                 vocab_file: Optional[str]=None,
                 merges_file: Optional[str]=None,
                 add_prefix_space: bool=False):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = NFKC.new()
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel.new()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
Beispiel #19
0
    def test_padding(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # By default it does nothing when encoding single sequence
        tokenizer.enable_padding()
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name"]

        # Can pad to the longest in a batch
        output = tokenizer.encode_batch(["my name", "my name is john"])
        assert all([len(encoding) == 4 for encoding in output])

        # Can pad to the specified max length otherwise
        tokenizer.enable_padding(max_length=4)
        output = tokenizer.encode("my name")
        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
        output = tokenizer.encode("my name", "pair")
        assert output.tokens == ["my", "name", "pair", "[PAD]"]
Beispiel #20
0
    def test_encode(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])

        # Can encode single sequence
        output = tokenizer.encode("my name is john")
        assert output.tokens == ["my", "name", "is", "john"]
        assert type(output.ids) == list
        assert type(output.type_ids) == list
        assert type(output.offsets) == list
        assert type(output.words) == list
        assert type(output.special_tokens_mask) == list
        assert type(output.attention_mask) == list
        assert type(output.overflowing) == list

        # Can encode a pair of sequences
        output = tokenizer.encode("my name is john", "pair")
        assert output.tokens == ["my", "name", "is", "john", "pair"]

        # Can encode a batch with both a single sequence and a pair of sequences
        output = tokenizer.encode_batch(
            ["my name is john", ("my name is john", "pair")])
        assert len(output) == 2
Beispiel #21
0
 def test_instantiate(self, roberta_files):
     assert isinstance(BPE.empty(), Model)
     assert isinstance(
         BPE.from_files(roberta_files["vocab"], roberta_files["merges"]),
         Model)
Beispiel #22
0
    def test_full_strip(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Strip(left=True, right=True)

        output = tokenizer.normalize("  hello  ")
        assert output == "hello"
Beispiel #23
0
    def test_lowercase(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Lowercase()

        output = tokenizer.normalize("HELLO")
        assert output == "hello"
Beispiel #24
0
    def test_can_make_sequences(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Sequence([Lowercase(), Strip()])

        output = tokenizer.normalize("  HELLO  ")
        assert output == "hello"