Beispiel #1
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: Optional[str] = "<unk>",
                 suffix: Optional[str] = "</w>",
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()])
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
        tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Beispiel #2
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: str = "<unk>",
                 replacement: str = "▁",
                 add_prefix_space: bool = True,
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.add_special_tokens([unk_token])

        tokenizer.normalizer = NFKC.new()
        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(
            replacement=replacement, add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.Metaspace.new(
            replacement=replacement, add_prefix_space=add_prefix_space)

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Beispiel #3
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
        continuing_subword_prefix: Optional[str] = None,
        end_of_word_suffix: Optional[str] = None,
        trim_offsets: bool = False,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(
                    vocab_file,
                    merges_file,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
                ))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(
            trim_offsets=trim_offsets)

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
            "lowercase": lowercase,
            "dropout": dropout,
            "unicode_normalizer": unicode_normalizer,
            "continuing_subword_prefix": continuing_subword_prefix,
            "end_of_word_suffix": end_of_word_suffix,
            "trim_offsets": trim_offsets,
        }

        super().__init__(tokenizer, parameters)
Beispiel #4
0
    def test_processing(self, roberta_files):
        tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
        tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)

        # Keeps original offsets
        output = tokenizer.encode("My name is John")
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]

        # Trims offsets when activated
        tokenizer.post_processor = ByteLevel(trim_offsets=True)
        output = tokenizer.encode("My name is John")
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
    def slow_train():
        tokenizer, trainer = TestQuicktour.get_tokenizer_trainer()

        # START train
        files = [
            f"data/wikitext-103-raw/wiki.{split}.raw"
            for split in ["test", "train", "valid"]
        ]
        tokenizer.train(trainer, files)
        # END train
        # START reload_model
        files = tokenizer.model.save("data", "wiki")
        tokenizer.model = BPE.from_files(*files, unk_token="[UNK]")
        # END reload_model
        # START save
        tokenizer.save("data/tokenizer-wiki.json")
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        unk_token: Optional[str] = "<unk>",
        suffix: Optional[str] = "</w>",
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        # OpenAI normalization is the same as Bert
        normalizers += [BertNormalizer()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = BertPreTokenizer()
        tokenizer.decoder = BPEDecoder(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Beispiel #7
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 add_prefix_space: bool = False,
                 do_lowercase: bool = False,
                 unicode_normalizer: Optional[str] = None,
                 continuing_subword_prefix: Optional[str] = None,
                 end_of_word_suffix: Optional[str] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(
                    vocab_file,
                    merges_file,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or ""))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if do_lowercase:
            normalizers += [Lowercase.new()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence.new(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel.new()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
Beispiel #8
0
    def __init__(self,
                 vocab_file: Optional[str]=None,
                 merges_file: Optional[str]=None,
                 add_prefix_space: bool=False):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = NFKC.new()
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel.new()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
Beispiel #9
0
    def test_encode_add_special_tokens(self, roberta_files):
        tokenizer = Tokenizer(
            BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
        tokenizer.add_special_tokens(["<s>", "</s>"])

        tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
        tokenizer.post_processor = RobertaProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        # Can encode with special tokens
        output_with_specials = tokenizer.encode("My name is John",
                                                add_special_tokens=True)
        assert output_with_specials.tokens == [
            "<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"
        ]

        # Can encode without special tokens
        output_without_specials = tokenizer.encode("My name is John",
                                                   add_special_tokens=False)
        assert output_without_specials.tokens == [
            "ĠMy", "Ġname", "Ġis", "ĠJohn"
        ]
Beispiel #10
0
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!
""".split("\n")

if args.type == "gpt2":
    print("Running GPT-2 tokenizer")
    tok_p = GPT2Tokenizer.from_pretrained('gpt2')

    # Create a Tokenizer using BPE
    tok_r = Tokenizer(BPE.from_files(args.vocab, args.merges))
    # Use ByteLevel PreTokenizer
    tok_r.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
    # Use ByteLevel Decoder
    tok_r.decoder = decoders.ByteLevel()
elif args.type == "bert":
    print("Running Bert tokenizer")
    tok_p = BertTokenizer.from_pretrained(args.vocab)

    tok_r = Tokenizer(
        WordPiece.from_files(args.vocab,
                             unk_token="[UNK]",
                             max_input_chars_per_word=100))
    tok_r.normalizer = BertNormalizer(
        clean_text=True,
        handle_chinese_chars=True,
Beispiel #11
0
 def test_instantiate(self, roberta_files):
     assert isinstance(BPE.empty(), Model)
     assert isinstance(
         BPE.from_files(roberta_files["vocab"], roberta_files["merges"]),
         Model)