Exemple #1
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: Optional[str] = "<unk>",
                 suffix: Optional[str] = "</w>",
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()])
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
        tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Exemple #2
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 add_prefix_space: bool = False,
                 do_lowercase: bool = False,
                 unicode_normalizer: Optional[str] = None,
                 continuing_subword_prefix: Optional[str] = None,
                 end_of_word_suffix: Optional[str] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(
                    vocab_file,
                    merges_file,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or ""))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if do_lowercase:
            normalizers += [Lowercase.new()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence.new(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel.new()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)