コード例 #1
0
 def __init__(self):
     self.tokenizer = Tokenizer(BPE())
     self.tokenizer.normalizer = Sequence([
         NFKC()
     ])
     self.tokenizer.pre_tokenizer = ByteLevel()
     self.tokenizer.decoder = ByteLevelDecoder()
コード例 #2
0
ファイル: sentencepiece_bpe.py プロジェクト: wmcai/tokenizers
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: str = "<unk>",
                 replacement: str = "▁",
                 add_prefix_space: bool = True,
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.add_special_tokens([unk_token])

        tokenizer.normalizer = NFKC.new()
        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(
            replacement=replacement, add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.Metaspace.new(
            replacement=replacement, add_prefix_space=add_prefix_space)

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
コード例 #3
0
    def __init__(
        self,
        path_src,
        path_tgt,
        path_tokenizer,
        path_root: Optional[str] = '',
    ):
        self.path_src = path_root + path_src
        self.path_tgt = path_root + path_tgt
        self.len = 0
        self.max_len = 512

        self.tokenizer = Tokenizer(
            BPE(
                path_root + path_tokenizer + 'vocab.json',
                path_root + path_tokenizer + 'merges.txt',
            ))
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])

        with open(self.path_src, 'r+') as f:
            lines_src = f.readlines()

        with open(self.path_tgt, 'r+') as f:
            lines_tgt = f.readlines()

        self.len = len(lines_src)
        self.example = list(zip(lines_src, lines_tgt))
コード例 #4
0
def get_tokenizer(args):

    tokenizer = Tokenizer(models.BPE())
    tokenizer.normalizer = Sequence(
        [NFKC(), Replace('\r', ''),
         Replace('\n', ' ')])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    if os.path.isdir(args.tokenizer_dir):
        vocab_fn = os.path.join(args.tokenizer_dir, 'vocab.json')
        merge_fn = os.path.join(args.tokenizer_dir, 'merges.txt')
        tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn)
    else:
        os.makedirs(args.tokenizer_dir)
        trainer = trainers.BpeTrainer(
            vocab_size=args.vocab_size,
            special_tokens=["[UNK]", "[PAD]", "[BOS]", "[EOS]"])
        files = [
            os.path.join(args.data_dir, split)
            for split in ['train.json', 'val.json', 'test.json']
        ]
        tokenizer.train(files=files, trainer=trainer)
        tokenizer.model.save(args.tokenizer_dir)

    return tokenizer
コード例 #5
0
    def __init__(self,
                 vocab_size=25000,
                 min_freq=5,
                 lang="en",
                 files=[None, None]) -> None:
        """

        Args:
            vocab_size: (int)
            min_freq: minimum frequency
            lang: 
            files: (List[str]) ["vocab.json", "merge.txt"]
        """
        super(BPETokenizer, self).__init__()

        self.tokenizer = Tokenizer(BPE(files[0], files[1]))

        self.lang = lang
        self.trainer = BpeTrainer(vocab_size=vocab_size,
                                  min_frequency=min_freq,
                                  special_tokens=["[PAD]", "[SEP]"],
                                  initial_alphabet=ByteLevel.alphabet())

        # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
        # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()
コード例 #6
0
ファイル: train_tokenizer.py プロジェクト: yingnengd/gpt-neox
def train_tokenizer(input_dir: str,
                    save_path: str,
                    tokenizer_type: str = "BPE",
                    vocab_size: int = 52000):
    """
    Trains a tokenizer on all the json files in `input_dir` and saves it to `save_path`

    :param input_dir: input directory containing jsonl files
    :param save_path: path to save tokenizer to
    :param tokenizer_type: type of tokenizer to train.
    :param vocab_size: int, size of tokenizer's vocab
    :return:
    """

    if tokenizer_type == "BPE":
        model = models.BPE()
    else:
        raise NotImplementedError(
            f'Tokenizer type {tokenizer_type} not implemented')
    tokenizer = Tokenizer(model)

    # Customize pre-tokenization and decoding
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    tokenizer.normalizer = NFKC()

    # And then train
    trainer = trainers.BpeTrainer(
        vocab_size=vocab_size, special_tokens=["<|endoftext|>", "<|padding|>"])
    tokenizer.train_from_iterator(json_iterator(input_dir), trainer)

    # And Save it
    tokenizer.save(save_path, pretty=True)
    print(f'Tokenizer saved at {save_path}')
コード例 #7
0
    def __init__(
        self,
        vocab: Optional[str] = None,
        replacement: str = "▁",
        add_prefix_space: bool = True,
    ):
        if vocab is not None:
            tokenizer = Tokenizer(Unigram(vocab))
        else:
            tokenizer = Tokenizer(Unigram())

        tokenizer.normalizer = NFKC()
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
            pre_tokenizers.WhitespaceSplit(),
            pre_tokenizers.Metaspace(replacement=replacement,
                                     add_prefix_space=add_prefix_space),
        ])
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)

        parameters = {
            "model": "SentencePieceUnigram",
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
コード例 #8
0
ファイル: sentencepiece_bpe.py プロジェクト: bhieu79/KC-4.0
    def __init__(
        self,
        vocab: Optional[Union[str, Dict[str, int]]] = None,
        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int,
                                                                int]]]] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        dropout: Optional[float] = None,
    ):
        if vocab is not None and merges is not None:
            tokenizer = Tokenizer(
                BPE(vocab, merges, dropout=dropout, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(BPE())

        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])

        tokenizer.normalizer = NFKC()
        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement, add_prefix_space=add_prefix_space)

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
コード例 #9
0
ファイル: bpe.py プロジェクト: zhongqian71400/tokenizers
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: Optional[str] = "<unk>",
                 suffix: Optional[str] = "</w>",
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()])
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
        tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
コード例 #10
0
def main() -> None:
    args = parse_args()

    special_tokens = list(SPECIAL_TOKENS)

    if args.reserved < len(special_tokens):
        raise AssertionError(
            f"number of reserved tokens should be more than number of f{len(special_tokens)}")
    for i in range(len(special_tokens), args.reserved):
        special_tokens.append(f"[unused{i:03d}]")

    all_filenames = get_all_filenames(args.input)
    # "C:\Users\demianmedich\data\wiki\20191120.en\pp_cased/"

    tokenizer = Tokenizer(get_model(args.model))
    tokenizer.normalizer = normalizers.Sequence([
        NFKC(), StripAccents(), Lowercase()
    ])

    tokenizer.pre_tokenizer = Whitespace()

    trainer = WordPieceTrainer(
        vocab_size=args.vocab_size,
        special_tokens=special_tokens)
    tokenizer.train(trainer, all_filenames)

    model_files = tokenizer.model.save()

    sys.exit(0)
コード例 #11
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        no_consecutive_space: bool = True,
        dropout: Optional[float] = None,
        clean_text: bool = True,
        handle_chinese_chars: bool = True,
        separate_numbers: bool = True,
        strip_accents: bool = True,
        lowercase: bool = True,
        wordpieces_prefix: str = "##",
        special_chars: str = SPECIAL_CHARS,
        zh_norm: bool = True,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE(vocab_file,
                    merges_file,
                    dropout=dropout,
                    unk_token=unk_token))
        else:
            tokenizer = Tokenizer(BPE())

        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])

        tokenizer.normalizer = Sequence([
            NFKC(),
            BertNormalizer(clean_text=clean_text,
                           handle_chinese_chars=handle_chinese_chars,
                           separate_numbers=separate_numbers,
                           strip_accents=strip_accents,
                           lowercase=lowercase,
                           special_chars=special_chars,
                           zh_norm=zh_norm)
        ])
        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
            no_consecutive_space=no_consecutive_space)
        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
            no_consecutive_space=no_consecutive_space)

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "no_consecutive_space": no_consecutive_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
コード例 #12
0
ファイル: train_tokenizer.py プロジェクト: shawwn/lm
def setup_tokenizer(_):
    # Initialize a tokenizer
    tokenizer = Tokenizer(models.BPE())

    # Customize pre-tokenization and decoding
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
    tokenizer.decoder = decoders.ByteLevel()
    tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
    normalizers = [NFKC()]
    tokenizer.normalizer = Sequence(normalizers)
    return tokenizer
    def load_or_train_tokenizer(file_paths, tokenizer_mode_path):
        '''
        Tries to load saved text tokenizer
        If there is none, trains the new tokenizer and saves is
        '''

        if not os.path.exists(tokenizer_mode_path):
            print('Tokenizer model not found, training one')

            from tokenizers.models import BPE
            from tokenizers import Tokenizer
            from tokenizers.decoders import ByteLevel as ByteLevelDecoder
            from tokenizers.normalizers import NFKC, Sequence
            from tokenizers.pre_tokenizers import ByteLevel
            from tokenizers.trainers import BpeTrainer

            tokenizer = Tokenizer(BPE())
            tokenizer.normalizer = Sequence([
                NFKC()
            ])
            tokenizer.pre_tokenizer = ByteLevel()
            tokenizer.decoder = ByteLevelDecoder()

            trainer = BpeTrainer(
                vocab_size=50000,
                show_progress=True,
                inital_alphabet=ByteLevel.alphabet(),
                special_tokens=[
                    "<s>",
                    "<pad>",
                    "</s>",
                    "<unk>",
                    "<mask>"
                ]
            )
            tokenizer.train(file_paths, trainer)

            if not os.path.exists(tokenizer_mode_path):
                os.makedirs(tokenizer_mode_path)
            tokenizer.model.save(tokenizer_mode_path, None)

        print('Loading trained tokenizer model')

        tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_mode_path)
        tokenizer.add_special_tokens({
            'eos_token': '</s>',
            'bos_token': '<s>',
            'unk_token': '<unk>',
            'pad_token': '<pad>',
            'mask_token': '<mask>'
        })

        return tokenizer
コード例 #14
0
    def get_tokenizer(self, tokenizer_dir):

        tokenizer = Tokenizer(models.BPE())
        tokenizer.normalizer = Sequence(
            [NFKC(), Replace('\r', ''),
             Replace('\n', ' ')])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
        tokenizer.decoder = decoders.ByteLevel()

        vocab_fn = os.path.join(tokenizer_dir, 'vocab.json')
        merge_fn = os.path.join(tokenizer_dir, 'merges.txt')
        tokenizer.model = models.BPE.from_file(vocab_fn, merge_fn)
        tokenizer.add_special_tokens(['[UNK]', '[PAD]', '[BOS]', '[EOS]'])
        return tokenizer
コード例 #15
0
ファイル: tokenizer.py プロジェクト: peternara/pororo-nlp
    def __init__(
        self,
        vocab: Union[str, List],
        merges: Union[str, None],
        unk_token: str = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        dropout: Optional[float] = None,
        normalize: bool = True,
    ):
        if merges:
            n_model = "BPE"
            tokenizer = Tokenizer(
                BPE(
                    vocab,  # type: ignore
                    merges,
                    unk_token=unk_token,
                    fuse_unk=True,
                ))
        else:
            n_model = "Unigram"
            tokenizer = Tokenizer(Unigram(vocab, 1))  # type: ignore

        if normalize:
            tokenizer.normalizer = NFKC()

        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        parameters = {
            "model": f"SentencePiece{n_model}",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }
        super().__init__(tokenizer, parameters)
コード例 #16
0
    def __init__(self,
                 vocab_file: Optional[str]=None,
                 merges_file: Optional[str]=None,
                 add_prefix_space: bool=False):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(BPE.from_files(vocab_file, merges_file))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = NFKC.new()
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel.new()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
        }

        super().__init__(tokenizer, parameters)
コード例 #17
0
def main(args):
    if args.do_train:
        # Initialize a tokenizer
        files = get_smi_files(args.training_files)
        print("Training BPE tokenizer using the following files:{}".format(
            files))
        tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))
        tokenizer.enable_padding(pad_id=args.vocab_size + 2,
                                 pad_token="<pad>",
                                 length=args.pad_len)
        tokenizer.enable_truncation(max_length=args.pad_len,
                                    strategy='only_first')
        tokenizer.normalizer = Sequence([NFKC()])
        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=False)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
        # Train the tokenizer
        trainer = trainers.BpeTrainer(show_progress=True,
                                      vocab_size=args.vocab_size,
                                      min_frequency=args.min_frequency)
        tokenizer.train(files, trainer=trainer)
        tokenizer.add_tokens(["<start>", "<end>"])
        tokenizer.save(os.path.join('tokenizers', args.tokenizer_name),
                       pretty=True)
        print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

    if args.do_test:
        # Test the tokenizer
        tokenizer = Tokenizer.from_file(
            os.path.join('tokenizers', args.tokenizer_name))
        print("Testing with SMILES String: {}".format(args.test_string))
        encoding = tokenizer.encode(args.test_string)
        print("Encoded string: {}".format(encoding.tokens))
        print(encoding.ids)
        decoded = tokenizer.decode(encoding.ids)
        print("Decoded string: {}".format(decoded))
コード例 #18
0
 def prepare_tokenizer(self):
     tokenizer = Tokenizer(WordPiece(unk_token='[UNK]'))
     tokenizer.pre_tokenizer = Whitespace()
     tokenizer.normalizer = Sequence([Lowercase(), NFKC()])
     return tokenizer
コード例 #19
0
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

path_data = "../../ml-datasets/wmt14/tokenizer/"

path_train_src = "../../ml-datasets/wmt14/train.en"
path_train_tgt = "../../ml-datasets/wmt14/train.de"

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    NFKC(),
    Lowercase()
])

tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()

trainer = BpeTrainer(vocab_size=25000, show_progress=True, initial_alphabet=ByteLevel.alphabet(),
                     min_frequency=2, special_tokens=["<pad>", "<s>", "</s>", "<unk>", "<mask>", ])
tokenizer.train(trainer, [path_train_src, path_train_tgt])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(path_data)
コード例 #20
0
 def prepare_tokenizer(self):
     tokenizer = Tokenizer(Unigram())
     tokenizer.normalizer = Sequence([Lowercase(), NFKC()])
     return tokenizer
コード例 #21
0
ファイル: train_tokenizer.py プロジェクト: 7exe/gpt-neo
                  for s in g:
                      f.write(s)
                      f.write("\n\n")
          elif args.file_type == 'txt':
              shutil.copyfile(str(arch), str(fp))

  data_files = glob(str(out_path / "*.txt"))
  data_files = random.sample(data_files, int(0.2 * len(data_files)))

  assert len(data_files) > 0, 'No data files found'

  # Initialize a tokenizer
  tokenizer = Tokenizer(models.BPE())

  # Customize pre-tokenization and decoding
  tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=True)
  tokenizer.decoder = decoders.ByteLevel()
  tokenizer.post_processor = processors.ByteLevel(trim_offsets=True)
  tokenizer.normalizer = NFKC()

  # And then train
  trainer = trainers.BpeTrainer(vocab_size=args.vocab_size, min_frequency=2, special_tokens=["<|endoftext|>", "<|padding|>"])
  tokenizer.train(trainer, data_files)

  # And Save it
  tokenizer_path = out_path / "byte-level-bpe.tokenizer.json"
  tokenizer.save(str(tokenizer_path), pretty=True)

  print(f'tokenizer saved at {str(tokenizer_path)}')
  return tokenizer_path
コード例 #22
0
 def prepare_tokenizer(self):
     tokenizer = Tokenizer(BPE(unk_token="<unk>"))
     tokenizer.normalizer = Sequence([Lowercase(), NFKC()])
     tokenizer.pre_tokenizer = ByteLevelPreTokenizer()
     tokenizer.decoder = ByteLevelDecoder()
     return tokenizer
コード例 #23
0
# the overall pipeline for various well-known tokenization algorithm.
# Everything described below can be replaced by the ByteLevelBPETokenizer class.

from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.models import BPE
from tokenizers.normalizers import Lowercase, NFKC, Sequence
from tokenizers.pre_tokenizers import ByteLevel

# First we create an empty Byte-Pair Encoding model (i.e. not trained model)
tokenizer = Tokenizer(BPE())

# Then we enable lower-casing and unicode-normalization
# The Sequence normalizer allows us to combine multiple Normalizer that will be
# executed in order.
tokenizer.normalizer = Sequence([NFKC(), Lowercase()])

# Our tokenizer also needs a pre-tokenizer responsible for converting the input
# to a ByteLevel representation.
tokenizer.pre_tokenizer = ByteLevel()

# And finally, let's plug a decoder so we can recover from a tokenized input
# to the original one
tokenizer.decoder = ByteLevelDecoder()

from tokenizers.trainers import BpeTrainer

# We initialize our trainer, giving him the details about the vocabulary we want
# to generate
trainer = BpeTrainer(vocab_size=25000,
                     show_progress=True,
コード例 #24
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 unk_token: Union[str, AddedToken] = "<unk>",
                 sep_token: Union[str, AddedToken] = "</s>",
                 cls_token: Union[str, AddedToken] = "<s>",
                 nl_token: Union[str, AddedToken] = "<nl>",
                 pad_token: Union[str, AddedToken] = "<pad>",
                 mask_token: Union[str, AddedToken] = "<mask>",
                 clean_text: bool = True,
                 handle_chinese_chars: bool = True,
                 separate_numbers: bool = True,
                 strip_accents: bool = True,
                 lowercase: bool = True,
                 wordpieces_prefix: str = "##",
                 special_chars: str = SPECIAL_CHARS,
                 zh_norm: bool = True,
                 handle_simpl: bool = True,
                 do_postprocess: bool = False):

        if vocab_file is not None:
            tokenizer = Tokenizer(
                WordPiece(vocab_file, unk_token=str(unk_token)))
        else:
            tokenizer = Tokenizer(WordPiece())

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(sep_token)) is not None:
            tokenizer.add_special_tokens([str(sep_token)])
        if tokenizer.token_to_id(str(cls_token)) is not None:
            tokenizer.add_special_tokens([str(cls_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(nl_token)) is not None:
            tokenizer.add_special_tokens([str(nl_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        tokenizer.normalizer = Sequence([
            NFKC(),
            BertNormalizer(clean_text=clean_text,
                           handle_chinese_chars=handle_chinese_chars,
                           separate_numbers=separate_numbers,
                           strip_accents=strip_accents,
                           lowercase=lowercase,
                           special_chars=special_chars,
                           zh_norm=zh_norm,
                           handle_simpl=handle_simpl)
        ])
        tokenizer.pre_tokenizer = BertPreTokenizer()

        if vocab_file is not None and do_postprocess:
            sep_token_id = tokenizer.token_to_id(str(sep_token))
            if sep_token_id is None:
                raise TypeError("sep_token not found in the vocabulary")
            cls_token_id = tokenizer.token_to_id(str(cls_token))
            if cls_token_id is None:
                raise TypeError("cls_token not found in the vocabulary")
            tokenizer.post_processor = BertProcessing(
                (str(sep_token), sep_token_id), (str(cls_token), cls_token_id))

        tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)

        parameters = {
            "model": "BertWordPiece",
            "unk_token": unk_token,
            "sep_token": sep_token,
            "cls_token": cls_token,
            "nl_token": nl_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "clean_text": clean_text,
            "handle_chinese_chars": handle_chinese_chars,
            "separate_numbers": separate_numbers,
            "strip_accents": strip_accents,
            "lowercase": lowercase,
            "special_chars": special_chars,
            "zh_norm": zh_norm,
            "handle_simpl": handle_simpl,
            "wordpieces_prefix": wordpieces_prefix,
        }

        super().__init__(tokenizer, parameters)
コード例 #25
0
    def __init__(
        self,
        vocab: Union[str, List],
        merges: List[Tuple[str, str]],
        bos_token: str = "<s>",
        eos_token: str = "</s>",
        sep_token: str = "</s>",
        cls_token: str = "<s>",
        pad_token: str = "<pad>",
        unk_token: str = "<unk>",
        replacement: str = "▁",
        add_prefix_space: bool = True,
        dropout: Optional[float] = None,
        normalize: bool = True,
    ):
        bpe = BPE(
            vocab=vocab,
            merges=merges,
            unk_token=unk_token,
            fuse_unk=True,
        )

        tokenizer = Tokenizer(bpe)

        tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        tokenizer.decoder = decoders.Metaspace(
            replacement=replacement,
            add_prefix_space=add_prefix_space,
        )

        if normalize:
            tokenizer.normalizer = NFKC()

        parameters = {
            "model": "SentencePieceBPE",
            "unk_token": unk_token,
            "replacement": replacement,
            "add_prefix_space": add_prefix_space,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False)
        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False)
        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False)
        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False)
        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False)
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False)

        self.add_special_tokens([
            bos_token,
            eos_token,
            sep_token,
            cls_token,
            unk_token,
            pad_token,
        ])