def get_recurrent_tokenizer(vocab,
                            max_context_tokens,
                            unk_token,
                            pad_token,
                            device="cpu"):
    """
    Return a tokenizer to be used with recurrent-based models
    """
    question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    question_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    question_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    question_tokenizer.enable_padding(direction="right",
                                      pad_id=vocab[pad_token],
                                      pad_type_id=1,
                                      pad_token=pad_token)

    context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    context_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    context_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    context_tokenizer.enable_padding(
        direction="right",
        pad_id=vocab[pad_token],
        pad_type_id=1,
        pad_token=pad_token,
    )
    context_tokenizer.enable_truncation(max_context_tokens)

    return RecurrentSquadTokenizer(question_tokenizer,
                                   context_tokenizer,
                                   device=device)
Exemple #2
0
    def __init__(self,
                 vocab_size=25000,
                 min_freq=5,
                 lang="en",
                 files=[None, None]) -> None:
        """

        Args:
            vocab_size: (int)
            min_freq: minimum frequency
            lang: 
            files: (List[str]) ["vocab.json", "merge.txt"]
        """
        super(BPETokenizer, self).__init__()

        self.tokenizer = Tokenizer(BPE(files[0], files[1]))

        self.lang = lang
        self.trainer = BpeTrainer(vocab_size=vocab_size,
                                  min_frequency=min_freq,
                                  special_tokens=["[PAD]", "[SEP]"],
                                  initial_alphabet=ByteLevel.alphabet())

        # https://huggingface.co/docs/tokenizers/python/latest/components.html#normalizers
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])
        # https://huggingface.co/docs/tokenizers/python/latest/components.html#pre-tokenizers
        self.tokenizer.pre_tokenizer = ByteLevel()
        self.tokenizer.decoder = ByteLevelDecoder()
def main() -> None:
    args = parse_args()

    special_tokens = list(SPECIAL_TOKENS)

    if args.reserved < len(special_tokens):
        raise AssertionError(
            f"number of reserved tokens should be more than number of f{len(special_tokens)}")
    for i in range(len(special_tokens), args.reserved):
        special_tokens.append(f"[unused{i:03d}]")

    all_filenames = get_all_filenames(args.input)
    # "C:\Users\demianmedich\data\wiki\20191120.en\pp_cased/"

    tokenizer = Tokenizer(get_model(args.model))
    tokenizer.normalizer = normalizers.Sequence([
        NFKC(), StripAccents(), Lowercase()
    ])

    tokenizer.pre_tokenizer = Whitespace()

    trainer = WordPieceTrainer(
        vocab_size=args.vocab_size,
        special_tokens=special_tokens)
    tokenizer.train(trainer, all_filenames)

    model_files = tokenizer.model.save()

    sys.exit(0)
Exemple #4
0
 def __init__(
     self,
     load_from: str = None,
     vocab_size: int = 10000,
     max_example_len: int = 128,
     batch_size: int = 16,
     num_stopwords: int = 250,
     mask_output_len: int = 4,
 ):
     self.char_dict: Dict[str, int] = {}
     self.char_rev: Dict[int, str] = {}
     self.token_dict: Dict[str, int] = {}
     self.token_rev: Dict[str, int] = {}
     self.vocab_size = vocab_size
     self.max_example_len = max_example_len
     self.batch_size = batch_size
     self.num_stopwords = num_stopwords
     self.mask_output_len = mask_output_len
     self.tokenizer_fit = False
     self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.normalizer = Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"],
                                   vocab_size=self.vocab_size)
     if load_from:
         self._load(load_from)
Exemple #5
0
    def test_normalize(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
        tokenizer.normalizer = Lowercase()

        output = tokenizer.normalize("My Name Is John")
        assert output == "my name is john"
Exemple #6
0
    def __init__(self,
                 vocab_file: Optional[str] = None,
                 merges_file: Optional[str] = None,
                 unk_token: Optional[str] = "<unk>",
                 suffix: Optional[str] = "</w>",
                 dropout: Optional[float] = None):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(vocab_file,
                               merges_file,
                               dropout=dropout,
                               unk_token=unk_token,
                               end_of_word_suffix=suffix))
        else:
            tokenizer = Tokenizer(BPE.empty())

        tokenizer.normalizer = Sequence.new([NFKC.new(), Lowercase.new()])
        tokenizer.pre_tokenizer = pre_tokenizers.Whitespace.new()
        tokenizer.decoder = decoders.BPEDecoder.new(suffix=suffix)

        parameters = {
            "model": "BPE",
            "unk_token": unk_token,
            "suffix": suffix,
            "dropout": dropout,
        }

        super().__init__(tokenizer, parameters)
Exemple #7
0
def train_tokenizer(langs, dataset, vocab_size):
    """Train a tokenizer on given list of languages.
    Reserves a special token for each language which is
    [LANG] where LANG is the language tag. These are assigned
    to tokens 5, 6, ..., len(langs) + 4.
    """

    # Byte-pair encoding
    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

    # trainer
    lang_tokens = ['[' + lang + ']' for lang in langs]
    special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens
    trainer = BpeTrainer(
        special_tokens=special_tokens,
        vocab_size=vocab_size)

    # normalise and pre tokenize
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    # create iterator and train
    iterator = _MultilingualIterator(dataset, langs)
    tokenizer.train_from_iterator(iterator, trainer)

    # post process start/end tokens
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ], )
    return tokenizer
Exemple #8
0
def get_daily_dialog_tokenizer(tokenizer_location=None):
    '''
    Get the daily dialog tokenizer. Trains a new one if no location is provided
    :param tokenizer_location: Json containing information about the tokenizer.
    :return:
    '''
    if tokenizer_location:
        tokenizer = Tokenizer.from_file(tokenizer_location, )
        tokenizer.enable_padding()
        return tokenizer
    else:
        dataset_train = datasets.load_dataset("daily_dialog", split="train", )
        utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train]

        trainer = WordPieceTrainer(
            vocab_size = 2048, 
            special_tokens = token_utils.special_tokens.values()
        )

        custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], ))
        custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        custom_tokenizer.pre_tokenizer = Whitespace()
        custom_tokenizer.train_from_iterator(utterances, trainer, )
        custom_tokenizer.enable_padding()

        # Write every dialogue to file
        location = './daily_dialog/'
        custom_tokenizer.save(location + "tokenizer.json")

        return custom_tokenizer
Exemple #9
0
    def __init__(
        self,
        path_src,
        path_tgt,
        path_tokenizer,
        path_root: Optional[str] = '',
    ):
        self.path_src = path_root + path_src
        self.path_tgt = path_root + path_tgt
        self.len = 0
        self.max_len = 512

        self.tokenizer = Tokenizer(
            BPE(
                path_root + path_tokenizer + 'vocab.json',
                path_root + path_tokenizer + 'merges.txt',
            ))
        self.tokenizer.normalizer = Sequence([NFKC(), Lowercase()])

        with open(self.path_src, 'r+') as f:
            lines_src = f.readlines()

        with open(self.path_tgt, 'r+') as f:
            lines_tgt = f.readlines()

        self.len = len(lines_src)
        self.example = list(zip(lines_src, lines_tgt))
Exemple #10
0
def train():
    """Source: https://huggingface.co/docs/tokenizers/pipeline"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'MimicIII/Encounters/Text/'

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    # input to tokenizer.encode() goes through this pipeline:
    # normalization, pre-tokenization, model, post-processing
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)])

    files = [str(file) for file in Path(corpus_path).glob('*.txt')]
    trainer = WordPieceTrainer(
        vocab_size=30522,
        show_progress=True,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    bert_tokenizer.train(files, trainer)

    os.mkdir('./Tokenizer')
    bert_tokenizer.save("Tokenizer/tokenizer.json")
    def __init__(
        self,
        vocab: Optional[Union[str, Dict[str, int]]] = None,
        merges: Optional[Union[str, Dict[Tuple[int, int], Tuple[int,
                                                                int]]]] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
        continuing_subword_prefix: Optional[str] = None,
        end_of_word_suffix: Optional[str] = None,
        trim_offsets: bool = False,
    ):
        if vocab is not None and merges is not None:
            tokenizer = Tokenizer(
                BPE(
                    vocab,
                    merges,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
                ))
        else:
            tokenizer = Tokenizer(BPE())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.ByteLevel(
            trim_offsets=trim_offsets)

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
            "lowercase": lowercase,
            "dropout": dropout,
            "unicode_normalizer": unicode_normalizer,
            "continuing_subword_prefix": continuing_subword_prefix,
            "end_of_word_suffix": end_of_word_suffix,
            "trim_offsets": trim_offsets,
        }

        super().__init__(tokenizer, parameters)
    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        try:
            tokenizer = WordLevel(vocab_file, unk_token=unk_token)
            tokenizer = Tokenizer(tokenizer)
        except Exception:
            raise ValueError(
                "Unable to parse file {}. Unknown format. "
                "If you tried to load a model saved through TransfoXLTokenizer,"
                "please note they are not compatible.".format(vocab_file))

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        # Strip normalizer at the end
        normalizer += [Strip(left=True, right=True)]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(
                normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(
            delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)),
                (eos_token, tokenizer.token_to_id(eos_token)))

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Exemple #13
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        merges_file: Optional[str] = None,
        add_prefix_space: bool = False,
        lowercase: bool = False,
        dropout: Optional[float] = None,
        unicode_normalizer: Optional[str] = None,
        continuing_subword_prefix: Optional[str] = None,
        end_of_word_suffix: Optional[str] = None,
    ):
        if vocab_file is not None and merges_file is not None:
            tokenizer = Tokenizer(
                BPE.from_files(
                    vocab_file,
                    merges_file,
                    dropout=dropout,
                    continuing_subword_prefix=continuing_subword_prefix or "",
                    end_of_word_suffix=end_of_word_suffix or "",
                ))
        else:
            tokenizer = Tokenizer(BPE.empty())

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()

        parameters = {
            "model": "ByteLevelBPE",
            "add_prefix_space": add_prefix_space,
            "lowercase": lowercase,
            "dropout": dropout,
            "unicode_normalizer": unicode_normalizer,
            "continuing_subword_prefix": continuing_subword_prefix,
            "end_of_word_suffix": end_of_word_suffix,
        }

        super().__init__(tokenizer, parameters)
Exemple #14
0
    def normalizer(self, proto):
        normalizers = [Replace("``", '"'), Replace("''", '"')]
        if not self.original_tokenizer.keep_accents:
            normalizers.append(NFKD())
            normalizers.append(StripAccents())
        if self.original_tokenizer.do_lower_case:
            normalizers.append(Lowercase())

        precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
        normalizers.append(Precompiled(precompiled_charsmap))
        return Sequence(normalizers)
def tokenizer_pipeline():
    """
    specific pipeline for Cebuano Corpus tokenization 
    - Uses a Byte pair encoding (BPE) tokenizer
    """
    tokenizer = Tokenizer(BPE())

    # string normalization
    tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()])
    tokenizer.pre_tokenizer = ByteLevel()
    tokenizer.decoder = ByteLevelDecoder()
    return tokenizer
Exemple #16
0
    def __init__(
        self,
        vocab_file: Optional[str] = None,
        unk_token: Union[str, AddedToken] = "[UNK]",
        pad_token: Union[str, AddedToken] = "[PAD]",
        mask_token: Union[str, AddedToken] = "[MASK]",
        lowercase: bool = False,
        unicode_normalizer: Optional[str] = None,
    ):
        if vocab_file is not None:
            logging.info(f"Initiating tokenizer at {vocab_file}")
            tokenizer = Tokenizer(
                WordLevel(vocab=vocab_file, unk_token=unk_token))
        else:
            tokenizer = Tokenizer(WordLevel(unk_token=unk_token))

        # Let the tokenizer know about special tokens if they are part of the vocab
        if tokenizer.token_to_id(str(unk_token)) is not None:
            tokenizer.add_special_tokens([str(unk_token)])
        if tokenizer.token_to_id(str(pad_token)) is not None:
            tokenizer.add_special_tokens([str(pad_token)])
        if tokenizer.token_to_id(str(mask_token)) is not None:
            tokenizer.add_special_tokens([str(mask_token)])

        # Check for Unicode normalization first (before everything else)
        normalizers = []

        if unicode_normalizer:
            normalizers += [unicode_normalizer_from_str(unicode_normalizer)]

        if lowercase:
            normalizers += [Lowercase()]

        # Create the normalizer structure
        if len(normalizers) > 0:
            if len(normalizers) > 1:
                tokenizer.normalizer = Sequence(normalizers)
            else:
                tokenizer.normalizer = normalizers[0]

        tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

        parameters = {
            "model": "WordLevel",
            "unk_token": unk_token,
            "pad_token": pad_token,
            "mask_token": mask_token,
            "lowercase": lowercase,
            "unicode_normalizer": unicode_normalizer,
        }

        super().__init__(tokenizer, parameters)
Exemple #17
0
    def __init__(
        self,
        vocab_file,
        delimiter,
        lowercase,
        unk_token,
        eos_token,
        add_eos=False,
        add_double_eos=False,
        normalization: Optional[str] = None,
    ):

        tokenizer = WordLevel.from_files(vocab_file, unk_token=unk_token)
        tokenizer = Tokenizer(tokenizer)

        # Create the correct normalization path
        normalizer = []

        # Include unicode normalization
        if normalization:
            normalizer += [unicode_normalizer_from_str(normalization)]

        # Include case normalization
        if lowercase:
            normalizer += [Lowercase()]

        if len(normalizer) > 0:
            tokenizer.normalizer = Sequence(
                normalizer) if len(normalizer) > 1 else normalizer[0]

        # Setup the splitter
        tokenizer.pre_tokenizer = CharDelimiterSplit(
            delimiter) if delimiter else WhitespaceSplit()

        if add_double_eos:
            tokenizer.post_processor = BertProcessing(
                (eos_token, tokenizer.token_to_id(eos_token)),
                (eos_token, tokenizer.token_to_id(eos_token)))

        parameters = {
            "model": "TransfoXLModel",
            "add_eos": add_eos,
            "add_double_eos": add_double_eos,
            "unk_token": unk_token,
            "eos_token": eos_token,
            "delimiter": delimiter,
            "lowercase": lowercase,
        }

        super().__init__(tokenizer, parameters)
Exemple #18
0
 def _prepare_pipeline(self):
     self.tokenizer.normalizer = normalizers.Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.post_processor = TemplateProcessing(
         single="[CLS] $A [SEP]",
         pair="[CLS] $A [SEP] $B:1 [SEP]:1",
         special_tokens=[
             ("[CLS]", 1),
             ("[SEP]", 2),
         ],
     )
     self.tokenizer.enable_padding(
         pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"),
         pad_token="[PAD]")
Exemple #19
0
def main(args):
    # from tokenizers import BertWordPieceTokenizer
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece

    bert_tokenizer = Tokenizer(WordPiece())
    # bert_tokenizer = Tokenizer(MBartTokenizer())

    from tokenizers import normalizers

    from tokenizers.normalizers import Lowercase, NFD, StripAccents

    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace

    bert_tokenizer.pre_tokenizer = Whitespace()

    # from tokenizers.processors import TemplateProcessing
    #
    # bert_tokenizer.post_processor = TemplateProcessing(
    #     single="[CLS] $A [SEP]",
    #     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #     special_tokens=[
    #         ("[CLS]", 1),
    #         ("[SEP]", 2),
    #     ],
    # )

    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=10000,
        special_tokens=["[UNK]", "[CLS]", "[PAD]",
                        "[MASK]"]  # "[SEP]", "[PAD]", "[MASK]"]
    )
    files = glob.glob(args.text_raw_files_pattern)
    bert_tokenizer.train(trainer, files)

    os.makedirs(args.output_dir, exist_ok=True)
    model_files = bert_tokenizer.model.save(args.output_dir,
                                            "bert-tokenizer-kr")
    bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")

    bert_tokenizer.save(os.path.join(args.output_dir,
                                     "bert-tokenizer-kr.json"))
Exemple #20
0
    def configure_tokenizers(self, padding, truncation, max_length, lower):
        # Settings
        pad_length = None
        if padding in {True, "longest"}:
            pass
        elif padding in {"max_length"}:
            pad_length = max_length
        elif padding in {False, "do_not_pad"}:
            pass
        else:
            raise ValueError("Unknown padding type")

        # SRC tokenizer
        tok_normalizers = [NFD(), Strip()]
        if lower:
            tok_normalizers += [Lowercase()]

        self.tokenizer = Tokenizer(tok_model())  # unk_token=... not working
        self.tokenizer.add_special_tokens(self.special_tokens)
        self.tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [WhitespaceSplit()])
        self.tokenizer.normalizer = normalizers.Sequence(
            tok_normalizers)  # StripAccents requires NFD
        self.tokenizer.decoder = tok_decoder()

        # Define template (Needed for the sos/eos tokens)
        basic_template = TemplateProcessing(
            single=f"{self.SOS_WORD} $A {self.EOS_WORD}",
            pair=
            f"{self.SOS_WORD} $A {self.EOS_WORD} {self.SOS_WORD} $B {self.EOS_WORD}",
            special_tokens=[
                (self.SOS_WORD, self.tokenizer.token_to_id(self.SOS_WORD)),
                (self.EOS_WORD, self.tokenizer.token_to_id(self.EOS_WORD))
            ],
        )
        self.tokenizer.post_processor = basic_template

        if padding:
            self.tokenizer.enable_padding(pad_id=self.tokenizer.token_to_id(
                self.PAD_WORD),
                                          pad_token=self.PAD_WORD,
                                          length=pad_length)
        if truncation:
            self.tokenizer.enable_truncation(max_length,
                                             stride=0,
                                             strategy='longest_first')
Exemple #21
0
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer:
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer
Exemple #22
0
def train_wordpiece_bert():
    """
    Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
    """
    from tokenizers.models import WordPiece
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    from tokenizers import normalizers
    from tokenizers.normalizers import Lowercase, NFD, StripAccents
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace
    bert_tokenizer.pre_tokenizer = Whitespace()

    from tokenizers.processors import TemplateProcessing
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    files = [
        DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a
        for a in ["test", "train", "valid"]
    ]
    bert_tokenizer.train(files, trainer)
    bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json')

    return bert_tokenizer
Exemple #23
0
    def __init__(self, tokenizer: PreTrainedTokenizerFast, cased: bool,
                 target_vocab_size: int):
        """
        Args:
            tokenizer: A Rust-based 🤗 Tokenizer
            cased: If False, ignore uppercases in corpus
            target_vocab_size: Size of augmented vocabulary

        Raises:
            ValueError: If :obj:`target_vocab_size` is larger or equal to the existing vocabulary of :obj:`tokenizer`
            RuntimeError: If :obj:`tokenizer` uses an unsupported tokenization model
        """
        if target_vocab_size <= tokenizer.vocab_size:
            raise ValueError(
                f"Ensure that `target_vocab_size` is larger than tokenizer's vocab size."
            )
        self.tokenizer = tokenizer
        self.cased = cased
        self.target_vocab_size = target_vocab_size
        self.model_cls: Type[
            BaseTokenizer] = tokenizer.backend_tokenizer.model.__class__

        # Instantiate rust tokenizer
        rust_tokenizer = Tokenizer(self.model_cls())
        if not cased:
            rust_tokenizer.normalizer = Lowercase()
        rust_tokenizer.pre_tokenizer = Whitespace()
        self.rust_tokenizer = rust_tokenizer

        # Instantiate the appropriate Trainer based on `self.model` (i.e. BPE, WordPiece, etc)
        trainer_cls = self.supported_trainers.get(self.model_cls, None)
        if trainer_cls is None:
            raise RuntimeError(f"{self.model_cls} is not supported")
        self.trainer = trainer_cls(
            vocab_size=self.target_vocab_size,
            special_tokens=list(self.tokenizer.special_tokens_map.values()),
        )
Exemple #24
0
    def test_can_make_sequences(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Sequence([Lowercase(), Strip()])

        output = tokenizer.normalize("  HELLO  ")
        assert output == "hello"
Exemple #25
0
# name = mention + concept
names = unique([mentions + list(concepts)], verbose=False) - stop_words

# names = unique([mentions + list(concepts)], verbose=False)
print(f"Unique names: {len(names)}\n")

name_words = {n: " ".join(split_to_words(n)) for n in names}

with open(f"{proc_path}/names.txt", "w") as f:
    f.write("\n".join(list(name_words.values())))
    # f.write("\n".join(words))

tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Sequence([
    # NFKC(),
    Lowercase()
])
tokenizer.pre_tokenizer = ByteLevel()
tokenizer.decoder = ByteLevelDecoder()
trainer = BpeTrainer(vocab_size=int(vocab_size), show_progress=True)
tokenizer.train(trainer, [f"{proc_path}/names.txt"])

print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.model.save(proc_path)

tokenizer.model = BPE.from_file(f'{proc_path}/vocab.json',
                                f'{proc_path}/merges.txt')

with open(f"{proc_path}/vocab.json", "r") as f:
    bpe_vocab = json.load(f)
Exemple #26
0
    def test_lowercase(self):
        normalizer = Lowercase()

        output = normalizer.normalize_str("HELLO")
        assert output == "hello"
Exemple #27
0
 def test_instantiate(self):
     assert isinstance(Lowercase(), Normalizer)
     assert isinstance(Lowercase(), Lowercase)
     assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
Exemple #28
0
    def test_can_make_sequences(self):
        normalizer = Sequence([Lowercase(), Strip()])

        output = normalizer.normalize_str("  HELLO  ")
        assert output == "hello"
Exemple #29
0
    def test_lowercase(self):
        tokenizer = Tokenizer(BPE.empty())
        tokenizer.normalizer = Lowercase()

        output = tokenizer.normalize("HELLO")
        assert output == "hello"
from tokenizers.models import BPE, WordPiece, Unigram
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import Whitespace, Digits, Sequence
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

TRAIN_DATA_PATH = 'data/data_fusion_train.parquet'
OUTPUT_PATH = 'data/tokenizers/'

# Prepare data
train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name'])
item_names = train.item_name.drop_duplicates().tolist()

# WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=60000)