Example #1
0
    def converted(self) -> Tokenizer:
        ot = self.original_tokenizer
        vocab = ot.encoder
        merges = list(ot.bpe_ranks.keys())

        tokenizer = Tokenizer(
            BPE(
                vocab=vocab,
                merges=merges,
                dropout=None,
                continuing_subword_prefix="",
                end_of_word_suffix="",
                fuse_unk=False,
            ))

        tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
            add_prefix_space=ot.add_prefix_space)
        tokenizer.decoder = decoders.ByteLevel()
        tokenizer.post_processor = processors.TemplateProcessing(
            single="[CLS]:0 $A:0 [SEP]:0",
            pair="[CLS]:0 $A:0 [SEP]:0 $B:0 [SEP]:0",
            special_tokens=[
                ("[CLS]",
                 self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
                ("[SEP]",
                 self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
            ],
        )

        return tokenizer
 def post_processor(self):
     eos = self.original_tokenizer.eos_token
     special_tokens = [
         (eos, self.original_tokenizer.eos_token_id),
     ]
     return processors.TemplateProcessing(single=["$A", eos],
                                          pair=["$A", "$B", eos],
                                          special_tokens=special_tokens)
Example #3
0
 def post_processor(self):
     return processors.TemplateProcessing(
         single=["$A", "</s>"],
         pair=["$A", "</s>", "$B", "</s>"],
         special_tokens=[
             ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
         ],
     )
Example #4
0
 def post_processor(self):
     return processors.TemplateProcessing(
         single="$A:0 <sep>:0 <cls>:2",
         pair="$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2",
         special_tokens=[
             ("<sep>", self.original_tokenizer.convert_tokens_to_ids("<sep>")),
             ("<cls>", self.original_tokenizer.convert_tokens_to_ids("<cls>")),
         ],
     )
Example #5
0
 def post_processor(self):
     return processors.TemplateProcessing(
         single="[CLS]:0 $A:0 [SEP]:0",
         pair="[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1",
         special_tokens=[
             ("[CLS]", self.original_tokenizer.convert_tokens_to_ids("[CLS]")),
             ("[SEP]", self.original_tokenizer.convert_tokens_to_ids("[SEP]")),
         ],
     )
Example #6
0
 def post_processor(self):
     return processors.TemplateProcessing(
         single="<s> $A </s>",
         pair="<s> $A </s> </s> $B </s>",
         special_tokens=[
             ("<s>", self.original_tokenizer.convert_tokens_to_ids("<s>")),
             ("</s>", self.original_tokenizer.convert_tokens_to_ids("</s>")),
         ],
     )
Example #7
0
 def set_tgt_lang_special_tokens(self, lang):
     self.cur_lang_code = self.convert_tokens_to_ids(lang)
     self.prefix_tokens = []
     self.suffix_tokens = [self.EOS, self.cur_lang_code]
     prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
     suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
     self._tokenizer.post_processor = processors.TemplateProcessing(
         single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
         pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
         special_tokens=list(
             zip(prefix_tokens_str + suffix_tokens_str,
                 self.prefix_tokens + self.suffix_tokens)),
     )
    def set_tgt_lang_special_tokens(self, lang: str) -> None:
        """Reset the special tokens to the target language setting. Prefix [tgt_lang_code], suffix =[eos]."""
        self.cur_lang_code = self.convert_tokens_to_ids(lang)
        self.prefix_tokens = []
        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]

        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )
    def set_src_lang_special_tokens(self, src_lang) -> None:
        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, cur_lang_code]."""
        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
        self.prefix_tokens = []
        self.suffix_tokens = [self.eos_token_id, self.cur_lang_code]

        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)

        self._tokenizer.post_processor = processors.TemplateProcessing(
            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
        )
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        # # Let the tokenizer know about special tokens if they are part of the vocab
        # if tokenizer.token_to_id(str(self.original_tokenizer.unk_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.unk_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.sep_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.sep_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.cls_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.cls_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.pad_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.pad_token)])
        # if tokenizer.token_to_id(str(self.original_tokenizer.mask_token)) is not None:
        #     tokenizer.add_special_tokens([str(self.original_tokenizer.mask_token)])

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=
            f"{cls}:2 $A:0 {sep}:0",  # token_type_id is 2 for Funnel transformer
            pair=f"{cls}:2 $A:0 {sep}:0 $B:1 {sep}:1",
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        question = str(self.original_tokenizer.question_token)
        dot = "."
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id
        question_token_id = self.original_tokenizer.question_token_id
        dot_token_id = self.original_tokenizer.convert_tokens_to_ids(".")

        if self.original_tokenizer.padding_side == "right":
            pair = f"{cls}:0 $A:0 {question} {dot} {sep}:0 $B:1 {sep}:1"
        else:
            pair = f"{cls}:0 $A:0 {sep}:0 $B:1 {question} {dot} {sep}:1"

        tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{cls}:0 $A:0 {sep}:0",
            pair=pair,
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
                (question, question_token_id),
                (dot, dot_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
Example #12
0
def train_tokenizer() -> Tuple[tokenizers.Tokenizer, Generator, int]:
    tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>"))
    tokenizer.decoder = decoders.WordPiece()

    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFD(),  # NFD unicode normalizer
        normalizers.Lowercase(),
        normalizers.StripAccents()
    ])
    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
        pre_tokenizers.Whitespace(),
        pre_tokenizers.Digits(individual_digits=False)
    ])
    tokenizer.post_processor = processors.TemplateProcessing(
        single="$A </s>",
        pair="$A </s> [SEP] <s> $B:1",
        special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)])

    dataset = datasets.load_dataset("wikitext",
                                    "wikitext-103-raw-v1",
                                    split="test")

    def batch_iterator(batch_size=1000):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]["text"]

    tokenizer.train_from_iterator(
        batch_iterator(),
        trainer=trainers.WordPieceTrainer(
            vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>",
                                              "</s>"]))

    def generator():
        for record in dataset:
            if record['text'].strip() != '':
                for sentence in sent_tokenizer(record['text']):
                    yield sentence

    data = tf.data.Dataset.from_generator(generator,
                                          output_signature=(tf.TensorSpec(
                                              shape=(None), dtype=tf.string)))
    data = data.map(tf.strings.strip,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return tokenizer, data
Example #13
0
def train_tokenizer() -> tokenizers.Tokenizer:
    tokenizer = tokenizers.Tokenizer(models.WordPiece(unk_token="<unk>"))
    tokenizer.decoder = decoders.WordPiece()

    tokenizer.normalizer = normalizers.Sequence([
        normalizers.NFD(),  # NFD unicode normalizer
        normalizers.Lowercase(),
        normalizers.StripAccents()
    ])
    tokenizer.pre_tokenizer = tokenizers.pre_tokenizers.Sequence([
        pre_tokenizers.Whitespace(),
        pre_tokenizers.Digits(individual_digits=False)
    ])
    tokenizer.post_processor = processors.TemplateProcessing(
        single="$A </s>",
        pair="$A </s> [SEP] <s> $B:1",
        special_tokens=[("[SEP]", 1), ("<s>", 2), ("</s>", 3)])

    # dataset = datasets.load_dataset("wikitext", "wikitext-103-raw-v1", split="train+test+validation")
    dataset = datasets.load_dataset("wikitext",
                                    "wikitext-103-raw-v1",
                                    split="validation")

    def batch_iterator(batch_size=1000):
        for i in range(0, len(dataset), batch_size):
            yield dataset[i:i + batch_size]["text"]

    tokenizer.train_from_iterator(
        batch_iterator(),
        trainer=trainers.WordPieceTrainer(
            vocab_size=10000, special_tokens=["<unk>", "[SEP]", "<s>",
                                              "</s>"]))

    def generator():
        for record in dataset:
            if record['text'].strip() != '':
                yield record['text']

    return tokenizer, generator
    def converted(self) -> Tokenizer:
        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        tokenize_chinese_chars = False
        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            tokenize_chinese_chars = self.original_tokenizer.basic_tokenizer.tokenize_chinese_chars
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=tokenize_chinese_chars,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{cls}:0 $A:0 {sep}:0",
            pair=
            f"{cls}:0 $A:0 {sep}:0 {sep}:0 $B:1 {sep}:1",  # MPNet uses two [SEP] tokens
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
Example #15
0
    def converted(self) -> Tokenizer:
        from .models.roformer.tokenization_utils import JiebaPreTokenizer

        vocab = self.original_tokenizer.vocab
        tokenizer = Tokenizer(
            WordPiece(vocab, unk_token=str(self.original_tokenizer.unk_token)))

        strip_accents = False
        do_lower_case = False
        if hasattr(self.original_tokenizer, "basic_tokenizer"):
            strip_accents = self.original_tokenizer.basic_tokenizer.strip_accents
            do_lower_case = self.original_tokenizer.basic_tokenizer.do_lower_case

        tokenizer.normalizer = normalizers.BertNormalizer(
            clean_text=True,
            handle_chinese_chars=False,
            strip_accents=strip_accents,
            lowercase=do_lower_case,
        )
        tokenizer.pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            JiebaPreTokenizer(vocab))

        cls = str(self.original_tokenizer.cls_token)
        sep = str(self.original_tokenizer.sep_token)
        cls_token_id = self.original_tokenizer.cls_token_id
        sep_token_id = self.original_tokenizer.sep_token_id

        tokenizer.post_processor = processors.TemplateProcessing(
            single=f"{cls}:0 $A:0 {sep}:0",
            pair=f"{cls}:0 $A:0 {sep}:0 $B:1 {sep}:1",
            special_tokens=[
                (cls, cls_token_id),
                (sep, sep_token_id),
            ],
        )
        tokenizer.decoder = decoders.WordPiece(prefix="##")

        return tokenizer
Example #16
0
def train_custom_tokenizer(dataset,
                           token_model,
                           tknzr_file,
                           vocab_size,
                           vocab=None,
                           pretrain_fast=False,
                           max_input_chars_per_word=None,
                           eos_token=None,
                           bos_token=None,
                           pad_token=None,
                           mask_token=None,
                           unk_token=None):
    """
    Building a Tokenizer using HuggingFace library. The pipeline seems to be:

        - Model           : algorithm that tokenizes, it is a mandatory
                            component. There are only 4 models implemented
                            (BPE, Unigram, WordLevel, WordPiece)
        - Normalizer      : some preprocessing that could happen before, but
                            doesn't necessarily have to
        - Pre-Tokenizer   : splitting the input according to some rules
        - Post-Processing : needing to add some tokens/input after (mostly seems
                            to be eos, bos tokens)
        - Decoder         : certain previous pipeline steps need to be reversed
                            for proper decoding
        - Trainer         : The corresponding training algorithm for the model

    Note : Some pre-processing might need to happen beforehand in previous
            functions (might be easier using pandas before)

    Input
        token_model (str)        : algorithm to use for tokenization
        dataset (class)          : a python iterator that goes through the data
                                    to be used for training
        token_dir (str)          : directory with tokenizers
        vocab_size (int)         : size of the vocabulary to use
        tokenFilename (str)     : filename of particular token we want to
                                    train. Will overwrite previously save files.
        vocab (list of str)      : models other than BPE can use non-mandatory
                                    vocab as input
        max_input_chars_per_word : used for WordPiece

    Output
        tokenizer                : huggingFace Tokenizer object, our fully
                                    trainer tokenizer

    """
    special_token_lst = [
        pad_token, bos_token, eos_token, mask_token, unk_token
    ]

    # NFKC
    normalizer_lst = []
    pre_tokenizer_lst = [Whitespace, ByteLevel]
    decoder_lst = []

    bos_idx = special_token_lst.index(bos_token)
    eos_idx = special_token_lst.index(eos_token)

    if token_model == 'BPE':
        model = BPE(unk_token=unk_token)
        Trainer = BpeTrainer
    elif token_model == 'Unigram':
        model = Unigram(vocab=vocab)
        Trainer = UnigramTrainer
    elif token_model == 'WordLevel':
        model = WordLevel(unk_token=unk_token, vocab=vocab)
        Trainer = WordLevelTrainer
    elif token_model == 'WordPiece':
        model = WordPiece(unk_token=unk_token,
                          vocab=vocab,
                          max_input_chars_per_word=max_input_chars_per_word)
        Trainer = WordPieceTrainer
    else:
        error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \
                    % VALID_TOKENIZATIONS
        raise SystemExit(error_msg)

    # instantiation
    tokenizer = Tokenizer(model)

    # Select a tokenization trainer
    if vocab_size is None:
        trainer = Trainer(show_progress=True, special_tokens=special_token_lst)
    else:
        trainer = Trainer(vocab_size=vocab_size,
                          show_progress=True,
                          special_tokens=special_token_lst)

    # Set the normalizer
    tokenizer.normalizer = normalizers.Sequence(
        [fcn() for fcn in normalizer_lst])

    # Set the pre-tokenizer
    tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
        [fcn() for fcn in pre_tokenizer_lst])

    # Set the post-processing
    tokenizer.post_processor = processors.TemplateProcessing(
        single=bos_token + " $A " + eos_token,
        special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)],
        #  pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1",
    )

    # Set the decoder
    if ByteLevel in pre_tokenizer_lst:
        tokenizer.decoder = decoders.ByteLevel()
    if Metaspace in pre_tokenizer_lst:
        tokenizer.decoder = decoders.Metaspace()
    if token_model == 'WordPiece':
        tokenizer.decoder = decoders.WordPiece()

    # creating iterator
    def batch_iterator():
        for i in np.arange(0, len(dataset)):
            yield dataset[i]

    # train call
    tokenizer.train_from_iterator(trainer=trainer,
                                  iterator=batch_iterator(),
                                  length=len(dataset))

    if Path(tknzr_file).exists():
        print(f"Warning : overwriting previously save tokenizer with\
                        same filename ( {tknzr_file} ).")
    tokenizer.save(tknzr_file)

    if pretrain_fast:
        tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file)
    else:
        tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file)
    tokenizer.pad_token = pad_token
    tokenizer.mask_token = mask_token

    return tokenizer