def get_recurrent_tokenizer(vocab,
                            max_context_tokens,
                            unk_token,
                            pad_token,
                            device="cpu"):
    """
    Return a tokenizer to be used with recurrent-based models
    """
    question_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    question_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    question_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    question_tokenizer.enable_padding(direction="right",
                                      pad_id=vocab[pad_token],
                                      pad_type_id=1,
                                      pad_token=pad_token)

    context_tokenizer = Tokenizer(WordLevel(vocab, unk_token=unk_token))
    context_tokenizer.normalizer = Sequence(
        [StripAccents(), Lowercase(), Strip()])
    context_tokenizer.pre_tokenizer = PreSequence(
        [Whitespace(), Punctuation()])
    context_tokenizer.enable_padding(
        direction="right",
        pad_id=vocab[pad_token],
        pad_type_id=1,
        pad_token=pad_token,
    )
    context_tokenizer.enable_truncation(max_context_tokens)

    return RecurrentSquadTokenizer(question_tokenizer,
                                   context_tokenizer,
                                   device=device)
Exemple #2
0
 def __init__(
     self,
     load_from: str = None,
     vocab_size: int = 10000,
     max_example_len: int = 128,
     batch_size: int = 16,
     num_stopwords: int = 250,
     mask_output_len: int = 4,
 ):
     self.char_dict: Dict[str, int] = {}
     self.char_rev: Dict[int, str] = {}
     self.token_dict: Dict[str, int] = {}
     self.token_rev: Dict[str, int] = {}
     self.vocab_size = vocab_size
     self.max_example_len = max_example_len
     self.batch_size = batch_size
     self.num_stopwords = num_stopwords
     self.mask_output_len = mask_output_len
     self.tokenizer_fit = False
     self.tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.normalizer = Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tok_trainer = BpeTrainer(special_tokens=["[UNK]", "[MASK]"],
                                   vocab_size=self.vocab_size)
     if load_from:
         self._load(load_from)
Exemple #3
0
def get_daily_dialog_tokenizer(tokenizer_location=None):
    '''
    Get the daily dialog tokenizer. Trains a new one if no location is provided
    :param tokenizer_location: Json containing information about the tokenizer.
    :return:
    '''
    if tokenizer_location:
        tokenizer = Tokenizer.from_file(tokenizer_location, )
        tokenizer.enable_padding()
        return tokenizer
    else:
        dataset_train = datasets.load_dataset("daily_dialog", split="train", )
        utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train]

        trainer = WordPieceTrainer(
            vocab_size = 2048, 
            special_tokens = token_utils.special_tokens.values()
        )

        custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], ))
        custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        custom_tokenizer.pre_tokenizer = Whitespace()
        custom_tokenizer.train_from_iterator(utterances, trainer, )
        custom_tokenizer.enable_padding()

        # Write every dialogue to file
        location = './daily_dialog/'
        custom_tokenizer.save(location + "tokenizer.json")

        return custom_tokenizer
Exemple #4
0
def train_tokenizer(langs, dataset, vocab_size):
    """Train a tokenizer on given list of languages.
    Reserves a special token for each language which is
    [LANG] where LANG is the language tag. These are assigned
    to tokens 5, 6, ..., len(langs) + 4.
    """

    # Byte-pair encoding
    tokenizer = Tokenizer(BPE(unk_token='[UNK]'))

    # trainer
    lang_tokens = ['[' + lang + ']' for lang in langs]
    special_tokens = ['[MASK]', '[CLS]', '[SEP]', '[PAD]', '[UNK]'] + lang_tokens
    trainer = BpeTrainer(
        special_tokens=special_tokens,
        vocab_size=vocab_size)

    # normalise and pre tokenize
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), StripAccents()])
    tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel()
    tokenizer.decoder = decoders.ByteLevel()

    # create iterator and train
    iterator = _MultilingualIterator(dataset, langs)
    tokenizer.train_from_iterator(iterator, trainer)

    # post process start/end tokens
    tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", tokenizer.token_to_id("[CLS]")),
            ("[SEP]", tokenizer.token_to_id("[SEP]")),
        ], )
    return tokenizer
Exemple #5
0
def train():
    """Source: https://huggingface.co/docs/tokenizers/pipeline"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'MimicIII/Encounters/Text/'

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    # input to tokenizer.encode() goes through this pipeline:
    # normalization, pre-tokenization, model, post-processing
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)])

    files = [str(file) for file in Path(corpus_path).glob('*.txt')]
    trainer = WordPieceTrainer(
        vocab_size=30522,
        show_progress=True,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    bert_tokenizer.train(files, trainer)

    os.mkdir('./Tokenizer')
    bert_tokenizer.save("Tokenizer/tokenizer.json")
def main() -> None:
    args = parse_args()

    special_tokens = list(SPECIAL_TOKENS)

    if args.reserved < len(special_tokens):
        raise AssertionError(
            f"number of reserved tokens should be more than number of f{len(special_tokens)}")
    for i in range(len(special_tokens), args.reserved):
        special_tokens.append(f"[unused{i:03d}]")

    all_filenames = get_all_filenames(args.input)
    # "C:\Users\demianmedich\data\wiki\20191120.en\pp_cased/"

    tokenizer = Tokenizer(get_model(args.model))
    tokenizer.normalizer = normalizers.Sequence([
        NFKC(), StripAccents(), Lowercase()
    ])

    tokenizer.pre_tokenizer = Whitespace()

    trainer = WordPieceTrainer(
        vocab_size=args.vocab_size,
        special_tokens=special_tokens)
    tokenizer.train(trainer, all_filenames)

    model_files = tokenizer.model.save()

    sys.exit(0)
Exemple #7
0
def tokenize(dt, df):
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece
    from tokenizers.pre_tokenizers import Whitespace
    from tokenizers import normalizers
    from tokenizers.normalizers import NFD, StripAccents
    from tokenizers.processors import TemplateProcessing
    from tokenizers.trainers import WordPieceTrainer

    #print(df.head())
    #print(df.query_text.head())
    #print(df.query_text.to_list())
    #exit(0)
    data_source = get_data_source(dt)
    token_file = Path(data_dir, data_source, 'tokenizer.json')
    vocab_file = Path(data_dir, data_source, 'vocab.txt')
    corpus_file = Path(data_dir, data_source, 'corpus.txt')
    if vocab_file.is_file() and corpus_file.is_file():
        print("corpus and token files already generated")
        return 0

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=25000,
        min_frequency=3,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    #print(df.query_text.to_list())
    bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer)
    bert_tokenizer.save(str(token_file))
    #bert_tokenizer.save_model(directory=data_dir,name='tokenizer')
    df['range_idx'] = range(0, df.shape[0])
    df['mean_rank_group'] = df.groupby(
        ['session_id'], sort=False)['range_idx'].transform(np.mean)
    df['separate_column'] = df['range_idx'] < df['mean_rank_group']
    df = df.groupby(['session_id', 'separate_column'],
                    as_index=False,
                    sort=False)['query_text'].agg(
                        ' '.join).drop(columns='separate_column')
    #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index()
    df.query_text.to_csv(corpus_file, header=False, index=False)
    with open(token_file) as token_f:
        jdata = json.load(token_f)
        with open(vocab_file, "w") as fd:
            for k in jdata['model']['vocab'].keys():
                print(k, file=fd)
Exemple #8
0
    def normalizer(self, proto):
        normalizers = [Replace("``", '"'), Replace("''", '"')]
        if not self.original_tokenizer.keep_accents:
            normalizers.append(NFKD())
            normalizers.append(StripAccents())
        if self.original_tokenizer.do_lower_case:
            normalizers.append(Lowercase())

        precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
        normalizers.append(Precompiled(precompiled_charsmap))
        return Sequence(normalizers)
def tokenizer_pipeline():
    """
    specific pipeline for Cebuano Corpus tokenization 
    - Uses a Byte pair encoding (BPE) tokenizer
    """
    tokenizer = Tokenizer(BPE())

    # string normalization
    tokenizer.normalizer = Sequence([NFD(), StripAccents(), Lowercase()])
    tokenizer.pre_tokenizer = ByteLevel()
    tokenizer.decoder = ByteLevelDecoder()
    return tokenizer
Exemple #10
0
 def _prepare_pipeline(self):
     self.tokenizer.normalizer = normalizers.Sequence(
         [NFD(), Lowercase(), StripAccents()])
     self.tokenizer.pre_tokenizer = Whitespace()
     self.tokenizer.post_processor = TemplateProcessing(
         single="[CLS] $A [SEP]",
         pair="[CLS] $A [SEP] $B:1 [SEP]:1",
         special_tokens=[
             ("[CLS]", 1),
             ("[SEP]", 2),
         ],
     )
     self.tokenizer.enable_padding(
         pad_id=self.__class__.SPECIAL_TOKENS.index("[PAD]"),
         pad_token="[PAD]")
Exemple #11
0
def main(args):
    # from tokenizers import BertWordPieceTokenizer
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece

    bert_tokenizer = Tokenizer(WordPiece())
    # bert_tokenizer = Tokenizer(MBartTokenizer())

    from tokenizers import normalizers

    from tokenizers.normalizers import Lowercase, NFD, StripAccents

    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace

    bert_tokenizer.pre_tokenizer = Whitespace()

    # from tokenizers.processors import TemplateProcessing
    #
    # bert_tokenizer.post_processor = TemplateProcessing(
    #     single="[CLS] $A [SEP]",
    #     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #     special_tokens=[
    #         ("[CLS]", 1),
    #         ("[SEP]", 2),
    #     ],
    # )

    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=10000,
        special_tokens=["[UNK]", "[CLS]", "[PAD]",
                        "[MASK]"]  # "[SEP]", "[PAD]", "[MASK]"]
    )
    files = glob.glob(args.text_raw_files_pattern)
    bert_tokenizer.train(trainer, files)

    os.makedirs(args.output_dir, exist_ok=True)
    model_files = bert_tokenizer.model.save(args.output_dir,
                                            "bert-tokenizer-kr")
    bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")

    bert_tokenizer.save(os.path.join(args.output_dir,
                                     "bert-tokenizer-kr.json"))
Exemple #12
0
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer:
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer
Exemple #13
0
def train_wordpiece_bert():
    """
    Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
    """
    from tokenizers.models import WordPiece
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    from tokenizers import normalizers
    from tokenizers.normalizers import Lowercase, NFD, StripAccents
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace
    bert_tokenizer.pre_tokenizer = Whitespace()

    from tokenizers.processors import TemplateProcessing
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    files = [
        DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a
        for a in ["test", "train", "valid"]
    ]
    bert_tokenizer.train(files, trainer)
    bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json')

    return bert_tokenizer