Ejemplo n.º 1
0
        print(v)

    # Start vocabulary with all standard special tokens. (PAD=0!)
    vocab = {}
    for special_token in ["[PAD]", "[CLS]", "[SEP]", "[UNK]", "[MASK]", "[BOS]", "[EOS]"]:
        vocab[special_token] = len(vocab)
    # Add other words - if not already present.
    for w in words:
        if w not in vocab:
            vocab[w] = len(vocab)
    print(vocab)

    # New tokenizer.
    init_tokenizer = BertWordPieceTokenizer(vocab=vocab) 
    init_tokenizer.normalizer = Sequence([Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()])
    init_tokenizer.pre_tokenizer = Whitespace()
    #init_tokenizer.pad_token_id = vocab["[PAD]"]
    #print("Created tokenizer: ", init_tokenizer)

    # Save the created tokenizer.
    init_tokenizer.save(decoder_tokenizer_path)
    print("Tokenizer saved to: ", decoder_tokenizer_path)

# Load from tokenizer file.
tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path)
tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]',
    'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]'
    })

print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50)
for k, v in tokenizer.get_vocab().items():
    def pre_tokenize(self, text):
        return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text))


if __name__ == '__main__':
    lang = 'fr'
    clean_text = False
    handle_chinese_chars = True
    strip_accents = False
    lowercase = True
    vocab_size = 30000
    min_frequency = 2
    spt = ["<s>", "<pad>", "</s>", "<unk>", "<mask>", "[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]"]
    if lang == "fr":
        train_data = "../.data/wmt19_de_fr/train.fr"
    elif lang == "en":
        train_data = "../.data/wmt19_en_de/train.en"
    else:
        raise ValueError("Undefined language {}".format(lang))

    tokenizer = BertWordPieceTokenizer(clean_text=clean_text, lowercase=lowercase,
                                       handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents)
    tokenizer.pre_tokenizer = MosesPreTokenizer(lang, lowercase)

    # Customize training
    print("Starting to train ...")
    tokenizer.train(files=train_data, vocab_size=vocab_size, show_progress=True, min_frequency=min_frequency, special_tokens=spt)
    # Save files to disk
    tokenizer.save_model(".", "moses-pre-tokenized-wmt-uncased-{}".format(lang))