def from_corpus(cls, corpus, corpus_save_path, tokenizer_save_path, tokenizer_name, vocab_size, min_frequency, strip_accents, clean_text, lowercase): with open(corpus_save_path, 'wb') as f: f.write('\n'.join(corpus).encode()) tokenizer = BertWordPieceTokenizer( strip_accents=strip_accents, clean_text=clean_text, lowercase=lowercase, ) tokenizer.train( [corpus_save_path], vocab_size=vocab_size, min_frequency=min_frequency, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], wordpieces_prefix="##", ) if os.path.exists(tokenizer_save_path): shutil.rmtree(tokenizer_save_path) os.mkdir(tokenizer_save_path) tokenizer.save_model(tokenizer_save_path, tokenizer_name) vocab_path = os.path.join(tokenizer_save_path, f'{tokenizer_name}-vocab.txt') return cls(vocab_path, strip_accents, clean_text, lowercase)
def train_tokenizer(captions): print('Create training file...') train_tokenizer = [sample for samples in captions for sample in samples] with open('train_tokenizer.txt', 'a') as f: for sample in train_tokenizer: f.write(sample) # init bwpt = BertWordPieceTokenizer(vocab_file=None, unk_token='[UNK]', sep_token='[SEP]', cls_token='[CLS]', clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix='##') print('Tokenizer training...') bwpt.train(files=['train_tokenizer.txt'], vocab_size=30000, min_frequency=5, limit_alphabet=1000, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]']) bwpt.save('.', 'captions') # initialization of a trained tokenizer tokenizer = BertWordPieceTokenizer('captions-vocab.txt') tokenizer.enable_truncation(max_length=16) print('Tokenizer is ready to use...') return tokenizer
def _wordpiece(self): tokenizer = BertWordPieceTokenizer( vocab=self.conf.vocab, unk_token=self.conf.unk_token, sep_token=self.conf.sep_token, cls_token=self.conf.cls_token, pad_token=self.conf.pad_token, mask_token=self.conf.mask_token, clean_text=self.conf.clean_text, handle_chinese_chars=self.conf.handle_chinese_chars, strip_accents=self.conf.strip_accents, lowercase=self.conf.lowercase, wordpieces_prefix=self.conf.wordpieces_prefix, ) tokenizer.train( files=self.files, vocab_size=self.conf.vocab_size, min_frequency=self.conf.min_frequency, limit_alphabet=self.conf.limit_alphabet, initial_alphabet=self.conf.initial_alphabet, special_tokens=self.conf.word_piece_special_tokens, wordpieces_prefix=self.conf.wordpieces_prefix, ) return tokenizer
def train_bert_tokenizer(dataset_base_path: str, target_path: str, tokenizer_name: str, files_pattern: str = '**/*', vocab_size: int = 30000, lower_case: bool = False): """ Trains a BERT WordPiece Tokenizer based on data located in dataset_base_path. By default it reads all files in dataset_base_path. One can specify `files_pattern` for filtering. The files generated by the tokenizer will be saved under <target_path>/<tokenizer_name> namespace. """ files = [ str(f) for f in Path(dataset_base_path).glob(files_pattern) if os.path.isfile(f) ] logger.info(f'Found {len(files)} files to use for training.') logger.debug(f'Files are: {files}') tokenizer_args = { 'lowercase': lower_case, 'strip_accents': False, } wordpiece_tokenizer = BertWordPieceTokenizer(**tokenizer_args) wordpiece_tokenizer.train(files=files, vocab_size=vocab_size) save_out = wordpiece_tokenizer.save(target_path, tokenizer_name) logger.info(f'Train finish. Result is in {save_out}')
def train_bert(): # https://huggingface.co/transformers/_modules/transformers/tokenization_bert.html files = [ "Corpora/CS_V0_normalized_sent_per_line.txt", "Corpora/AsoSoft_Large_sent_per_line.txt", "Corpora/KTC_all_cleaned.txt", "Corpora/Lyrics_all_cleaned.txt", "Corpora/Tanztil_ku_normalized.txt" ] vocab_size = 50000 # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(clean_text=True, handle_chinese_chars=False, strip_accents=True, lowercase=False) # And then train tokenizer.train( files, vocab_size, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) tokenizer.save('./', 'ckb-wordpiece_%s' % str(vocab_size))
def main(): tokenizer = BertWordPieceTokenizer() tokenizer.train(files=[configs.data.raw], vocab_size=52_000, min_frequency=5) tokenizer.save_model(configs.data.path) print(f"save to {configs.data.path}")
def train_tokenizer(files: List[str], tokenizer_name: str, base_path: str, vocab_size: int, lowercase: bool = False, strip_accents: bool = False): tokenizer = BertWordPieceTokenizer(lowercase=lowercase, strip_accents=strip_accents) tokenizer_path = os.path.join(base_path, tokenizer_name) os.makedirs(tokenizer_path, exist_ok=True) initial_alphabet = get_bert_initial_alphabet() tokenizer.train(files, special_tokens=initial_alphabet, vocab_size=vocab_size) tokenizer.save(tokenizer_path) # Creating a default config for the tokenizer config = {'do_lower_case': lowercase, 'strip_accents': strip_accents} config_file_path = os.path.join(tokenizer_path, 'tokenizer_config.json') with open(config_file_path, 'w+') as config_file: json.dump(config, config_file)
def main(): random.seed(1) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus-filelist-path", required=True, help="Location of pre-training text files.") args = parser.parse_args() paths = [] with open(args.corpus_filelist_path) as f: for line in f: line = line.strip() if line: paths.append(line) random.shuffle(paths) print(f'Nrof files: {len(paths)}') paths = paths[:100_000] print(f'Nrof filtered files: {len(paths)}') # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(lowercase=False) # Customize training tokenizer.train( files=paths, vocab_size=40_000, min_frequency=4, ) # Save files to disk tokenizer.save_model(".", "vocab.txt")
def train_wordpiece_tokenizer(self) -> None: wordpiece_tokenizer = BertWordPieceTokenizer() wordpiece_tokenizer.train( files=["./train.txt", "./test.txt"], vocab_size=10000, ) wordpiece_tokenizer.save_model("nlpbook/wordpiece")
def convert_to_ratt(self, ratt_dir, do_lower=True, max_sequence_length=128, data_type="train"): if not os.path.exists(ratt_dir): os.mkdir(ratt_dir) # Build dictionary text_list, label_list = self._read_csv(self.raw_data_file) # Token vocab token_vocab_name = "ratt" vocab_file = os.path.join(ratt_dir, token_vocab_name + "-vocab.txt") if not os.path.isfile(vocab_file): tokenizer = BertWordPieceTokenizer(lowercase=do_lower) tokenizer.train(files=[self.raw_data_file], vocab_size=8192) tokenizer.save_model(ratt_dir, token_vocab_name) else: tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file, lowercase=do_lower) # Label vocab label_vocab_file = os.path.join(ratt_dir, "label_dict.txt") if not os.path.isfile(label_vocab_file): labels = set(label_list) label_map = {str(l): i for i, l in enumerate(labels)} with open(label_vocab_file, "w", encoding="utf-8") as fout: for l in labels: fout.write("%s\n" % l) else: label_map = {} with open(label_vocab_file, encoding="utf-8") as fin: for i, line in enumerate(fin): label_map[line.rstrip()] = i if data_type not in ["train", "dev", "test"]: data_types = ["train", "dev", "test"] else: data_types = [data_type] for data_type in data_types: logging.info("Converting %s.." % eval("self.raw_%s_file" % data_type)) text_list, label_list = self._read_csv( eval("self.raw_%s_file" % data_type)) outputs = tokenizer.encode_batch(text_list, add_special_tokens=True) input_ids = [output.ids for output in outputs] padded_inputs = tf.keras.preprocessing.sequence.pad_sequences( input_ids, padding="post", maxlen=max_sequence_length, truncating="post") label_ids = [label_map[str(label)] for label in label_list] save_file = os.path.join(ratt_dir, data_type + ".npz") np.savez(save_file, inputs=padded_inputs, targets=label_ids)
def generate_custom_vocab(self): try: tokenizer = None # root dir path check and generate if not os.path.isdir(self.vocab_root_dir): os.makedirs(self.vocab_root_dir, exist_ok=True) # generate models directory self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/' os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True) user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]', '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]'] unused_token_num = 200 unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)] user_defined_symbols = user_defined_symbols + unused_list if self.tokenizer_type == 'word': # if lowercase is False must set strip_accents option as 'False' tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=True, clean_text=True, handle_chinese_chars=True, wordpieces_prefix="##" ) # when selected 'base' going to use bert-base-uncased tokenizer... close function # training vocab start corpus_file = [self.corpus_path] vocab_size = 32000 limit_alphabet = 6000 min_frequency = 3 tokenizer.train(files=corpus_file, vocab_size=vocab_size, special_tokens=user_defined_symbols, min_frequency=min_frequency, # 단어의 최소 발생 빈도, 3 limit_alphabet=limit_alphabet, # ByteLevelBPETokenizer 학습시엔 주석처리 필요 show_progress=True) self.setPrint('Customer Tokenizer Training is completed') sentence = '전화 통화가 정상적으로 안됨.' output = tokenizer.encode(sentence) self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence)) self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'. format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids))) # save tokenizer tokenizer.save_model(self.vocab_root_dir + self.vocab_dir) except: self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2].tb_lineno))
def create_vocab(file_path, output_path, least_freq=2): tokenizer = BertWordPieceTokenizer(clean_text=False, strip_accents=False, lowercase=True) files = [file_path] tokenizer.train(files, vocab_size=1000, min_frequency=least_freq, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[SOS]', '[EOS]'], limit_alphabet=1000, wordpieces_prefix="##") tokenizer.save(output_path) print(f"Vacabulary created at location {output_path}")
def train_tokenizer(data_file_paths, vocab_size): t = BertWordPieceTokenizer(handle_chinese_chars=False) special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] wordpieces_prefix = "##" t.train( files=data_file_paths, vocab_size=vocab_size, min_frequency=10, show_progress=True, special_tokens=special_tokens, limit_alphabet=1000, wordpieces_prefix=wordpieces_prefix, ) return t
def train_tokenizer( corpus: Union[str, List[str]], vocab_size: int = 30519, overwrite: bool = True, lowercase: bool = True, save_vocab: bool = False, dst: Optional[str] = None, in_domain_vocab: str = VOCAB_CACHE_PREFIX, ) -> BertWordPieceTokenizer: """Train a WordPiece tokenizer from scratch. Arguments: corpus {Union[str, List[str]]} -- In-domain corpus / corpora Keyword Arguments: vocab_size {int} -- Size of trained vocabulary (default: 30519) lowercase {bool} -- If True, perform lowercasing (default: True) save_vocab {bool} -- If True, save vocab to `in_domain_vocab` (default: Fakse) in_domain_vocab {str} -- Path to save trained tokenizer vocabulary (default: {'in-domain-vocab.txt'}) Returns: A BertWordPieceTokenizer trained on in-domain corpora. """ if not isinstance(corpus, list): corpus = [corpus] # Load cached vocab if possible if not overwrite: cached_vocab = Path(dst) / (VOCAB_CACHE_PREFIX + '-vocab.txt') if cached_vocab.exists(): logger.info(f'Loading cached vocabulary at {cached_vocab}') return BertWordPieceTokenizer(str(cached_vocab)) else: logger.info(f'Cached vocabulary not found at {cached_vocab}') # Train tokenizer logger.info('Training new WordPiece tokenizer on in-domain corpora') tokenizer = BertWordPieceTokenizer(lowercase=lowercase) tokenizer.train(corpus, vocab_size=vocab_size) if save_vocab: tokenizer.save('.' if dst is None else dst, in_domain_vocab) logger.info('Saved in-domain vocabulary to ' f'{Path(dst) / (in_domain_vocab + "-vocab.txt")}') return tokenizer
class BertWordPiece: def __init__(self, clean_text: bool, strip_accents: bool, lowercase: bool): self.clean = clean_text self.strip = strip_accents self.lower = lowercase self.tokenizer = BertWordPieceTokenizer( clean_text=self.clean, strip_accents=self.clean lowercase=self.lower, handle_chinese_chars=True ) def train(self, files, vocab_size, min_frequency, limit_alphabet): self.trainer = self.tokenizer.train( files, vocab_size=vocab_size, min_frequency=min_frequency, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=limit_alphabet, wordpieces_prefix="##", ) def save(self, path, filename): self.tokenizer.save(path, filename)
def gen_tokenizer(self, min_frequency=6, limit_alphabet=150): '''Create a WordPiece tokenizer from the parsed data''' # Store the flattened text in a temporary file f = tempfile.NamedTemporaryFile() text = self.flatten() f.write(text.encode("utf8")) # Create the tokenizer tokenizer = BertWordPieceTokenizer() tokenizer.train([f.name], min_frequency=min_frequency, limit_alphabet=limit_alphabet) f.close() return tokenizer
def main(args): print(args) if args['train']: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, # Must be False if cased model lowercase=True, wordpieces_prefix="##" ) tokenizer.train( files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'], limit_alphabet=6000, vocab_size=32000 ) print(tokenizer.save_model("../BertWordPieceTokenizer_32000")) elif args['test']: test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.' print("=========== tokenizer ===========") tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str.ids) decoded_str = tokenizer.decode(encoded_str.ids) print(decoded_str) print("=========== BertTokenizer ===========") tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str) print("=========== BertTokenizer2 ===========") tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str)
def build_vocab(args): special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] special_tokens += ["[unused{}]".format(idx) for idx in range(args.unused_size)] if args.tokenizer_model == "mecab_wordpiece": mecab_wordpiece_notag_trainer = KoNLPyBertWordPieceTrainer( Mecab(), use_tag=False ) mecab_wordpiece_notag_trainer.train( files=[args.corpus], vocab_size=args.vocab_size, min_frequency=args.min_frequency, limit_alphabet=args.limit_alphabet, special_tokens=special_tokens, ) mecab_wordpiece_notag_trainer.save_model( "./data/vocab/mecab_normalize_{}".format(args.vocab_size), "notag" ) elif args.tokenizer_model == "wordpiece": tokenizer = BertWordPieceTokenizer( vocab=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, lowercase=False, wordpieces_prefix="##", ) tokenizer.train( files=[args.corpus], vocab_size=args.vocab_size, min_frequency=args.min_frequency, limit_alphabet=args.limit_alphabet, special_tokens=special_tokens, ) tokenizer.save_model("./data/vocab/wordpiece") else: logger.info("tokenizer model : wordpiece / mecab_wordpiece") sys.exit(1)
def get_vocabulary(infile: Text, vocabsize: int, outfolder: Text): # get special token maps and config autotok = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") autotok.save_pretrained(args.outfolder) os.remove(os.path.join(args.outfolder, "vocab.txt")) # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False, clean_text=False) # Then train it! tokenizer.train([args.infile], vocab_size=args.vocabsize, limit_alphabet=int(1e9)) # And finally save it somewhere tokenizer.save(args.outfolder, "vocab") os.rename(os.path.join(args.outfolder, "vocab-vocab.txt"), os.path.join(args.outfolder, "vocab.txt"))
class Tokenizer: def __init__(self, lang): """ A Tokenizer class to load and train a custom tokenizer Using the Hugging Face tokenization library for the same """ self.tokenizer_dir = r"data/{}".format(lang) if not os.path.exists(self.tokenizer_dir): os.mkdir(self.tokenizer_dir) self.vocab = self.tokenizer_dir + "/vocab.txt" if os.path.exists(self.vocab): print("Initialized tokenizer using cached vocab file {}".format(self.vocab)) self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab) else: self.tokenizer = BertWordPieceTokenizer() self.tokenizer.enable_padding(max_length=MAX_LENGTH) self.tokenizer.enable_truncation(max_length=MAX_LENGTH) def train_tokenizer(self, sentences): """ Train a tokenizer with a list of sentences """ if not os.path.exists(self.vocab): print("Training tokenizer for {}".format(self.tokenizer_dir)) # Hugging Face only accepts a Temp File with sentences for Training Tokenizer with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f: [f.write(i + "\n") for i in sentences] self.tokenizer.train([self.tokenizer_dir + "/data.txt"]) self.tokenizer.save(self.tokenizer_dir) print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size())) # Removing the temp file os.remove(self.tokenizer_dir + "/data.txt") def encode(self, decoded): return self.tokenizer.encode(decoded) def decode(self, encoded): return self.tokenizer.decode_batch(encoded)
def tokenize(inputPath, outputPath): paths = [str(x) for x in Path(inputPath).glob("*.ns")] print(paths) # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(vocab_file=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, lowercase=False, wordpieces_prefix="##") # Customize training tokenizer.train( files=paths, vocab_size=50000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], ) tokenizer.save(outputPath)
def train_tokenizer(filename, params): """ Train a BertWordPieceTokenizer with the specified params and save it """ # Get tokenization params save_location = params["tokenizer_path"] max_length = params["max_length"] min_freq = params["min_freq"] vocabsize = params["vocab_size"] tokenizer = BertWordPieceTokenizer() tokenizer.do_lower_case = False special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"] tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens) tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),) tokenizer.enable_truncation(max_length=max_length) print("Saving tokenizer ...") if not os.path.exists(save_location): os.makedirs(save_location) tokenizer.save(save_location)
def load_from_files_bert_tokenizer(path_to_files=None, vocab_size=30000): """ Adapted from: https://github.com/huggingface/tokenizers/tree/master/bindings/python/examples If used frequently, save the model to avoid reloading (see example above) """ if path_to_files is None: path_to_files = os.path.join( os.path.dirname(os.path.abspath(__file__)), "sample_files" ) # parse more complex patterns if used files = glob.glob(path_to_files) # Create tokenizer using tokenizer tokenizer = BertWordPieceTokenizer( strip_accents=True, # following arguments are all same as default, listed for clarity clean_text=True, handle_chinese_chars=True, lowercase=True, ) # And finally train tokenizer.train( files, # following arguments are all same as default, listed for clarity vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) return tokenizer
def main(language): # Initialize an empty BERT tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=False, ) cleaned_dir = BASE_DIR / "data/wikiextracted" / language / "cleaned" # prepare text files to train vocab on them # use only one subdir # files = [str(file_path) for file_path in cleaned_dir.glob("AA/wiki_*")] # use all wiki articles (in the given language) files = [str(file_path) for file_path in cleaned_dir.glob("**/wiki_*")] # train BERT tokenizer tokenizer.train( files, # vocab_size=100, # default value is 30000 min_frequency=MIN_FREQ, show_progress=True, special_tokens=SPEC_TOKENS, limit_alphabet=SIZE_OF_ALPHABET, # default value is 1000 wordpieces_prefix="##" ) # save the vocab os.makedirs(str(BASE_DIR / "data/tokenizer" / language), exist_ok=True) tokenizer.save(str(BASE_DIR / "data/tokenizer" / language / "vocab")) # save the alphabet vocab = json.loads(read_vocab(language))['model']['vocab'] alphabet = prepare_alphabet(vocab) write_alphabet_to_file(alphabet, language)
# CharBPETokenizer: The original BPE # ByteLevelBPETokenizer: The byte level version of the BPE # SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece # BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece DATAFILE = '../data/pg16457.txt' MODELDIR = 'models' input_text = 'This is a test' # Training the tokenizers print("========= CharBPETokenizer ==========") # CharBPETokenizer tokenizer = CharBPETokenizer() tokenizer.train([DATAFILE], vocab_size=500) tokenizer.save(MODELDIR, 'char_bpe') output = tokenizer.encode(input_text) print(output.tokens) # ['T', 'his</w>', 'is</w>', 'a</w>', 't', 'est</w>'] print("========= ByteLevelBPETokenizer ==========") # ByteLevelBPETokenizer tokenizer = ByteLevelBPETokenizer() tokenizer.train([DATAFILE], vocab_size=500) tokenizer.save(MODELDIR, 'byte_bpe') output = tokenizer.encode(input_text) print(output.tokens) # ['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est']
os.makedirs('./vocab', exist_ok=True) train_files = [ f"./inputs/pretrain/{f}" for f in os.listdir('./inputs/pretrain') ] tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=False, ) special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] tokenizer.train( files=train_files, vocab_size=32000, min_frequency=2, special_tokens=special_tokens, limit_alphabet=500, wordpieces_prefix="##", ) tokenizer.save_model('./vocab') vocab_file = f'./vocab/{os.listdir("./vocab")[0]}' print(vocab_file) with open(vocab_file) as f: for vocab_size, _ in enumerate(f, 1): pass print(f'Vocab size: {vocab_size}')
# dev_corpus_file = './mimicdata/bio-mimic3/dev_50.csv' # test_corpus_file = './mimicdata/bio-mimic3/test_50.csv' train_corpus_file = './mimicdata/mimic3/train_full.csv' dev_corpus_file = './mimicdata/mimic3/dev_full.csv' test_corpus_file = './mimicdata/mimic3/test_full.csv' limit_alphabet = 100 vocab_size = 100000 tokenizer = BertWordPieceTokenizer( vocab_file=None, clean_text=True, handle_chinese_chars=False, strip_accents=False, # Must be False if cased model lowercase=True, wordpieces_prefix="##", ) tokenizer.train( files=[train_corpus_file, dev_corpus_file, test_corpus_file], limit_alphabet=limit_alphabet, vocab_size=vocab_size, min_frequency=1, ) # tokenizer.save("./tokenizers", "bert-tiny-mimic3-50-{}-limit-{}".format(limit_alphabet, vocab_size)) tokenizer.save( "./tokenizers", "bert-tiny-mimic3-full-{}-limit-{}".format(limit_alphabet, vocab_size))
def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True): """ Train a new tokenizer on `train_files`. Args: - train_files: List of files to be used when training the tokenizer. - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer. - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir'] will be used. - use_trained_tokenizer (optional): Load the trained tokenizer once training completes. Returns: None """ if not self.args["vocab_size"]: raise AttributeError( "Cannot train a new tokenizer as vocab_size is not specified in args dict. " "Either provide a tokenizer or specify vocab_size." ) if not isinstance(train_files, list): train_files = [train_files] if not output_dir: output_dir = self.args["output_dir"] if self.args["model_type"] in ["bert", "electra"]: tokenizer = BertWordPieceTokenizer() self.args["special_tokens"] = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.args["wordpieces_prefix"] = "##" tokenizer.train( files=train_files, vocab_size=self.args["vocab_size"], min_frequency=self.args["min_frequency"], special_tokens=self.args["special_tokens"], wordpieces_prefix="##", ) else: tokenizer = ByteLevelBPETokenizer() tokenizer.train( files=train_files, vocab_size=self.args["vocab_size"], min_frequency=self.args["min_frequency"], special_tokens=self.args["special_tokens"], ) os.makedirs(output_dir, exist_ok=True) tokenizer.save(output_dir) logger.info(" Training of {} tokenizer complete. Saved to {}.".format(tokenizer_name, output_dir)) _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]] tokenizer = tokenizer_class.from_pretrained(output_dir) if use_trained_tokenizer: self.tokenizer = tokenizer self.args["tokenizer_name"] = output_dir try: if self.args["model_type"] == "electra": model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) except AttributeError: pass
from pathlib import Path from tokenizers import BertWordPieceTokenizer #paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")] paths = ['../../data/jw300.en-tw.tw','../../data/asante_twi_bible.txt'] # Initialize a tokenizer tokenizer = BertWordPieceTokenizer() # Customize training # And then train tokenizer.train( paths, vocab_size=30000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save files to disk tokenizer.save("abena-base-v2-akuapem-twi-cased")
from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer() tokenizer.train(["sp_data/mono/all.en-fr"], vocab_size=60000) # with open("sp_data/mono/all.en-fr") as r, open("sp_data/mono/all.en-fr.wordpiece", "w") as w: # lines = r.readlines() # for line in lines: # encoded = tokenizer.encode(line[:-1]) # w.write(" ".join(encoded.tokens)) # w.write("\n") # with open("sp_data/para/dev/newstest2013-ref.en") as r, open("sp_data/para/dev/newstest2013-ref.en.wordpiece", "w") as w: # lines = r.readlines() # for line in lines: # encoded = tokenizer.encode(line[:-1]) # w.write(" ".join(encoded.tokens)) # w.write("\n") # # with open("sp_data/para/dev/newstest2013-ref.fr") as r, open("sp_data/para/dev/newstest2013-ref.fr.wordpiece", "w") as w: # lines = r.readlines() # for line in lines: # encoded = tokenizer.encode(line[:-1]) # w.write(" ".join(encoded.tokens)) # w.write("\n") # # with open("sp_data/para/dev/newstest2014-fren-src.en") as r, open("sp_data/para/dev/newstest2014-fren-src.en.wordpiece", "w") as w: # lines = r.readlines() # for line in lines: # encoded = tokenizer.encode(line[:-1]) # w.write(" ".join(encoded.tokens))