def main(): tokenizer = BertWordPieceTokenizer() tokenizer.train(files=[configs.data.raw], vocab_size=52_000, min_frequency=5) tokenizer.save_model(configs.data.path) print(f"save to {configs.data.path}")
def main(): random.seed(1) parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--corpus-filelist-path", required=True, help="Location of pre-training text files.") args = parser.parse_args() paths = [] with open(args.corpus_filelist_path) as f: for line in f: line = line.strip() if line: paths.append(line) random.shuffle(paths) print(f'Nrof files: {len(paths)}') paths = paths[:100_000] print(f'Nrof filtered files: {len(paths)}') # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(lowercase=False) # Customize training tokenizer.train( files=paths, vocab_size=40_000, min_frequency=4, ) # Save files to disk tokenizer.save_model(".", "vocab.txt")
def from_corpus(cls, corpus, corpus_save_path, tokenizer_save_path, tokenizer_name, vocab_size, min_frequency, strip_accents, clean_text, lowercase): with open(corpus_save_path, 'wb') as f: f.write('\n'.join(corpus).encode()) tokenizer = BertWordPieceTokenizer( strip_accents=strip_accents, clean_text=clean_text, lowercase=lowercase, ) tokenizer.train( [corpus_save_path], vocab_size=vocab_size, min_frequency=min_frequency, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], wordpieces_prefix="##", ) if os.path.exists(tokenizer_save_path): shutil.rmtree(tokenizer_save_path) os.mkdir(tokenizer_save_path) tokenizer.save_model(tokenizer_save_path, tokenizer_name) vocab_path = os.path.join(tokenizer_save_path, f'{tokenizer_name}-vocab.txt') return cls(vocab_path, strip_accents, clean_text, lowercase)
def train_wordpiece_tokenizer(self) -> None: wordpiece_tokenizer = BertWordPieceTokenizer() wordpiece_tokenizer.train( files=["./train.txt", "./test.txt"], vocab_size=10000, ) wordpiece_tokenizer.save_model("nlpbook/wordpiece")
def convert_to_ratt(self, ratt_dir, do_lower=True, max_sequence_length=128, data_type="train"): if not os.path.exists(ratt_dir): os.mkdir(ratt_dir) # Build dictionary text_list, label_list = self._read_csv(self.raw_data_file) # Token vocab token_vocab_name = "ratt" vocab_file = os.path.join(ratt_dir, token_vocab_name + "-vocab.txt") if not os.path.isfile(vocab_file): tokenizer = BertWordPieceTokenizer(lowercase=do_lower) tokenizer.train(files=[self.raw_data_file], vocab_size=8192) tokenizer.save_model(ratt_dir, token_vocab_name) else: tokenizer = BertWordPieceTokenizer(vocab_file=vocab_file, lowercase=do_lower) # Label vocab label_vocab_file = os.path.join(ratt_dir, "label_dict.txt") if not os.path.isfile(label_vocab_file): labels = set(label_list) label_map = {str(l): i for i, l in enumerate(labels)} with open(label_vocab_file, "w", encoding="utf-8") as fout: for l in labels: fout.write("%s\n" % l) else: label_map = {} with open(label_vocab_file, encoding="utf-8") as fin: for i, line in enumerate(fin): label_map[line.rstrip()] = i if data_type not in ["train", "dev", "test"]: data_types = ["train", "dev", "test"] else: data_types = [data_type] for data_type in data_types: logging.info("Converting %s.." % eval("self.raw_%s_file" % data_type)) text_list, label_list = self._read_csv( eval("self.raw_%s_file" % data_type)) outputs = tokenizer.encode_batch(text_list, add_special_tokens=True) input_ids = [output.ids for output in outputs] padded_inputs = tf.keras.preprocessing.sequence.pad_sequences( input_ids, padding="post", maxlen=max_sequence_length, truncating="post") label_ids = [label_map[str(label)] for label in label_list] save_file = os.path.join(ratt_dir, data_type + ".npz") np.savez(save_file, inputs=padded_inputs, targets=label_ids)
def generate_custom_vocab(self): try: tokenizer = None # root dir path check and generate if not os.path.isdir(self.vocab_root_dir): os.makedirs(self.vocab_root_dir, exist_ok=True) # generate models directory self.vocab_dir = '/BERT_TRAINING_VOCAB_' + self.getCurrent_time()[2] + '/' os.makedirs(self.vocab_root_dir + self.vocab_dir, exist_ok=True) user_defined_symbols = ['[BOS]', '[EOS]', '[UNK]', '[UNK1]', '[UNK2]', '[UNK3]', '[UNK4]', '[UNK5]', '[UNK6]', '[UNK7]', '[UNK8]', '[UNK9]'] unused_token_num = 200 unused_list = ['[unused{}]'.format(n) for n in range(unused_token_num)] user_defined_symbols = user_defined_symbols + unused_list if self.tokenizer_type == 'word': # if lowercase is False must set strip_accents option as 'False' tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=True, clean_text=True, handle_chinese_chars=True, wordpieces_prefix="##" ) # when selected 'base' going to use bert-base-uncased tokenizer... close function # training vocab start corpus_file = [self.corpus_path] vocab_size = 32000 limit_alphabet = 6000 min_frequency = 3 tokenizer.train(files=corpus_file, vocab_size=vocab_size, special_tokens=user_defined_symbols, min_frequency=min_frequency, # 단어의 최소 발생 빈도, 3 limit_alphabet=limit_alphabet, # ByteLevelBPETokenizer 학습시엔 주석처리 필요 show_progress=True) self.setPrint('Customer Tokenizer Training is completed') sentence = '전화 통화가 정상적으로 안됨.' output = tokenizer.encode(sentence) self.setPrint('Tokenizer 테스트 문장: {}'.format(sentence)) self.setPrint('Tokenizer 분석 결과\n=>idx: {}\n=>tokens: {}\n=>offset: {}\n=>decode: {}\n'. format(output.ids, output.tokens, output.offsets, tokenizer.decode(output.ids))) # save tokenizer tokenizer.save_model(self.vocab_root_dir + self.vocab_dir) except: self.setPrint('Error: {}. {}, line: {}'.format(sys.exc_info()[0], sys.exc_info()[1], sys.exc_info()[2].tb_lineno))
def build_vocab(args): special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] special_tokens += ["[unused{}]".format(idx) for idx in range(args.unused_size)] if args.tokenizer_model == "mecab_wordpiece": mecab_wordpiece_notag_trainer = KoNLPyBertWordPieceTrainer( Mecab(), use_tag=False ) mecab_wordpiece_notag_trainer.train( files=[args.corpus], vocab_size=args.vocab_size, min_frequency=args.min_frequency, limit_alphabet=args.limit_alphabet, special_tokens=special_tokens, ) mecab_wordpiece_notag_trainer.save_model( "./data/vocab/mecab_normalize_{}".format(args.vocab_size), "notag" ) elif args.tokenizer_model == "wordpiece": tokenizer = BertWordPieceTokenizer( vocab=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, lowercase=False, wordpieces_prefix="##", ) tokenizer.train( files=[args.corpus], vocab_size=args.vocab_size, min_frequency=args.min_frequency, limit_alphabet=args.limit_alphabet, special_tokens=special_tokens, ) tokenizer.save_model("./data/vocab/wordpiece") else: logger.info("tokenizer model : wordpiece / mecab_wordpiece") sys.exit(1)
def train_tokenizer(filename, params): """ Train a BertWordPieceTokenizer with the specified params and save it """ # Get tokenization params save_location = params["tokenizer_path"] max_length = params["max_length"] min_freq = params["min_freq"] vocabsize = params["vocab_size"] tokenizer = BertWordPieceTokenizer() tokenizer.do_lower_case = False special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"] tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens) tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),) tokenizer.enable_truncation(max_length=max_length) print("Saving tokenizer ...") if not os.path.exists(save_location): os.makedirs(save_location) tokenizer.save_model(save_location)
def train_tokenizer(file_iterator): # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # And then train #tokenizer.train_from_iterator( tokenizer.train_from_iterator( file_iterator, vocab_size=1000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save the files tokenizer.save_model(args.out, args.name)
def main(args): print(args) if args['train']: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, # Must be False if cased model lowercase=True, wordpieces_prefix="##" ) tokenizer.train( files=['/data2/BERT/data/naver_news/news_3_preprocessed/naver_news.txt'], limit_alphabet=6000, vocab_size=32000 ) print(tokenizer.save_model("../BertWordPieceTokenizer_32000")) elif args['test']: test_str = '나는 워드피스 토크나이저를 써요. 성능이 좋은지 테스트 해보려 합니다.' print("=========== tokenizer ===========") tokenizer = BertWordPieceTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str.ids) decoded_str = tokenizer.decode(encoded_str.ids) print(decoded_str) print("=========== BertTokenizer ===========") tokenizer = BertTokenizer("../BertWordPieceTokenizer_32000/vocab.txt") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str) print("=========== BertTokenizer2 ===========") tokenizer = BertTokenizer.from_pretrained("../BertWordPieceTokenizer_32000") print(tokenizer) encoded_str = tokenizer.encode(test_str) print('encoding: ', encoded_str) decoded_str = tokenizer.decode(encoded_str) print(decoded_str)
from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer( clean_text=False, handle_chinese_chars=False, strip_accents=False, # Must be False if cased model lowercase=True, wordpieces_prefix="##") path = '../data' tokenizer.train(files=[ '../data/wiki/train.txt', '../data/news/train.txt', '../data/twitter/train.txt', '../data/books/train.txt' ], vocab_size=30000) tokenizer.save_model(".", "bert")
parser.add_argument('--txtfolder', type=str, help='the FOLDER where are those txt files') args = parser.parse_args() paths = [str(x) for x in Path(str(args.txtfolder)).glob("**/*.txt")] # Initialize a lm_model tokenizer = BertWordPieceTokenizer() #trainer = BpeTrainer(vocab_size= VOCAB_SIZE, show_progress=True, initial_alphabet=ByteLevel.alphabet()) #tokenizer.train(trainer, paths) # Customize training ''' tokenizer._tokenizer.post_processor = BertProcessing(("[CLS]", tokenizer.token_to_id("[CLS]")), ("[SEP]", tokenizer.token_to_id("[SEP]")), ) ''' tokenizer.train(files=paths, vocab_size=VOCAB_SIZE, min_frequency=2, special_tokens=[ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]", ]) print("Trained vocab size: {}".format(tokenizer.get_vocab_size())) tokenizer.save_model('./lm_model') print('tokenizer savedresults, they are vocab.json and merges.txt')
from tokenizers import BertWordPieceTokenizer from glob import glob tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False, ) files = glob('splitted/*') trainer = tokenizer.train( files, vocab_size=32000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) tokenizer.save_model('./', 'bert-standard')
import argparse import os from pathlib import Path from tokenizers import BertWordPieceTokenizer if __name__ == "__main__": parser = argparse.ArgumentParser( description="Process code into pre-trained tokenizer.") parser.add_argument("--vocab_size", type=int, default=8000) parser.add_argument("--text_file_path", type=str) parser.add_argument("--out_path", type=str) parser.add_argument("--out_name", type=str) args = parser.parse_args() paths = [str(x) for x in Path(args.text_file_path).glob("**/*")] tokenizer = BertWordPieceTokenizer(clean_text=True, lowercase=False, strip_accents=True) tokenizer.train( files=paths, vocab_size=args.vocab_size, min_frequency=2, special_tokens=[ "<s>", "</s>", "<cls>", "<pad>", "<unk>", "<mask>", "[CLS]", "[SEP]", "[MASK]", "[EOL]", "[URL]", "[PAD]", "[UNK]" ], ) tokenizer.save_model(args.out_path, args.out_name) tokenizer.save(os.path.join(args.out_path, "vocab.json"))
# use this when training BPE tokenizer from scratch from pathlib import Path from tokenizers import BertWordPieceTokenizer paths = ['../../data/jw300.en-tw.tw', '../../data/asante_twi_bible.txt'] # dataset location # Initialize a tokenizer tokenizer = BertWordPieceTokenizer() # Customize training tokenizer.train( paths, vocab_size=30000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save tokenizer to disk - make sure these directories exist tokenizer.save_model("distilabena-base-v2-akuapem-twi-cased") # akuapem
from pathlib import Path #from tokenizers import ByteLevelBPETokenizer from tokenizers import BertWordPieceTokenizer paths = ['../../results_file_clean.txt'] # Initialize a tokenizer tokenizer = BertWordPieceTokenizer() # Customize training tokenizer.train(files=paths, vocab_size=50_000, min_frequency=2,special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model('.')
wp_tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=False, lowercase=False, ) wp_tokenizer.train( files='/opt/ml/code/KBOBERT/KBOBERT_Data.txt', vocab_size=32000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], wordpieces_prefix="##") wp_tokenizer.save_model('./') tokenizer = BertTokenizerFast( vocab_file="/opt/ml/code/KBOBERT/vocab.txt", max_len=512, do_lower_case=False, ) tokenizer.add_special_tokens({'mask_token': '[MASK]'}) # https://huggingface.co/transformers/model_doc/bert.html#bertconfig config = BertConfig(vocab_size=32000, hidden_size=256, num_hidden_layers=6, num_attention_heads=4,
def preprocessing(args): start_time = time.time() print('Start preprocessing!') #===================================# #=============Data Load=============# #===================================# # 1) Comment data open train = pd.read_csv(os.path.join(args.data_path, 'train.hate.csv')) valid = pd.read_csv(os.path.join(args.data_path, 'dev.hate.csv')) test = pd.read_csv(os.path.join(args.data_path, 'test.hate.no_label.csv')) # 2) Title data open with open(os.path.join(args.data_path, 'train.news_title.txt'), 'r') as f: train_title = [x.replace('\n', '') for x in f.readlines()] with open(os.path.join(args.data_path, 'dev.news_title.txt'), 'r') as f: valid_title = [x.replace('\n', '') for x in f.readlines()] with open(os.path.join(args.data_path, 'test.news_title.txt'), 'r') as f: test_title = [x.replace('\n', '') for x in f.readlines()] # 3) Unlabeld data open if args.unlabeled_data_processing: unlabel_title = pd.read_csv(os.path.join( args.data_path, 'unlabeled_comments.news_title.txt'), names=['title']) unlabel_comments = pd.read_csv(os.path.join(args.data_path, 'unlabeled_comments.txt'), names=['comments']) # 4) Path setting if not os.path.exists(args.preprocess_path): os.mkdir(args.preprocess_path) #===================================# #=============Tokenizer=============# #===================================# print('Tokenizer setting...') # 1) Tokenizer open if args.custom_training_tokenizer: tokenizer = BertWordPieceTokenizer(lowercase=False) with open(os.path.join(args.preprocessing_path, 'unlabeld.txt'), 'w') as f: for i in range(len(unlabel_title)): f.write(unlabel_title.tolist()[i]) f.write('\n') f.write(unlabel_comments.tolist()[i]) f.write('\n') tokenizer.train( [os.path.join(args.preprocessing_path, 'unlabeld.txt')], vocab_size=args.vocab_size, limit_alphabet=args.limit_alphabet) tokenizer.save_model(args.preprocessing_path) else: tokenizer = BertTokenizer.from_pretrained('beomi/kcbert-base') #===================================# #=============Cleansing=============# #===================================# print('Cleansing...') # 1) Regular expression compile emojis = ''.join(emoji.UNICODE_EMOJI.keys()) pattern = re.compile(f'[^ .,?!/@$%~%·∼()\x00-\x7Fㄱ-힣{emojis}]+') url_pattern = re.compile( r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)' ) # 2) Definition clean def clean(x): x = pattern.sub(' ', x) x = url_pattern.sub('', x) x = x.strip() x = repeat_normalize(x, num_repeats=2) return x def encoding_text(list_x, tokenizer): encoded_text_list = list_x.map(lambda x: tokenizer.encode( clean(str(x)), max_length=args.max_len, truncation=True)) return encoded_text_list # 3) Preprocess comments train['comments'] = encoding_text(train['comments'], tokenizer) valid['comments'] = encoding_text(valid['comments'], tokenizer) test['comments'] = encoding_text(test['comments'], tokenizer) # 4) Title parsing train['title'] = encoding_text(pd.Series(train_title), tokenizer) valid['title'] = encoding_text(pd.Series(valid_title), tokenizer) test['title'] = encoding_text(pd.Series(test_title), tokenizer) # 5) Unlabel data parsing if args.unlabeled_data_processing: unlabel_title = encoding_text(unlabel_title['title'], tokenizer) unlabel_comments = encoding_text(unlabel_comments['comments'], tokenizer) #===================================# #==========Label processing=========# #===================================# print('Label processing...') train.replace({'label': { 'none': 0, 'offensive': 1, 'hate': 2 }}, inplace=True) valid.replace({'label': { 'none': 0, 'offensive': 1, 'hate': 2 }}, inplace=True) #===================================# #==============Saving===============# #===================================# # 1) Print status print('Parsed sentence save setting...') max_train_len = max([len(x) for x in train['comments']]) max_valid_len = max([len(x) for x in valid['comments']]) max_test_len = max([len(x) for x in test['comments']]) max_train_title_len = max([len(x) for x in train['title']]) max_valid_title_len = max([len(x) for x in valid['title']]) max_test_title_len = max([len(x) for x in test['title']]) if args.unlabeled_data_processing: max_unlabel_title_len = max([len(x) for x in unlabel_title]) max_unlabel_comments_len = max([len(x) for x in unlabel_comments]) print( f'Train data max length => title: {max_train_len} | comment: {max_train_title_len}', end=' | ') print(f'total: {max_train_len + max_train_title_len}') print( f'Valid data max length => title: {max_valid_len} | comment: {max_valid_title_len}', end=' | ') print(f'total: {max_valid_len + max_valid_title_len}') print( f'Test data max length => title: {max_test_len} | comment: {max_test_title_len}', end=' | ') print(f'total: {max_test_len + max_test_title_len}') if args.unlabeled_data_processing: print( f'Unlabel data max length => title: {max_unlabel_title_len} | comment: {max_unlabel_comments_len}', end=' | ') print(f'total: {max_unlabel_title_len + max_unlabel_comments_len}') # 2) Training pikcle saving with open(os.path.join(args.preprocess_path, 'processed.pkl'), 'wb') as f: pickle.dump( { 'train_comment_indices': train['comments'].tolist(), 'valid_comment_indices': valid['comments'].tolist(), 'train_title_indices': train['title'].tolist(), 'valid_title_indices': valid['title'].tolist(), 'train_label': train['label'].tolist(), 'valid_label': valid['label'].tolist() }, f) # 3) Test pickle saving with open(os.path.join(args.preprocess_path, 'test_processed.pkl'), 'wb') as f: pickle.dump( { 'test_comment_indices': test['comments'].tolist(), 'test_title_indices': test['title'].tolist(), }, f) # 4) Unlabeled pickle saving if args.unlabeled_data_processing: with open( os.path.join(args.preprocess_path, 'unlabeled_processed.pkl'), 'wb') as f: pickle.dump( { 'unlabel_title': unlabel_title, 'unlabel_comments': unlabel_comments, }, f) print(f'Done! ; {round((time.time()-start_time)/60, 3)}min spend')
def pre_tokenize(self, text): return self.moses_tokenizer.tokenize(self.mpn.normalize(text.lower() if self.do_lowercase else text)) if __name__ == '__main__': lang = 'fr' clean_text = False handle_chinese_chars = True strip_accents = False lowercase = True vocab_size = 30000 min_frequency = 2 spt = ["<s>", "<pad>", "</s>", "<unk>", "<mask>", "[UNK]", "[SEP]", "[CLS]", "[PAD]", "[MASK]"] if lang == "fr": train_data = "../.data/wmt19_de_fr/train.fr" elif lang == "en": train_data = "../.data/wmt19_en_de/train.en" else: raise ValueError("Undefined language {}".format(lang)) tokenizer = BertWordPieceTokenizer(clean_text=clean_text, lowercase=lowercase, handle_chinese_chars=handle_chinese_chars, strip_accents=strip_accents) tokenizer.pre_tokenizer = MosesPreTokenizer(lang, lowercase) # Customize training print("Starting to train ...") tokenizer.train(files=train_data, vocab_size=vocab_size, show_progress=True, min_frequency=min_frequency, special_tokens=spt) # Save files to disk tokenizer.save_model(".", "moses-pre-tokenized-wmt-uncased-{}".format(lang))
def tokenizer( self) -> Union[BaseTokenizer, CountVectorizer, TfidfVectorizer]: pkl_path = os.path.join(self.tokenizer_path, "model.pkl") if self._tokenizer is not None: return self._tokenizer ### get pickled tokenizer if os.path.exists(pkl_path) and not self.retrain_tokenizer: with open(pkl_path, 'rb') as f: tokenizer = pickle.load(f) ### train new tokenizer else: self.retrain_tokenizer = False if self.algorithm == 'bert': from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer( vocab_file=None if self._init_vocabulary is None else os. path.join(self.cache_path, "bert_vocab.txt")) tokenizer.enable_truncation(max_length=self.max_length) tokenizer.enable_padding(length=self.max_length) # train the tokenizer if self._init_vocabulary is None: path = os.path.join(self.cache_path, 'train.txt') with open(path, 'w') as f: for i in chain(self.train_text, self.valid_text, self.test_text): if len(i) == 0: continue f.write(i + "\n" if i[-1] != "\n" else i) tokenizer.train(files=path, vocab_size=self.vocab_size, min_frequency=self.min_frequency, limit_alphabet=self.limit_alphabet, show_progress=True) tokenizer.save_model(self.tokenizer_path) elif self.algorithm in ('count', 'tf', 'tfidf'): if self.algorithm == 'count': tokenizer = CountVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, stop_words='english') elif self.algorithm in ('tf', 'tfidf'): tokenizer = TfidfVectorizer( input='content', ngram_range=self.ngram_range, min_df=self.min_frequency, max_df=self.max_frequency, max_features=self.vocab_size, stop_words='english', vocabulary=self._init_vocabulary, tokenizer=_simple_tokenizer, use_idf=False if self.algorithm == 'tf' else True) tokenizer.fit((_simple_preprocess(i) for i in chain( self.train_text, self.valid_text, self.test_text))) else: raise NotImplementedError # save the pickled model with open(pkl_path, "wb") as f: pickle.dump(tokenizer, f) ### assign and return self._tokenizer = tokenizer return self._tokenizer
import os from tokenizers import BertWordPieceTokenizer from pathlib import Path save_dir = "vocab" paths = [ str(x) for x in Path("/home/phmay/data/nlp/corpus/ready/").glob("*.txt") ] print(paths) vocab_size = 32_767 # 2^15-1 min_frequency = 2 os.makedirs(save_dir, exist_ok=True) special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] for i in range(767 - 5): special_tokens.append('[unused{}]'.format(i)) # https://github.com/huggingface/tokenizers/blob/04fb9e4ebe785a6b2fd428766853eb27ee894645/bindings/python/tokenizers/implementations/bert_wordpiece.py#L11 tokenizer = BertWordPieceTokenizer(strip_accents=False) tokenizer.train( files=paths, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=special_tokens, ) tokenizer.save_model(save_dir) tokenizer.save(save_dir + "/tokenizer.json")
type=str, help="The name of the output vocab files") args = parser.parse_args() files = glob.glob(args.files) if not files: print(f"File does not exist: {args.files}") exit(1) # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # And then train tokenizer.train( files, vocab_size=10000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save the files tokenizer.save_model(args.out, args.name)
def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True): """ Train a new tokenizer on `train_files`. Args: - train_files: List of files to be used when training the tokenizer. - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer. - output_dir (optional): The directory where model files will be saved. If not given, self.args.output_dir will be used. - use_trained_tokenizer (optional): Load the trained tokenizer once training completes. Returns: None """ if not self.args.vocab_size: raise AttributeError( "Cannot train a new tokenizer as vocab_size is not specified in args dict. " "Either provide a tokenizer or specify vocab_size." ) if not isinstance(train_files, list): train_files = [train_files] if not output_dir: output_dir = self.args.output_dir if self.args.model_type in ["bert", "electra"]: tokenizer = BertWordPieceTokenizer( clean_text=self.args.clean_text, handle_chinese_chars=self.args.handle_chinese_chars, strip_accents=self.args.strip_accents, lowercase=self.args.do_lower_case, ) self.args.special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.args.wordpieces_prefix = "##" tokenizer.train( files=train_files, vocab_size=self.args.vocab_size, min_frequency=self.args.min_frequency, special_tokens=self.args.special_tokens, wordpieces_prefix="##", ) else: tokenizer = ByteLevelBPETokenizer(lowercase=self.args.do_lower_case) tokenizer.train( files=train_files, vocab_size=self.args.vocab_size, min_frequency=self.args.min_frequency, special_tokens=self.args.special_tokens, ) os.makedirs(output_dir, exist_ok=True) tokenizer.save_model(output_dir) logger.info(" Training of {} tokenizer complete. Saved to {}.".format(tokenizer_name, output_dir)) _, _, tokenizer_class = MODEL_CLASSES[self.args.model_type] tokenizer = tokenizer_class.from_pretrained(output_dir) if use_trained_tokenizer: self.tokenizer = tokenizer self.args.tokenizer_name = output_dir try: if self.args.model_type == "electra": model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) except AttributeError: pass
handle_chinese_chars=False, strip_accents=False, lowercase=False, ) special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] tokenizer.train( files=train_files, vocab_size=32000, min_frequency=2, special_tokens=special_tokens, limit_alphabet=500, wordpieces_prefix="##", ) tokenizer.save_model('./vocab') vocab_file = f'./vocab/{os.listdir("./vocab")[0]}' print(vocab_file) with open(vocab_file) as f: for vocab_size, _ in enumerate(f, 1): pass print(f'Vocab size: {vocab_size}') ELECTRA_SMALL_DEFAULT = { 'generator_config': { "attention_probs_dropout_prob": 0.1, "embedding_size": 128, "hidden_act": "gelu",
if __name__ == '__main__': parser = argparse.ArgumentParser(description='train wordpiece tokenizer') parser.add_argument("--input_file", type=str, default="") parser.add_argument("--vocab_size", type=int, default="40000") parser.add_argument( "--limit_alphabet", type=int, default="1000") # 30000 for chinese and similar languages parser.add_argument("--output_path", type=str, default="") args = parser.parse_args() # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=False) tokenizer.train( files=[args.input_file], vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=args.limit_alphabet, wordpieces_prefix="##", ) # Save files to disk tokenizer.save_model(args.output_path)