def train(args): tokenizer = CharBPETokenizer() tokenizer.train([args.corpus], vocab_size=1000) tokenizer.save("src/dev_scripts/tokenizer.json")
class HuggingFaceTokenizer: def __init__(self, cache_dir, max_length=None, vocab_size=400): self.vocab_size = vocab_size self.max_length = max_length self.cache_dir = cache_dir self.name = "%d-%s" % (vocab_size, max_length) self.tokenizer = None vocab = os.path.join(self.cache_dir, self.name + '-vocab.json') merges = os.path.join(self.cache_dir, self.name + '-merges.txt') if os.path.exists(vocab) and os.path.exists(merges): self.tokenizer = CharBPETokenizer(vocab, merges, lowercase=True) print('Using cached HuggingFaceTokenizer') def build(self, texts): if self.tokenizer is not None: return tmp_file = tempfile.NamedTemporaryFile() with open(tmp_file.name, "w") as f: f.write(' '.join(texts).lower()) self.tokenizer = CharBPETokenizer(lowercase=True) self.tokenizer.train( [tmp_file.name], vocab_size=self.vocab_size, special_tokens=[ NUL_token, PAD_token, BOS_token, UNK_token, ], ) os.makedirs(self.cache_dir, exist_ok=True) self.tokenizer.save(self.cache_dir, self.name) def encode(self, text): token_ids = self.tokenizer.encode(text.lower()).ids token_ids = token_ids[:self.max_length] return token_ids def decode(self, tokens, skip_special_tokens=True): text = self.tokenizer.decode( # My special tokens tokens, # [token for token in tokens if token > 3], # aren't skipped skip_special_tokens=skip_special_tokens, # even I set f*****g ) # skip_special_tokens return text # to True def decode_plus(self, token_batch): sentences = [] for tokens in token_batch: sentences.append(self.decode(tokens)) return sentences
def train(): """My main man""" base = os.environ['DATA_ROOT'] corpus_path = base + 'Thyme/Text/train+dev+test/*' files = glob.glob(corpus_path) tokenizer = CharBPETokenizer(lowercase=True) tokenizer.train(files=files, vocab_size=10000, min_frequency=3, show_progress=True) tokenizer.save('.', name='thyme-tokenizer')
def train_subword_tokenizer(size, special_tokens, path): """Train subword tokenizers for subword encoding ref: https://github.com/huggingface/tokenizers Args: path: path of training corpus. """ tokenizer = CharBPETokenizer() tokenizer.train( [path+"/corpus_all.txt"], vocab_size=size, min_frequency=2, show_progress=True, special_tokens=special_tokens[:3]+["<unk>"], ) tokenizer.save(path, "bpe")
def get_data(): transcript_folder = os.path.join('data', 'transcripts') summary_folder = os.path.join('data', 'summary') train_files, train_result_files, test_files, test_result_files = get_dataset_files(transcript_folder, summary_folder) train_data, train_results, test_data, test_results = get_dataset(train_files, train_result_files, test_files, test_result_files) tokenizer = CharBPETokenizer() all_files = np.concatenate([train_files, train_result_files, test_files, test_result_files]) tokenizer.train(list(all_files)) train_data = tokenize_data(tokenizer, train_data) test_data = tokenize_data(tokenizer, test_data) return train_data, train_results, test_data, test_results
def create_tokenizer_imbd(data_path, file_name, vocab_size): #df = pd.read_csv(os.path.join(data_path, file_name)) tokenizer = CharBPETokenizer() tokenizer.train( os.path.join(data_path, file_name), vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=["[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]"]) print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format( str(tokenizer.token_to_id("[CLS]")), str(tokenizer.token_to_id("[PAD]")), str(tokenizer.token_to_id("[MASK]")), str(tokenizer.token_to_id("[UNK]")), str(tokenizer.token_to_id("[SEP]")))) tokenizer.save(data_path, "tokenizer")
def create_tokenizer(data_path, vocab_size): tokenizer = CharBPETokenizer() tokenizer.train([ os.path.join(data_path, file) for file in [f for f in os.listdir(data_path) if f.find("uncased_chunk") != -1][:20] ], vocab_size=vocab_size, min_frequency=2, show_progress=True, special_tokens=[ "[CLS]", "[PAD]", "[MASK]", "[UNK]", "[SEP]" ]) print("[CLS]: {}, [PAD]: {}, [MASK]: {}, [UNK]: {}, [SEP]: {}".format( str(tokenizer.token_to_id("[CLS]")), str(tokenizer.token_to_id("[PAD]")), str(tokenizer.token_to_id("[MASK]")), str(tokenizer.token_to_id("[UNK]")), str(tokenizer.token_to_id("[SEP]")))) tokenizer.save(data_path, "tokenizer")
def _cbpe(self): tokenizer = CharBPETokenizer( vocab=self.conf.vocab, merges=self.conf.merges, unk_token=self.conf.cbpe_unk_token, suffix=self.conf.suffix, dropout=self.conf.dropout, lowercase=self.conf.lowercase, unicode_normalizer=self.conf.unicode_normalizer, bert_normalizer=self.conf.bert_normalizer, split_on_whitespace_only=self.conf.split_on_whitespace_only, ) tokenizer.train( files=self.files, vocab_size=self.conf.vocab_size, min_frequency=self.conf.min_frequency, special_tokens=self.conf.special_tokens, limit_alphabet=self.conf.limit_alphabet, initial_alphabet=self.conf.initial_alphabet, suffix=self.conf.cpbe_train_shuffix, ) return tokenizer
class BPETokenizer: def __init__(self, text_list, vocab_size, lazy=False): if not lazy: self.tokenizer = CharBPETokenizer() self.tokenizer.train(text_list, vocab_size=vocab_size, special_tokens=[PAD, BOS, EOS, "<unk>"]) self.tokenizer.add_special_tokens([PAD, BOS, EOS]) else: self.tokenizer = None def tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(t) for t in tokens] def ids_to_tokens(self, ids): return [self.tokenizer.id_to_token(i) for i in ids] def encode(self, text): encodes = self.tokenizer.encode(text) return encodes.ids def decode(self, ids, skip_special=True): return self.tokenizer.decode(ids, skip_special_tokens=skip_special) def save(self, path, file_name): self.tokenizer.save(path, file_name) @classmethod def load(cls, vocab, merges): tkz = cls(None, None, lazy=True) tkz.tokenizer = CharBPETokenizer(vocab, merges) tkz.tokenizer.add_special_tokens([PAD, BOS, EOS]) return tkz def __len__(self): return self.tokenizer.get_vocab_size()
import json import argparse from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers, processors from tokenizers import CharBPETokenizer parser = argparse.ArgumentParser() parser.add_argument("--corpus", help="Path to text training corpus", default="/home/benet/IRI/How2Sign/metadata/metadata.txt") parser.add_argument("--saveto", help="Path where to save the model", default="steps/tokenizer.json") parser.add_argument("--size", help="Number of tokens / vocabulary size", type=int, default=1000) if __name__ == '__main__': args = parser.parse_args() tokenizer = CharBPETokenizer() tokenizer.train([args.corpus], vocab_size=args.size) tokenizer.save(args.saveto)
return bleu_score(outputs, targets) # function to save model parameters def save_checkpoint(state, filename="my_checkpoint.pth.tar"): print("=> Saving checkpoint") torch.save(state, filename) # function to load pre-saved model parameters def load_checkpoint(checkpoint, model, optimizer): print("=> Loading checkpoint") model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) # Common Tokenizers training tokenizer = CharBPETokenizer() tokenizer.train(["english_fr.txt", "english_lt.txt", "french.txt", "lithuanian.txt"]) # Data loading english_lt = open("english_lt.txt", encoding="utf-8").read().split("\n") lithuanian = open("lithuanian.txt", encoding="utf-8").read().split("\n") english_fr = open("english_fr.txt", encoding="utf-8").read().split("\n") french = open("french.txt", encoding="utf-8").read().split("\n") # Create dataframe raw_data_child = { "English": [line for line in english_lt], "Lithuanian": [line for line in lithuanian], } raw_data_parent = { "English": [line for line in english_fr], "French": [line for line in french],
class EngGerNewstest(Dataset): """ The newstest 2014 dataset used for testing """ def __init__(self, data_folder, rank=0, val_set=False, world_size=1, seed=0, eng_to_ger=True, vocab_size=37000, MASK="<MASK>", START="<START>", STOP="<STOP>", exp_name="", max_context=None, batch_size=128, val_size=30000, **kwargs): """ rank: int the rank in the distributed training val_set: bool if true, this dataset is created as the validation set world_size: int the number of processes if using distributed training seed: int random seed data_folder: str the path to the folder that should contain a `train.en` and a `train.de` file. eng_to_ger: bool if true, the x values are returned as english ids and the y values are german ids. If false, then visa-versa vocab_size: int the number of encodings for the byte-pair encoding scheme MASK: str the mask token START: str the start token STOP: str the stop token exp_name: str name of the experiment max_context: int the maximum sequence length val_size: int the number of samples to be set aside for validation """ self.rank = rank print("rank:", self.rank) self.world_size = world_size self.val_set = val_set self.val_size = val_size self.batch_size = batch_size self.data_folder = os.path.expanduser(data_folder) self.en_path = os.path.join(data_folder, "newstest2014.en") self.de_path = os.path.join(data_folder, "newstest2014.de") self.eng_to_ger = eng_to_ger self.vocab_size = vocab_size self.MASK = MASK self.START = START self.STOP = STOP self.max_context = max_context self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer") self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer") self.en_arr_path = os.path.join(self.data_folder, "en_bcolz") self.de_arr_path = os.path.join(self.data_folder, "de_bcolz") self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens") self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens") # Train tokenizers if rank == 0: print("Tokenizing english..") self.en_tokenizer = CharBPETokenizer() if os.path.exists(self.en_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.en_tok_path) self.en_tokenizer = ml_utils.datas.load_tokenizer( self.en_tokenizer, self.en_tok_path) else: self.en_tokenizer.train([self.en_path], vocab_size=vocab_size) os.mkdir(self.en_tok_path) self.en_tokenizer.save_model(self.en_tok_path) self.en_tokenizer.add_special_tokens([self.MASK]) self.en_tokenizer.add_tokens([self.START]) self.en_tokenizer.add_tokens([self.STOP]) self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK) self.en_start_idx = self.en_tokenizer.token_to_id(self.START) self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP) if rank == 0: print("Tokenizing german..") self.de_tokenizer = CharBPETokenizer() if os.path.exists(self.de_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.de_tok_path) self.de_tokenizer = ml_utils.datas.load_tokenizer( self.de_tokenizer, self.de_tok_path) else: self.de_tokenizer.train([self.de_path], vocab_size=vocab_size) os.mkdir(self.de_tok_path) self.de_tokenizer.save_model(self.de_tok_path) self.de_tokenizer.add_special_tokens([self.MASK]) self.de_tokenizer.add_tokens([self.START]) self.de_tokenizer.add_tokens([self.STOP]) self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK) self.de_start_idx = self.de_tokenizer.token_to_id(self.START) self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP) # Get English sentence lists if rank == 0: print("Making english idxs") self.en_max_len = 0 self.en_idxs = [] self.en_lens = [] with open(self.en_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.en_tokenizer.encode(l) ids = [self.en_start_idx]+list(output.ids)\ +[self.en_stop_idx] self.en_idxs.append(ids) self.en_lens.append(len(ids)) if len(ids) > self.en_max_len: self.en_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.en_mask_idx for i in range(self.en_max_len)] l = 0 if rank == 0: print("Padding english idxs") for i in tqdm(range(len(self.en_idxs))): diff = self.en_max_len - len(self.en_idxs[i]) self.en_idxs[i] = self.en_idxs[i] + mask[:diff] # Get German Sentence Lists if rank == 0: print("Making german idxs") self.de_max_len = 0 self.de_idxs = [] self.de_lens = [] with open(self.de_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.de_tokenizer.encode(l) ids = [self.de_start_idx]+list(output.ids)\ +[self.de_stop_idx] self.de_idxs.append(ids) self.de_lens.append(len(ids)) if len(ids) > self.de_max_len: self.de_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.de_mask_idx for i in range(self.de_max_len)] if rank == 0: print("Padding german idxs") for i in tqdm(range(len(self.de_idxs))): diff = self.de_max_len - len(self.de_idxs[i]) self.de_idxs[i] = self.de_idxs[i] + mask[:diff] if rank == 0: print("Converting to numpy arrays") if self.eng_to_ger: self.X = np.asarray(self.en_idxs) self.X_lens = np.asarray(self.en_lens) self.X_tokenizer = self.en_tokenizer self.X_mask_idx = self.en_mask_idx self.X_start_idx = self.en_start_idx self.X_stop_idx = self.en_stop_idx self.X_max_len = self.en_max_len self.Y = np.asarray(self.de_idxs) self.Y_lens = np.asarray(self.de_lens) self.Y_tokenizer = self.de_tokenizer self.Y_mask_idx = self.de_mask_idx self.Y_start_idx = self.de_start_idx self.Y_stop_idx = self.de_stop_idx self.Y_max_len = self.de_max_len else: self.X = np.asarray(self.de_idxs) self.X_lens = np.asarray(self.de_lens) self.X_tokenizer = self.de_tokenizer self.X_mask_idx = self.de_mask_idx self.X_start_idx = self.de_start_idx self.X_stop_idx = self.de_stop_idx self.X_max_len = self.de_max_len self.Y = np.asarray(self.en_idxs) self.Y_lens = np.asarray(self.en_lens) self.Y_tokenizer = self.en_tokenizer self.Y_mask_idx = self.en_mask_idx self.Y_start_idx = self.en_start_idx self.Y_stop_idx = self.en_stop_idx self.Y_max_len = self.en_max_len def __len__(self): return len(self.en_idxs) #def __getitem__(self,i,l=None): # if l is None: # l = self.X_lens[int(i)] # idxs = np.zeros(1) # margin = 5 # while idxs.sum()<25 and margin < 400: # min_l = l-margin # max_l = l+margin # idxs = (self.X_lens>min_l)&(self.X_lens<max_l) # margin += 5 # max_l = min(np.max(self.X_lens[idxs]),self.max_context) # if max_l < 50 : batch_size = self.batch_size # elif max_l < 70: batch_size = self.batch_size//2 # elif max_l < 100: batch_size = self.batch_size//4 # elif max_l < 120: batch_size = self.batch_size//8 # elif max_l < 140: batch_size = self.batch_size//16 # elif max_l < 160: batch_size = self.batch_size//32 # else: batch_size = self.batch_size//64 # batch_size = max(16,batch_size) # perm = np.random.permutation(idxs.sum())[:batch_size] # max_l = np.max(self.X_lens[idxs][perm]) # x = np.asarray(self.X[idxs][perm,:max_l]) # max_l = np.max(self.Y_lens[idxs][perm]) # y = np.asarray(self.Y[idxs][perm,:max_l]) # return torch.LongTensor(x), torch.LongTensor(y) def __getitem__(self, idx): return torch.LongTensor(self.X[idx]), torch.LongTensor(self.Y[idx]) def get_largest_batch(self, size_num): l = 10 if size_num == 1: l = 25 elif size_num == 2: l = 400 elif size_num == 3: l = 130 elif size_num == 4: l = 75 elif size_num == 5: l = 44 elif size_num == 6: l = 94 elif size_num == 7: l = 200 elif size_num == 8: l = 300 return self.__getitem__(0, l) def X_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.X_tokenizer.decode(idxs) def Y_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.Y_tokenizer.decode(idxs)
class EngGerDataset(Dataset): """ Can be english to german or german to english. """ def __init__(self, data_folder, rank=0, val_set=False, world_size=1, seed=0, eng_to_ger=True, vocab_size=37000, MASK="<MASK>", START="<START>", STOP="<STOP>", exp_name="", max_context=None, batch_size=128, val_size=30000, **kwargs): """ rank: int the rank in the distributed training val_set: bool if true, this dataset is created as the validation set world_size: int the number of processes if using distributed training seed: int random seed data_folder: str the path to the folder that should contain a `train.en` and a `train.de` file. eng_to_ger: bool if true, the x values are returned as english ids and the y values are german ids. If false, then visa-versa vocab_size: int the number of encodings for the byte-pair encoding scheme MASK: str the mask token START: str the start token STOP: str the stop token exp_name: str name of the experiment max_context: int the maximum sequence length val_size: int the number of samples to be set aside for validation """ self.rank = rank print("rank:", self.rank) self.world_size = world_size self.val_set = val_set self.val_size = val_size self.batch_size = batch_size self.data_folder = os.path.expanduser(data_folder) self.en_path = os.path.join(data_folder, "train.en") self.de_path = os.path.join(data_folder, "train.de") self.eng_to_ger = eng_to_ger self.vocab_size = vocab_size self.MASK = MASK self.START = START self.STOP = STOP self.max_context = max_context self.en_tok_path = os.path.join(self.data_folder, "en_tokenizer") self.de_tok_path = os.path.join(self.data_folder, "de_tokenizer") self.en_arr_path = os.path.join(self.data_folder, "en_bcolz") self.de_arr_path = os.path.join(self.data_folder, "de_bcolz") self.en_lens_path = os.path.join(self.data_folder, "en_bcolz_lens") self.de_lens_path = os.path.join(self.data_folder, "de_bcolz_lens") # Train tokenizers if rank == 0: print("Tokenizing english..") self.en_tokenizer = CharBPETokenizer() if os.path.exists(self.en_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.en_tok_path) self.en_tokenizer = ml_utils.datas.load_tokenizer( self.en_tokenizer, self.en_tok_path) else: self.en_tokenizer.train([self.en_path], vocab_size=vocab_size) os.mkdir(self.en_tok_path) self.en_tokenizer.save_model(self.en_tok_path) self.en_tokenizer.add_special_tokens([self.MASK]) self.en_tokenizer.add_tokens([self.START]) self.en_tokenizer.add_tokens([self.STOP]) self.en_mask_idx = self.en_tokenizer.token_to_id(self.MASK) self.en_start_idx = self.en_tokenizer.token_to_id(self.START) self.en_stop_idx = self.en_tokenizer.token_to_id(self.STOP) if rank == 0: print("Tokenizing german..") self.de_tokenizer = CharBPETokenizer() if os.path.exists(self.de_tok_path): # Load trained tokenizer if rank == 0: print("loading from pretrained tokenizer", self.de_tok_path) self.de_tokenizer = ml_utils.datas.load_tokenizer( self.de_tokenizer, self.de_tok_path) else: self.de_tokenizer.train([self.de_path], vocab_size=vocab_size) os.mkdir(self.de_tok_path) self.de_tokenizer.save_model(self.de_tok_path) self.de_tokenizer.add_special_tokens([self.MASK]) self.de_tokenizer.add_tokens([self.START]) self.de_tokenizer.add_tokens([self.STOP]) self.de_mask_idx = self.de_tokenizer.token_to_id(self.MASK) self.de_start_idx = self.de_tokenizer.token_to_id(self.START) self.de_stop_idx = self.de_tokenizer.token_to_id(self.STOP) # Get English sentence lists if rank == 0: print("Making english idxs") if os.path.exists(self.en_arr_path): if rank == 0: print("loading from bcolz", self.en_arr_path) self.en_idxs = bcolz.carray(rootdir=self.en_arr_path) self.en_lens = bcolz.carray(rootdir=self.en_lens_path) self.en_max_len = self.en_idxs.shape[-1] if exp_name == "test": self.val_size = 250 self.en_idxs = self.en_idxs[:1000] self.en_lens = self.en_lens[:1000] if self.world_size > 1: with temp_seed(seed - rank): sample_perm = np.random.permutation(len(self.en_idxs)) if not self.val_set: n_samps = (len(self.en_idxs) - self.val_size) n_samps = n_samps // self.world_size indices = sample_perm[rank * n_samps:(rank + 1) * n_samps] else: indices = sample_perm[-self.val_size:] try: if rank == 0: print("splitting dataset.. ", end="") starttime = time.time() self.en_idxs = self.en_idxs[indices] self.en_lens = self.en_lens[indices] if rank == 0: print("duration:", time.time() - starttime) except: temp_idxs = [] temp_lens = [] if rank == 0: print("Collecting data") rnge = tqdm(indices) else: rnge = indices for i in rnge: temp_idxs.append(self.en_idxs[i]) temp_lens.append(self.en_lens[i]) self.en_idxs = np.asarray(temp_idxs) self.en_lens = np.asarray(temp_lens) if rank == 0: print("duration:", time.time() - starttime) elif world_size == 1: self.en_max_len = 0 self.en_idxs = [] self.en_lens = [] with open(self.en_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.en_tokenizer.encode(l) ids = [self.en_start_idx]+list(output.ids)\ +[self.en_stop_idx] self.en_idxs.append(ids) self.en_lens.append(len(ids)) if len(ids) > self.en_max_len: self.en_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.en_mask_idx for i in range(self.en_max_len)] l = 0 if rank == 0: print("Padding english idxs") for i in tqdm(range(len(self.en_idxs))): diff = self.en_max_len - len(self.en_idxs[i]) self.en_idxs[i] = self.en_idxs[i] + mask[:diff] if rank == 0: print("Saving to bcolz") self.en_idxs = bcolz.carray(self.en_idxs, rootdir=self.en_arr_path, dtype="int32") self.en_idxs.flush() self.en_lens = bcolz.carray(self.en_lens, rootdir=self.en_lens_path, dtype="int32") self.en_lens.flush() else: print("Make dataset without using multi-processing!!") assert False if self.en_max_len > max_context: if rank == 0: print("Truncating context from", self.en_max_len, "to", self.max_context) self.en_max_len = self.max_context # Get German Sentence Lists if rank == 0: print("Making german idxs") if os.path.exists(self.de_arr_path): if rank == 0: print("loading from bcolz", self.de_arr_path) self.de_idxs = bcolz.carray(rootdir=self.de_arr_path) self.de_lens = bcolz.carray(rootdir=self.de_lens_path) self.de_max_len = self.de_idxs.shape[-1] if exp_name == "test": self.val_size = 250 self.en_idxs = self.en_idxs[:1000] self.en_lens = self.en_lens[:1000] if self.world_size > 1: try: if rank == 0: print("splitting dataset.. ", end="") starttime = time.time() self.de_idxs = self.de_idxs[indices] self.de_lens = self.de_lens[indices] if rank == 0: print("duration:", time.time() - starttime) except: temp_idxs = [] temp_lens = [] try: if rank == 0: print("Collecting data") for i in rnge: temp_idxs.append(self.de_idxs[i]) temp_lens.append(self.de_lens[i]) except Exception as e: print("Likely error caused by bcolz existing "+\ "for en but not de data") print(e) assert False self.de_idxs = np.asarray(temp_idxs) self.de_lens = np.asarray(temp_lens) if rank == 0: print("duration:", time.time() - starttime) else: self.de_max_len = 0 self.de_idxs = [] self.de_lens = [] with open(self.de_path, 'r') as f: for i, l in tqdm(enumerate(f.readlines())): l = l.strip() if len(l) > 0: output = self.de_tokenizer.encode(l) ids = [self.de_start_idx]+list(output.ids)\ +[self.de_stop_idx] self.de_idxs.append(ids) self.de_lens.append(len(ids)) if len(ids) > self.de_max_len: self.de_max_len = len(ids) if exp_name == "test" and i > 100: break mask = [self.de_mask_idx for i in range(self.de_max_len)] if rank == 0: print("Padding german idxs") for i in tqdm(range(len(self.de_idxs))): diff = self.de_max_len - len(self.de_idxs[i]) self.de_idxs[i] = self.de_idxs[i] + mask[:diff] if rank == 0: print("Saving to bcolz") self.de_idxs = bcolz.carray(self.de_idxs, rootdir=self.de_arr_path, dtype="int32") self.de_idxs.flush() self.de_lens = bcolz.carray(self.de_lens, rootdir=self.de_lens_path, dtype="int32") self.de_lens.flush() if self.de_max_len > max_context: if rank == 0: print("Truncating context from", self.de_max_len, "to", self.max_context) self.de_max_len = self.max_context if rank == 0: print("Converting to numpy arrays") if self.eng_to_ger: self.X = np.asarray(self.en_idxs) self.X_lens = np.asarray(self.en_lens) self.X_tokenizer = self.en_tokenizer self.X_mask_idx = self.en_mask_idx self.X_start_idx = self.en_start_idx self.X_stop_idx = self.en_stop_idx self.X_max_len = self.en_max_len self.Y = np.asarray(self.de_idxs) self.Y_lens = np.asarray(self.de_lens) self.Y_tokenizer = self.de_tokenizer self.Y_mask_idx = self.de_mask_idx self.Y_start_idx = self.de_start_idx self.Y_stop_idx = self.de_stop_idx self.Y_max_len = self.de_max_len else: self.X = np.asarray(self.de_idxs) self.X_lens = np.asarray(self.de_lens) self.X_tokenizer = self.de_tokenizer self.X_mask_idx = self.de_mask_idx self.X_start_idx = self.de_start_idx self.X_stop_idx = self.de_stop_idx self.X_max_len = self.de_max_len self.Y = np.asarray(self.en_idxs) self.Y_lens = np.asarray(self.en_lens) self.Y_tokenizer = self.en_tokenizer self.Y_mask_idx = self.en_mask_idx self.Y_start_idx = self.en_start_idx self.Y_stop_idx = self.en_stop_idx self.Y_max_len = self.en_max_len def __len__(self): return len(self.en_idxs) def __getitem__(self, i, l=None): if l is None: l = self.X_lens[int(i)] idxs = np.zeros(1) margin = 5 while idxs.sum() < 25 and margin < 400: min_l = l - margin max_l = l + margin idxs = (self.X_lens > min_l) & (self.X_lens < max_l) margin += 5 max_l = min(np.max(self.X_lens[idxs]), self.max_context) if max_l < 50: batch_size = self.batch_size elif max_l < 70: batch_size = self.batch_size // 2 elif max_l < 100: batch_size = self.batch_size // 4 elif max_l < 120: batch_size = self.batch_size // 8 elif max_l < 140: batch_size = self.batch_size // 16 elif max_l < 160: batch_size = self.batch_size // 32 else: batch_size = self.batch_size // 64 batch_size = max(16, batch_size) perm = np.random.permutation(idxs.sum())[:batch_size] max_l = np.max(self.X_lens[idxs][perm]) x = np.asarray(self.X[idxs][perm, :max_l]) max_l = np.max(self.Y_lens[idxs][perm]) y = np.asarray(self.Y[idxs][perm, :max_l]) return torch.LongTensor(x), torch.LongTensor(y) def get_largest_batch(self, size_num): l = 10 if size_num == 1: l = 25 elif size_num == 2: l = 400 elif size_num == 3: l = 130 elif size_num == 4: l = 75 elif size_num == 5: l = 44 elif size_num == 6: l = 94 elif size_num == 7: l = 200 elif size_num == 8: l = 300 return self.__getitem__(0, l) def X_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.X_tokenizer.decode(idxs) def Y_idxs2tokens(self, idxs): """ idxs: LongTensor (N,) converts an array of tokens to a sentence """ return self.Y_tokenizer.decode(idxs)
from tokenizers import CharBPETokenizer tokenizer_code = CharBPETokenizer() tokenizer_doc = CharBPETokenizer() #tokenizer.train([list of files to learn tokenizer from], vocab_size) #change file locations if the training files is elsewhere tokenizer_code.train([ "ncs_preprocessed_data/train-ncs/code.original_subtoken", "ncs_preprocessed_data/dev/code.original_subtoken" ], vocab_size=100000) tokenizer_doc.train([ "ncs_preprocessed_data/train-ncs/javadoc.original", "ncs_preprocessed_data/dev/javadoc.original" ], vocab_size=100000) print(tokenizer_code.get_vocab) print(tokenizer_doc.get_vocab) #use the trained tokenizer_code to encode and write in output_file file_dir = "data/ncs_preprocessed_data/train-CoDesc/" src_file_name = "code.original_subtoken" tgt_file_name = "code.bpe" output_file = open(file_dir + "/" + tgt_file_name, "w") output_file.close() output_file = open(file_dir + "/" + tgt_file_name, "a") with open(file_dir + "/" + src_file_name, 'r') as file:
# ('../youtube-speech-text/preprocessed_english_meta.csv', 'Normalized Transcription') ] if not os.path.exists('raw_corpus.txt'): with open('raw_corpus.txt', 'w') as f: for csv_filename, col_name in caption_texts: texts = list(pd.read_csv(csv_filename)[col_name]) for t in texts: t.replace('<eos>', '') f.write(t + '\n') tokenizer = CharBPETokenizer(lowercase=True) tokenizer.train( ["raw_corpus.txt"], vocab_size=1000, min_frequency=2, special_tokens=[ "<blank>", "<bos>", "<unk>", ], ) # os.makedirs('./BPE-1000', exist_ok=True) tokenizer.save(f'./BPE-1000', '') tokenizer = CharBPETokenizer('./BPE-1000/-vocab.json', './BPE-1000/-merges.txt') # with open('.test.pkl', 'w') as f: # pickle.dump(tokenizer, f) tokenizer = HuggingFaceTokenizer() print(
files = glob.glob(args.files) if not files: logger.info(f'File does not exist: {args.files}') exit(1) # Initialize an empty tokenizer # ANY ARGS? tokenizer = CharBPETokenizer() # And then train tokenizer.train( files, vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=['<unk>'], suffix='</w>', limit_alphabet=args.limit_alphabet, ) # Save the files tokenizer.save(args.out, args.name) # Restoring model from learned vocab/merges tokenizer = CharBPETokenizer( join(args.out, '{}-vocab.json'.format(args.name)), join(args.out, '{}-merges.txt'.format(args.name)), ) # Test encoding
def train(args): tokenizer = CharBPETokenizer() tokenizer.train([args.corpus], vocab_size=args.size) tokenizer.save(args.output_file)
from tokenizers import CharBPETokenizer import os import torch from transformers import RobertaConfig paths = ['data/train.txt'] # Initialize a tokenizer tokenizer = CharBPETokenizer(split_on_whitespace_only=True) # Customize training tokenizer.train(files=paths, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) os.makedirs('./tokenizer/Charbpetokenizer', exist_ok=True) tokenizer.save_model('./tokenizer/Charbpetokenizer') tokenizer = CharBPETokenizer( "./tokenizer/Charbpetokenizer/vocab.json", "./tokenizer/Charbpetokenizer/merges.txt", ) config = RobertaConfig( vocab_size=12531, max_position_embeddings=130, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, )
# coding:utf-8 from tokenizers import CharBPETokenizer from pathlib import Path # Initialize a tokenizer tokenizer = CharBPETokenizer() # Then train it! tokenizer.train(["./data/wiki_sunyang.txt"]) # And you can use it encoded = tokenizer.encode( "In 2012, Sun became the first Chinese man to win an Olympic gold medal in swimming." ) # print(encoded.tokens) # And finally save it somewhere saved_path = Path("./saved_tokenizer/wiki_sunyang") saved_path.mkdir(exist_ok=True, parents=True) tokenizer.save(str(saved_path))
def main(): batch_size = 4 vocab_size = 16384 max_source_length = 1024 max_target_length = 1024 num_workers = 3 dataset = nlp.load_dataset("iwslt2017.py", "nl-en") # Train tokenizer tokenizer_filename = "tokenizer.json" if os.path.exists(tokenizer_filename): tokenizer = Tokenizer.from_file(tokenizer_filename) else: data_filename = "whole_data.txt" with open(data_filename, "w") as f: for item in dataset["train"]: f.write(item["source"] + "\n") f.write(item["target"] + "\n\n") tokenizer = CharBPETokenizer() tokenizer.train([data_filename], vocab_size=vocab_size) pad_token = AddedToken("[PAD]", lstrip=False, rstrip=False) tokenizer.add_tokens([pad_token]) tokenizer.save(tokenizer_filename) tokenizer.pad_token_id = vocab_size # Loaders train_dataset = Seq2SeqDataset(tokenizer, dataset["train"], max_source_length, max_target_length) val_dataset = Seq2SeqDataset(tokenizer, dataset["validation"], max_source_length, max_target_length) train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=train_dataset.collate_fn, num_workers=num_workers, ) val_loader = DataLoader( val_dataset, batch_size=batch_size, collate_fn=val_dataset.collate_fn, num_workers=num_workers, ) # Train model config = BartConfig( vocab_size=vocab_size + 1, # Pad d_model=1024, encoder_ffn_dim=1024, encoder_layers=6, encoder_attention_heads=4, decoder_ffn_dim=1024, decoder_layers=6, decoder_attention_heads=4, ) model = BartForConditionalGeneration(config) translator = Translate(model, tokenizer) trainer = pl.Trainer(gpus=1) trainer.fit(translator, train_loader, val_loader)
from tokenizers import CharBPETokenizer import json import tqdm if __name__ == "__main__": # Initialize a tokenizer tokenizer = CharBPETokenizer() # Then train it! tokenizer.train( [ "data\\train.txt", "D:/数据/wikitext-2-raw-v1/wikitext-2-raw/wiki.train.raw", "D:/数据/webtext2019zh/web_text_raw.txt" ], vocab_size=30000, min_frequency=2, special_tokens=['<UNK>', '<BOS>', '<EOS>', '<PAD>', '<CLS>', '<SEP>']) # Now, let's use it: encoded = tokenizer.encode("I can feel the magic, can you?") # And finally save it somewhere tokenizer.save("./", "bpe.tokenizer.json")
# function to save model parameters def save_checkpoint(state, filename="my_checkpoint.pth.tar"): print("=> Saving checkpoint") torch.save(state, filename) # function to load pre-saved model parameters def load_checkpoint(checkpoint, model, optimizer): print("=> Loading checkpoint") model.load_state_dict(checkpoint["state_dict"]) optimizer.load_state_dict(checkpoint["optimizer"]) # Separate Tokenizers training tokenizer = CharBPETokenizer() tokenizer.train(["english_lt.txt", "lithuanian.txt"]) eng_tokenizer = CharBPETokenizer() eng_tokenizer.train(["english_lt.txt"]) lt_tokenizer = CharBPETokenizer() lt_tokenizer.train(["lithuanian.txt"]) # Data loading english_txt = open("english_lt.txt", encoding="utf-8").read().split("\n") lithuanian_txt = open("lithuanian.txt", encoding="utf-8").read().split("\n") # Create dataframe raw_data = { "English": [line for line in english_txt], "Lithuanian": [line for line in lithuanian_txt], } df = pd.DataFrame(raw_data, columns=["English", "Lithuanian"])