def _create_train_files_and_regenerate_vocab(): print("pass") r = run("split -l1000000 train.txt --verbose") if r.ok: print("Train splits generated") if r.ok: try: shutil.rmtree("td") except FileNotFoundError: pass os.mkdir("td") r = run( "mv xaa td/xaa.txt | mv xab td/xbb.txt | mv xac td/xac.txt | mv xad td/xad.txt | mv xae td/xae.txt | mv xaf td/xaf.txt" ) if r.ok: paths = [str(x) for x in Path(".").glob("td/*.txt")] tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train( files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) try: shutil.rmtree("codeBERT") except FileNotFoundError: pass os.mkdir("codeBERT") tokenizer.save("codeBERT")
def create_tokenizer(args): # Directory for storing directory = args.store_files # Train the tokenizer # paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")] paths = [args.file] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=args.vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save(args.store_files) tokenizer_config = { "max_len": 512 } with open("{}/tokenizer_config.json".format(args.store_files), 'w') as fp: json.dump(tokenizer_config, fp)
def build_tokenizer(data_path, save_path): r""" Creates a tokenizer for the Bert Model based on the given data corpus Args: data_path (:obj:`str`): Path to the data corpus save_path (:obj:`str`): Path where the custom tokenizer should be saved """ # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=data_path, vocab_size=52000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save(save_path)
def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = False, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout) tokenizer.train( files=files, vocab_size=vocab_size - len(added_tokens), min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token], ) tokenizer.add_tokens(added_tokens) PREFIX = "aitextgen" save_path_str = "the current directory" if save_path == "" else save_path if serialize: logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " + "You will need this file to build the GPT2Tokenizer.") tokenizer.save(f"{PREFIX}.tokenizer.json") else: logger.info( f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. " + "You will need both files to build the GPT2Tokenizer.") tokenizer.save_model(save_path, PREFIX)
def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True): """ Train a new tokenizer on `train_files`. Args: - train_files: List of files to be used when training the tokenizer. - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer. - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir'] will be used. - use_trained_tokenizer (optional): Load the trained tokenizer once training completes. Returns: None """ if not isinstance(train_files, list): train_files = [train_files] if not output_dir: output_dir = self.args["output_dir"] tokenizer = ByteLevelBPETokenizer() tokenizer.train( files=train_files, vocab_size=self.args["vocab_size"], min_frequency=self.args["min_frequency"], special_tokens=self.args["special_tokens"], ) os.makedirs(output_dir, exist_ok=True) tokenizer.save(output_dir) logger.info(" Training of {} tokenizer complete. Saved to {}.".format( tokenizer_name, output_dir)) _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]] tokenizer = tokenizer_class.from_pretrained(output_dir) if use_trained_tokenizer: self.tokenizer = tokenizer self.args["tokenizer_name"] = output_dir try: model_to_resize = self.model.module if hasattr( self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) except AttributeError: pass
def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, prefix: str = "aitextgen", save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = True, trim_offsets: bool = True, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param prefix: File name prefix of the final tokenizer :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout, trim_offsets=trim_offsets) tokenizer.train( files=files, vocab_size=vocab_size, min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token] + added_tokens, ) if serialize: tokenizer.save(f"{prefix}.tokenizer.json") else: tokenizer.save_model(save_path, prefix)
def save_sentense_piece_model(): paths = [str(x) for x in Path("./data/").glob("**/*.txt")] print(paths) special_token = ["<s>", "<pad>", "</s>", "<unk>", "<mask>"] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save(".", "ko")
def save_sentense_piece_model(): ko_paths = ['./data/korean-english-park.dev.ko', './data/korean-english-park.train.ko'] en_paths = ['./data/korean-english-park.dev.en', './data/korean-english-park.train.en'] special_token = ["<pad>", "<bos>", "<eos>", "<unk>", "<mask>"] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=ko_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save("./create_spm", "ko") tokenizer.train(files=en_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save("./create_spm", "en")
def tokenize(filename, vocab_size): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=filename, vocab_size=vocab_size, min_frequency=2, special_tokens=['<|endoftext|>']) # '<bos>', '<eos>', '<unk>', '<pad>', '<mask>']) tokenizer.save(corpus) return tokenizer
def main(args): paths = [path for path in args.input.split(":")] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train( files=paths, vocab_size=args.vocab_size, min_frequency=args.min_freq, special_tokens=["<s>", "<pad>", "</s>", "<unk>"], ) # Save files to disk tokenizer.save("{}.json".format(args.name), pretty=True) tok_spec = json.loads(tokenizer.to_str()) with open("{}-vocab.json".format(args.name), "w") as fp: json.dump(tok_spec["model"]["vocab"], fp, indent=4) with open("{}-merges.txt".format(args.name), "w") as fp: fp.write("\n".join(tok_spec["model"]["merges"]))
def get_french_vocab(model_name): root = Path(os.getcwd()).parent.parent.parent french_corpus = "Datasets/corpora/fr/text" fr_corpus_path = os.path.join(root, french_corpus) files = [] for dir_ in os.listdir(fr_corpus_path): fr_corpus_dir = os.path.join(fr_corpus_path, dir_) for text_file in os.listdir(fr_corpus_dir): text_file = os.path.join(fr_corpus_dir, text_file) files.append(text_file) tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) tokenizer.pre_tokenizer = Whitespace() tokenizer.train(files, vocab_size=20000, min_frequency=2, show_progress=True, special_tokens=["<sos>", "<pad>", "<eos>", "<unk>"]) print(tokenizer.encode("c'est la meilleure des phrases françaises").tokens) tokenizer.save(model_name)
else: print(files) gpt2_tok.train( files=files, vocab_size=args.vocab_size, show_progress=True, special_tokens=["<|endoftext|>", "<s>", "<pad>", "</s>"], ) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) gpt2_tok.save( os.path.join(args.output_dir,"tokenizer.json"), pretty=True ) # FIX Access is denied. (os error 5) gpt2_tok.save_model(args.output_dir, args.output_file_name) # tokenizer = GPT2TokenizerFast( # vocab_file=os.path.join(args.output_dir, args.output_file_name) + "-vocab.json", # merges_file=os.path.join(args.output_dir, args.output_file_name) # + "-merges.txt", # add_prefix_space=True, # ) # tokenizer.add_special_tokens( # { # "eos_token": "<|endoftext|>", # "bos_token": "<|endoftext|>", # "unk_token": "<|endoftext|>",
for file_path in tqdm.tqdm( glob.glob("../../data/txts/*.txt") + glob.glob("../../data/txts2/*.txt")): paper_name = file_path.split("/")[-1].replace(".pdf.txt", "") if os.path.exists(f"../../data/pre_abstract_txts/{paper_name}.txt"): continue with open(file_path) as file: text = file.read() abstract = None mention_count = len(abstract_re.findall(papers[paper_name]["title"])) if mention_count > 0: mentions = list(abstract_re.finditer(text)) if len(mentions) >= mention_count: abstract = mentions[mention_count] else: abstract = abstract_re.search(text) if abstract is None: continue with open(f"../../data/pre_abstract_txts/{paper_name}.txt", "w") as file: file.write(text[:abstract.start()]) files = glob.glob("../../data/pre_abstract_txts/*.txt") tokenizer = ByteLevelBPETokenizer(lowercase=True) tokenizer.train(files, vocab_size=2500, special_tokens=["[PAD]"]) tokenizer.save("tokenizer")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os from tokenizers import ByteLevelBPETokenizer # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training vocab_size = 50265 path = 'data/src-train.txt' # Customize training tokenizer.train(files=path, vocab_size=50265, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) # Save files to disk directory = "models/roberta" if not os.path.exists(directory): os.makedirs(directory) tokenizer.save(directory)
from tokenizers import ByteLevelBPETokenizer path = "roberta_test/train.txt" #plwiki # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=path, vocab_size=50265, min_frequency=5, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.save("roberta_test/tokenizer.json") import json config = { "architectures": [ "RobertaForMaskedLM" ], "attention_probs_dropout_prob": 0.1, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 768, "initializer_range": 0.02, "intermediate_size": 3072, "layer_norm_eps": 1e-05, "max_position_embeddings": 514, "model_type": "roberta",
alphabet = pre_tokenizers.ByteLevel.alphabet() # 256 chars logger.info( 'Initial alphabet for ByteLevel BPE as defined in pre_tokenizers.ByteLevel.alphabet(): ', alphabet) # And then train tokenizer.train( files, vocab_size=args.vocab_size, min_frequency=2, show_progress=True, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'], ) # Save the files tokenizer.save(args.out, args.name) # Restoring model from learned vocab/merges tokenizer = ByteLevelBPETokenizer( join(args.out, '{}-vocab.json'.format(args.name)), join(args.out, '{}-merges.txt'.format(args.name)), add_prefix_space=True, ) # Test encoding logger.info( 'Tokens and their ids from ByteLevelBPETokenizer with GFP protein sequence: \n MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT' ) encoded = tokenizer.encode('MSKGEE LFTGVVPILVELDGDVNGHKFSVSGEGEG DAT', pad_to_max_length=True) logger.info(encoded.tokens)
from pathlib import Path from tokenizers import ByteLevelBPETokenizer clean_wiki_text = "/home/rohola/codes/persian_transformer/clean_wiki_text_txt" paths = [str(x) for x in Path(clean_wiki_text).glob("**/*")] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save("models/faberto", "faberto")
parser.add_argument('--train_path', type=str) parser.add_argument('--n_files', type=int) parser.add_argument('--save_path', type=str) parser.add_argument('--vocab_size', type=int) parser.add_argument('--control_codes', nargs='+', default=['<|endoftext|>']) args = parser.parse_args() if os.path.isdir(args.train_path): paths = glob.glob(os.path.join(args.train_path, '*')) else: paths = [args.train_path] paths = paths[:args.n_files] tok = ByteLevelBPETokenizer() tok.train(files=paths, vocab_size=args.vocab_size, special_tokens=args.control_codes) tok.save(args.save_path) tokenizer_config = { "max_len": 1024 } with open(os.path.join(args.save_path, "tokenizer_config.json"), 'w') as fp: json.dump(tokenizer_config, fp)
for (_, _, f) in walk(labeledDataFolder + "/legitimate_htmls"): files.extend( [labeledDataFolder + "/legitimate_htmls/" + file for file in f]) for (_, _, f) in walk(labeledDataFolder + "/phishing_htmls"): files.extend([labeledDataFolder + "/phishing_htmls/" + file for file in f]) print("Total number of html files: %d\n" % len(files)) # Writing data, one html file per line. This is the format the tokenizer expects print("Writing html data into a single file...") output = open("tokenizer/htmlCodePerLine.txt", "w") count = 0 for file in files: count = count + 1 print("Files processed: %d, Total files: %d" % (count, len(files))) fileData = io.open(file, "r", errors="ignore").readlines() fileData = ''.join(str(line) for line in fileData) fileData = fileData.replace("\n", " ") output.write(fileData + "\n") output.close() # Starting tokenization print("\nStarting tokenization with BPE") tokenizer = ByteLevelBPETokenizer() tokenizer.train("tokenizer/htmlCodePerLine.txt", min_frequency=minFrequency, vocab_size=vocabSize) print( "Vocabulary size is: %d\nNOTE: Sometimes, the vocab size might not be equal to the input 'vocab_size'\n" % (tokenizer.get_vocab_size())) tokenizer.save("tokenizer", "tokenizer.tok") print("Tokenizer files have been saved in 'tokenizer' directory...")
from transformers import GPT2Tokenizer if __name__ == '__main__': # # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() HOME = os.environ['HOME'] data_file = HOME + '/data/wikitext-103-raw/wiki.train.raw' # data_file ='/tmp/wikitext-2-raw/wiki.train.raw' tokenizer.train(files=[data_file], vocab_size=20_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer_name = "Tokenizer" os.makedirs(tokenizer_name) tokenizer.save(tokenizer_name) with open(data_file, encoding="utf-8") as f: text = f.read() tok = GPT2Tokenizer.from_pretrained('Tokenizer') x = tok.convert_tokens_to_ids(tok.tokenize(text[:100])) y = tok.build_inputs_with_special_tokens(x) print(x) print(y)
# train tokenizer _pretty_print("Training tokenizer") bpe_tokenizer.train( [input_path, input_path_val], vocab_size=vocab_size, min_frequency=min_freq, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", "<sep>", "<cls>" ], ) # save tokenizer tok_path = os.path.join(output_path, "tokenizer") tok_path_file = os.path.join(tok_path, "vocab.json") os.makedirs(tok_path, exist_ok=True) # bpe_tokenizer.save_model(tok_path) bpe_tokenizer.save(tok_path_file, True) # load tokenizer with Roberta configuration bpe_tokenizer = PreTrainedTokenizerFast( tokenizer_file=tok_path_file, max_length=max_len, lowercase=True, unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", bos_token="<s>", eos_token="</s>", ) # bpe_tokenizer = FunnelTokenizerFast(
#! pip install tokenizers from pathlib import Path from tokenizers import ByteLevelBPETokenizer paths = [str(x) for x in Path("./data/").glob("**/*.txt")] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save("litberta")
def architecture_search(process_id): os.makedirs(f"checkpoints/{process_id+1}") os.makedirs(f"tokenizer/{process_id+1}") files = glob.glob("../../data/pre_abstract_txts/*.txt") tok_sizes = list(range(100, 2000, 100)) hidden_sizes = list(range(12, 300, 12)) emb_sizes = list(range(10, 250, 10)) cased = [True, False] batch_size = 1 results = {} choices = list(itertools.product(tok_sizes, hidden_sizes, emb_sizes, cased)) random.shuffle(choices) best_acc = -np.inf while len(choices) > 0: tok_size, hidden_size, emb_size, cased = choices.pop() print(tok_size, hidden_size, emb_size, cased) tokenizer = ByteLevelBPETokenizer(lowercase=cased) tokenizer.train(files, vocab_size=tok_size, special_tokens=["[PAD]"]) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") dataset = TextDataset(data_dir="../../data/pre_abstract_txts", labels_dir="../../data/pre_abstract_labels", device=device, tokenizer=tokenizer, batch_size=batch_size) test_dataset = TextDataset( data_dir="../../data/pre_abstract_txts", labels_dir="../../data/pre_abstract_labels_test", device=device, tokenizer=tokenizer, batch_size=batch_size) model = LSTMTagger(vocab_size=tokenizer.get_vocab_size(), embedding_dim=emb_size, lstm_dim=hidden_size, dropout=0, n_classes=len(dataset.classes)).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters()) # optimizer = torch.optim.SGD(model.parameters(), momentum=0.9, nesterov=True, lr=v) epoch = 0 n = 3 test_acc = -np.inf log_interval = 10 # all n batches weights = copy.deepcopy(model.state_dict()) while True: dataset.shuffle() epoch += 1 model.train() total_loss = 0. pbar = tqdm.tqdm(enumerate(dataset), desc=f"epoch {epoch}") for i, (x, y) in pbar: # reset gradients optimizer.zero_grad() # feed forward batch output = model(x) # calculate loss loss = criterion(output.transpose(1, 2), y) # back propagate loss loss.backward() # norm and clip gradients # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5) optimizer.step() pbar.set_description( f'epoch {epoch} | batch {i + 1:d}/{len(dataset)} | loss {loss.item():.2f}' ) model.eval() a, c = 0, 0 with torch.no_grad(): t_loss = 0 for i, (x, y) in enumerate(test_dataset): output = model(x) loss = criterion(output.transpose(1, 2), y) t_loss += loss.item() for p, t in zip(torch.argmax(output, -1), y): for pi, ti in zip(p, t): a += 1 if pi == ti: c += 1 acc = c / a if acc <= test_acc and n > 0: n -= 1 continue elif acc <= test_acc: break print(t_loss, acc) weights = copy.deepcopy(model.state_dict()) test_acc = acc results[(tok_size, hidden_size, emb_size, cased)] = acc print( list( sorted([(k, v) for k, v in results.items()], key=lambda y: y[1], reverse=True))[:10]) print(best_acc, test_acc) if test_acc > best_acc: best_acc = test_acc dir_path = f"tokenizer/{process_id+1}/lstm-tagger-{best_acc:.6f}" if os.path.exists(dir_path): continue torch.save( weights, f"checkpoints/{process_id+1}/lstm-tagger-{best_acc:.6f}.pt") os.makedirs(dir_path) tokenizer.save(dir_path)
def main(): ''' python -m ipdb run_gpt2.py \ --data-path /path/to/americanlit/ \ --output-dir path/to/checkpoint/ \ --eval-split valid \ --train-n-steps 20000 \ --validate-every 1000 \ --sequence-tune-rate 0.0 \ --mode train \ --model-name from_scratch \ --batch-size 32 --seqlen 80 --gradient-accumulation-steps 4 '''#with this bsz, seqlen, fits to bm gpus parser = argparse.ArgumentParser(description='openGPT-2 analysis') #debug menu parser.add_argument('--debug', action='store_true', help='use dbg1000.jsonl for faster programming') #training options #--> consider redefining FT... parser.add_argument('--mode', choices=[ 'train', 'FT', 'eval-singletoken', 'eval-completion', 'eval-both' ], default='eval-singletoken') parser.add_argument( '--input-mode', choices=['CLM', 'relFT'], default='CLM', help= 'determine whether or not to put specials amongst sentences (CLM => do not / relFT => do)' ) parser.add_argument('--data-path', default='../jsonlpath/DBG', help='path/to/jsonl/files') parser.add_argument('--eval-split', choices=['train', 'valid', 'test']) parser.add_argument( '--model-name', choices=['from_scratch', 'gpt2', 'gpt2-medium', 'gpt2-large'], default='gpt2') parser.add_argument('--model-load-dir', type=str, default=None) parser.add_argument('--seed', type=int, default=777) #parser.add_argument('--data-base', type=str) parser.add_argument('--batch-size', type=int, default=32) parser.add_argument("--max-steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--num-train-epochs', type=int, default=1) parser.add_argument('--gradient-accumulation-steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--seqlen', type=int, default=120) parser.add_argument( '--tolerate_offset', type=int, default=20, help= 'when training with TPLoss, length to be additionally tolerated to args.seqlen.' ) #training is done upto this step. regardless of args.max_steps or args.num_train_epochs parser.add_argument('--train-n-steps', type=int, default=-1) #10000) parser.add_argument('--seqlen-singletoken', type=int, default=1024) parser.add_argument('--seqlen-completion', type=int, default=300) # need to unify both and use only one parser.add_argument('--seqlen-train', type=int, default=300) parser.add_argument( "--output-dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) # eval-completion parser.add_argument('--prefix-length', type=int, default=50) parser.add_argument('--continuation-length', type=int, default=100) parser.add_argument('--top-k', type=int, default=1) parser.add_argument('--top-p', type=float, default=0.0) # custom training parser.add_argument('--sequence-tune-rate', type=float, default=0.5) parser.add_argument('--report-metrics-every', type=int, default=10) parser.add_argument('--save-every', type=int, default=1000) parser.add_argument('--sequence-ngram-n', type=int, default=4) parser.add_argument('--validate-every', type=int, default=10000) # training loop parser.add_argument("--adam-epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max-grad-norm', type=int, default=1) parser.add_argument('--learning-rate', type=float, default=6.25e-5) parser.add_argument("--warmup-steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr-schedule', type=str, default='warmup_linear') parser.add_argument('--weight-decay', type=float, default=0.01) parser.add_argument('--lm-coef', type=float, default=0.9) parser.add_argument('--num-workers', type=int, default=0) args = parser.parse_args() print(args) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) ## file below prep'd by flatten.py using amerlit jsonl splits (which are all post processed) ## root / 'flattened_amerlit.txt' if args.mode == 'FT': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') elif args.mode == 'train': # train tokenizer based on corpus d_root = Path(args.data_path) vocab_path = d_root / 'vocab.json' rawtxt_path = d_root / 'flattened_amerlit.txt' # this is obtained by running "python 4_flatten4vocab.py @ dataroot" merge_path = d_root / 'merges.txt' if not (vocab_path.exists() and merge_path.exists()): #check if vocab file exists vocabgenerator = ByteLevelBPETokenizer() vocabgenerator.train(str(rawtxt_path), vocab_size=50_000, min_frequency=2) vocabgenerator.save( str(d_root) ) # vocabgenerator is also tokenizer but not from transformers del vocabgenerator tokenizer = GPT2Tokenizer(vocab_path, merge_path, errors='replace') # add CLS to the vocab # see example here: https://huggingface.co/transformers/model_doc/gpt2.html#transformers.GPT2DoubleHeadsModel.forward tokenizer = init_special_tokens(tokenizer) dataset_paths = { 'train': d_root / 'train.jsonl', 'valid': d_root / 'val.jsonl', 'test': d_root / 'test.jsonl', } # keep this for later code compatibility albeit it looks crappy if args.model_load_dir: model = GPT2LMHeadModel.from_pretrained(args.model_load_dir) elif args.model_name == 'from_scratch': config = GPT2Config() config.architectures = ["GPT2LMHeadModel"] model = GPT2LMHeadModel(config) #mp = GPT2LMHeadModel.from_pretrained('gpt2') #pretrained config vs GPT2Config has only difference # "architectures": ['GPT2LMHeadModel'] else: model = GPT2LMHeadModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) model.config.output_hidden_states = True # make them return output hidden model.to(device) '''if args.mode == 'eval-singletoken' or args.mode == 'eval-both': eval_singletoken(model, args, dataset_paths) ''' if args.mode == 'eval-completion' or args.mode == 'eval-both': datasets = get_datasets(dataset_paths, max_len=args.seqlen_completion) eval_sampler = SequentialSampler(datasets[args.eval_split]) eval_dataloader = DataLoader(datasets[args.eval_split], sampler=eval_sampler, batch_size=1) model.eval() with torch.no_grad(): all_text_completions = [] bpe_ngram_metrics = Metrics(pad=-1) word_ngram_metrics = Metrics(pad=-1) for i, batch in tqdm(enumerate(eval_dataloader), desc="Evaluating", total=len(eval_dataloader)): input_sequence = batch[0].cuda() if input_sequence.size(1) < args.prefix_length: continue # Predict the completions. batch = batch_input_sequence_by_prefix_length( input_sequence, args.prefix_length) bpe_completions, _ = sample_sequence(model, batch, args.prefix_length, args.continuation_length, args.top_k, args.top_p) bpe_completions = bpe_completions.tolist() # Extract continuations from the predicted completions. bpe_continuations = [] text_continuations = [] for bpe_completion in bpe_completions: bpe_continuations.append( bpe_completion[args.prefix_length:]) text_continuations.append( get_text_continuation(bpe_completion, tokenizer, args)) all_text_completions.append( tokenizer.decode(bpe_completion)) # Only keep continuations with at least one 4-gram # (A short continuation may occur due to predicted whitespace, then tokenizing, despite being # normal length in BPE tokens). text_continuations = [ c for c in text_continuations if len(c) > 3 ] # Update metrics with this batch of continuations. bpe_ngram_metrics.update(bpe_continuations) word_ngram_metrics.update(text_continuations) # Save the (possibly intermediate) metrics. save_completion_metrics(bpe_metrics=bpe_ngram_metrics.report( 'bpe_%s' % args.eval_split), word_metrics=word_ngram_metrics.report( 'word_%s' % args.eval_split), text_completions=all_text_completions, config=model.config.to_dict(), args=args) if args.mode == 'train': if not os.path.exists(os.path.join(args.output_dir, 'best')): os.makedirs(os.path.join(args.output_dir, 'best')) token_loss = mle_loss if args.debug: train_seq_dataloader = get_dataloaders(args, tokenizer, spl='dbg1000') #for batch in train_seq_dataloader: #print(batch.pre_tru.shape) #print(batch.pre_fals) # None #set_trace() else: # debugging mode train_seq_dataloader = get_dataloaders(args, tokenizer, spl='train') # Setup optimizer # one of both need to be specified for training # args.num_train_epochs / args.max_steps if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (args.batch_size * len( train_seq_dataloader) // args.gradient_accumulation_steps) + 1 #if performing gradient accumulation, steps won't update. #this means actual epochs training multiplied directly by "gradient_accumulation_steps" else: t_total = len( train_seq_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs #if not specified, param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, args.warmup_steps, t_total) total_steps = 0 best_ppl = 1e20 for _ in trange(args.num_train_epochs, desc="Epoch"): logging_outputs = [] epoch_loss = 0 epoch_steps = 0 tqdm_bar = tqdm(train_seq_dataloader, desc="Training", total=t_total if args.train_n_steps <= 1 else args.train_n_steps) for step, batch in enumerate(tqdm_bar): optimizer.zero_grad() # Sequence loss if torch.rand(1).item() < args.sequence_tune_rate: if batch[0].size(1) < args.prefix_length: continue loss, batch_metrics = ul_seq(model, batch, args) # Token loss else: loss, batch_metrics = token_loss( model, batch, args) # == mleloss(model, batch, args) loss.backward() optimizer.step() scheduler.step() epoch_loss += loss.item() epoch_steps += 1 total_steps += 1 tqdm_bar.desc = f"Training loss: {(epoch_loss/epoch_steps):.2f} lr: {scheduler.get_lr()[0]:.2f}" # get_last_lr in pytorch 1.4.0 #tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(epoch_loss/epoch_steps, scheduler.get_lr()[0]) # scheduler.get_last_lr() is for 1.4.0 logging_outputs.append(batch_metrics) if epoch_steps % args.report_metrics_every == 0: logging_average = CrossEntropyCriterionWCustomMetrics.aggregate_logging_outputs( logging_outputs) temp = SequencePenaltyCriterion.aggregate_logging_outputs( logging_outputs) for k, v in temp.items(): logging_average[k] = v logging_average['ppl'] = 2**logging_average['loss'] print(logging_average) logging_outputs = [] if step == args.train_n_steps: break # here train_n_steps if epoch_steps % args.save_every == 0: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) if total_steps % args.validate_every == 0: print("Validating...") validation_outputs = eval_singletoken( model, args, dataset_paths, train_iter=total_steps) if validation_outputs['ppl'] < best_ppl: best_ppl = validation_outputs['ppl'] model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.output_dir, 'best', WEIGHTS_NAME) output_config_file = os.path.join( args.output_dir, 'best', CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary( os.path.join(args.output_dir, 'best')) save_singletoken_metrics(validation_outputs, model.config.to_dict(), args, train_iter=total_steps, best=True)
def save_sentense_piece_model(cfg): paths = get_file_path_list(cfg) special_token = ["<pad>", "<bos>", "<eos>", "<sep>", "<unk>", "<mask>"] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=32000, min_frequency=2, special_tokens=special_token) tokenizer.save(cfg.path_sentence_piece, "ko")
class HuggingFaceBpeHelper(BPEHelper): """ HuggingFace's ByteLevelBPE Tokenizer. Fast because Rust. """ def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if os.path.isfile(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if os.path.isfile(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.lower: warn_once( 'Are you sure you want to lower case your BPE dictionary?') if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).') if 'bpe_vocab' not in opt: raise ValueError( '--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError( '--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError('--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe') if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer(self.vocab_path, self.merge_path, self.add_prefix_space) def helper_encode(self, text: str) -> List[str]: """ Decode list of tokens into text string. :param tokens: list of tokens :param delimiter: string delimiter for tokens :return text: decoded text """ return self.tokenizer.encode(text).tokens def helper_decode(self, tokens: List[str], token_ids: List[int], delimiter: str) -> str: """ Decode list of tokens into text string. :param tokens: list of tokens :param token_ids: list of token ids :param delimiter: string delimiter for tokens :return text: decoded text """ text = self.tokenizer.decode(token_ids) return text def sync_with_dict(self, dict_agent): """ Sync the dictionary agent with Hugging Face tokenizer's BPE dict. Called only once on initialization. """ special_tokens = [ dict_agent.null_token, dict_agent.start_token, dict_agent.end_token, dict_agent.unk_token, ] self.tokenizer.add_special_tokens(special_tokens) for i in range(self.tokenizer.get_vocab_size() - 4): token = self.tokenizer.id_to_token(i) dict_agent.add_token(token) # We don't have access to the hugging face word frequency table, # just set it to 1 instead dict_agent.freq[token] = 1 def save(self, dir_name: str, file_name: str): """ Save appropriate files. :param dir_name: directory to save. :param file_name: file to save. """ self.tokenizer.save(dir_name, file_name)
from pathlib import Path from tokenizers import ByteLevelBPETokenizer #paths = ['data/train_sentences.txt'] paths = ['data/train/t5.txt'] #paths = [str(x) for x in Path("./data/").glob("train_subset_*.txt")] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=25_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save("models", "KariBERTa-small")
from pathlib import Path from tokenizers import ByteLevelBPETokenizer paths = [str(x) for x in Path("./data/eo/data/").glob("**/*.txt")] tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save(".", "esperberto")
tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save(".", "rubinberto") tokenizer = ByteLevelBPETokenizer( "rubinberto-vocab.json", "rubinberto-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print( tokenizer.encode( "А можно вспоминать не о событиях, а, например, о чувствах, испытываемых нами за «отчетный период»."
"<pad>", "<SEP>", "<UNK>", "<MASK>", ]) print('en completed') # Customize training ta_tokenizer.train(files=new_ta_path, vocab_size=8300, min_frequency=2, special_tokens=[ "<CLS>", "<pad>", "<SEP>", "<UNK>", "<MASK>", ]) print('ta completed') en_tokenizer.save(en_tokenizer_path) ta_tokenizer.save(ta_tokenizer_path) en_tokenizer = Tokenizer.from_file(en_tokenizer_path) ta_tokenizer = Tokenizer.from_file(ta_tokenizer_path) tamil_text = 'அதனை நிரூபிப்பதுபோல் இருக்குமாம் படம்' english_text = 'This movie will prove that' id_1 = ta_tokenizer.encode(tamil_text) assert (ta_tokenizer.decode( id_1.ids) == tamil_text), 'mismatch in tamil tokenizer encoding' id_2 = en_tokenizer.encode(english_text) assert (en_tokenizer.decode( id_2.ids) == english_text), 'mismatch in english tokenizer encoding'