def initialize_model(): config = get_config() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') print("device", device) ''' Create dataloaders ''' train_dataset = SplitReshapeTrainDataset(config['complex_sentences_file'], config['simple_sentences_file']) train_data, val_data = torch.utils.data.random_split(train_dataset, [round(config["train_data_percentage"] * len(train_dataset)), round(config["val_data_percentage"] * len(train_dataset))]) train_dataloader = DataLoader(train_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True) val_dataloader = DataLoader(val_data, batch_size=config["batch_size"], num_workers=config["num_of_workers"], pin_memory=True) ''' create tokenizer ''' tokenizer = ByteLevelBPETokenizer( "./data/english_tokenizer-vocab.json", "./data/english_tokenizer-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) ''' Create model ''' vocab_size = len(tokenizer.get_vocab()) print("tokenizer.vocab_size", vocab_size) model = TransformerModel(config['embedding_size'], vocab_size, vocab_size, config['src_pad_idx'], config['num_heads'], config['num_encoder_layers'], config['num_decoder_layers'], config['forward_expansion'], config['dropout'], config['max_len'], device) model.train() trainer = model.to(device) ''' Create Optimizer ''' loss_fun = nn.CrossEntropyLoss(ignore_index = config['src_pad_idx']) optimizer = optim.Adam(trainer.parameters(), lr = config["learning_rate"]) scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, 10) writer = SummaryWriter() return config, train_dataloader, val_dataloader, trainer, loss_fun, optimizer, writer, device, scheduler, tokenizer
def _fit_tokenizer( path_to_text_file: Union[str, List[str]], tokenizer: ByteLevelBPETokenizer, vocabulary_size: int, ) -> None: tokenizer.train( path_to_text_file, vocabulary_size, special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN], )
def initialize_model(): config = get_config() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') print("device", device) '''create tokenizers''' tokenizer = ByteLevelBPETokenizer( "data/english_tokenizer-vocab.json", "data/english_tokenizer-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len']) tokenizer.enable_truncation(max_length=config['max_len']) ''' Create model ''' vocab_size = len(tokenizer.get_vocab()) print("tokenizer.vocab_size", vocab_size) model = TransformerModel(config['embedding_size'], vocab_size, vocab_size, config['src_pad_idx'], config['num_heads'], config['num_encoder_layers'], config['num_decoder_layers'], config['forward_expansion'], config['dropout'], config['max_len'], device) checkpoint = torch.load(config['pretrained_model'], map_location=device) model.load_state_dict(checkpoint['net']) model.eval() model = model.to(device) return config, model, tokenizer, device
def __init__(self, tokenizer: PreTrainedTokenizer, args, file_path: str, block_size=512): assert os.path.isfile(file_path) # Here, we do not cache the features, operating under the assumption # that we will soon use fast multithreaded tokenizers from the # `tokenizers` repo everywhere =) logger.info(" Creating features from dataset file at %s", file_path) with open(file_path, encoding="utf-8") as f: lines = [ line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace()) ] tokenizer = ByteLevelBPETokenizer( f"{args['tokenizer_name']}/vocab.json", f"{args['tokenizer_name']}/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=block_size) self.examples = [t.ids for t in tokenizer.encode_batch(lines)]
def test_language_model_dataset_fit_tokenizer_should_call_the_train_method_of_bpe_tokenizer( ): # Given language_modeling_dataset = LanguageModelingDataset(1, 1) tokenizer = ByteLevelBPETokenizer() tokenizer.train = MagicMock() language_modeling_dataset.set_tokenizer(tokenizer) # When language_modeling_dataset._fit_tokenizer(FAKE_PATH_FOR_TEST, tokenizer, 300) # Then tokenizer.train.assert_called_with( FAKE_PATH_FOR_TEST, 300, special_tokens=[EOD_TOKEN, PAD_TOKEN, SOS_TOKEN, UNK_TOKEN], )
def create_norwegian_tokenizer(): tokenizer = ByteLevelBPETokenizer( "./models/KariBERTa-tiny/vocab.json", "./models/KariBERTa-tiny/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.enable_padding() return tokenizer
def pretrain_tokenization(self): paths = [str(x) for x in Path("handler/datadir/").glob("*-train.txt")] print(paths) tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) tokenizer.save(".", "danbert-small")
def main(): tokenizer = ByteLevelBPETokenizer() tokenizer.train(files=get_file(), vocab_size=config.VOCAB_SIZE, min_frequency=config.MIN_FREQUENCY, special_tokens=config.SPECIAL_TOKENS) tokenizer.save_model(config.TOKENIZER_PATH)
def create_token_masker(tokenizer: ByteLevelBPETokenizer): special_tokens = [ START_TOKEN, PAD_TOKEN, STOP_TOKEN, UNKNOWN_TOKEN, MASK_TOKEN ] special_token_ids = { tokenizer.token_to_id(token) for token in special_tokens } special_token_ids def get_special_tokens_mask(token_ids: torch.tensor): return [ 1 if token_id in special_token_ids else 0 for token_id in token_ids ] return get_special_tokens_mask
def __init__(self, evaluate: bool = false): tokenizer = ByteLevelBPETokenizer( "./esperberto-vocab.json", './esperberto-merges.txt', ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) self.examples = [] src_files = Path("./")
def __init__(self, evaluate: bool = False): tokenizer = ByteLevelBPETokenizer( "./roberta-lm/vocab.json", "./roberta-lm/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] src_files = Path("./data/montecristo/").glob("**/*.txt") for src_file in src_files: print("🔥", src_file) lines = src_file.read_text(encoding="utf-8").splitlines() self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def __init__(self, file_path: str = None, tokenizer_path: str = None): tokenizer = ByteLevelBPETokenizer( tokenizer_path + "/vocab.json", tokenizer_path + "/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) self.examples = [] with open(file_path, encoding="utf-8") as f: lines = f.readlines() lines = [ line for line in lines if (len(line) > 0 and not line.isspace()) ] self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def train_tok(txt_dir, tokenizer_dir): # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=txt_dir, vocab_size=52_000, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) tokenizer.save_model(tokenizer_dir)
def __init__(self, evaluate=False): tokenizer = ByteLevelBPETokenizer( "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-vocab.json", "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] if evaluate: src_files = ["/home/zheng/sde/data/valid.txt"] else: src_files = ["/home/zheng/sde/data/test.txt"] for src_file in src_files: print(src_file) f = open(src_file, 'r') lines = f.readlines() self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
import os from tokenizers.implementations import ByteLevelBPETokenizer from transformers import BertConfig from transformers import BertTokenizer from transformers import BertForMaskedLM from transformers import LineByLineTextDataset from transformers import DataCollatorForLanguageModeling from transformers import Trainer, TrainingArguments PATH = os.getcwd() SAVE_MODEL = os.getcwd() tokenizer = ByteLevelBPETokenizer() tokenizer.train(files="kant.txt", vocab_size=52_000, min_frequency=2, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]) tokenizer.save_model(SAVE_MODEL) tokenizer = ByteLevelBPETokenizer( SAVE_MODEL + "/vocab.json", SAVE_MODEL + "/merges.txt", ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("For it is in reality vain to profess")) config = BertConfig( vocab_size=52_000, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6,
from tokenizers.implementations import ByteLevelBPETokenizer from transformers import LineByLineTextDataset # config = RobertaConfig( # vocab_size=52_000, # max_position_embeddings=514, # num_attention_heads=12, # num_hidden_layers=6, # type_vocab_size=1, # ) tokenizer = ByteLevelBPETokenizer( "./models/nepali_BERT_tokenizer_L-vocab.json", "./models/nepali_BERT_tokenizer_L-merges.txt", ) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path= "../Data/nepali_corpus/sanitized_data/sentences/master_sentence_list.txt", #this path is from step 1 block_size=128, ) #Need this util for data back propagation from transformers import DataCollatorForLanguageModeling data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) #Set up training arguments
input_path_val, output_path, vocab_size=30_000, min_freq=2, max_len=256, block_size=64, mlm_probability=0.15, num_attention_heads=6, num_hidden_layers=3, epochs=5, batch_size=30, val_batch_size=60, eval_steps=50, **kwargs): # instantiate tokenizer bpe_tokenizer = ByteLevelBPETokenizer() # train tokenizer _pretty_print("Training tokenizer") bpe_tokenizer.train([input_path, input_path_val], vocab_size=vocab_size, min_frequency=min_freq, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # save tokenizer tok_path = os.path.join(output_path, "tokenizer") os.makedirs(tok_path, exist_ok=True)
from tokenizers.implementations import ByteLevelBPETokenizer from language_model.tokenization.trainer import ByteLevelBPETokenizerTrainer task = ByteLevelBPETokenizerTrainer( source_folder_path="data/ukr/data/wiki_oscar_data/", tokenizer=ByteLevelBPETokenizer(), vocab_size=52000, min_frequency=5, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"], )
with open('data/start.txt', 'r') as myfile: data = myfile.read() segmenter.segment('I am Batman i live in gotham') # ============================================================================= # Huggingface tokenizer # ============================================================================= if False: from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing from pathlib import Path tokenizer = ByteLevelBPETokenizer( "data/german_old.json", "data/german_old.txt", ) tokenizer = ByteLevelBPETokenizer( "data/german_old.json", "data/german_old.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) #print(tokenizer.encode(sen_out[0]))
from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing import torch tokenizer = ByteLevelBPETokenizer( "./EsperBERTo/vocab.json", "./EsperBERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) from transformers_weighted_head.src.transformers import RobertaConfig WEIGHT_HEADS = True config = RobertaConfig( weight_heads=WEIGHT_HEADS, vocab_size=52_000, max_position_embeddings=514, hidden_size=480, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) from transformers_weighted_head.src.transformers import RobertaTokenizerFast
# -*- coding:utf-8 -*- import os from argparse import ArgumentParser from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing, RobertaProcessing if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("--token_path", type=str, nargs='?', required=True, help="") args = parser.parse_args() inputpath = args.token_path tokenizer = ByteLevelBPETokenizer(os.path.join(inputpath, "vocab.json"), os.path.join(inputpath, "merges.txt"), add_prefix_space=True, trim_offsets=True, lowercase=True, unicode_normalizer="nfkc") tokenizer._tokenizer.post_processor = RobertaProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), trim_offsets=True, add_prefix_space=True) tokenizer.enable_truncation(max_length=512) tokens = tokenizer.encode("I am Julien\nI am from China.").tokens print([x.encode('utf-8') for x in tokens])
default=os.environ['SM_MODEL_DIR']) parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) parser.add_argument('--data-dir', type=str, default=os.environ['SM_CHANNEL_TRAINING']) args = parser.parse_args() paths = [str(x) for x in Path(args.data_dir).glob("**/*.txt")] print("data files") print(paths) # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) # Need to save it to model dir for inference tokenizer.save(args.model_dir) tokenizer = ByteLevelBPETokenizer(os.path.join(args.model_dir, "vocab.json"), os.path.join(args.model_dir, "merges.txt")) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")),
def main(): argument_parser = argparse.ArgumentParser() argument_parser.add_argument("--path_to_train_data", type=str, required=True) argument_parser.add_argument("--path_to_eval_data", type=str, required=False, default=None) argument_parser.add_argument("--n_epochs", type=int, required=False, default=3) argument_parser.add_argument("--batch_size", type=int, required=False, default=32) argument_parser.add_argument("--bptt", type=int, required=False, default=64) argument_parser.add_argument("--lr", type=float, required=False, default=0.0001) argument_parser.add_argument("--vocabulary_size", type=int, required=False, default=20000) argument_parser.add_argument("--embedding_dimension", type=int, required=False, default=300) argument_parser.add_argument("--hidden_units_for_lstm", type=int, required=False, default=256) argument_parser.add_argument("--num_of_lstm_layer", type=int, required=False, default=1) argument_parser.add_argument("--n_decoder_blocks", type=int, required=False, default=5) arguments = argument_parser.parse_args() train_language_modeling_dataset = LanguageModelingDataset( arguments.batch_size, arguments.bptt) train_language_modeling_dataset.set_tokenizer(ByteLevelBPETokenizer()) train_language_modeling_dataset.fit( arguments.path_to_train_data, vocabulary_size=arguments.vocabulary_size) train_language_modeling_dataloader = LanguageModelingDataLoader( arguments.bptt, train_language_modeling_dataset.transform(arguments.path_to_train_data, return_target=True), ) model = LSTMModel( arguments.vocabulary_size, arguments.embedding_dimension, arguments.hidden_units_for_lstm, arguments.n_decoder_blocks, arguments.num_of_lstm_layer, ) logger = TensorboardLogger() trainer = Trainer(arguments.batch_size) trainer.set_logger(logger) if arguments.path_to_eval_data: eval_language_modeling_dataloader = LanguageModelingDataLoader( arguments.bptt, train_language_modeling_dataset.transform( arguments.path_to_eval_data, return_target=True), ) trainer.train( model, train_language_modeling_dataloader, CrossEntropyLoss(), Adam(model.parameters(), arguments.lr), eval_language_modeling_dataloader, arguments.n_epochs, ) else: trainer.train( model, train_language_modeling_dataloader, CrossEntropyLoss(), Adam(model.parameters(), arguments.lr), None, arguments.n_epochs, ) logger.log_params(vars(arguments), trainer.losses) saver = Saver(logger.log_dir()) saver.save_preprocessor_and_model(train_language_modeling_dataset, model)
from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing tokenizer = ByteLevelBPETokenizer( "./bert-tokenizer/vocab.json", "./bert-tokenizer/merges.txt", ) from tokenizers.processors import BertProcessing tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) ### Testing tokenizer.encode('TOBB ETU NLP&IR team')
from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing import os.path import sys ROOT_DIRECTORY = os.path.join(os.path.dirname(__file__), '..') sys.path.append(ROOT_DIRECTORY) tokenizer = ByteLevelBPETokenizer( os.path.join(ROOT_DIRECTORY, "models/en_cycl_tokenizer/vocab.json"), os.path.join(ROOT_DIRECTORY, "models/en_cycl_tokenizer/merges.txt"), ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) token_to_encode = "(() ((#$isa #$McCoyTyner-Musician #$Individual)))" encoded_token = tokenizer.encode(token_to_encode) print(encoded_token) print(encoded_token.tokens) token_to_encode = "Pair of scissors is marketed as office product." encoded_token = tokenizer.encode(token_to_encode) print(encoded_token) print(encoded_token.tokens)
return example # Check that PyTorch sees it print("CUDA:", torch.cuda.is_available()) corpus_length = 6_993_330 # fazer um wc -l para ver a qtde de linhas vocab_size = 150_000 # Dataset files # -------------------------------------------------- paths = [str(x) for x in Path("./").glob("**/corpus.txt")] # Byte Level Tokernize # -------------------------------------------------- # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer() # Customize training tokenizer.train(files=paths, vocab_size=vocab_size, min_frequency=2, special_tokens=[ "<s>", "<pad>", "</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model("BR_BERTo") # Test tokenizer = ByteLevelBPETokenizer( "./BR_BERTo/vocab.json", "./BR_BERTo/merges.txt", )
from tokenizers.implementations import ByteLevelBPETokenizer from tokenizers.processors import BertProcessing path = "./model" tokenizer = ByteLevelBPETokenizer( "{path}/vocab.json".format(path=path), "{path}/merges.txt".format(path=path), ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.encode("Dette er første testen.") tokens = tokenizer.encode("Dette er første testen.").tokens print(tokens)