def __init__(self, model_path: str = None) -> None: super().__init__() "Requires the BertTokenizer from pytorch_transformers" # pip install pytorch_transformers import os import torch from pytorch_transformers import BertTokenizer, cached_path from training.transformer_utils.model import TransformerWithClfHeadAndAdapters try: self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.config = torch.load( cached_path(os.path.join(model_path, "model_training_args.bin"))) self.model = TransformerWithClfHeadAndAdapters( self.config["config"], self.config["config_ft"]).to(self.device) state_dict = torch.load(cached_path( os.path.join(model_path, "model_weights.pth")), map_location=self.device) self.model.load_state_dict(state_dict) self.tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) except: raise Exception( "Require a valid transformer model file ({0}/model_weights.pth) " "and its config file ({0}/model_training_args.bin).".format( model_path))
def load_pretrained_model(args): "download pre-trained model and config" state_dict = torch.load(cached_path(os.path.join(args.model_checkpoint, "model_checkpoint.pth")), map_location='cpu') config = torch.load(cached_path(os.path.join(args.model_checkpoint, "model_training_args.bin"))) # Initialize model: Transformer base + classifier head model = TransformerWithClfHeadAndAdapters(config=config, fine_tuning_config=args).to(args.device) incompatible_keys = model.load_state_dict(state_dict, strict=False) print(f"Parameters discarded from the pretrained model: {incompatible_keys.unexpected_keys}") print(f"Parameters added in the model: {incompatible_keys.missing_keys}") if args.adapters_dim > 0: # Display adaptation parameters for name, param in model.named_parameters(): if 'embeddings' not in name and 'classification' not in name and 'adapters_1' not in name and 'adapters_2' not in name: param.detach_() param.requires_grad = False else: param.requires_grad = True full_parameters = sum(p.numel() for p in model.parameters()) trained_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f"\nWe will train {trained_parameters:,} parameters out of {full_parameters:,}" f" (i.e. {100 * trained_parameters/full_parameters:.1f}%) of the full parameters") return model, state_dict, config
def get_dataset(tokenizer, dataset_path, dataset_cache): """ Get tokenized PERSONACHAT dataset from S3 or cache.""" dataset_path = dataset_path or PERSONACHAT_URL dataset_cache = dataset_cache + '_' + type( tokenizer ).__name__ # To avoid using GPT cache for GPT-2 and vice-versa if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) dataset = torch.load(dataset_cache) else: logger.info("Download dataset from %s", dataset_path) personachat_file = cached_path(dataset_path) with open(personachat_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) return list(tokenize(o) for o in obj) dataset = tokenize(dataset) torch.save(dataset, dataset_cache) return dataset
def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None): """ Get personalities from PERSONACHAT """ dataset_path = dataset_path or PERSONACHAT_URL dataset_cache = dataset_cache + '_' + type( tokenizer ).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa if os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) personachat = torch.load(dataset_cache) else: logger.info("Download PERSONACHAT dataset from %s", dataset_path) personachat_file = cached_path(dataset_path) with open(personachat_file, "r", encoding="utf-8") as f: personachat = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) return list(tokenize(o) for o in obj) personachat = tokenize(personachat) torch.save(personachat, dataset_cache) logger.info("Filter personalities") personalities = [] for dataset in personachat.values(): for dialog in dataset: personalities.append(dialog["personality"]) logger.info("Gathered {} personalities".format(len(personalities))) return personalities
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ pretrained_model_name_or_path] else: vocab_file = pretrained_model_name_or_path if os.path.isdir(vocab_file): vocab_file = os.path.join(vocab_file, VOCAB_NAME) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) except EnvironmentError: return None if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) return tokenizer
def download_pretrained_model(): """ Download and extract finetuned model from S3 """ resolved_archive_file = cached_path(HF_FINETUNED_MODEL) tempdir = tempfile.mkdtemp() #logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir)) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) return tempdir
def from_pretrained( cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs ): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path] else: vocab_file = pretrained_model_name_or_path if os.path.isdir(vocab_file): vocab_file = os.path.join(vocab_file, VOCAB_NAME) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name_or_path, ", ".join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), vocab_file, ) ) return None if resolved_vocab_file == vocab_file: logger.info("loading vocabulary file {}".format(vocab_file)) else: logger.info( "loading vocabulary file {} from cache at {}".format( vocab_file, resolved_vocab_file ) ) if ( pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP ): # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name_or_path ] kwargs["max_len"] = min(kwargs.get("max_len", int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) return tokenizer
def get_dataset(tokenizer, dataset_path, dataset_cache=None): """ Get PERSONACHAT from S3 """ dataset_path = dataset_path or PERSONACHAT_URL dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) dataset = torch.load(dataset_cache) else: logger.info("Download dataset from %s", dataset_path) personachat_file = cached_path(dataset_path) with open(personachat_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): tokenize.dict_key_calls += 1 logger.debug( 'Encoding {}. obj.keys() = {}, obj.items().__len__() = {}'.format(tokenize.dict_key_calls, obj.keys(), obj.items().__len__())) return dict((n, tokenize(o)) for n, o in obj.items()) min_samples_for_multiprocessing = 100 if obj.__len__() > min_samples_for_multiprocessing: logger.debug(' Encoding VERY LONG list of obj.__len__() = {}'.format(obj.__len__())) logger.debug(' Encoding list with with multiprocessing...') """functools.partial does not work becuase tokenizer has to be handed recusively together with obj to worker_tokenize again. As a workaround of not knowing how to handle splash-operator for possible dict-output and **kwargs input, the list_args is implemented.""" with mp.Pool(processes=mp.cpu_count() - 1) as pool: results = pool.map(func=worker_tokenize, iterable=[[o, tokenizer] for o in obj]) return results else: logger.debug(' Encoding list of obj.__len__() = {}'.format(obj.__len__())) return list(tokenize(o) for o in obj) tokenize.dict_key_calls = 0 dataset = tokenize(dataset) # dataset = tokenize(dataset) if dataset_cache: torch.save(dataset, dataset_cache) return dataset
def get_dataset(tokenizer, dataset_path, dataset_cache=None, task=None, return_cachepath=False): dataset_cache = dataset_cache + '_' + type( tokenizer ).__name__ + '_' + task # Do avoid using GPT cache for GPT-2 and vice-versa if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) dataset = torch.load(dataset_cache) else: logger.info("Download dataset from %s", dataset_path) dataset_file = cached_path(dataset_path) with open(dataset_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict( (n, o) if (n == 'id' or n == 'turn_id') else (n, tokenize(o)) for n, o in obj.items()) if isinstance(obj, int): return obj return list(tokenize(o) for o in obj) dataset = tokenize(dataset) if dataset_cache: torch.save(dataset, dataset_cache) if return_cachepath: return dataset, dataset_cache else: return dataset
def download_dataset(self, dataset_path): # download personachat to dataset_path msg = "Downloading personachat from S3 into {}" logger.info(msg.format(dataset_path)) return cached_path(PERSONACHAT_URL, dataset_path)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) tokenizer.add_special_tokens({ 'cls_token': '<CLS>', 'sep_token': '<SEP>', 'pad_token': '<PAD>', 'eos_token': '<EOS>' }) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) special_tokens_ids = [ tokenizer.convert_tokens_to_ids(special_token) for special_token in ['<PAD>', '<CLS>', '<SEP>', '<EOS>'] ] model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids, lm_labels, mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def move_cached(name, cache_dir, out_path): cached_vocab = pt.cached_path(name, cache_dir=cache_dir) logger.info("Moving cached vocab {} to {}".format(cached_vocab, out_path)) os.rename(cached_vocab, out_path) os.remove(cached_vocab + '.json')
def train( distributed=False, local_rank=-1, lr = 6.25e-5, dataset_path='../data/personachat_self_original.json', dataset_cache=cached_path('../data/personachat_self_original.json'), model_checkpoint='gpt2', num_candidates=2, max_history=5, train_batch_size=2, valid_batch_size=2, gradient_accumulation_steps=8, lm_coef=1.0, mc_coef=1.0, max_norm=1.0, n_epochs=10, personality_permutations=1, eval_before_start=False, device = 'cuda' if torch.cuda.is_available() else 'cpu', fp16='' ): ''' parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="openai-gpt", help="Path, url or short name of the model") parser.add_argument("--num_candidates", type=int, default=2, help="Number of candidates for training") parser.add_argument("--max_history", type=int, default=2, help="Number of previous exchanges to keep in history") parser.add_argument("--train_batch_size", type=int, default=4, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=4, help="Batch size for validation") parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Accumulate gradients on several steps") parser.add_argument("--lr", type=float, default=6.25e-5, help="Learning rate") parser.add_argument("--lm_coef", type=float, default=1.0, help="LM loss coefficient") parser.add_argument("--mc_coef", type=float, default=1.0, help="Multiple-choice loss coefficient") parser.add_argument("--max_norm", type=float, default=1.0, help="Clipping gradient norm") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--personality_permutations", type=int, default=1, help="Number of permutations of personality sentences") parser.add_argument("--eval_before_start", action='store_true', help="If true start with a first evaluation before training") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--fp16", type=str, default="", help="Set to O0, O1, O2 or O3 for fp16 training (see apex documentation)") parser.add_argument("--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning("Running process %d", args.local_rank) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat(args)) ''' args = None # Initialize distributed training if needed distributed = (local_rank != -1) if distributed: torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') #logger.info("Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") print(f'{datetime.now()}: Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning') model = GPT2DoubleHeadsModel.from_pretrained('gpt2') tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # We will use 5 special tokens: # - <bos> to indicate the start of the sequence # - <eos> to indicate the end of the sequence # - <speaker1> to indicate the beginning and the tokens of an utterance from the user # - <speaker2> to indicate the beginning and the tokens of an utterance from the bot # - <pad> as a padding token to build batches of sequences special_tokens = { 'bos_token': '<bos>', 'eos_token': '<eos>', 'additional_special_tokens': ['<speaker1>', '<speaker2>'], 'pad_token': '<pad>' } # We can add these special tokens to the vocabulary and the embeddings of the model: tokenizer.add_special_tokens(special_tokens) #model.config.num_special_tokens = len(special_tokens) model.resize_token_embeddings(len(tokenizer)) device = 'cuda' if torch.cuda.is_available() else 'cpu' model.to(device) optimizer = AdamW(model.parameters(), lr=lr) # Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) if fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=fp16) if distributed: model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) #logger.info("Prepare datasets") print(f'{datetime.now()}: prepare datasets') train_loader, val_loader, train_sampler, valid_sampler = get_data_loaders(tokenizer) # Training function and trainer def update(engine, batch): model.train() batch = tuple(input_tensor.to(device) for input_tensor in batch) lm_loss, mc_loss, _, _, _ = model(*batch) loss = (lm_loss * lm_coef + mc_loss * mc_coef) / gradient_accumulation_steps if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) if engine.state.iteration % gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(device) for input_tensor in batch) input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids = batch #logger.info(tokenizer.decode(input_ids[0, -1, :].tolist())) print(f'{datetime.now()}: {tokenizer.decode(input_ids[0, -1, :].tolist())}') model_outputs = model(input_ids, mc_token_ids, token_type_ids=token_type_ids) lm_logits, mc_logits = model_outputs[0], model_outputs[1] # So we can also use GPT2 outputs lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1)) lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1) return (lm_logits_flat_shifted, mc_logits), (lm_labels_flat_shifted, mc_labels) evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) if eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if distributed: trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Linearly decrease the learning rate from lr to zero scheduler = PiecewiseLinear(optimizer, "lr", [(0, lr), (n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we compute distributed metrics RunningAverage(output_transform=lambda x: x).attach(trainer, "loss") metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0])), "accuracy": Accuracy(output_transform=lambda x: (x[0][1], x[1][1]))} #metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args), # "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args)}) metrics["average_ppl"] = MetricsLambda(math.exp, metrics["nll"]) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train if local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss"]) evaluator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics))) tb_logger = TensorboardLogger(log_dir='../logs') #tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) #tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) #tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), another_engine=trainer), event_name=Events.EPOCH_COMPLETED) #checkpoint_handler = ModelCheckpoint(tb_logger.writer.log_dir, 'checkpoint', save_interval=1, n_saved=3) #trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, # {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation #torch.save(args, tb_logger.writer.log_dir + '/model_training_args.bin') #getattr(model, 'module', model).config.to_json_file(os.path.join(tb_logger.writer.log_dir, CONFIG_NAME)) #tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=n_epochs)
#tokenizer.save_vocabulary(tb_logger.writer.log_dir) # Run the training trainer.run(train_loader, max_epochs=n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint (for easy re-loading with OpenAIGPTModel.from_pretrained method) #if local_rank in [-1, 0] and n_epochs > 0: # os.rename(checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME)) # TODO: PR in ignite to have better access to saved file paths (cleaner) # tb_logger.close() # In[14]: train(dataset_path='../data/ql_dataset.json', dataset_cache=cached_path('../data/ql_dataset.json'), n_epochs=100, train_batch_size=10, valid_batch_size=10) # In[22]: def top_filtering(logits, top_k=0, top_p=0.0, threshold=-float('Inf'), filter_value=-float('Inf')): """ Filter a distribution of logits using top-k, top-p (nucleus) and/or threshold filtering Args: logits: logits distribution shape (vocabulary size) top_k: <=0: no filtering, >0: keep only top k tokens with highest probability. top_p: <=0.0: no filtering, >0.0: keep only a subset S of candidates, where S is the smallest subset whose total probability mass is greater than or equal to the threshold top_p. In practice, we select the highest probability tokens whose cumulative probability mass exceeds the threshold top_p. threshold: a minimal threshold to keep logits
# start training until the epoch finished train() # data to predict from shopee new_test_df = pd.read_csv('../input/test.csv') new_test_df['review'] = new_test_df['review'].apply( removeNumbersAndPunctuations) new_test_df['review'] = new_test_df['review'].apply(removeSpaces) new_test_df['review'] = new_test_df['review'].apply(lowerWords) new_test_df['review'] = new_test_df['review'].apply(removeStopWords) # path to where you save your model .pth and .bin generated by the above function model_path = "./tmp_data" device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") config = torch.load( cached_path(os.path.join(model_path, "model_training_args.bin"))) model = TransformerWithClfHeadAndAdapters(config["config"], config["config_ft"]).to(device) state_dict = torch.load(cached_path( os.path.join(model_path, "model_weights.pth")), map_location=device) model.load_state_dict(state_dict) # Load model state dict tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False) # Load tokenizer clf_token = tokenizer.vocab['[CLS]'] # classifier token pad_token = tokenizer.vocab['[PAD]'] # pad token max_length = config[ 'config'].num_max_positions # Max length from trained model