def _load_datasets(self): self.max_seq_len = get_param_val(self.model_params, "max_seq_len", allow_default=False) dataset_name = get_param_val(self.model_params, "dataset", default_val="penntreebank") self.dataset_class = TaskLanguageModeling.get_dataset_class( dataset_name) print("Loading dataset %s..." % dataset_name) self.train_dataset = self.dataset_class(max_seq_len=self.max_seq_len, train=True) self.val_dataset = self.dataset_class(max_seq_len=self.max_seq_len, val=True) self.test_dataset = self.dataset_class(max_seq_len=self.max_seq_len, test=True) if hasattr(self.dataset_class, "get_length_prior"): # print("use length_prior!") self.length_prior = self.dataset_class.get_length_prior( max_seq_len=self.max_seq_len) else: # print("not use length_prior!") self.length_prior = None
def _create_model(self, model_params): dataset_name = get_param_val(self.model_params, "dataset", default_val="penntreebank") dataset_class = TaskLanguageModeling.get_dataset_class(dataset_name) vocab_dict = dataset_class.get_vocabulary() vocab_torchtext = dataset_class.get_torchtext_vocab() model_name = get_param_val(self.model_params, "model_name", default_val="CNF") if model_name == "RNN": model = LSTMModel(num_classes=len(vocab_dict), vocab=vocab_torchtext, model_params=model_params) elif model_name == "CNF": model = CNFLanguageModeling(model_params=model_params, vocab_size=len(vocab_dict), vocab=vocab_torchtext, dataset_class=dataset_class) elif model_name in ["DAF", "DBF"]: model = DFModel(num_classes=len(vocab_dict), batch_size=self.batch_size, model_params=model_params, model_name=model_name) return model
def __init__(self, num_classes, batch_size=64, hidden_size=8, num_flows=1, temperature=0.1, max_seq_len=-1, model_params=None, model_name="DAF"): super().__init__() hidden_size = get_param_val(model_params["discrete_flow"], "nh", hidden_size) num_flows = get_param_val(model_params["discrete_flow"], "num_flows", num_flows) temperature = get_param_val(model_params["discrete_flow"], "temperature", temperature) max_seq_len = get_param_val(model_params, "max_seq_len", max_seq_len) self.num_flows = num_flows self.hidden_size = hidden_size self.temperature = temperature self.vocab_size = num_classes self.model_name = model_name flows = [] for i in range(self.num_flows): if model_name == "DAF": layer = MADE( [batch_size, max_seq_len, self.vocab_size], self.vocab_size, [self.hidden_size, self.hidden_size, self.hidden_size]) disc_layer = DiscreteAutoregressiveFlow( layer, temperature, self.vocab_size) elif model_name == "DBF": vector_length = self.vocab_size * max_seq_len layer = lambda inputs, **kwargs: inputs disc_layer = DiscreteBipartiteFlow(layer, i % 2, temperature, self.vocab_size, vector_length) # i%2 flips the parity of the masking. It splits the vector in half and alternates # each flow between changing the first half or the second. flows.append(disc_layer) self.flows = nn.ModuleList(flows) # Making random base probability distribution self.base_log_probs = torch.randn( max_seq_len, self.vocab_size).clone().detach().requires_grad_(True)
def __init__(self, model, model_params, load_data=True, debug=False, batch_size=64): super().__init__(model, model_params, load_data=load_data, debug=debug, batch_size=batch_size, name="TaskLanguageModeling") prior_dist_params = get_param_val( self.model_params, "prior_distribution", allow_default=False, error_location="TaskLanguageModeling - init") self.prior_distribution = create_prior_distribution(prior_dist_params) self.beta_scheduler = create_scheduler(self.model_params["beta"], "beta") self.summary_dict = { "log_prob": list(), "ldj": list(), "z": list(), "beta": 0 }
def evaluate_model(self, checkpoint_model=None): ## Function for evaluation/testing of a model # Load the "best" model by first loading the most recent one and determining the "best" model checkpoint_dict = self.load_recent_model() best_save_dict = get_param_val(checkpoint_dict, "best_save_dict", { "file": None, "metric": -1, "detailed_metrics": dict() }, warning_if_default=True) # best_save_iter = best_save_dict["file"] if not os.path.isfile(best_save_iter): splits = best_save_iter.split("/") checkpoint_index = splits.index("checkpoints") best_save_iter = "/".join(splits[checkpoint_index:]) if not os.path.isfile(best_save_iter): print( "[!] WARNING: Tried to load best model \"%s\", but file does not exist" % (best_save_iter)) else: load_model(best_save_iter, model=self.model) # Print saved information of performance on validation set print("\n" + "-" * 100 + "\n") print("Best evaluation iteration", best_save_iter) print("Best evaluation metric", best_save_dict["metric"]) print("Detailed metrics") for metric_name, metric_val in best_save_dict[ "detailed_metrics"].items(): print("-> %s: %s" % (metric_name, str(metric_val))) print("\n" + "-" * 100 + "\n") # Test model self.task.checkpoint_path = self.checkpoint_path eval_metric, detailed_metrics = self.task.test() # Print test results out_dict = {} print("Evaluation metric", eval_metric) print("Detailed metrics") for metric_name, metric_val in detailed_metrics.items(): print("-> %s: %s" % (metric_name, str(metric_val))) out_dict[metric_name] = str(metric_val) if isinstance( metric_val, torch.Tensor) else metric_val print("\n" + "-" * 100 + "\n") # Save test results externally with open(os.path.join(self.checkpoint_path, "eval_metrics.json"), "w") as f: json.dump(out_dict, f, indent=4)
def create_prior_distribution(distribution_params): distribution_type = get_param_val(distribution_params, "distribution_type", PriorDistribution.LOGISTIC) input_params = { key: val for key, val in distribution_params.items() if val is not None } if PriorDistribution.GAUSSIAN == distribution_type: return GaussianDistribution(**input_params) elif PriorDistribution.LOGISTIC == distribution_type: return LogisticDistribution(**input_params) else: print("[!] ERROR: Unknown distribution type %s" % str(distribution_type)) sys.exit(1)
def __init__(self, model_params, optimizer_params, batch_size, checkpoint_path, debug=False, name_prefix="", multi_gpu=False): self.batch_size = batch_size model_name = get_param_val(model_params, "model_name", default_val="CNF") self.name_prefix = name_prefix.strip( ) + model_name # Remove possible spaces. Name is used for creating default checkpoint path self.model_params = model_params self.optimizer_params = optimizer_params ## Load model self.model = self._create_model(model_params) if multi_gpu: # Testing for multi-gpu if selected num_gpus = torch.cuda.device_count() if num_gpus == 0: print( "[#] WARNING: Multi-GPU training failed because no GPU was detected. Continuing with single GPU..." ) elif num_gpus == 1: print( "[#] WARNING: Multi-GPU training failed because only a single GPU is available. Continuing with single GPU..." ) else: print("Preparing to use %i GPUs..." % (num_gpus)) self.model = WrappedDataParallel(self.model) self.model = self.model.to(get_device()) ## Load task self.task = self._create_task(model_params, debug=debug) ## Load optimizer and checkpoints self._create_optimizer(model_params, optimizer_params) self._prepare_checkpoint(checkpoint_path)
def create_scheduler(scheduler_params, param_name=None): sched_type = get_param_val(scheduler_params, "scheduler_type", allow_default=False) end_val = get_param_val(scheduler_params, "scheduler_end_val", allow_default=False) start_val = get_param_val(scheduler_params, "scheduler_start_val", allow_default=False) stepsize = get_param_val(scheduler_params, "scheduler_step_size", allow_default=False) logit = get_param_val(scheduler_params, "scheduler_logit", allow_default=False) delay = get_param_val(scheduler_params, "scheduler_delay", allow_default=False) if sched_type == "constant": return ConstantScheduler(const_val=end_val, param_name=param_name) elif sched_type == "linear": return LinearScheduler(start_val=start_val, end_val=end_val, stepsize=stepsize, delay=delay, param_name=param_name) elif sched_type == "sigmoid": return SigmoidScheduler(start_val=start_val, end_val=end_val, logit_factor=logit, stepsize=stepsize, delay=delay, param_name=param_name) elif sched_type == "exponential": return ExponentialScheduler(start_val=start_val, end_val=end_val, logit_factor=logit, stepsize=stepsize, delay=delay, param_name=param_name) else: print("[!] ERROR: Unknown scheduler type \"%s\"" % str(sched_type)) sys.exit(1)
def train_model(self, max_iterations=1e6, loss_freq=50, eval_freq=2000, save_freq=1e5, max_gradient_norm=0.25, no_model_checkpoints=False): parameters_to_optimize = self.model.parameters() # Setup dictionary to save evaluation details in checkpoint_dict = self.load_recent_model() start_iter = get_param_val( checkpoint_dict, "iteration", 0, warning_if_default=False) # Iteration to start from evaluation_dict = get_param_val( checkpoint_dict, "evaluation_dict", dict(), warning_if_default=False ) # Dictionary containing validation performances over time best_save_dict = get_param_val(checkpoint_dict, "best_save_dict", { "file": None, "metric": 1e6, "detailed_metrics": None, "test": None }, warning_if_default=False) # best_save_iter = best_save_dict["file"] last_save = None if start_iter == 0 else self.get_checkpoint_filename( start_iter) if last_save is not None and not os.path.isfile(last_save): print( "[!] WARNING: Could not find last checkpoint file specified as " + last_save) last_save = None test_NLL = None # Possible test performance determined in the end of the training # Initialize tensorboard writer writer = SummaryWriter(self.checkpoint_path) # Function for saving model. Add here in the dictionary necessary parameters that should be saved def save_train_model(iteration, only_weights=True): if no_model_checkpoints: return checkpoint_dict = { "iteration": iteration, "best_save_dict": best_save_dict, "evaluation_dict": evaluation_dict } self.save_model(iteration, checkpoint_dict, save_optimizer=not only_weights) # Function to export the current results to a txt file def export_result_txt(): if best_save_iter is not None: with open(os.path.join(self.checkpoint_path, "results.txt"), "w") as f: f.write("Best validation performance: %s\n" % (str(best_save_dict["metric"]))) f.write( "Best iteration: %i\n" % int(str(best_save_iter).split("_")[-1].split(".")[0])) f.write("Best checkpoint: %s\n" % str(best_save_iter)) f.write("Detailed metrics\n") for metric_name, metric_val in best_save_dict[ "detailed_metrics"].items(): f.write("-> %s: %s\n" % (metric_name, str(metric_val))) if "test" in best_save_dict and best_save_dict[ "test"] is not None: f.write("Test - Detailed metrics\n") for metric_name, metric_val in best_save_dict[ "test"].items(): f.write("[TEST] -> %s: %s\n" % (metric_name, str(metric_val))) f.write("\n") # "Trackers" are moving averages. We use them to log the loss and time needed per training iteration time_per_step = Tracker() train_losses = Tracker() # Try-catch if user terminates try: index_iter = -1 self.model.eval() self.task.initialize() print("=" * 50 + "\nStarting training...\n" + "=" * 50) self.model.train() print("Performing initial evaluation...") self.model.eval() eval_NLL, detailed_scores = self.task.eval(initial_eval=True) self.model.train() write_dict_to_tensorboard(writer, detailed_scores, base_name="eval", iteration=start_iter) for index_iter in range(start_iter, int(max_iterations)): # Training step start_time = time.time() loss = self.task.train_step(iteration=index_iter) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(parameters_to_optimize, max_gradient_norm) if self.model.model_name in ["DAF", "DBF"]: torch.nn.utils.clip_grad_norm_(self.model.base_log_probs, max_gradient_norm) self.optimizer.step() if self.optimizer.param_groups[0]['lr'] > self.lr_minimum: self.lr_scheduler.step() end_time = time.time() time_per_step.add(end_time - start_time) train_losses.add(loss.item()) # Statement for detecting NaN values if torch.isnan(loss).item(): print("[!] ERROR: Loss is NaN!" + str(loss.item())) for name, param in self.model.named_parameters(): if param.requires_grad: if torch.isnan(param).sum() > 0: print("[!] ERROR: Parameter %s has %s NaN values!\n" % (name, str(torch.isnan(param).sum())) + \ "Grad values NaN: %s.\n" % (str(torch.isnan(param.grad).sum()) if param.grad is not None else "no gradients") + \ "Grad values avg: %s.\n" % (str(param.grad.abs().mean()) if param.grad is not None else "no gradients") + \ "Last loss: %s" % (str(loss))) # Printing current loss etc. for debugging if (index_iter + 1) % loss_freq == 0: loss_avg = train_losses.get_mean(reset=True) bpd_avg = self.task.loss_to_bpd(loss_avg) train_time_avg = time_per_step.get_mean(reset=True) max_memory = torch.cuda.max_memory_allocated( device=get_device( )) / 1.0e9 if torch.cuda.is_available() else -1 print( "Training iteration %i|%i (%4.2fs). Loss: %6.5f, Bpd: %6.4f [Mem: %4.2fGB]" % (index_iter + 1, max_iterations, train_time_avg, loss_avg, bpd_avg, max_memory)) writer.add_scalar("train/loss", loss_avg, index_iter + 1) writer.add_scalar("train/bpd", bpd_avg, index_iter + 1) writer.add_scalar("train/learning_rate", self.optimizer.param_groups[0]['lr'], index_iter + 1) writer.add_scalar("train/training_time", train_time_avg, index_iter + 1) self.task.add_summary(writer, index_iter + 1, checkpoint_path=self.checkpoint_path) # Performing evaluation every "eval_freq" steps if (index_iter + 1) % eval_freq == 0: self.model.eval() eval_NLL, detailed_scores = self.task.eval() self.model.train() write_dict_to_tensorboard(writer, detailed_scores, base_name="eval", iteration=index_iter + 1) # If model performed better on validation than any other iteration so far => save it and eventually replace old model if eval_NLL < best_save_dict["metric"]: best_save_iter = self.get_checkpoint_filename( index_iter + 1) best_save_dict["metric"] = eval_NLL best_save_dict["detailed_metrics"] = detailed_scores if not os.path.isfile(best_save_iter): print("Saving model at iteration " + str(index_iter + 1)) if best_save_dict[ "file"] is not None and os.path.isfile( best_save_dict["file"]): print("Removing checkpoint %s..." % best_save_dict["file"]) os.remove(best_save_dict["file"]) if last_save is not None and os.path.isfile( last_save): print("Removing checkpoint %s..." % last_save) os.remove(last_save) best_save_dict["file"] = best_save_iter last_save = best_save_iter save_train_model(index_iter + 1) self.task.export_best_results(self.checkpoint_path, index_iter + 1) export_result_txt() evaluation_dict[index_iter + 1] = best_save_dict["metric"] # Independent of evaluation, the model is saved every "save_freq" steps. This prevents loss of information if model does not improve for a while if (index_iter + 1) % save_freq == 0 and not os.path.isfile( self.get_checkpoint_filename(index_iter + 1)): save_train_model(index_iter + 1) if last_save is not None and os.path.isfile( last_save) and last_save != best_save_iter: print("Removing checkpoint %s..." % last_save) os.remove(last_save) last_save = self.get_checkpoint_filename(index_iter + 1) ## End training loop # Before testing, load best model and check whether its validation performance is in the right range (to prevent major loading issues) if not no_model_checkpoints and best_save_iter is not None: load_model(best_save_iter, model=self.model, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler) eval_NLL, detailed_scores = self.task.eval() if eval_NLL != best_save_dict["metric"]: if abs(eval_NLL - best_save_dict["metric"]) > 1e-1: print( "[!] WARNING: new evaluation significantly differs from saved one (%s vs %s)! Probably a mistake in the saving/loading part..." % (str(eval_NLL), str(best_save_dict["metric"]))) else: print( "[!] WARNING: new evaluation sligthly differs from saved one (%s vs %s)." % (str(eval_NLL), str(best_save_dict["metric"]))) else: print("Using last model as no models were saved...") # Testing the trained model test_NLL, detailed_scores = self.task.test() print("=" * 50 + "\nTest performance: %lf" % (test_NLL)) detailed_scores["original_NLL"] = test_NLL best_save_dict["test"] = detailed_scores self.task.finalize_summary(writer, max_iterations, self.checkpoint_path) # If user terminates training early, replace last model saved per "save_freq" steps by current one except KeyboardInterrupt: if index_iter > 0: print( "User keyboard interrupt detected. Saving model at step %i..." % (index_iter)) save_train_model(index_iter + 1) else: print( "User keyboard interrupt detected before starting to train." ) if last_save is not None and os.path.isfile(last_save) and not any( [val == last_save for _, val in best_save_dict.items()]): os.remove(last_save) export_result_txt() writer.close()
def __init__(self, num_classes, hidden_size=64, num_layers=2, embedding_dim=32, dp_rate=0.0, input_dp_rate=0.0, max_seq_len=-1, vocab=None, model_params=None): super().__init__() self.model_name = "RNN" if model_params is not None: hidden_size = get_param_val(model_params, "coupling_hidden_size", hidden_size) embedding_dim = hidden_size // 4 num_layers = get_param_val(model_params, "coupling_hidden_layers", num_layers) dp_rate = get_param_val(model_params, "coupling_dropout", dp_rate) input_dp_rate = get_param_val(model_params, "coupling_input_dropout", input_dp_rate) max_seq_len = get_param_val(model_params, "max_seq_len", max_seq_len) self.num_layers = num_layers self.hidden_size = hidden_size self.embed_dim = 1 # Not equal to embedding_dim, is needed for making sampling equal to flows if vocab is not None and vocab.vectors is not None: embedding_dim = vocab.vectors.shape[1] self.embeddings = nn.Embedding(num_embeddings=len(vocab), embedding_dim=embedding_dim) self.embeddings.weight.data.copy_(vocab.vectors) self.vocab_size = len(vocab) else: self.embeddings = nn.Embedding(num_embeddings=num_classes, embedding_dim=embedding_dim) self.vocab_size = num_classes if input_dp_rate < 1.0: time_embed_dim = embedding_dim // 4 time_embed = nn.Linear(2 * max_seq_len, time_embed_dim) self.max_seq_len = max_seq_len self.time_concat = TimeConcat(time_embed=time_embed, input_dp_rate=input_dp_rate) else: self.time_concat = None time_embed_dim = 0 self.lstm = nn.LSTM(input_size=embedding_dim + time_embed_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=False) self.init_state = nn.Parameter(torch.zeros(num_layers, 1, hidden_size)) self.output_layer = nn.Sequential( nn.Linear(hidden_size, hidden_size // 2), nn.GELU(), nn.Dropout(dp_rate), nn.Linear(hidden_size // 2, num_classes), nn.LogSoftmax(dim=-1))