Esempio n. 1
0
    def _load_datasets(self):
        self.max_seq_len = get_param_val(self.model_params,
                                         "max_seq_len",
                                         allow_default=False)

        dataset_name = get_param_val(self.model_params,
                                     "dataset",
                                     default_val="penntreebank")
        self.dataset_class = TaskLanguageModeling.get_dataset_class(
            dataset_name)
        print("Loading dataset %s..." % dataset_name)

        self.train_dataset = self.dataset_class(max_seq_len=self.max_seq_len,
                                                train=True)
        self.val_dataset = self.dataset_class(max_seq_len=self.max_seq_len,
                                              val=True)
        self.test_dataset = self.dataset_class(max_seq_len=self.max_seq_len,
                                               test=True)

        if hasattr(self.dataset_class, "get_length_prior"):
            # print("use length_prior!")
            self.length_prior = self.dataset_class.get_length_prior(
                max_seq_len=self.max_seq_len)
        else:
            # print("not use length_prior!")
            self.length_prior = None
Esempio n. 2
0
    def _create_model(self, model_params):
        dataset_name = get_param_val(self.model_params,
                                     "dataset",
                                     default_val="penntreebank")
        dataset_class = TaskLanguageModeling.get_dataset_class(dataset_name)
        vocab_dict = dataset_class.get_vocabulary()
        vocab_torchtext = dataset_class.get_torchtext_vocab()

        model_name = get_param_val(self.model_params,
                                   "model_name",
                                   default_val="CNF")
        if model_name == "RNN":
            model = LSTMModel(num_classes=len(vocab_dict),
                              vocab=vocab_torchtext,
                              model_params=model_params)
        elif model_name == "CNF":
            model = CNFLanguageModeling(model_params=model_params,
                                        vocab_size=len(vocab_dict),
                                        vocab=vocab_torchtext,
                                        dataset_class=dataset_class)
        elif model_name in ["DAF", "DBF"]:
            model = DFModel(num_classes=len(vocab_dict),
                            batch_size=self.batch_size,
                            model_params=model_params,
                            model_name=model_name)
        return model
Esempio n. 3
0
    def __init__(self,
                 num_classes,
                 batch_size=64,
                 hidden_size=8,
                 num_flows=1,
                 temperature=0.1,
                 max_seq_len=-1,
                 model_params=None,
                 model_name="DAF"):
        super().__init__()
        hidden_size = get_param_val(model_params["discrete_flow"], "nh",
                                    hidden_size)
        num_flows = get_param_val(model_params["discrete_flow"], "num_flows",
                                  num_flows)
        temperature = get_param_val(model_params["discrete_flow"],
                                    "temperature", temperature)
        max_seq_len = get_param_val(model_params, "max_seq_len", max_seq_len)

        self.num_flows = num_flows
        self.hidden_size = hidden_size
        self.temperature = temperature
        self.vocab_size = num_classes
        self.model_name = model_name

        flows = []
        for i in range(self.num_flows):
            if model_name == "DAF":
                layer = MADE(
                    [batch_size, max_seq_len, self.vocab_size],
                    self.vocab_size,
                    [self.hidden_size, self.hidden_size, self.hidden_size])
                disc_layer = DiscreteAutoregressiveFlow(
                    layer, temperature, self.vocab_size)

            elif model_name == "DBF":
                vector_length = self.vocab_size * max_seq_len
                layer = lambda inputs, **kwargs: inputs
                disc_layer = DiscreteBipartiteFlow(layer, i % 2, temperature,
                                                   self.vocab_size,
                                                   vector_length)
                # i%2 flips the parity of the masking. It splits the vector in half and alternates
                # each flow between changing the first half or the second.

            flows.append(disc_layer)

        self.flows = nn.ModuleList(flows)

        # Making random base probability distribution
        self.base_log_probs = torch.randn(
            max_seq_len, self.vocab_size).clone().detach().requires_grad_(True)
Esempio n. 4
0
    def __init__(self,
                 model,
                 model_params,
                 load_data=True,
                 debug=False,
                 batch_size=64):
        super().__init__(model,
                         model_params,
                         load_data=load_data,
                         debug=debug,
                         batch_size=batch_size,
                         name="TaskLanguageModeling")

        prior_dist_params = get_param_val(
            self.model_params,
            "prior_distribution",
            allow_default=False,
            error_location="TaskLanguageModeling - init")
        self.prior_distribution = create_prior_distribution(prior_dist_params)

        self.beta_scheduler = create_scheduler(self.model_params["beta"],
                                               "beta")

        self.summary_dict = {
            "log_prob": list(),
            "ldj": list(),
            "z": list(),
            "beta": 0
        }
Esempio n. 5
0
    def evaluate_model(self, checkpoint_model=None):
        ## Function for evaluation/testing of a model

        # Load the "best" model by first loading the most recent one and determining the "best" model
        checkpoint_dict = self.load_recent_model()
        best_save_dict = get_param_val(checkpoint_dict,
                                       "best_save_dict", {
                                           "file": None,
                                           "metric": -1,
                                           "detailed_metrics": dict()
                                       },
                                       warning_if_default=True)  #
        best_save_iter = best_save_dict["file"]
        if not os.path.isfile(best_save_iter):
            splits = best_save_iter.split("/")
            checkpoint_index = splits.index("checkpoints")
            best_save_iter = "/".join(splits[checkpoint_index:])
        if not os.path.isfile(best_save_iter):
            print(
                "[!] WARNING: Tried to load best model \"%s\", but file does not exist"
                % (best_save_iter))
        else:
            load_model(best_save_iter, model=self.model)

        # Print saved information of performance on validation set
        print("\n" + "-" * 100 + "\n")
        print("Best evaluation iteration", best_save_iter)
        print("Best evaluation metric", best_save_dict["metric"])
        print("Detailed metrics")
        for metric_name, metric_val in best_save_dict[
                "detailed_metrics"].items():
            print("-> %s: %s" % (metric_name, str(metric_val)))
        print("\n" + "-" * 100 + "\n")

        # Test model
        self.task.checkpoint_path = self.checkpoint_path
        eval_metric, detailed_metrics = self.task.test()

        # Print test results
        out_dict = {}
        print("Evaluation metric", eval_metric)
        print("Detailed metrics")
        for metric_name, metric_val in detailed_metrics.items():
            print("-> %s: %s" % (metric_name, str(metric_val)))
            out_dict[metric_name] = str(metric_val) if isinstance(
                metric_val, torch.Tensor) else metric_val
        print("\n" + "-" * 100 + "\n")

        # Save test results externally
        with open(os.path.join(self.checkpoint_path, "eval_metrics.json"),
                  "w") as f:
            json.dump(out_dict, f, indent=4)
Esempio n. 6
0
def create_prior_distribution(distribution_params):
    distribution_type = get_param_val(distribution_params, "distribution_type",
                                      PriorDistribution.LOGISTIC)
    input_params = {
        key: val
        for key, val in distribution_params.items() if val is not None
    }

    if PriorDistribution.GAUSSIAN == distribution_type:
        return GaussianDistribution(**input_params)
    elif PriorDistribution.LOGISTIC == distribution_type:
        return LogisticDistribution(**input_params)
    else:
        print("[!] ERROR: Unknown distribution type %s" %
              str(distribution_type))
        sys.exit(1)
Esempio n. 7
0
    def __init__(self,
                 model_params,
                 optimizer_params,
                 batch_size,
                 checkpoint_path,
                 debug=False,
                 name_prefix="",
                 multi_gpu=False):
        self.batch_size = batch_size
        model_name = get_param_val(model_params,
                                   "model_name",
                                   default_val="CNF")
        self.name_prefix = name_prefix.strip(
        ) + model_name  # Remove possible spaces. Name is used for creating default checkpoint path
        self.model_params = model_params
        self.optimizer_params = optimizer_params
        ## Load model
        self.model = self._create_model(model_params)
        if multi_gpu:  # Testing for multi-gpu if selected
            num_gpus = torch.cuda.device_count()
            if num_gpus == 0:
                print(
                    "[#] WARNING: Multi-GPU training failed because no GPU was detected. Continuing with single GPU..."
                )
            elif num_gpus == 1:
                print(
                    "[#] WARNING: Multi-GPU training failed because only a single GPU is available. Continuing with single GPU..."
                )
            else:
                print("Preparing to use %i GPUs..." % (num_gpus))
                self.model = WrappedDataParallel(self.model)

        self.model = self.model.to(get_device())
        ## Load task
        self.task = self._create_task(model_params, debug=debug)
        ## Load optimizer and checkpoints
        self._create_optimizer(model_params, optimizer_params)
        self._prepare_checkpoint(checkpoint_path)
Esempio n. 8
0
def create_scheduler(scheduler_params, param_name=None):
    sched_type = get_param_val(scheduler_params,
                               "scheduler_type",
                               allow_default=False)
    end_val = get_param_val(scheduler_params,
                            "scheduler_end_val",
                            allow_default=False)
    start_val = get_param_val(scheduler_params,
                              "scheduler_start_val",
                              allow_default=False)
    stepsize = get_param_val(scheduler_params,
                             "scheduler_step_size",
                             allow_default=False)
    logit = get_param_val(scheduler_params,
                          "scheduler_logit",
                          allow_default=False)
    delay = get_param_val(scheduler_params,
                          "scheduler_delay",
                          allow_default=False)

    if sched_type == "constant":
        return ConstantScheduler(const_val=end_val, param_name=param_name)
    elif sched_type == "linear":
        return LinearScheduler(start_val=start_val,
                               end_val=end_val,
                               stepsize=stepsize,
                               delay=delay,
                               param_name=param_name)
    elif sched_type == "sigmoid":
        return SigmoidScheduler(start_val=start_val,
                                end_val=end_val,
                                logit_factor=logit,
                                stepsize=stepsize,
                                delay=delay,
                                param_name=param_name)
    elif sched_type == "exponential":
        return ExponentialScheduler(start_val=start_val,
                                    end_val=end_val,
                                    logit_factor=logit,
                                    stepsize=stepsize,
                                    delay=delay,
                                    param_name=param_name)
    else:
        print("[!] ERROR: Unknown scheduler type \"%s\"" % str(sched_type))
        sys.exit(1)
Esempio n. 9
0
    def train_model(self,
                    max_iterations=1e6,
                    loss_freq=50,
                    eval_freq=2000,
                    save_freq=1e5,
                    max_gradient_norm=0.25,
                    no_model_checkpoints=False):

        parameters_to_optimize = self.model.parameters()

        # Setup dictionary to save evaluation details in
        checkpoint_dict = self.load_recent_model()
        start_iter = get_param_val(
            checkpoint_dict, "iteration", 0,
            warning_if_default=False)  # Iteration to start from
        evaluation_dict = get_param_val(
            checkpoint_dict,
            "evaluation_dict",
            dict(),
            warning_if_default=False
        )  # Dictionary containing validation performances over time
        best_save_dict = get_param_val(checkpoint_dict,
                                       "best_save_dict", {
                                           "file": None,
                                           "metric": 1e6,
                                           "detailed_metrics": None,
                                           "test": None
                                       },
                                       warning_if_default=False)  #
        best_save_iter = best_save_dict["file"]
        last_save = None if start_iter == 0 else self.get_checkpoint_filename(
            start_iter)
        if last_save is not None and not os.path.isfile(last_save):
            print(
                "[!] WARNING: Could not find last checkpoint file specified as "
                + last_save)
            last_save = None
        test_NLL = None  # Possible test performance determined in the end of the training

        # Initialize tensorboard writer
        writer = SummaryWriter(self.checkpoint_path)

        # Function for saving model. Add here in the dictionary necessary parameters that should be saved
        def save_train_model(iteration, only_weights=True):
            if no_model_checkpoints:
                return
            checkpoint_dict = {
                "iteration": iteration,
                "best_save_dict": best_save_dict,
                "evaluation_dict": evaluation_dict
            }
            self.save_model(iteration,
                            checkpoint_dict,
                            save_optimizer=not only_weights)

        # Function to export the current results to a txt file
        def export_result_txt():
            if best_save_iter is not None:
                with open(os.path.join(self.checkpoint_path, "results.txt"),
                          "w") as f:
                    f.write("Best validation performance: %s\n" %
                            (str(best_save_dict["metric"])))
                    f.write(
                        "Best iteration: %i\n" %
                        int(str(best_save_iter).split("_")[-1].split(".")[0]))
                    f.write("Best checkpoint: %s\n" % str(best_save_iter))
                    f.write("Detailed metrics\n")
                    for metric_name, metric_val in best_save_dict[
                            "detailed_metrics"].items():
                        f.write("-> %s: %s\n" % (metric_name, str(metric_val)))
                    if "test" in best_save_dict and best_save_dict[
                            "test"] is not None:
                        f.write("Test - Detailed metrics\n")
                        for metric_name, metric_val in best_save_dict[
                                "test"].items():
                            f.write("[TEST] -> %s: %s\n" %
                                    (metric_name, str(metric_val)))
                    f.write("\n")

        # "Trackers" are moving averages. We use them to log the loss and time needed per training iteration
        time_per_step = Tracker()
        train_losses = Tracker()

        # Try-catch if user terminates
        try:
            index_iter = -1
            self.model.eval()
            self.task.initialize()
            print("=" * 50 + "\nStarting training...\n" + "=" * 50)
            self.model.train()

            print("Performing initial evaluation...")
            self.model.eval()
            eval_NLL, detailed_scores = self.task.eval(initial_eval=True)
            self.model.train()
            write_dict_to_tensorboard(writer,
                                      detailed_scores,
                                      base_name="eval",
                                      iteration=start_iter)

            for index_iter in range(start_iter, int(max_iterations)):

                # Training step
                start_time = time.time()
                loss = self.task.train_step(iteration=index_iter)
                self.optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(parameters_to_optimize,
                                               max_gradient_norm)
                if self.model.model_name in ["DAF", "DBF"]:
                    torch.nn.utils.clip_grad_norm_(self.model.base_log_probs,
                                                   max_gradient_norm)
                self.optimizer.step()
                if self.optimizer.param_groups[0]['lr'] > self.lr_minimum:
                    self.lr_scheduler.step()
                end_time = time.time()

                time_per_step.add(end_time - start_time)
                train_losses.add(loss.item())

                # Statement for detecting NaN values
                if torch.isnan(loss).item():
                    print("[!] ERROR: Loss is NaN!" + str(loss.item()))
                for name, param in self.model.named_parameters():
                    if param.requires_grad:
                        if torch.isnan(param).sum() > 0:
                            print("[!] ERROR: Parameter %s has %s NaN values!\n" % (name, str(torch.isnan(param).sum())) + \
                               "Grad values NaN: %s.\n" % (str(torch.isnan(param.grad).sum()) if param.grad is not None else "no gradients") + \
                               "Grad values avg: %s.\n" % (str(param.grad.abs().mean()) if param.grad is not None else "no gradients") + \
                               "Last loss: %s" % (str(loss)))

                # Printing current loss etc. for debugging
                if (index_iter + 1) % loss_freq == 0:
                    loss_avg = train_losses.get_mean(reset=True)
                    bpd_avg = self.task.loss_to_bpd(loss_avg)
                    train_time_avg = time_per_step.get_mean(reset=True)
                    max_memory = torch.cuda.max_memory_allocated(
                        device=get_device(
                        )) / 1.0e9 if torch.cuda.is_available() else -1
                    print(
                        "Training iteration %i|%i (%4.2fs). Loss: %6.5f, Bpd: %6.4f [Mem: %4.2fGB]"
                        % (index_iter + 1, max_iterations, train_time_avg,
                           loss_avg, bpd_avg, max_memory))
                    writer.add_scalar("train/loss", loss_avg, index_iter + 1)
                    writer.add_scalar("train/bpd", bpd_avg, index_iter + 1)
                    writer.add_scalar("train/learning_rate",
                                      self.optimizer.param_groups[0]['lr'],
                                      index_iter + 1)
                    writer.add_scalar("train/training_time", train_time_avg,
                                      index_iter + 1)

                    self.task.add_summary(writer,
                                          index_iter + 1,
                                          checkpoint_path=self.checkpoint_path)

                # Performing evaluation every "eval_freq" steps
                if (index_iter + 1) % eval_freq == 0:
                    self.model.eval()
                    eval_NLL, detailed_scores = self.task.eval()
                    self.model.train()

                    write_dict_to_tensorboard(writer,
                                              detailed_scores,
                                              base_name="eval",
                                              iteration=index_iter + 1)

                    # If model performed better on validation than any other iteration so far => save it and eventually replace old model
                    if eval_NLL < best_save_dict["metric"]:
                        best_save_iter = self.get_checkpoint_filename(
                            index_iter + 1)
                        best_save_dict["metric"] = eval_NLL
                        best_save_dict["detailed_metrics"] = detailed_scores
                        if not os.path.isfile(best_save_iter):
                            print("Saving model at iteration " +
                                  str(index_iter + 1))
                            if best_save_dict[
                                    "file"] is not None and os.path.isfile(
                                        best_save_dict["file"]):
                                print("Removing checkpoint %s..." %
                                      best_save_dict["file"])
                                os.remove(best_save_dict["file"])
                            if last_save is not None and os.path.isfile(
                                    last_save):
                                print("Removing checkpoint %s..." % last_save)
                                os.remove(last_save)
                            best_save_dict["file"] = best_save_iter
                            last_save = best_save_iter
                            save_train_model(index_iter + 1)
                        self.task.export_best_results(self.checkpoint_path,
                                                      index_iter + 1)
                        export_result_txt()
                    evaluation_dict[index_iter + 1] = best_save_dict["metric"]

                # Independent of evaluation, the model is saved every "save_freq" steps. This prevents loss of information if model does not improve for a while
                if (index_iter + 1) % save_freq == 0 and not os.path.isfile(
                        self.get_checkpoint_filename(index_iter + 1)):
                    save_train_model(index_iter + 1)
                    if last_save is not None and os.path.isfile(
                            last_save) and last_save != best_save_iter:
                        print("Removing checkpoint %s..." % last_save)
                        os.remove(last_save)
                    last_save = self.get_checkpoint_filename(index_iter + 1)
            ## End training loop

            # Before testing, load best model and check whether its validation performance is in the right range (to prevent major loading issues)
            if not no_model_checkpoints and best_save_iter is not None:
                load_model(best_save_iter,
                           model=self.model,
                           optimizer=self.optimizer,
                           lr_scheduler=self.lr_scheduler)
                eval_NLL, detailed_scores = self.task.eval()
                if eval_NLL != best_save_dict["metric"]:
                    if abs(eval_NLL - best_save_dict["metric"]) > 1e-1:
                        print(
                            "[!] WARNING: new evaluation significantly differs from saved one (%s vs %s)! Probably a mistake in the saving/loading part..."
                            % (str(eval_NLL), str(best_save_dict["metric"])))
                    else:
                        print(
                            "[!] WARNING: new evaluation sligthly differs from saved one (%s vs %s)."
                            % (str(eval_NLL), str(best_save_dict["metric"])))
            else:
                print("Using last model as no models were saved...")

            # Testing the trained model
            test_NLL, detailed_scores = self.task.test()
            print("=" * 50 + "\nTest performance: %lf" % (test_NLL))
            detailed_scores["original_NLL"] = test_NLL
            best_save_dict["test"] = detailed_scores
            self.task.finalize_summary(writer, max_iterations,
                                       self.checkpoint_path)

        # If user terminates training early, replace last model saved per "save_freq" steps by current one
        except KeyboardInterrupt:
            if index_iter > 0:
                print(
                    "User keyboard interrupt detected. Saving model at step %i..."
                    % (index_iter))
                save_train_model(index_iter + 1)
            else:
                print(
                    "User keyboard interrupt detected before starting to train."
                )
            if last_save is not None and os.path.isfile(last_save) and not any(
                [val == last_save for _, val in best_save_dict.items()]):
                os.remove(last_save)

        export_result_txt()

        writer.close()
Esempio n. 10
0
    def __init__(self,
                 num_classes,
                 hidden_size=64,
                 num_layers=2,
                 embedding_dim=32,
                 dp_rate=0.0,
                 input_dp_rate=0.0,
                 max_seq_len=-1,
                 vocab=None,
                 model_params=None):
        super().__init__()
        self.model_name = "RNN"
        if model_params is not None:
            hidden_size = get_param_val(model_params, "coupling_hidden_size",
                                        hidden_size)
            embedding_dim = hidden_size // 4
            num_layers = get_param_val(model_params, "coupling_hidden_layers",
                                       num_layers)
            dp_rate = get_param_val(model_params, "coupling_dropout", dp_rate)
            input_dp_rate = get_param_val(model_params,
                                          "coupling_input_dropout",
                                          input_dp_rate)
            max_seq_len = get_param_val(model_params, "max_seq_len",
                                        max_seq_len)

        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embed_dim = 1  # Not equal to embedding_dim, is needed for making sampling equal to flows

        if vocab is not None and vocab.vectors is not None:
            embedding_dim = vocab.vectors.shape[1]
            self.embeddings = nn.Embedding(num_embeddings=len(vocab),
                                           embedding_dim=embedding_dim)
            self.embeddings.weight.data.copy_(vocab.vectors)
            self.vocab_size = len(vocab)
        else:
            self.embeddings = nn.Embedding(num_embeddings=num_classes,
                                           embedding_dim=embedding_dim)
            self.vocab_size = num_classes

        if input_dp_rate < 1.0:
            time_embed_dim = embedding_dim // 4
            time_embed = nn.Linear(2 * max_seq_len, time_embed_dim)
            self.max_seq_len = max_seq_len
            self.time_concat = TimeConcat(time_embed=time_embed,
                                          input_dp_rate=input_dp_rate)
        else:
            self.time_concat = None
            time_embed_dim = 0
        self.lstm = nn.LSTM(input_size=embedding_dim + time_embed_dim,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=False)

        self.init_state = nn.Parameter(torch.zeros(num_layers, 1, hidden_size))

        self.output_layer = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2), nn.GELU(),
            nn.Dropout(dp_rate), nn.Linear(hidden_size // 2, num_classes),
            nn.LogSoftmax(dim=-1))