Example #1
0
    def _get_synth_dir(hparams: ExtendedHParams,
                       use_model_name: bool = True,
                       epoch: int = None,
                       step: int = None) -> os.PathLike:
        if hparams.has_value("synth_dir"):
            save_dir = hparams.synth_dir
        else:
            if hparams.has_value("out_dir"):
                save_dir = [hparams.out_dir]
            else:
                save_dir = [os.path.curdir]

            if use_model_name and hparams.has_value("model_name"):
                save_dir.append(hparams.model_name)

            save_dir.append(Synthesiser.SYNTH_SUB_DIR)

            if epoch is not None:
                save_dir.append("e" + str(epoch))
            elif step is not None:
                save_dir.append("s" + str(step))

            save_dir = os.path.join(*save_dir)

        makedirs_safe(save_dir)
        logging.info("Selected {} as synthesis directory.".format(save_dir))
        return save_dir
Example #2
0
    def __init__(self,
                 id_list: List[str],
                 datareaders: List,
                 hparams: ExtendedHParams,
                 is_train_set: bool = False,
                 is_val_set: bool = False,
                 is_test_set: bool = False):
        super().__init__()

        assert hparams.has_value("windowed_feature_names"), \
            "Use hparams.windowed_feature_names to define the features to " \
            "apply the windowing to. Those features have to match in length."

        self.id_list = id_list
        if is_train_set:
            self.batch_size = hparams.batch_size_train
        elif is_val_set:
            self.batch_size = hparams.batch_size_val
        elif is_test_set:
            self.batch_size = hparams.batch_size_test

        self.windowed_feature_names = hparams.windowed_feature_names
        self.window_size = hparams.get("window_size", 500)
        assert self.window_size > 1
        self.step_size = hparams.get("step_size", 50)
        self.mem_copy = hparams.get("windower_mem_copy", False)
        self.allow_shorter_sequences = hparams.get("allow_shorter_sequences",
                                                   True)
        self.random_offset = hparams.get("windower_random_offset", True)

        self.dataset = PyTorchDatareadersDataset(id_list, datareaders, hparams)
        self.length = None
Example #3
0
    def synth(self,
              hparams: ExtendedHParams,
              ids_input: Union[str, List[str], Tuple[str, ...], os.PathLike],
              post_processing_mapping: Dict[str, str] = None,
              plotter_configs: List[DataPlotter.Config] = None):

        if post_processing_mapping is None:
            post_processing_mapping = {
                "pred_acoustic_features": "cmp_features",
                "acoustic_features": "cmp_features"
            }

        if plotter_configs is None:
            plotter_configs = AcousticModelTrainer._get_legacy_plotter_configs(
                hparams)

        if not hparams.has_value("synth_feature_names"):
            hparams = copy.deepcopy(hparams)
            hparams.add_hparam("synth_feature_names",
                               ["pred_acoustic_features"])

        return super().synth(hparams=hparams,
                             ids_input=ids_input,
                             post_processing_mapping=post_processing_mapping,
                             plotter_configs=plotter_configs)
Example #4
0
    def init(self,
             hparams: ExtendedHParams,
             data_reader_configs: List[DataReader.Config] = None,
             model_config=None,
             loss_configs: List[NamedLoss.Config] = None) -> None:

        if model_config is None and hparams.has_value("model_type"):
            model_config = NamedForwardWrapper.Config(
                wrapped_model_config=rnn_dyn.convert_legacy_to_config(
                    (hparams.num_questions, ), hparams),
                input_names="questions",
                batch_first=hparams.batch_first,
                name="AcousticModel",
                output_names="pred_acoustic_features")

        if loss_configs is None:
            loss_configs = [
                NamedLoss.Config(name="MSELoss_acoustic_features",
                                 type_="MSELoss",
                                 seq_mask="acoustic_features_mask",
                                 input_names=[
                                     "acoustic_features",
                                     "pred_acoustic_features"
                                 ],
                                 batch_first=hparams.batch_first)
            ]

        super().init(data_reader_configs=data_reader_configs,
                     hparams=hparams,
                     model_config=model_config,
                     loss_configs=loss_configs)
        self.logger.info("AcousticModelTrainer ready.")
    def process_dataloader(self,
                           dataloader: DataLoader,
                           hparams: ExtendedHParams,
                           total_epoch: int,
                           total_steps: int,
                           current_epoch: int = None,
                           training: bool = True):
        if hparams.use_gpu:
            assert (hparams.num_gpus <= torch.cuda.device_count()), \
                "Specified number of GPUs is incorrect."

        try:
            from torch.utils.tensorboard import SummaryWriter

            if hparams.has_value("tensorboard_dir"):
                tensorboard_dir = hparams.tensorboard_dir
            else:
                tensorboard_dir = os.path.join(hparams.out_dir,
                                               hparams.model_name,
                                               "tensorboard")
            tb_writer = SummaryWriter(log_dir=tensorboard_dir)
        except ImportError:
            tb_writer = None

        model = self.model
        if training:
            model.train()
            msg = "{}: Train with {} on ".format(
                datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
                self.optimiser)
            if hparams.use_gpu:
                msg += str(torch.cuda.device_count()) + " GPU(s)."
            else:
                msg += "1 CPU."
            self.logger.info(msg),
        else:
            if self.ema is not None:
                self.logger.info("Using averaged model for validation.")
                model = self.ema.model
            model.eval()
            self.logger.info("{}: Compute loss of validation set.".format(
                datetime.now().strftime("%Y-%m-%d %H:%M:%S")))

        if hparams.log_memory_consumption:
            self.logger.info('CPU: {:.0f} MB, GPU: {} MB'.format(
                resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e3,
                str(get_gpu_memory_map()) if hparams.use_gpu else "-"))

        # Multi-GPU support.
        if hparams.num_gpus > 1:
            model = DataParallel(model, dim=0 if hparams.batch_first else 1)
            # Make the init_hidden method directly accessible.
            model.init_hidden = model.module.init_hidden

        # Log loss after each <hparams.logging_batch_index_perc>% of batches.
        logging_batch_index = (len(dataloader) // hparams.logging_batch_index_perc) + 1

        total_losses = dict()

        # for params in reversed(list(self.model.parameters())):
        #         params.retain_grad()

        for batch_index, batch in enumerate(dataloader):

            if hparams.use_gpu:
                batch = self._batch_to_gpu(batch, hparams.dataset_load_async)

            data_dict, lengths = batch
            batch_size = len(next(iter(lengths.values())))
            model.init_hidden(batch_size)

            # Compute max length because DataParallel splits the seq_lengths_input and padding will be done according to
            # the maximum length of that subset. Combining multi GPU output will fail with a size miss match.
            # https://pytorch.org/docs/stable/notes/faq.html#pack-rnn-unpack-with-data-parallelism
            max_lengths = dict()
            for key in data_dict.keys():
                if key in lengths:
                    l_max = max(lengths[key])
                    if hparams.use_gpu and hparams.num_gpus > 1:
                        l_max = l_max.repeat(hparams.num_gpus)
                    max_lengths[key] = l_max

            # Give max length because DataParallel splits the seq_lengths_input and padding will be done according to
            # the maximum length of that subset. Combining multi GPU output will fail with a size miss match.
            # https://pytorch.org/docs/stable/notes/faq.html#pack-rnn-unpack-with-data-parallelism
            if training:
                model(data_dict, lengths, max_lengths)
            else:
                with torch.no_grad():
                    model(data_dict, lengths, max_lengths)

            losses = {}
            for loss_fn in self.losses:
                loss_ = loss_fn(data_dict, lengths, total_steps)
                for loss_name, l in loss_.items():
                    if torch.isnan(l):
                        raise ValueError("Found NaN in {} loss.".format(loss_name))
                    if not hparams.replace_inf_grads_by_zero and torch.isinf(l):
                        raise ValueError("Found +/-Inf in {} loss.".format(loss_name))
                    if loss_name in losses:
                        raise KeyError("Loss with name {} defined twice.".format(loss_name))
                    losses[loss_name] = l
            backprop_loss = self.get_summed_losses_subset(
                loss_names=hparams.backprop_loss_names, losses=losses)
            if hparams.backprop_loss_names is None \
                    and hparams.scheduler_loss_names is None:
                scheduler_loss = backprop_loss.detach()
            else:
                scheduler_loss = self.get_summed_losses_subset(
                    loss_names=hparams.scheduler_loss_names, losses=losses).detach()

            if training:
                self.optimiser.zero_grad()
                backprop_loss.backward(retain_graph=hparams.backward_retain_graph)
                total_steps += 1

                # for params in reversed(list(self.model.parameters())):
                #     nan_or_inf |= torch.isnan(params.grad).any()
                #     nan_or_inf |= (params.grad == float("inf")).any()
                #     nan_or_inf |= (params.grad == -float("inf")).any()
                #     if nan_or_inf:
                #         raise ValueError("Found NaN/Inf in {}.".format(params))
                #         pdb.set_trace()

                if hparams.replace_inf_grads_by_zero:
                    self._replace_inf_grads_by_zero()

                if hparams.grad_clip_norm_type is not None:
                    # Adds a small bias.
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   hparams.grad_clip_max_norm,
                                                   hparams.grad_clip_norm_type)
                if hparams.grad_clip_thresh is not None:
                    # Adds a big bias.
                    torch.nn.utils.clip_grad_value_(self.model.parameters(),
                                                    hparams.grad_clip_thresh)

                self.optimiser.step()

                # Update exponential moving average.
                if self.ema:
                    self.ema.update_params(model)

                current_iter = self._get_current_iteration(
                    batch_index=batch_index, current_epoch=current_epoch,
                    dataloader_length=len(dataloader), hparams=hparams,
                    total_epoch=total_epoch)
                self.run_scheduler(hparams=hparams, loss=scheduler_loss,
                                   current_iter=current_iter)

            # Logging current error.
            if batch_index % logging_batch_index == 0:
                log_message = "Train " if training else "Test "
                log_message += "mini batch [{:{front_pad}d}/{}]".format(
                    batch_index + 1, len(dataloader),
                    front_pad=len(str(len(dataloader))))
                log_message += "\tLoss: "
                log_message += " ".join(["{}: {:.3f}".format(key, loss) for
                                         key, loss in losses.items()])
                if hparams.log_memory_consumption:
                    log_message += "\tCPU: {:.0f} MB, ".format(
                        resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 1e3)
                    if hparams.use_gpu:
                        log_message += "GPU: {} MB".format(
                            str(get_gpu_memory_map()))

                self.logger.info(log_message)

            losses = {k: l.detach() for k, l in losses.items()}
            for key, loss in losses.items():
                if key not in total_losses:
                    total_losses[key] = loss
                else:
                    total_losses[key] += loss

            if tb_writer is not None:
                tb_writer.add_scalars("Train loss", losses, total_steps)

            del data_dict, lengths, max_lengths, losses, backprop_loss, scheduler_loss

        total_losses = {key: value / len(dataloader) for key, value in total_losses.items()}

        if not training:
            if tb_writer is not None:
                tb_writer.add_scalars("Validation loss", total_losses, total_steps)

            self.logger.info(
                'Validation set: Total loss: {}\nAverage loss:\n\t{}\n'.format(
                    sum(total_losses.values()),
                    "\n\t".join(["{}: {:.3f}".format(key, loss)
                                 for key, loss in total_losses.items()])))

            fn_log_per_test = getattr(self.model, "log_per_test", None)
            if callable(fn_log_per_test):
                fn_log_per_test()

        np_total_losses = {key: loss.cpu().numpy() for key, loss in total_losses.items()}
        del total_losses

        return np_total_losses
    def load_checkpoint(self, hparams: ExtendedHParams,
                        model_path: Union[str, os.PathLike], epoch: int = None,
                        ignore_layers: bool = True, load_optimiser: bool = True,
                        load_scheduler: bool = True, step: int = None,
                        verbose: bool = True, load_best_model: bool = False):
        """
        Load a trainer and model from    save_as_best_model: bool = False): a checkpoint.

        :param hparams: Hyper-parameter container
        :type hparams: ExtendedHParams
        :param model_path: Path to folder with save files of the checkpoint (config.json, params_*, trainer_*)
        :type model_path: String or Path
        :param epoch: Epoch of the checkpoint to load, use -1 to load best model, defaults to None
        :type epoch: int, optional
        :param ignore_layers: Whether to ignore layers specified in hparams, defaults to True
        :type ignore_layers: bool, optional
        :param load_optimiser: Whether to load the optimiser state, defaults to True
        :type load_optimiser: bool, optional
        :param step: Step of the checkpoint to load, use -1 to load best model, defaults to None
        :type step: int, optional
        :param verbose: Additional logging of checkpoint creation time, defaults to True
        :type verbose: bool, optional
        :param load_best_model: If true, epoch and step are ignored and the best model is loaded, defaults to False
        :type load_best_model: bool, optional
        :return: (best_loss, epoch, step) tuple of loaded checkpoint
        :type: Tuple[float, int, int]
        """
        assert load_best_model or step is None or epoch is None, \
            "Only epoch ({}) OR step ({}) can be not None".format(epoch, step)

        if load_best_model or epoch == -1 or step == -1:
            suffix = "_best"
        elif hparams.load_newest_checkpoint:
            assert step is None and epoch is None, \
                "epoch ({}) and step ({}) need to be None when loading newest "\
                "model.".format(epoch, step)

            file_list = glob.glob(os.path.join(model_path, "params_*"))
            if len(file_list) == 0:
                raise FileNotFoundError("No newest checkpoint found in {}."
                                        .format(model_path))
            elif len(file_list) == 1:
                latest_params = file_list[0]
            else:
                file_list = [f for f in file_list if os.path.basename(f) not in
                             ["params_e0", "params_s0"]]  # Ignore initial state
                latest_params = max(file_list, key=os.path.getctime)
            suffix = "_" + os.path.basename(latest_params).split('_')[1]
        else:
            assert load_best_model or step is not None or epoch is not None, \
                "Either step or epoch is required. Use -1 in one of them to " \
                "load the best model."
            if step is not None:
                suffix = "_s{}".format(step)
            else:
                suffix = "_e{}".format(epoch)
        params_path = os.path.join(model_path, "params" + suffix)

        if verbose:
            mod_time = local_modification_time(params_path)
            message = "Load model state dict from {} (last modified {})".format(
                params_path, mod_time)
            if ignore_layers and hparams.ignore_layers is not None \
                    and len(hparams.ignore_layers) > 0:
                message += " ignoring {}".format(hparams.ignore_layers)
            self.logger.info(message)

        checkpoint = torch.load(params_path, map_location=lambda storage,
                                loc: storage)
        try:
            params = checkpoint["params"]
        except KeyError:
            params = checkpoint["model_state_dict"]  # Legacy support

        best_loss = np.inf
        epoch = checkpoint["epoch"]
        step = checkpoint["step"] if "step" in checkpoint else None
        self.logger.info("Load {}{}".format(
            "epoch {}, ".format(epoch) if epoch is not None else "",
            "step {}".format(step) if step is not None else ""))

        if self.model is None:
            with open(os.path.join(model_path, "config.json"), "r") as f:
                json_str = f.read()
            config_json = jsonpickle.decode(json_str)
            self.model = config_json.create_model()

        if hparams.has_value("layer_map") and len(hparams.layer_map) > 0:
            params = self._map_layer_names(params, hparams.layer_map, verbose)

        if ignore_layers:
            params = self._remove_ignored_layers(params, self.model, hparams)
        missing_keys, unexpected_keys = self.model.load_state_dict(
            params, strict=not hparams.allow_missing_layers)
        if verbose:
            if len(missing_keys) > 0:
                self.logger.warning("Did not load: {}".format(
                    ", ".join(missing_keys)))
            if len(unexpected_keys) > 0:
                self.logger.warning("Found unexpected keys: {}".format(
                    ", ".join(unexpected_keys)))

        if load_optimiser:
            opt_params_path = os.path.join(model_path, "optimiser" + suffix)
            checkpoint = torch.load(opt_params_path, map_location=lambda storage,
                                    loc: storage)
            if "best_loss" in checkpoint and (not ignore_layers
                                              or hparams.ignore_layers is None
                                              or len(hparams.ignore_layers) == 0):
                best_loss = checkpoint["best_loss"]
            opt_params = checkpoint["params"]
            # if opt_params is not None:
            self._load_optimiser(opt_params, hparams)

            if load_scheduler:
                scheduler_params_path = os.path.join(model_path,
                                                     "scheduler" + suffix)
                if os.path.isfile(scheduler_params_path):
                    checkpoint = torch.load(opt_params_path,
                                            map_location=lambda storage,
                                            loc: storage)
                    scheduler_params = checkpoint["params"]
                    self._load_scheduler(
                        scheduler_params,
                        epoch if epoch is not None else checkpoint['epoch'],
                        step if step is not None else checkpoint['step'],
                        hparams)

        if hparams.use_gpu:
            if hasattr(self.model, "set_gpu_flag") \
                    and callable(self.model.set_gpu_flag):
                self.model.set_gpu_flag(hparams.use_gpu)
            self.model = self.model.cuda()

            if self.optimiser is not None:
                self._optimiser_to_gpu()

        return best_loss, epoch, step