def __init__(self, model, args):
        """ Initialize an optimizer over model.parameters()
        """
        # check valildity of model
        if not hasattr(model, "parameters"):
            nii_warn.f_print("model is not torch.nn", "error")
            nii_warn.f_die("Error in creating OptimizerWrapper")

        # set optimizer type
        self.op_flag = args.optimizer
        self.lr = args.lr

        # create optimizer
        if self.op_flag == "Adam":
            self.optimizer = torch_optim.Adam(model.parameters(), lr=self.lr)
        elif self.op_flag == "RMSprop":
            self.optimizer = torch_optim.RMSprop(model.parameters(),
                                                 lr=self.lr)
        else:
            nii_warn.f_print("%s not availabel" % (self.op_flag), "error")
            nii_warn.f_die("Please change optimizer")

        # number of epochs
        self.epochs = args.epochs
        self.no_best_epochs = args.no_best_epochs

        return
Beispiel #2
0
def f_model_show(pt_model):
    """ 
    f_model_show(pt_model)
    Args: pt_model, a Pytorch model
    
    Print the informaiton of the model
    """
    print(pt_model)
    num = sum(p.numel() for p in pt_model.parameters() if p.requires_grad)
    nii_display.f_print("Parameter number: {:d}".format(num), "normal")
    return
Beispiel #3
0
 def f_parse(self):
     """ f_parse
     parse the configuration file
     """
     if self.m_config_path is not None:
         tmp_config = configparser.ConfigParser()
         tmp_config.read(self.m_config_path)
         return tmp_config
     else:
         nii_display.f_print("No config file provided", 'error')
         return None
 def _get_loss_for_learning_stopping(self, epoch_idx):
     # compute the average loss values
     if epoch_idx > self.cur_epoch:
         nii_display.f_print("To find loss for future epochs", 'error')
         nii_display.f_die("Op_process_monitor: error")
     if epoch_idx < 0:
         nii_display.f_print("To find loss for NULL epoch", 'error')
         nii_display.f_die("Op_process_monitor: error")
     loss_this = np.sum(self.loss_mat[epoch_idx, :, :], axis=0)
     # compute only part of the loss for early stopping when necessary
     loss_this = np.sum(loss_this * self.loss_flag)
     return loss_this
def f_inference_wrapper(args, pt_model, device, \
                        test_dataset_wrapper, checkpoint):
    """
    """
    test_data_loader = test_dataset_wrapper.get_loader()
    test_seq_num = test_dataset_wrapper.get_seq_num()
    test_dataset_wrapper.print_info()

    # print the network
    pt_model.to(device, dtype=nii_dconf.d_dtype)
    print(pt_model)

    cp_names = CheckPointKey()
    if type(checkpoint) is dict and cp_names.state_dict in checkpoint:
        pt_model.load_state_dict(checkpoint[cp_names.state_dict])
    else:
        pt_model.load_state_dict(checkpoint)

    pt_model.eval()
    with torch.no_grad():
        for _, (data_in, data_tar, data_info, idx_orig) in \
            enumerate(test_data_loader):

            # send data to device
            data_in = data_in.to(device)
            if isinstance(data_tar, torch.Tensor):
                data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)

            # compute output
            start_time = time.time()
            if args.model_forward_with_target:
                # if model.forward requires (input, target) as arguments
                # for example, for auto-encoder
                data_gen = pt_model(data_in, data_tar)
            else:
                data_gen = pt_model(data_in)
            data_gen = pt_model.denormalize_output(data_gen)
            time_cost = time.time() - start_time
            # average time for each sequence when batchsize > 1
            time_cost = time_cost / len(data_info)

            # save output (in case batchsize > 1, )
            data_gen_np = data_gen.to("cpu").numpy()
            for idx, seq_info in enumerate(data_info):
                _ = nii_op_display_tk.print_gen_info(seq_info, time_cost)
                test_dataset_wrapper.putitem(data_gen_np[idx:idx+1],\
                                             args.output_dir, \
                                             seq_info)
    #
    nii_display.f_print("Generated data to %s" % (args.output_dir))
    # done
    return
    def load_state_dic(self, state_dic):
        """ resume training, load the information
        """
        try:
            if self.seq_num != state_dic['seq_num']:
                nii_display.f_print("Number of samples are different \
                from previous training", 'error')
                nii_display.f_print("Please make sure that you are \
                using the same training/development sets as before.", "error")
                nii_display.f_print("Or\nPlease add --")
                nii_display.f_print("ignore_training_history_in_trained_model")
                nii_display.f_die(" to avoid loading training history")

            if self.epoch_num == state_dic['epoch_num']:
                self.loss_mat = state_dic['loss_mat']
                self.time_mat = state_dic['time_mat']
            else:
                # if training epoch is increased, resize the shape
                tmp_loss_mat = state_dic['loss_mat']
                self.loss_mat = np.resize(
                    self.loss_mat, 
                    [self.epoch_num, self.seq_num, tmp_loss_mat.shape[2]])
                self.loss_mat[0:tmp_loss_mat.shape[0]] = tmp_loss_mat
                self.time_mat[0:tmp_loss_mat.shape[0]] = state_dic['time_mat']

            self.seq_num = state_dic['seq_num']
            # since the saved cur_epoch has been finished
            self.cur_epoch = state_dic['cur_epoch'] + 1
            self.best_error = state_dic['best_error']
            self.best_epoch = state_dic['best_epoch']
            self.loss_flag = state_dic['loss_flag']
            self.seq_names = {}
        except KeyError:
            nii_display.f_die("Invalid op_process_monitor state_dic")
Beispiel #7
0
    def f_check_file_list(self):
        """ f_check_file_list():
            Check the file list after initialization
            Make sure that the file in file_list appears in every 
            input/output feature directory. 
            If not, get a file_list in which every file is avaiable
            in every input/output directory
        """
        if not isinstance(self.m_file_list, list):
            nii_warn.f_print("Read file list from directories")
            self.m_list = None            
        
        #  get a initial file list
        if self.m_file_list is None:
            self.m_file_list = nii_list_tools.listdir_with_ext(
                self.m_input_dirs[0], self.m_input_exts[0])

        # check the list of files exist in all input/output directories
        for tmp_d, tmp_e in zip(self.m_input_dirs[1:], \
                                self.m_input_exts[1:]):
            tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e)
            self.m_file_list = nii_list_tools.common_members(
                tmp_list, self.m_file_list)

        if len(self.m_file_list) < 1:
            nii_warn.f_print("No input features after scannning", 'error')
            nii_warn.f_print("Please check input config", 'error')
            nii_warn.f_print("Please check feature directory", 'error')

        # check output files if necessary
        if self.m_output_dirs:
            for tmp_d, tmp_e in zip(self.m_output_dirs, \
                                    self.m_output_exts):
                tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e)
                self.m_file_list = nii_list_tools.common_members(
                    tmp_list, self.m_file_list)

            if len(self.m_file_list) < 1:
                nii_warn.f_print("No output data found", 'error')
                nii_warn.f_die("Please check outpupt config")
        else:
            #nii_warn.f_print("Not loading output features")
            pass
        
        # done
        return
Beispiel #8
0
    def f_putitem(self, output_data, save_dir, data_infor_str):
        """ 
        """
        # Change the dimension to (length, dim)
        if output_data.ndim == 3 and output_data.shape[0] == 1:
            # When input data is (batchsize=1, length, dim)
            output_data = output_data[0]
        elif output_data.ndim == 2 and output_data.shape[0] == 1:
            # When input data is (batchsize=1, length)
            output_data = np.expand_dims(output_data[0], -1)
        else:
            nii_warn.f_print("Output data format not supported.", "error")
            nii_warn.f_print("Format is not (batch, len, dim)", "error")
            nii_warn.f_die("Please use batch_size = 1 in generation")

        # Save output
        if output_data.shape[1] != self.m_output_all_dim:
            nii_warn.f_print("Output data dim != expected dim", "error")
            nii_warn.f_print("Output:%d" % (output_data.shape[1]), \
                             "error")
            nii_warn.f_print("Expected:%d" % (self.m_output_all_dim), \
                             "error")
            nii_warn.f_die("Please check configuration")
        
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir)
            except OSError:
                nii_warn.f_die("Cannot carete {}".format(save_dir))

        # read the sentence information
        tmp_seq_info = nii_seqinfo.SeqInfo()
        tmp_seq_info.parse_from_str(data_infor_str)

        # write the data
        file_name = tmp_seq_info.seq_tag()
        s_dim = 0
        e_dim = 0
        for t_ext, t_dim in zip(self.m_output_exts, self.m_output_dims):
            e_dim = s_dim + t_dim
            file_path = nii_str_tk.f_realpath(save_dir, file_name, t_ext)
            self.f_write_data(output_data[:, s_dim:e_dim], file_path)
        
        return
def f_model_show(pt_model, do_model_def_check=True, model_type=None):
    """ f_model_show(pt_model, do_model_check=True)
    Print the informaiton of the model

    Args: 
      pt_model, a Pytorch model
      do_model_def_check, bool, whether check model definition (default True)
      model_type: str or None (default None), what type of network

    Return:
      None
    """
    if do_model_def_check:
        f_model_check(pt_model, model_type)

    nii_display.f_print("Model infor:")
    print(pt_model)
    num = sum(p.numel() for p in pt_model.parameters() if p.requires_grad)
    nii_display.f_print("Parameter number: {:d}\n".format(num), "normal")
    return
Beispiel #10
0
    def load_state_dic(self, state_dic):
        """ resume training, load the information
        """
        try:
            if self.seq_num != state_dic['seq_num']:
                nii_display.f_print(
                    "Number of samples are different \
                from previous training", 'error')
                nii_display.f_die("Please make sure resumed training are \
                using the same training/development sets as before")

            self.loss_mat = state_dic['loss_mat']
            self.time_mat = state_dic['time_mat']
            self.epoch_num = state_dic['epoch_num']
            self.seq_num = state_dic['seq_num']
            # since the saved cur_epoch has been finished
            self.cur_epoch = state_dic['cur_epoch'] + 1
            self.best_error = state_dic['best_error']
            self.best_epoch = state_dic['best_epoch']
            self.seq_names = {}
        except KeyError:
            nii_display.f_die("Invalid op_process_monitor state_dic")
Beispiel #11
0
    def f_log_data_len(self, file_name, t_len, t_reso):
        """ f_log_data_len(file_name, t_len, t_reso):
        Log down the length of the data file.

        When comparing the different input/output features for the same
        file_name, only keep the shortest length
        """
        # the length for the sequence with the fast tempoeral rate
        # For example, acoustic-feature -> waveform 16kHz,
        # if acoustic-feature is one frame per 5ms,
        #  tmp_len = acoustic feature frame length * (5 * 16)
        # where t_reso = 5*16 is the up-sampling rate of acoustic feature
        tmp_len = t_len * t_reso
        
        # save length when have not read the file
        if file_name not in self.m_data_length:
            self.m_data_length[file_name] = tmp_len

        # check length
        if t_len == 1:
            # if this is an utterance-level feature, it has only 1 frame
            pass
        elif self.f_valid_len(self.m_data_length[file_name], tmp_len, \
                            nii_dconf.data_seq_min_length):
            # if the difference in length is small
            if self.m_data_length[file_name] > tmp_len:
                self.m_data_length[file_name] = tmp_len
        else:
            nii_warn.f_print("Sequence length mismatch:", 'error')
            self.f_check_specific_data(file_name)
            nii_warn.f_print("Please the above features", 'error')
            nii_warn.f_die("Possible invalid data %s" % (file_name))

        # adjust the length so that, when reso is used,
        # the sequence length will be N * reso
        tmp = self.m_data_length[file_name]
        self.m_data_length[file_name] = self.f_adjust_len(tmp)
        return
def listdir_with_ext(file_dir, file_ext=None):
    """ 
    file_list = lstdir_with_ext(file_dir, file_ext=None)
    Return a list of file names with specified extention

    Args:
        file_dir: a file directory
        file_ext: string, specify the extention, e.g., txt, bin
    Return:
        file_list: a list of file_names
    """
    try:
    
        if file_ext is None:
            file_list = [os.path.splitext(x)[0] for x in os.listdir(file_dir) \
                        if not x.startswith('.')]
        else:
            file_list = [os.path.splitext(x)[0] for x in os.listdir(file_dir) \
                         if not x.startswith('.') and x.endswith(file_ext)]
        return file_list
    except OSError:
        nii_warn.f_print("Cannot access %s" % (file_dir), "error")
        return []
Beispiel #13
0
    def __init__(self, optimizer, args):        
        
        # learning rate decay
        self.lr_decay = args.lr_decay_factor

        # lr scheduler type 
        # please check arg_parse.py for the number ID
        self.lr_scheduler_type = args.lr_scheduler_type
        
        # patentience for ReduceLROnPlateau
        self.lr_patience = 5

        # step size for stepLR
        self.lr_stepLR_size = 10

        if self.lr_decay > 0:
            if self.lr_scheduler_type == 1:
                # StepLR
                self.lr_scheduler = torch.optim.lr_scheduler.StepLR(
                    optimizer=optimizer, step_size=self.lr_stepLR_size, 
                    gamma=self.lr_decay)
            else:
                # by default, ReduceLROnPlateau
                if args.no_best_epochs < 0:
                    self.lr_patience = 5
                    nii_warn.f_print("--no-best-epochs is set to 5 ")
                    nii_warn.f_print("for learning rate decaying")
                        
                self.lr_scheduler = torch_optim_steplr.ReduceLROnPlateau(
                    optimizer=optimizer, factor=self.lr_decay, 
                    patience=self.lr_patience)

            self.flag = True
        else:
            self.lr_scheduler = None
            self.flag =False
        return
    def __init__(self, model, args):
        """ Initialize an optimizer over model.parameters()
        """
        # check valildity of model
        if not hasattr(model, "parameters"):
            nii_warn.f_print("model is not torch.nn", "error")
            nii_warn.f_die("Error in creating OptimizerWrapper")

        # set optimizer type
        self.op_flag = args.optimizer
        self.lr = args.lr
        self.l2_penalty = args.l2_penalty

        # grad clip norm is directly added in nn_manager
        self.grad_clip_norm = args.grad_clip_norm

        # create optimizer
        if self.op_flag == "Adam":
            if self.l2_penalty > 0:
                self.optimizer = torch_optim.Adam(model.parameters(),
                                                  lr=self.lr,
                                                  weight_decay=self.l2_penalty)
            else:
                self.optimizer = torch_optim.Adam(model.parameters(),
                                                  lr=self.lr)

        else:
            nii_warn.f_print("%s not availabel" % (self.op_flag), "error")
            nii_warn.f_die("Please change optimizer")

        # number of epochs
        self.epochs = args.epochs
        self.no_best_epochs = args.no_best_epochs

        # lr scheduler
        self.lr_scheduler = nii_lr_scheduler.LRScheduler(self.optimizer, args)
        return
Beispiel #15
0
    def f_init_data_len_stats(self, data_path):
        """
        flag = f_init_data_len_stats(self, data_path)
        Check whether data length has been stored in data_pat.
        If yes, load data_path and return False
        Else, return True
        """
        self.m_seq_info = []
        self.m_data_length = {}
        self.m_data_total_length = 0
        
        flag = True
        if os.path.isfile(data_path):
            # load data length from pre-stored *.dic
            dic_seq_infos = nii_io_tk.read_dic(self.m_data_len_path)
            for dic_seq_info in dic_seq_infos:
                seq_info = nii_seqinfo.SeqInfo()
                seq_info.load_from_dic(dic_seq_info)
                self.m_seq_info.append(seq_info)
                seq_tag = seq_info.seq_tag()
                if seq_tag not in self.m_data_length:
                    self.m_data_length[seq_tag] = seq_info.seq_length()
                else:
                    self.m_data_length[seq_tag] += seq_info.seq_length()
            self.m_data_total_length = self.f_sum_data_length()
            
            # check whether *.dic contains files in filelist
            if nii_list_tools.list_identical(self.m_file_list,\
                                             self.m_data_length.keys()):
                nii_warn.f_print("Read sequence info: %s" % (data_path))
                flag = False
            else:
                self.m_seq_info = []
                self.m_data_length = {}
                self.m_data_total_length = 0

        return flag
Beispiel #16
0
    def f_init_mean_std(self, ms_input_path, ms_output_path):
        """ f_init_mean_std
        Initialzie mean and std vectors for input and output
        """
        self.m_input_mean = np.zeros([self.m_input_all_dim])
        self.m_input_std = np.ones([self.m_input_all_dim])
        self.m_output_mean = np.zeros([self.m_output_all_dim])
        self.m_output_std = np.ones([self.m_output_all_dim])
        
        flag = True
        if not self.m_save_ms:
            # assume mean/std will be in the network
            flag = False

        if os.path.isfile(ms_input_path) and \
           os.path.isfile(ms_output_path):
            # load mean and std if exists
            ms_input = self.f_load_data(ms_input_path, 1)
            ms_output = self.f_load_data(ms_output_path, 1)
            
            if ms_input.shape[0] != (self.m_input_all_dim * 2) or \
               ms_output.shape[0] != (self.m_output_all_dim * 2):
                if ms_input.shape[0] != (self.m_input_all_dim * 2):
                    nii_warn.f_print("%s incompatible" % (m_input_path),
                                     'warning')
                if ms_output.shape[0] != (self.m_output_all_dim * 2):
                    nii_warn.f_print("%s incompatible" % (m_output_path),
                                     'warning')
                nii_warn.f_print("mean/std will be recomputed", 'warning')
            else:
                self.m_input_mean = ms_input[0:self.m_input_all_dim]
                self.m_input_std = ms_input[self.m_input_all_dim:]
                
                self.m_output_mean = ms_output[0:self.m_output_all_dim]
                self.m_output_std = ms_output[self.m_output_all_dim:]
                nii_warn.f_print("Load mean/std from %s and %s" % \
                                 (ms_input_path, ms_output_path))
                flag = False
        return flag
def f_loss_check(loss_module, model_type=None):
    """ f_loss_check(pt_model)
    Check whether the loss module contains all the necessary keywords 
    
    Args: 
    ----
      loss_module, a class
      model_type, a str or None
    Return:
    -------
    """
    nii_display.f_print("Loss check")
    
    if model_type in nii_nn_manage_conf.loss_method_keywords_bags:
        keywords_bag = nii_nn_manage_conf.loss_method_keywords_bags[model_type]
    else:
        keywords_bag = nii_nn_manage_conf.loss_method_keywords_default

    for tmpkey in keywords_bag.keys():
        flag_mandatory, mes = keywords_bag[tmpkey]

        # mandatory keywords
        if flag_mandatory:
            if not hasattr(loss_module, tmpkey):
                nii_display.f_print("Please implement %s (%s)" % (tmpkey, mes))
                nii_display.f_die("[Error]: found no %s in Loss" % (tmpkey))
            else:
                # no need to print other information here
                pass #print("[OK]: %s found" % (tmpkey))
        else:
            if not hasattr(loss_module, tmpkey):
                # no need to print other information here
                pass #print("[OK]: %s is ignored, %s" % (tmpkey, mes))
            else:
                print("[OK]: use %s, %s" % (tmpkey, mes))
        # done
    nii_display.f_print("Loss check done\n")
    return
def f_model_check(pt_model, model_type=None):
    """ f_model_check(pt_model)
    Check whether the model contains all the necessary keywords 
    
    Args: 
    ----
      pt_model: a Pytorch model
      model_type_flag: str or None, a flag indicating the type of network

    Return:
    -------
    """
    nii_display.f_print("Model check:")
    if model_type in nii_nn_manage_conf.nn_model_keywords_bags:
        keywords_bag = nii_nn_manage_conf.nn_model_keywords_bags[model_type]
    else:
        keywords_bag = nii_nn_manage_conf.nn_model_keywords_default
    
    for tmpkey in keywords_bag.keys():
        flag_mandatory, mes = keywords_bag[tmpkey]

        # mandatory keywords
        if flag_mandatory:
            if not hasattr(pt_model, tmpkey):
                nii_display.f_print("Please implement %s (%s)" % (tmpkey, mes))
                nii_display.f_die("[Error]: found no %s in Model" % (tmpkey))
            else:
                print("[OK]: %s found" % (tmpkey))
        else:
            if not hasattr(pt_model, tmpkey):
                print("[OK]: %s is ignored, %s" % (tmpkey, mes))
            else:
                print("[OK]: use %s, %s" % (tmpkey, mes))
        # done
    nii_display.f_print("Model check done\n")
    return
Beispiel #19
0
def f_inference_wrapper(args, pt_model, device, \
                        test_dataset_wrapper, checkpoint):
    """ Wrapper for inference
    """

    # prepare dataloader
    test_data_loader = test_dataset_wrapper.get_loader()
    test_seq_num = test_dataset_wrapper.get_seq_num()
    test_dataset_wrapper.print_info()

    # cuda device
    if torch.cuda.device_count() > 1 and args.multi_gpu_data_parallel:
        nii_display.f_print("DataParallel for inference is not implemented",
                            'warning')
    nii_display.f_print("\nUse single GPU: %s\n" % \
                        (torch.cuda.get_device_name(device)))

    # print the network
    pt_model.to(device, dtype=nii_dconf.d_dtype)
    nii_nn_tools.f_model_show(pt_model)

    # load trained model parameters from checkpoint
    cp_names = nii_nn_manage_conf.CheckPointKey()
    if type(checkpoint) is dict and cp_names.state_dict in checkpoint:
        pt_model.load_state_dict(checkpoint[cp_names.state_dict])
    else:
        pt_model.load_state_dict(checkpoint)

    # start generation
    nii_display.f_print("Start inference (generation):", 'highlight')

    pt_model.eval()
    with torch.no_grad():
        for _, (data_in, data_tar, data_info, idx_orig) in \
            enumerate(test_data_loader):

            # send data to device and convert data type
            data_in = data_in.to(device, dtype=nii_dconf.d_dtype)
            if isinstance(data_tar, torch.Tensor):
                data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)

            # compute output
            start_time = time.time()

            # in case the model defines inference function explicitly
            if hasattr(pt_model, "inference"):
                infer_func = pt_model.inference
            else:
                infer_func = pt_model.forward

            if args.model_forward_with_target:
                # if model.forward requires (input, target) as arguments
                # for example, for auto-encoder
                if args.model_forward_with_file_name:
                    data_gen = infer_func(data_in, data_tar, data_info)
                else:
                    data_gen = infer_func(data_in, data_tar)
            else:
                if args.model_forward_with_file_name:
                    data_gen = infer_func(data_in, data_info)
                else:
                    data_gen = infer_func(data_in)

            time_cost = time.time() - start_time
            # average time for each sequence when batchsize > 1
            time_cost = time_cost / len(data_info)

            if data_gen is None:
                nii_display.f_print("No output saved: %s" % (str(data_info)),\
                                    'warning')
                for idx, seq_info in enumerate(data_info):
                    _ = nii_op_display_tk.print_gen_info(seq_info, time_cost)
                continue
            else:
                try:
                    data_gen = pt_model.denormalize_output(data_gen)
                    data_gen_np = data_gen.to("cpu").numpy()
                except AttributeError:
                    mes = "Output data is not torch.tensor. Please check "
                    mes += "model.forward or model.inference"
                    nii_display.f_die(mes)

                # save output (in case batchsize > 1, )
                for idx, seq_info in enumerate(data_info):
                    _ = nii_op_display_tk.print_gen_info(seq_info, time_cost)
                    test_dataset_wrapper.putitem(data_gen_np[idx:idx+1],\
                                                 args.output_dir, \
                                                 seq_info)

        # done for
    # done with

    #
    nii_display.f_print("Generated data to %s" % (args.output_dir))

    # finish up if necessary
    if hasattr(pt_model, "finish_up_inference"):
        pt_model.finish_up_inference()

    # done
    return
Beispiel #20
0
def f_train_wrapper(args, pt_model, loss_wrapper, device, \
                    optimizer_wrapper, \
                    train_dataset_wrapper, \
                    val_dataset_wrapper = None, \
                    checkpoint = None):
    """ 
    f_train_wrapper(args, pt_model, loss_wrapper, device, 
                    optimizer_wrapper
                    train_dataset_wrapper, val_dataset_wrapper = None,
                    check_point = None):
      A wrapper to run the training process

    Args:
       args:         argument information given by argpase
       pt_model:     pytorch model (torch.nn.Module)
       loss_wrapper: a wrapper over loss function
                     loss_wrapper.compute(generated, target) 
       device:       torch.device("cuda") or torch.device("cpu")

       optimizer_wrapper: 
           a wrapper over optimizer (defined in op_manager.py)
           optimizer_wrapper.optimizer is torch.optimizer
    
       train_dataset_wrapper: 
           a wrapper over training data set (data_io/default_data_io.py)
           train_dataset_wrapper.get_loader() returns torch.DataSetLoader
       
       val_dataset_wrapper: 
           a wrapper over validation data set (data_io/default_data_io.py)
           it can None.
       
       check_point:
           a check_point that stores every thing to resume training
    """

    nii_display.f_print_w_date("Start model training")

    ##############
    ## Preparation
    ##############

    # get the optimizer
    optimizer_wrapper.print_info()
    optimizer = optimizer_wrapper.optimizer
    lr_scheduler = optimizer_wrapper.lr_scheduler
    epoch_num = optimizer_wrapper.get_epoch_num()
    no_best_epoch_num = optimizer_wrapper.get_no_best_epoch_num()

    # get data loader for training set
    train_dataset_wrapper.print_info()
    train_data_loader = train_dataset_wrapper.get_loader()
    train_seq_num = train_dataset_wrapper.get_seq_num()

    # get the training process monitor
    monitor_trn = nii_monitor.Monitor(epoch_num, train_seq_num)

    # if validation data is provided, get data loader for val set
    if val_dataset_wrapper is not None:
        val_dataset_wrapper.print_info()
        val_data_loader = val_dataset_wrapper.get_loader()
        val_seq_num = val_dataset_wrapper.get_seq_num()
        monitor_val = nii_monitor.Monitor(epoch_num, val_seq_num)
    else:
        monitor_val = None

    # training log information
    train_log = ''

    # prepare for DataParallism if available
    # pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
    if torch.cuda.device_count() > 1 and args.multi_gpu_data_parallel:
        flag_multi_device = True
        nii_display.f_print("\nUse %d GPUs\n" % (torch.cuda.device_count()))
        # no way to call normtarget_f after pt_model is in DataParallel
        normtarget_f = pt_model.normalize_target
        pt_model = nn.DataParallel(pt_model)
    else:
        nii_display.f_print("\nUse single GPU: %s\n" % \
                            (torch.cuda.get_device_name(device)))
        flag_multi_device = False
        normtarget_f = None
    pt_model.to(device, dtype=nii_dconf.d_dtype)

    # print the network
    nii_nn_tools.f_model_show(pt_model)
    nii_nn_tools.f_loss_show(loss_wrapper)

    ###############################
    ## Resume training if necessary
    ###############################
    # resume training or initialize the model if necessary
    cp_names = nii_nn_manage_conf.CheckPointKey()
    if checkpoint is not None:
        if type(checkpoint) is dict:
            # checkpoint

            # load model parameter and optimizer state
            if cp_names.state_dict in checkpoint:
                # wrap the state_dic in f_state_dict_wrapper
                # in case the model is saved when DataParallel is on
                pt_model.load_state_dict(
                    nii_nn_tools.f_state_dict_wrapper(
                        checkpoint[cp_names.state_dict], flag_multi_device))

            # load optimizer state
            if cp_names.optimizer in checkpoint and \
               not args.ignore_optimizer_statistics_in_trained_model:
                optimizer.load_state_dict(checkpoint[cp_names.optimizer])

            # optionally, load training history
            if not args.ignore_training_history_in_trained_model:
                #nii_display.f_print("Load ")
                if cp_names.trnlog in checkpoint:
                    monitor_trn.load_state_dic(checkpoint[cp_names.trnlog])
                if cp_names.vallog in checkpoint and monitor_val:
                    monitor_val.load_state_dic(checkpoint[cp_names.vallog])
                if cp_names.info in checkpoint:
                    train_log = checkpoint[cp_names.info]
                if cp_names.lr_scheduler in checkpoint and \
                   checkpoint[cp_names.lr_scheduler] and lr_scheduler.f_valid():
                    lr_scheduler.f_load_state_dict(
                        checkpoint[cp_names.lr_scheduler])

                nii_display.f_print("Load check point, resume training")
            else:
                nii_display.f_print("Load pretrained model and optimizer")
        else:
            # only model status
            pt_model.load_state_dict(
                nii_nn_tools.f_state_dict_wrapper(checkpoint,
                                                  flag_multi_device))
            nii_display.f_print("Load pretrained model")

    ######################
    ### User defined setup
    ######################
    if hasattr(pt_model, "other_setups"):
        nii_display.f_print("Conduct User-defined setup")
        pt_model.other_setups()

    # This should be merged with other_setups
    if hasattr(pt_model, "g_pretrained_model_path") and \
       hasattr(pt_model, "g_pretrained_model_prefix"):
        nii_display.f_print("Load pret-rained models as part of this mode")
        nii_nn_tools.f_load_pretrained_model_partially(
            pt_model, pt_model.g_pretrained_model_path,
            pt_model.g_pretrained_model_prefix)

    ######################
    ### Start training
    ######################
    # other variables
    flag_early_stopped = False
    start_epoch = monitor_trn.get_epoch()
    epoch_num = monitor_trn.get_max_epoch()

    # print
    _ = nii_op_display_tk.print_log_head()
    nii_display.f_print_message(train_log, flush=True, end='')

    # loop over multiple epochs
    for epoch_idx in range(start_epoch, epoch_num):

        # training one epoch
        pt_model.train()
        # set validation flag if necessary
        if hasattr(pt_model, 'validation'):
            pt_model.validation = False
            mes = "Warning: model.validation is deprecated, "
            mes += "please use model.flag_validation"
            nii_display.f_print(mes, 'warning')
        if hasattr(pt_model, 'flag_validation'):
            pt_model.flag_validation = False

        f_run_one_epoch(args, pt_model, loss_wrapper, device, \
                        monitor_trn, train_data_loader, \
                        epoch_idx, optimizer, normtarget_f)
        time_trn = monitor_trn.get_time(epoch_idx)
        loss_trn = monitor_trn.get_loss(epoch_idx)

        # if necessary, do validataion
        if val_dataset_wrapper is not None:
            # set eval() if necessary
            if args.eval_mode_for_validation:
                pt_model.eval()

            # set validation flag if necessary
            if hasattr(pt_model, 'validation'):
                pt_model.validation = True
                mes = "Warning: model.validation is deprecated, "
                mes += "please use model.flag_validation"
                nii_display.f_print(mes, 'warning')
            if hasattr(pt_model, 'flag_validation'):
                pt_model.flag_validation = True

            with torch.no_grad():
                f_run_one_epoch(args, pt_model, loss_wrapper, \
                                device, \
                                monitor_val, val_data_loader, \
                                epoch_idx, None, normtarget_f)
            time_val = monitor_val.get_time(epoch_idx)
            loss_val = monitor_val.get_loss(epoch_idx)

            # update lr rate scheduler if necessary
            if lr_scheduler.f_valid():
                lr_scheduler.f_step(loss_val)

        else:
            time_val, loss_val = 0, 0

        if val_dataset_wrapper is not None:
            flag_new_best = monitor_val.is_new_best()
        else:
            flag_new_best = True

        # print information
        train_log += nii_op_display_tk.print_train_info(
            epoch_idx, time_trn, loss_trn, time_val, loss_val, flag_new_best,
            optimizer_wrapper.get_lr_info())

        # save the best model
        if flag_new_best:
            tmp_best_name = nii_nn_tools.f_save_trained_name(args)
            torch.save(pt_model.state_dict(), tmp_best_name)

        # save intermediate model if necessary
        if not args.not_save_each_epoch:
            tmp_model_name = nii_nn_tools.f_save_epoch_name(args, epoch_idx)

            if monitor_val is not None:
                tmp_val_log = monitor_val.get_state_dic()
            else:
                tmp_val_log = None

            if lr_scheduler.f_valid():
                lr_scheduler_state = lr_scheduler.f_state_dict()
            else:
                lr_scheduler_state = None

            # save
            tmp_dic = {
                cp_names.state_dict: pt_model.state_dict(),
                cp_names.info: train_log,
                cp_names.optimizer: optimizer.state_dict(),
                cp_names.trnlog: monitor_trn.get_state_dic(),
                cp_names.vallog: tmp_val_log,
                cp_names.lr_scheduler: lr_scheduler_state
            }
            torch.save(tmp_dic, tmp_model_name)
            if args.verbose == 1:
                nii_display.f_eprint(str(datetime.datetime.now()))
                nii_display.f_eprint("Save {:s}".format(tmp_model_name),
                                     flush=True)

        # Early stopping
        #  note: if LR scheduler is used, early stopping will be
        #  disabled
        if lr_scheduler.f_allow_early_stopping() and \
           monitor_val is not None and \
           monitor_val.should_early_stop(no_best_epoch_num):
            flag_early_stopped = True
            break

    # loop done
    nii_op_display_tk.print_log_tail()
    if flag_early_stopped:
        nii_display.f_print("Training finished by early stopping")
    else:
        nii_display.f_print("Training finished")
    nii_display.f_print("Model is saved to", end='')
    nii_display.f_print("{}".format(nii_nn_tools.f_save_trained_name(args)))
    return
Beispiel #21
0
def f_run_one_epoch(args,
                    pt_model, loss_wrapper, \
                    device, monitor,  \
                    data_loader, epoch_idx, optimizer = None, \
                    target_norm_method = None):
    """
    f_run_one_epoch: 
       run one poech over the dataset (for training or validation sets)

    Args:
       args:         from argpase
       pt_model:     pytorch model (torch.nn.Module)
       loss_wrapper: a wrapper over loss function
                     loss_wrapper.compute(generated, target) 
       device:       torch.device("cuda") or torch.device("cpu")
       monitor:      defined in op_procfess_monitor.py
       data_loader:  pytorch DataLoader. 
       epoch_idx:    int, index of the current epoch
       optimizer:    torch optimizer or None
                     if None, the back propgation will be skipped
                     (for developlement set)
       target_norm_method: method to normalize target data
                           (by default, use pt_model.normalize_target)
    """
    # timer
    start_time = time.time()

    # loop over samples
    for data_idx, (data_in, data_tar, data_info, idx_orig) in \
        enumerate(data_loader):

        #############
        # prepare
        #############
        # idx_orig is the original idx in the dataset
        # which can be different from data_idx when shuffle = True
        #idx_orig = idx_orig.numpy()[0]
        #data_seq_info = data_info[0]

        # send data to device
        if optimizer is not None:
            optimizer.zero_grad()

        ############
        # compute output
        ############
        data_in = data_in.to(device, dtype=nii_dconf.d_dtype)
        if args.model_forward_with_target:
            # if model.forward requires (input, target) as arguments
            # for example, for auto-encoder & autoregressive model
            if isinstance(data_tar, torch.Tensor):
                data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype)
                if args.model_forward_with_file_name:
                    data_gen = pt_model(data_in, data_tar_tm, data_info)
                else:
                    data_gen = pt_model(data_in, data_tar_tm)
            else:
                nii_display.f_print("--model-forward-with-target is set")
                nii_display.f_die("but data_tar is not loaded")
        else:
            if args.model_forward_with_file_name:
                # specifcal case when model.forward requires data_info
                data_gen = pt_model(data_in, data_info)
            else:
                # normal case for model.forward(input)
                data_gen = pt_model(data_in)

        #####################
        # compute loss and do back propagate
        #####################

        # Two cases
        # 1. if loss is defined as pt_model.loss, then let the users do
        #    normalization inside the pt_mode.loss
        # 2. if loss_wrapper is defined as a class independent from model
        #    there is no way to normalize the data inside the loss_wrapper
        #    because the normalization weight is saved in pt_model

        if hasattr(pt_model, 'loss'):
            # case 1, pt_model.loss is available
            if isinstance(data_tar, torch.Tensor):
                data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)
            else:
                data_tar = []

            loss_computed = pt_model.loss(data_gen, data_tar)
        else:
            # case 2, loss is defined independent of pt_model
            if isinstance(data_tar, torch.Tensor):
                data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)
                # there is no way to normalize the data inside loss
                # thus, do normalization here
                if target_norm_method is None:
                    normed_target = pt_model.normalize_target(data_tar)
                else:
                    normed_target = target_norm_method(data_tar)
            else:
                normed_target = []

            # return the loss from loss_wrapper
            # loss_computed may be [[loss_1, loss_2, ...],[flag_1, flag_2,.]]
            #   which contain multiple loss and flags indicating whether
            #   the corresponding loss should be taken into consideration
            #   for early stopping
            # or
            # loss_computed may be simply a tensor loss
            loss_computed = loss_wrapper.compute(data_gen, normed_target)

        loss_values = [0]
        # To handle cases where there are multiple loss functions
        # when loss_comptued is [[loss_1, loss_2, ...],[flag_1, flag_2,.]]
        #   loss: sum of [loss_1, loss_2, ...], for backward()
        #   loss_values: [loss_1.item(), loss_2.item() ..], for logging
        #   loss_flags: [True/False, ...], for logging,
        #               whether loss_n is used for early stopping
        # when loss_computed is loss
        #   loss: loss
        #   los_vals: [loss.item()]
        #   loss_flags: [True]
        loss, loss_values, loss_flags = nii_nn_tools.f_process_loss(
            loss_computed)

        # Back-propgation using the summed loss
        if optimizer is not None:
            # backward propagation
            loss.backward()

            # apply gradient clip
            if args.grad_clip_norm > 0:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    pt_model.parameters(), args.grad_clip_norm)

            # update parameters
            optimizer.step()

        # save the training process information to the monitor
        end_time = time.time()
        batchsize = len(data_info)
        for idx, data_seq_info in enumerate(data_info):
            # loss_value is supposed to be the average loss value
            # over samples in the the batch, thus, just loss_value
            # rather loss_value / batchsize
            monitor.log_loss(loss_values, loss_flags, \
                             (end_time-start_time) / batchsize, \
                             data_seq_info, idx_orig.numpy()[idx], \
                             epoch_idx)
            # print infor for one sentence
            if args.verbose == 1:
                monitor.print_error_for_batch(data_idx*batchsize + idx,\
                                              idx_orig.numpy()[idx], \
                                              epoch_idx)
            #
        # start the timer for a new batch
        start_time = time.time()

        # Save intermediate model for every n mini-batches (optional).
        # Note that if we re-start trainining with this intermediate model,
        #  the data will start from the 1st sample, not the one where we stopped
        if args.save_model_every_n_minibatches > 0 \
           and (data_idx+1) % args.save_model_every_n_minibatches == 0 \
           and optimizer is not None and data_idx > 0:
            cp_names = nii_nn_manage_conf.CheckPointKey()
            tmp_model_name = nii_nn_tools.f_save_epoch_name(
                args, epoch_idx, '_{:05d}'.format(data_idx + 1))
            # save
            tmp_dic = {
                cp_names.state_dict: pt_model.state_dict(),
                cp_names.optimizer: optimizer.state_dict()
            }
            torch.save(tmp_dic, tmp_model_name)

    # loop done
    return
Beispiel #22
0
def f_online_mean_std(data, mean_old, var_old, cnt_old):
    """ 
    mean, var, count=f_online_mean_var(data, mean, var, num_count):
    
    online algorithm to accumulate mean and var
    
    Args:
      data: input data as numpy.array, in shape [length, dimension]
    
      mean: mean to be updated, np.array [dimension]

      var: var to be updated, np.array [dimension]

      num_count: how many data rows have been calculated before 
        this calling.

    Return:
      mean: mean, np.array [dimension]
      var: var, np.array [dimension]
      count: accumulated data number, = num_count + data.shape[0]

    Ref. parallel algorithm                                                 
    https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance  
    """

    try:
        # how many time steps (number of rows) in this data
        cnt_this = data.shape[0]

        # if input data is empty, don't update
        if cnt_this == 0:
            return mean_old, var_old, cnt_old

        if data.ndim == 1:
            # single dimension data, 1d array
            mean_this = data.mean()
            var_this = data.var()
            dim = 1
        else:
            # multiple dimension data, 2d array
            mean_this = data.mean(axis=0)
            var_this = data.var(axis=0)
            dim = data.shape[1]

        # difference of accumulated mean and data mean
        diff_mean = mean_this - mean_old

        # new mean and var
        new_mean = np.zeros([dim], dtype=nii_dconf.h_dtype)
        new_var = np.zeros([dim], dtype=nii_dconf.h_dtype)

        # update count
        updated_count = cnt_old + cnt_this

        # update mean
        new_mean = mean_old + diff_mean * (float(cnt_this) /
                                           (cnt_old + cnt_this))
        # update var
        if cnt_old == 0:
            # if this is the first data
            if data.ndim == 1:
                # remember that var is array, not scalar
                new_var[0] = var_this
            else:
                new_var = var_this
        else:
            # not first data
            new_var = (
                var_old * (float(cnt_old) / updated_count) + var_this *
                (float(cnt_this) / updated_count) +
                (diff_mean * diff_mean /
                 (float(cnt_this) / cnt_old + float(cnt_old) / cnt_this + 2.0))
            )
        # done
        return new_mean, new_var, updated_count

    except ValueError:
        if data.ndim > 1:
            if data.shape[1] != mean_old.shape[0] or \
               data.shape[1] != var_old.shape[0]:
                nii_display.f_print("Dimension incompatible", "error")
                nii_display.f_die("Error in online mean var calculation")
        else:
            if mean_old.shape[0] != 1 or \
               var_old.shape[0] != 1:
                nii_display.f_print("Dimension incompatible", "error")
                nii_display.f_die("Error in online mean var calculation")
def main():
    """ main(): the default wrapper for training and inference process
    Please prepare config.py and model.py
    """
    # arguments initialization
    args = nii_arg_parse.f_args_parsed()

    #
    nii_warn.f_print_w_date("Start program", level='h')
    nii_warn.f_print("Load module: %s" % (args.module_config))
    nii_warn.f_print("Load module: %s" % (args.module_model))
    prj_conf = importlib.import_module(args.module_config)
    prj_model = importlib.import_module(args.module_model)

    # initialization
    nii_startup.set_random_seed(args.seed, args)
    use_cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    # prepare data io
    if not args.inference:
        params = {
            'batch_size': args.batch_size,
            'shuffle': args.shuffle,
            'num_workers': args.num_workers
        }

        # Load file list and create data loader
        trn_lst = nii_list_tool.read_list_from_text(prj_conf.trn_list)
        trn_set = nii_dset.NIIDataSetLoader(
            prj_conf.trn_set_name, \
            trn_lst,
            prj_conf.input_dirs, \
            prj_conf.input_exts, \
            prj_conf.input_dims, \
            prj_conf.input_reso, \
            prj_conf.input_norm, \
            prj_conf.output_dirs, \
            prj_conf.output_exts, \
            prj_conf.output_dims, \
            prj_conf.output_reso, \
            prj_conf.output_norm, \
            './',
            params = params,
            truncate_seq = prj_conf.truncate_seq,
            min_seq_len = prj_conf.minimum_len,
            save_mean_std = True,
            wav_samp_rate = prj_conf.wav_samp_rate)

        if prj_conf.val_list is not None:
            val_lst = nii_list_tool.read_list_from_text(prj_conf.val_list)
            val_set = nii_dset.NIIDataSetLoader(
                prj_conf.val_set_name,
                val_lst,
                prj_conf.input_dirs, \
                prj_conf.input_exts, \
                prj_conf.input_dims, \
                prj_conf.input_reso, \
                prj_conf.input_norm, \
                prj_conf.output_dirs, \
                prj_conf.output_exts, \
                prj_conf.output_dims, \
                prj_conf.output_reso, \
                prj_conf.output_norm, \
                './', \
                params = params,
                truncate_seq= prj_conf.truncate_seq,
                min_seq_len = prj_conf.minimum_len,
                save_mean_std = False,
                wav_samp_rate = prj_conf.wav_samp_rate)
        else:
            val_set = None

        # initialize the model and loss function
        model = prj_model.Model(trn_set.get_in_dim(), \
                                trn_set.get_out_dim(), \
                                args, trn_set.get_data_mean_std())
        loss_wrapper = prj_model.Loss(args)

        # initialize the optimizer
        optimizer_wrapper = nii_op_wrapper.OptimizerWrapper(model, args)

        # if necessary, resume training
        if args.trained_model == "":
            checkpoint = None
        else:
            checkpoint = torch.load(args.trained_model)

        # start training
        nii_nn_wrapper.f_train_wrapper(args, model, loss_wrapper, device,
                                       optimizer_wrapper, trn_set, val_set,
                                       checkpoint)
        # done for traing

    else:

        # for inference

        # default, no truncating, no shuffling
        params = {
            'batch_size': args.batch_size,
            'shuffle': False,
            'num_workers': args.num_workers
        }

        if type(prj_conf.test_list) is list:
            t_lst = prj_conf.test_list
        else:
            t_lst = nii_list_tool.read_list_from_text(prj_conf.test_list)
        test_set = nii_dset.NIIDataSetLoader(
            prj_conf.test_set_name, \
            t_lst, \
            prj_conf.test_input_dirs,
            prj_conf.input_exts,
            prj_conf.input_dims,
            prj_conf.input_reso,
            prj_conf.input_norm,
            prj_conf.test_output_dirs,
            prj_conf.output_exts,
            prj_conf.output_dims,
            prj_conf.output_reso,
            prj_conf.output_norm,
            './',
            params = params,
            truncate_seq= None,
            min_seq_len = None,
            save_mean_std = False,
            wav_samp_rate = prj_conf.wav_samp_rate)

        # initialize model
        model = prj_model.Model(test_set.get_in_dim(), \
                                test_set.get_out_dim(), \
                                args)
        if args.trained_model == "":
            print("No model is loaded by ---trained-model for inference")
            print("By default, load %s%s" %
                  (args.save_trained_name, args.save_model_ext))
            checkpoint = torch.load(
                "%s%s" % (args.save_trained_name, args.save_model_ext))
        else:
            checkpoint = torch.load(args.trained_model)

        # do inference and output data
        nii_nn_wrapper.f_inference_wrapper(args, model, device, \
                                           test_set, checkpoint)
    # done
    return
Beispiel #24
0
def f_run_one_epoch(args,
                    pt_model, loss_wrapper, \
                    device, monitor,  \
                    data_loader, epoch_idx, optimizer = None, \
                    target_norm_method = None):
    """
    f_run_one_epoch: 
       run one poech over the dataset (for training or validation sets)

    Args:
       args:         from argpase
       pt_model:     pytorch model (torch.nn.Module)
       loss_wrapper: a wrapper over loss function
                     loss_wrapper.compute(generated, target) 
       device:       torch.device("cuda") or torch.device("cpu")
       monitor:      defined in op_procfess_monitor.py
       data_loader:  pytorch DataLoader. 
       epoch_idx:    int, index of the current epoch
       optimizer:    torch optimizer or None
                     if None, the back propgation will be skipped
                     (for developlement set)
       target_norm_method: method to normalize target data
                           (by default, use pt_model.normalize_target)
    """
    # timer
    start_time = time.time()

    # loop over samples
    pbar = tqdm(data_loader)
    epoch_num = monitor.get_max_epoch()
    for data_idx, (data_in, data_tar, data_info, idx_orig) in enumerate(pbar):
        pbar.set_description("Epoch: {}/{}".format(epoch_idx, epoch_num))
        # idx_orig is the original idx in the dataset
        # which can be different from data_idx when shuffle = True
        #idx_orig = idx_orig.numpy()[0]
        #data_seq_info = data_info[0]

        # send data to device
        if optimizer is not None:
            optimizer.zero_grad()

        # compute
        data_in = data_in.to(device, dtype=nii_dconf.d_dtype)
        if args.model_forward_with_target:
            # if model.forward requires (input, target) as arguments
            # for example, for auto-encoder & autoregressive model
            if isinstance(data_tar, torch.Tensor):
                data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype)
                if args.model_forward_with_file_name:
                    data_gen = pt_model(data_in, data_tar_tm, data_info)
                else:
                    data_gen = pt_model(data_in, data_tar_tm)
            else:
                nii_display.f_print("--model-forward-with-target is set")
                nii_display.f_die("but data_tar is not loaded")
        else:
            if args.model_forward_with_file_name:
                # specifcal case when model.forward requires data_info
                data_gen = pt_model(data_in, data_info)
            else:
                # normal case for model.forward(input)
                data_gen = pt_model(data_in)

        # compute loss and do back propagate
        loss_vals = [0]
        if isinstance(data_tar, torch.Tensor):
            data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)
            # there is no way to normalize the data inside loss
            # thus, do normalization here
            if target_norm_method is None:
                normed_target = pt_model.normalize_target(data_tar)
            else:
                normed_target = target_norm_method(data_tar)

            # return the loss from loss_wrapper
            # loss_computed may be [[loss_1, loss_2, ...],[flag_1, flag_2,.]]
            #   which contain multiple loss and flags indicating whether
            #   the corresponding loss should be taken into consideration
            #   for early stopping
            # or
            # loss_computed may be simply a tensor loss
            loss_computed = loss_wrapper.compute(data_gen, normed_target)

            # To handle cases where there are multiple loss functions
            # when loss_comptued is [[loss_1, loss_2, ...],[flag_1, flag_2,.]]
            #   loss: sum of [loss_1, loss_2, ...], for backward()
            #   loss_vals: [loss_1.item(), loss_2.item() ..], for logging
            #   loss_flags: [True/False, ...], for logging,
            #               whether loss_n is used for early stopping
            # when loss_computed is loss
            #   loss: loss
            #   los_vals: [loss.item()]
            #   loss_flags: [True]
            loss, loss_vals, loss_flags = nii_nn_tools.f_process_loss(
                loss_computed)

            # Back-propgation using the summed loss
            if optimizer is not None:
                loss.backward()
                optimizer.step()

        # save the training process information to the monitor
        end_time = time.time()
        batchsize = len(data_info)
        for idx, data_seq_info in enumerate(data_info):
            # loss_value is supposed to be the average loss value
            # over samples in the the batch, thus, just loss_value
            # rather loss_value / batchsize
            monitor.log_loss(loss_vals, loss_flags, \
                             (end_time-start_time) / batchsize, \
                             data_seq_info, idx_orig.numpy()[idx], \
                             epoch_idx)
            # print infor for one sentence
            if args.verbose == 1:
                monitor.print_error_for_batch(data_idx*batchsize + idx,\
                                              idx_orig.numpy()[idx], \
                                              epoch_idx)
            #
        # start the timer for a new batch
        start_time = time.time()

    # lopp done
    pbar.close()
    return
Beispiel #25
0
    def __init__(self,
                 dataset_name, \
                 file_list, \
                 input_dirs, input_exts, input_dims, input_reso, \
                 input_norm, \
                 output_dirs, output_exts, output_dims, output_reso, \
                 output_norm, \
                 stats_path, \
                 data_format = '<f4', \
                 truncate_seq = None, \
                 min_seq_len = None, \
                 save_mean_std = True, \
                 wav_samp_rate = None):
        """
        Args:
            dataset_name: name of this data set
            file_list: a list of file name strings (without extension)
            input_dirs: a list of dirs from each input feature is loaded
            input_exts: a list of input feature name extentions
            input_dims: a list of input feature dimensions
            input_reso: a list of input feature temporal resolutions
            output_dirs: a list of dirs from each output feature is loaded
            output_exts: a list of output feature name extentions
            output_dims: a list of output feature dimensions
            output_reso: a list of output feature temporal resolutions
            stat_path: path to the directory that saves mean/std, 
                       utterance length
            data_format: method to load the data
                    '<f4' (default): load data as float32m little-endian
                    'htk': load data as htk format
            truncate_seq: None or int, truncate sequence into truncks.
                          truncate_seq > 0 specifies the trunck length
        """
        # initialization
        self.m_set_name = dataset_name
        self.m_file_list = file_list
        self.m_input_dirs = input_dirs
        self.m_input_exts = input_exts
        self.m_input_dims = input_dims
        
        self.m_output_dirs = output_dirs
        self.m_output_exts = output_exts
        self.m_output_dims = output_dims

        if len(self.m_input_dirs) != len(self.m_input_exts) or \
           len(self.m_input_dirs) != len(self.m_input_dims):
            nii_warn.f_print("Input dirs, exts, dims, unequal length",
                             'error')
            nii_warn.f_print(str(self.m_input_dirs), 'error')
            nii_warn.f_print(str(self.m_input_exts), 'error')
            nii_warn.f_print(str(self.m_input_dims), 'error')
            nii_warn.f_die("Please check input dirs, exts, dims")

        if len(self.m_output_dims) != len(self.m_output_exts) or \
           (self.m_output_dirs and \
            len(self.m_output_dirs) != len(self.m_output_exts)):
            nii_warn.f_print("Output dirs, exts, dims, unequal length", \
                             'error')
            nii_warn.f_die("Please check output dirs, exts, dims")

        # fill in m_*_reso and m_*_norm
        def _tmp_f(list2, default_value, length):
            if list2 is None:
                return [default_value for x in range(length)]
            else:
                return list2
            
        self.m_input_reso = _tmp_f(input_reso, 1, len(input_dims))
        self.m_input_norm = _tmp_f(input_norm, True, len(input_dims))
        self.m_output_reso = _tmp_f(output_reso, 1, len(output_dims))
        self.m_output_norm = _tmp_f(output_norm, True, len(output_dims))
        if len(self.m_input_reso) != len(self.m_input_dims):
            nii_warn.f_die("Please check input_reso")
        if len(self.m_output_reso) != len(self.m_output_dims):
            nii_warn.f_die("Please check output_reso")
        if len(self.m_input_norm) != len(self.m_input_dims):
            nii_warn.f_die("Please check input_norm")
        if len(self.m_output_norm) != len(self.m_output_dims):
            nii_warn.f_die("Please check output_norm")
        
        # dimensions
        self.m_input_all_dim = sum(self.m_input_dims)
        self.m_output_all_dim = sum(self.m_output_dims)
        self.m_io_dim = self.m_input_all_dim + self.m_output_all_dim

        self.m_truncate_seq = truncate_seq
        self.m_min_seq_len = min_seq_len
        self.m_save_ms = save_mean_std

        # in case there is waveform data in input or output features 
        self.m_wav_sr = wav_samp_rate
            
        # sanity check on resolution configuration
        # currently, only input features can have different reso,
        # and the m_input_reso must be the same for all input features
        if any([x != self.m_input_reso[0] for x in self.m_input_reso]):
            nii_warn.f_print("input_reso: %s" % (str(self.m_input_reso)),\
                             'error')
            nii_warn.f_print("NIIDataSet not support", 'error', end='')
            nii_warn.f_die(" different input_reso")
        if any([x != 1 for x in self.m_output_reso]):
            nii_warn.f_print("NIIDataSet only supports", 'error', end='')
            nii_warn.f_die(" output_reso = [1, 1, ... 1]")
        self.m_single_reso = self.m_input_reso[0]
        
        # To make sure that target waveform length is exactly equal
        #  to the up-sampled sequence length
        # self.m_truncate_seq must be changed to be N * up_sample
        if self.m_truncate_seq is not None:
            # assume input resolution is the same
            self.m_truncate_seq = self.f_adjust_len(self.m_truncate_seq)

        # method to load/write raw data
        if data_format == '<f4':
            self.f_load_data = _data_reader
            self.f_length_data = _data_len_reader
            self.f_write_data = lambda x, y: _data_writer(x, y, \
                                                          self.m_wav_sr)
        else:
            nii_warn.f_print("Unsupported dtype %s" % (data_format))
            nii_warn.f_die("Only supports np.float32 <f4")
            
        # check the validity of data
        self.f_check_file_list()
        
        # log down statiscs 
        #  1. length of each data utterance
        #  2. mean / std of feature feature file
        def get_name(stats_path, set_name, file_name):
            tmp = set_name + '_' + file_name
            return os.path.join(stats_path, tmp)
        
        self.m_ms_input_path = get_name(stats_path, self.m_set_name, \
                                        nii_dconf.mean_std_i_file)
        self.m_ms_output_path = get_name(stats_path, self.m_set_name, \
                                         nii_dconf.mean_std_o_file)
        self.m_data_len_path = get_name(stats_path, self.m_set_name, \
                                        nii_dconf.data_len_file)
        
        # initialize data length and mean /std
        flag_cal_len = self.f_init_data_len_stats(self.m_data_len_path)
        flag_cal_mean_std = self.f_init_mean_std(self.m_ms_input_path,
                                                 self.m_ms_output_path)
            
        # if data information is not available, read it again from data
        if flag_cal_len or flag_cal_mean_std:
            self.f_calculate_stats(flag_cal_len, flag_cal_mean_std) 
            
        # check
        if self.__len__() < 1:
            nii_warn.f_print("Fail to load any data", "error")
            nii_warn.f_die("Please check configuration")
        # done
        return                
def f_run_one_epoch_WGAN(
        args, pt_model_G, pt_model_D,
        loss_wrapper, \
        device, monitor,  \
        data_loader, epoch_idx,
        optimizer_G = None, optimizer_D = None, \
        target_norm_method = None):
    """
    f_run_one_epoch_WGAN: 
       similar to f_run_one_epoch_GAN, but for WGAN
    """
    # timer
    start_time = time.time()

    # number of critic (default 5)
    if hasattr(args, "wgan-critic-num"):
        num_critic = args.wgan_critic_num
    else:
        num_critic = 5
    # clip value
    if hasattr(args, "wgan-clamp"):
        wgan_clamp = args.wgan_clamp
    else:
        wgan_clamp = 0.01

    # loop over samples
    for data_idx, (data_in, data_tar, data_info, idx_orig) in \
        enumerate(data_loader):

        # send data to device
        if optimizer_G is not None:
            optimizer_G.zero_grad()
        if optimizer_D is not None:
            optimizer_D.zero_grad()

        # prepare data
        if isinstance(data_tar, torch.Tensor):
            data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)
            # there is no way to normalize the data inside loss
            # thus, do normalization here
            if target_norm_method is None:
                normed_target = pt_model_G.normalize_target(data_tar)
            else:
                normed_target = target_norm_method(data_tar)
        else:
            nii_display.f_die("target data is required")

        # to device (we assume noise will be generated by the model itself)
        # here we only provide external condition
        data_in = data_in.to(device, dtype=nii_dconf.d_dtype)

        ############################
        # Update Discriminator
        ############################
        # train with real
        pt_model_D.zero_grad()
        d_out_real = pt_model_D(data_tar)
        errD_real = loss_wrapper.compute_gan_D_real(d_out_real)
        if optimizer_D is not None:
            errD_real.backward()
        d_out_real_mean = d_out_real.mean()

        # train with fake
        #  generate sample
        if args.model_forward_with_target:
            # if model.forward requires (input, target) as arguments
            # for example, for auto-encoder & autoregressive model
            if isinstance(data_tar, torch.Tensor):
                data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype)
                if args.model_forward_with_file_name:
                    data_gen = pt_model_G(data_in, data_tar_tm, data_info)
                else:
                    data_gen = pt_model_G(data_in, data_tar_tm)
            else:
                nii_display.f_print("--model-forward-with-target is set")
                nii_display.f_die("but data_tar is not loaded")
        else:
            if args.model_forward_with_file_name:
                # specifcal case when model.forward requires data_info
                data_gen = pt_model_G(data_in, data_info)
            else:
                # normal case for model.forward(input)
                data_gen = pt_model_G(data_in)

        # data_gen.detach() is required
        # https://github.com/pytorch/examples/issues/116
        d_out_fake = pt_model_D(data_gen.detach())
        errD_fake = loss_wrapper.compute_gan_D_fake(d_out_fake)
        if optimizer_D is not None:
            errD_fake.backward()
        d_out_fake_mean = d_out_fake.mean()

        errD = errD_real + errD_fake
        if optimizer_D is not None:
            optimizer_D.step()

        # clip weights of discriminator
        for p in pt_model_D.parameters():
            p.data.clamp_(-wgan_clamp, wgan_clamp)

        ############################
        # Update Generator
        ############################
        pt_model_G.zero_grad()
        d_out_fake_for_G = pt_model_D(data_gen)
        errG_gan = loss_wrapper.compute_gan_G(d_out_fake_for_G)
        errG_aux = loss_wrapper.compute_aux(data_gen, data_tar)
        errG = errG_gan + errG_aux

        # only update after num_crictic iterations on discriminator
        if data_idx % num_critic == 0 and optimizer_G is not None:
            errG.backward()
            optimizer_G.step()

        d_out_fake_for_G_mean = d_out_fake_for_G.mean()

        # construct the loss for logging and early stopping
        # only use errG_aux for early-stopping
        loss_computed = [[
            errG_aux, errG_gan, errD_real, errD_fake, d_out_real_mean,
            d_out_fake_mean, d_out_fake_for_G_mean
        ], [True, False, False, False, False, False, False]]

        # to handle cases where there are multiple loss functions
        loss, loss_vals, loss_flags = nii_nn_tools.f_process_loss(
            loss_computed)

        # save the training process information to the monitor
        end_time = time.time()
        batchsize = len(data_info)
        for idx, data_seq_info in enumerate(data_info):
            # loss_value is supposed to be the average loss value
            # over samples in the the batch, thus, just loss_value
            # rather loss_value / batchsize
            monitor.log_loss(loss_vals, loss_flags, \
                             (end_time-start_time) / batchsize, \
                             data_seq_info, idx_orig.numpy()[idx], \
                             epoch_idx)
            # print infor for one sentence
            if args.verbose == 1:
                monitor.print_error_for_batch(data_idx*batchsize + idx,\
                                              idx_orig.numpy()[idx], \
                                              epoch_idx)
            #
        # start the timer for a new batch
        start_time = time.time()

    # lopp done
    return
def f_run_one_epoch(args,
                    pt_model, loss_wrapper, \
                    device, monitor,  \
                    data_loader, epoch_idx, optimizer = None):
    """
    f_run_one_epoch: 
       run one poech over the dataset (for training or validation sets)

    Args:
       args:         from argpase
       pt_model:     pytorch model (torch.nn.Module)
       loss_wrapper: a wrapper over loss function
                     loss_wrapper.compute(generated, target) 
       device:       torch.device("cuda") or torch.device("cpu")
       monitor:      defined in op_procfess_monitor.py
       data_loader:  pytorch DataLoader. 
       epoch_idx:    int, index of the current epoch
       optimizer:    torch optimizer or None
                     if None, the back propgation will be skipped
                     (for developlement set)
    """
    # timer
    start_time = time.time()

    # loop over samples
    for data_idx, (data_in, data_tar, data_info, idx_orig) in \
        enumerate(data_loader):

        # idx_orig is the original idx in the dataset
        # which can be different from data_idx when shuffle = True
        #idx_orig = idx_orig.numpy()[0]
        #data_seq_info = data_info[0]

        # send data to device
        if optimizer is not None:
            optimizer.zero_grad()

        # compute
        data_in = data_in.to(device, dtype=nii_dconf.d_dtype)
        if args.model_forward_with_target:
            # if model.forward requires (input, target) as arguments
            # for example, for auto-encoder & autoregressive model
            if isinstance(data_tar, torch.Tensor):
                data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype)
                data_gen = pt_model(data_in, data_tar_tm)
            else:
                nii_display.f_print("--model-forward-with-target is set")
                nii_display.f_die("but no data_tar is not loaded")
        else:
            # normal case for model.forward(input)
            data_gen = pt_model(data_in)

        # compute loss and do back propagate
        loss_value = 0
        if isinstance(data_tar, torch.Tensor):
            data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)
            # there is no way to normalize the data inside loss
            # thus, do normalization here
            normed_target = pt_model.normalize_target(data_tar)
            loss = loss_wrapper.compute(data_gen, normed_target)
            loss_value = loss.item()
            if optimizer is not None:
                loss.backward()
                optimizer.step()

        # log down process information
        end_time = time.time()
        batchsize = len(data_info)
        for idx, data_seq_info in enumerate(data_info):
            monitor.log_loss(loss_value / batchsize, \
                             (end_time-start_time) / batchsize, \
                             data_seq_info, idx_orig.numpy()[idx], \
                             epoch_idx)
            # print infor for one sentence
            if args.verbose == 1:
                monitor.print_error_for_batch(data_idx*batchsize + idx,\
                                              idx_orig.numpy()[idx], \
                                              epoch_idx)
            #
        # start the timer for a new batch
        start_time = time.time()

    # lopp done
    return
def f_train_wrapper_GAN(
        args, pt_model_G, pt_model_D, loss_wrapper, device, \
        optimizer_G_wrapper, optimizer_D_wrapper, \
        train_dataset_wrapper, \
        val_dataset_wrapper = None, \
        checkpoint_G = None, checkpoint_D = None):
    """ 
    f_train_wrapper_GAN(
       args, pt_model_G, pt_model_D, loss_wrapper, device, 
       optimizer_G_wrapper, optimizer_D_wrapper, 
       train_dataset_wrapper, val_dataset_wrapper = None,
       check_point = None):

      A wrapper to run the training process

    Args:
       args:         argument information given by argpase
       pt_model_G:   generator, pytorch model (torch.nn.Module)
       pt_model_D:   discriminator, pytorch model (torch.nn.Module)
       loss_wrapper: a wrapper over loss functions
                     loss_wrapper.compute_D_real(discriminator_output) 
                     loss_wrapper.compute_D_fake(discriminator_output) 
                     loss_wrapper.compute_G(discriminator_output)
                     loss_wrapper.compute_G(fake, real)

       device:       torch.device("cuda") or torch.device("cpu")

       optimizer_G_wrapper: 
           a optimizer wrapper for generator (defined in op_manager.py)
       optimizer_D_wrapper: 
           a optimizer wrapper for discriminator (defined in op_manager.py)
       
       train_dataset_wrapper: 
           a wrapper over training data set (data_io/default_data_io.py)
           train_dataset_wrapper.get_loader() returns torch.DataSetLoader
       
       val_dataset_wrapper: 
           a wrapper over validation data set (data_io/default_data_io.py)
           it can None.
       
       checkpoint_G:
           a check_point that stores every thing to resume training

       checkpoint_D:
           a check_point that stores every thing to resume training
    """

    nii_display.f_print_w_date("Start model training")

    # get the optimizer
    optimizer_G_wrapper.print_info()
    optimizer_D_wrapper.print_info()
    optimizer_G = optimizer_G_wrapper.optimizer
    optimizer_D = optimizer_D_wrapper.optimizer
    epoch_num = optimizer_G_wrapper.get_epoch_num()
    no_best_epoch_num = optimizer_G_wrapper.get_no_best_epoch_num()

    # get data loader for training set
    train_dataset_wrapper.print_info()
    train_data_loader = train_dataset_wrapper.get_loader()
    train_seq_num = train_dataset_wrapper.get_seq_num()

    # get the training process monitor
    monitor_trn = nii_monitor.Monitor(epoch_num, train_seq_num)

    # if validation data is provided, get data loader for val set
    if val_dataset_wrapper is not None:
        val_dataset_wrapper.print_info()
        val_data_loader = val_dataset_wrapper.get_loader()
        val_seq_num = val_dataset_wrapper.get_seq_num()
        monitor_val = nii_monitor.Monitor(epoch_num, val_seq_num)
    else:
        monitor_val = None

    # training log information
    train_log = ''
    model_tags = ["_G", "_D"]

    # prepare for DataParallism if available
    # pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html
    if torch.cuda.device_count() > 1 and args.multi_gpu_data_parallel:
        nii_display.f_die("data_parallel not implemented for GAN")
    else:
        nii_display.f_print("Use single GPU: %s" % \
                            (torch.cuda.get_device_name(device)))
        flag_multi_device = False
        normtarget_f = None
    pt_model_G.to(device, dtype=nii_dconf.d_dtype)
    pt_model_D.to(device, dtype=nii_dconf.d_dtype)

    # print the network
    nii_display.f_print("Setup generator")
    f_model_show(pt_model_G)
    nii_display.f_print("Setup discriminator")
    f_model_show(pt_model_D)

    # resume training or initialize the model if necessary
    cp_names = CheckPointKey()
    if checkpoint_G is not None or checkpoint_D is not None:
        for checkpoint, optimizer, pt_model, model_name in \
            zip([checkpoint_G, checkpoint_D], [optimizer_G, optimizer_D],
                [pt_model_G, pt_model_D], ["Generator", "Discriminator"]):
            nii_display.f_print("For %s" % (model_name))
            if type(checkpoint) is dict:
                # checkpoint
                # load model parameter and optimizer state
                if cp_names.state_dict in checkpoint:
                    # wrap the state_dic in f_state_dict_wrapper
                    # in case the model is saved when DataParallel is on
                    pt_model.load_state_dict(
                        nii_nn_tools.f_state_dict_wrapper(
                            checkpoint[cp_names.state_dict],
                            flag_multi_device))
                # load optimizer state
                if cp_names.optimizer in checkpoint:
                    optimizer.load_state_dict(checkpoint[cp_names.optimizer])
                # optionally, load training history
                if not args.ignore_training_history_in_trained_model:
                    #nii_display.f_print("Load ")
                    if cp_names.trnlog in checkpoint:
                        monitor_trn.load_state_dic(checkpoint[cp_names.trnlog])
                    if cp_names.vallog in checkpoint and monitor_val:
                        monitor_val.load_state_dic(checkpoint[cp_names.vallog])
                    if cp_names.info in checkpoint:
                        train_log = checkpoint[cp_names.info]
                    nii_display.f_print("Load check point, resume training")
                else:
                    nii_display.f_print("Load pretrained model and optimizer")
            elif checkpoint is not None:
                # only model status
                #pt_model.load_state_dict(checkpoint)
                pt_model.load_state_dict(
                    nii_nn_tools.f_state_dict_wrapper(checkpoint,
                                                      flag_multi_device))
                nii_display.f_print("Load pretrained model")
            else:
                nii_display.f_print("No pretrained model")

    # done for resume training

    # other variables
    flag_early_stopped = False
    start_epoch = monitor_trn.get_epoch()
    epoch_num = monitor_trn.get_max_epoch()

    if hasattr(loss_wrapper, "flag_wgan") and loss_wrapper.flag_wgan:
        f_wrapper_gan_one_epoch = f_run_one_epoch_WGAN
    else:
        f_wrapper_gan_one_epoch = f_run_one_epoch_GAN

    # print
    _ = nii_op_display_tk.print_log_head()
    nii_display.f_print_message(train_log, flush=True, end='')

    # loop over multiple epochs
    for epoch_idx in range(start_epoch, epoch_num):

        # training one epoch
        pt_model_D.train()
        pt_model_G.train()

        f_wrapper_gan_one_epoch(
            args, pt_model_G, pt_model_D,
            loss_wrapper, device, \
            monitor_trn, train_data_loader, \
            epoch_idx, optimizer_G, optimizer_D,
            normtarget_f)

        time_trn = monitor_trn.get_time(epoch_idx)
        loss_trn = monitor_trn.get_loss(epoch_idx)

        # if necessary, do validataion
        if val_dataset_wrapper is not None:
            # set eval() if necessary
            if args.eval_mode_for_validation:
                pt_model_G.eval()
                pt_model_D.eval()
            with torch.no_grad():
                f_wrapper_gan_one_epoch(
                    args, pt_model_G, pt_model_D,
                    loss_wrapper, \
                    device, \
                    monitor_val, val_data_loader, \
                    epoch_idx, None, None, normtarget_f)
            time_val = monitor_val.get_time(epoch_idx)
            loss_val = monitor_val.get_loss(epoch_idx)
        else:
            time_val, loss_val = 0, 0

        if val_dataset_wrapper is not None:
            flag_new_best = monitor_val.is_new_best()
        else:
            flag_new_best = True

        # print information
        train_log += nii_op_display_tk.print_train_info(
            epoch_idx, time_trn, loss_trn, time_val, loss_val, flag_new_best)

        # save the best model
        if flag_new_best:
            for pt_model, model_tag in \
                zip([pt_model_G, pt_model_D], model_tags):
                tmp_best_name = f_save_trained_name_GAN(args, model_tag)
                torch.save(pt_model.state_dict(), tmp_best_name)

        # save intermediate model if necessary
        if not args.not_save_each_epoch:
            # save model discrminator and generator
            for pt_model, optimizer, model_tag in \
                zip([pt_model_G, pt_model_D], [optimizer_G, optimizer_D],
                    model_tags):

                tmp_model_name = f_save_epoch_name_GAN(args, epoch_idx,
                                                       model_tag)
                if monitor_val is not None:
                    tmp_val_log = monitor_val.get_state_dic()
                else:
                    tmp_val_log = None
                # save
                tmp_dic = {
                    cp_names.state_dict: pt_model.state_dict(),
                    cp_names.info: train_log,
                    cp_names.optimizer: optimizer.state_dict(),
                    cp_names.trnlog: monitor_trn.get_state_dic(),
                    cp_names.vallog: tmp_val_log
                }
                torch.save(tmp_dic, tmp_model_name)
                if args.verbose == 1:
                    nii_display.f_eprint(str(datetime.datetime.now()))
                    nii_display.f_eprint("Save {:s}".format(tmp_model_name),
                                         flush=True)

        # early stopping
        if monitor_val is not None and \
           monitor_val.should_early_stop(no_best_epoch_num):
            flag_early_stopped = True
            break

    # loop done

    nii_op_display_tk.print_log_tail()
    if flag_early_stopped:
        nii_display.f_print("Training finished by early stopping")
    else:
        nii_display.f_print("Training finished")
    nii_display.f_print("Model is saved to", end='')
    for model_tag in model_tags:
        nii_display.f_print("{}".format(
            f_save_trained_name_GAN(args, model_tag)))
    return
def f_train_wrapper(args, pt_model, loss_wrapper, device, \
                    optimizer_wrapper, \
                    train_dataset_wrapper, \
                    val_dataset_wrapper = None, \
                    checkpoint = None):
    """ 
    f_train_wrapper(args, pt_model, loss_wrapper, device, 
                    optimizer_wrapper
                    train_dataset_wrapper, val_dataset_wrapper = None,
                    check_point = None):
      A wrapper to run the training process

    Args:
       args:         argument information given by argpase
       pt_model:     pytorch model (torch.nn.Module)
       loss_wrapper: a wrapper over loss function
                     loss_wrapper.compute(generated, target) 
       device:       torch.device("cuda") or torch.device("cpu")

       optimizer_wrapper: 
           a wrapper over optimizer (defined in op_manager.py)
           optimizer_wrapper.optimizer is torch.optimizer
    
       train_dataset_wrapper: 
           a wrapper over training data set (data_io/default_data_io.py)
           train_dataset_wrapper.get_loader() returns torch.DataSetLoader
       
       val_dataset_wrapper: 
           a wrapper over validation data set (data_io/default_data_io.py)
           it can None.
       
       check_point:
           a check_point that stores every thing to resume training
    """

    nii_display.f_print_w_date("Start model training")

    # get the optimizer
    optimizer_wrapper.print_info()
    optimizer = optimizer_wrapper.optimizer
    epoch_num = optimizer_wrapper.get_epoch_num()
    no_best_epoch_num = optimizer_wrapper.get_no_best_epoch_num()

    # get data loader for training set
    train_dataset_wrapper.print_info()
    train_data_loader = train_dataset_wrapper.get_loader()
    train_seq_num = train_dataset_wrapper.get_seq_num()

    # get the training process monitor
    monitor_trn = nii_monitor.Monitor(epoch_num, train_seq_num)

    # if validation data is provided, get data loader for val set
    if val_dataset_wrapper is not None:
        val_dataset_wrapper.print_info()
        val_data_loader = val_dataset_wrapper.get_loader()
        val_seq_num = val_dataset_wrapper.get_seq_num()
        monitor_val = nii_monitor.Monitor(epoch_num, val_seq_num)
    else:
        monitor_val = None

    # training log information
    train_log = ''

    # print the network
    pt_model.to(device, dtype=nii_dconf.d_dtype)
    f_model_show(pt_model)

    # resume training or initialize the model if necessary
    cp_names = CheckPointKey()
    if checkpoint is not None:
        if type(checkpoint) is dict:
            # checkpoint
            if cp_names.state_dict in checkpoint:
                pt_model.load_state_dict(checkpoint[cp_names.state_dict])
            if cp_names.optimizer in checkpoint:
                optimizer.load_state_dict(checkpoint[cp_names.optimizer])
            if cp_names.trnlog in checkpoint:
                monitor_trn.load_state_dic(checkpoint[cp_names.trnlog])
            if cp_names.vallog in checkpoint and monitor_val:
                monitor_val.load_state_dic(checkpoint[cp_names.vallog])
            if cp_names.info in checkpoint:
                train_log = checkpoint[cp_names.info]
            nii_display.f_print("Load check point and resume training")
        else:
            # only model status
            pt_model.load_state_dict(checkpoint)
            nii_display.f_print("Load pre-trained model")

    # other variables
    flag_early_stopped = False
    start_epoch = monitor_trn.get_epoch()
    epoch_num = monitor_trn.get_max_epoch()

    # print
    _ = nii_op_display_tk.print_log_head()
    nii_display.f_print_message(train_log, flush=True, end='')

    # loop over multiple epochs
    for epoch_idx in range(start_epoch, epoch_num):

        # training one epoch
        pt_model.train()
        f_run_one_epoch(args, pt_model, loss_wrapper, device, \
                        monitor_trn, train_data_loader, \
                        epoch_idx, optimizer)
        time_trn = monitor_trn.get_time(epoch_idx)
        loss_trn = monitor_trn.get_loss(epoch_idx)

        # if necessary, do validataion
        if val_dataset_wrapper is not None:
            # set eval() if necessary
            if args.eval_mode_for_validation:
                pt_model.eval()
            with torch.no_grad():
                f_run_one_epoch(args, pt_model, loss_wrapper, \
                                device, \
                                monitor_val, val_data_loader, \
                                epoch_idx, None)
            time_val = monitor_val.get_time(epoch_idx)
            loss_val = monitor_val.get_loss(epoch_idx)
        else:
            time_val, loss_val = 0, 0

        if val_dataset_wrapper is not None:
            flag_new_best = monitor_val.is_new_best()
        else:
            flag_new_best = True

        # print information
        train_log += nii_op_display_tk.print_train_info(epoch_idx, \
                                                        time_trn, \
                                                        loss_trn, \
                                                        time_val, \
                                                        loss_val, \
                                                        flag_new_best)
        # save the best model
        if flag_new_best:
            tmp_best_name = f_save_trained_name(args)
            torch.save(pt_model.state_dict(), tmp_best_name)

        # save intermediate model if necessary
        if not args.not_save_each_epoch:
            tmp_model_name = f_save_epoch_name(args, epoch_idx)
            if monitor_val is not None:
                tmp_val_log = monitor_val.get_state_dic()
            else:
                tmp_val_log = None
            # save
            tmp_dic = {
                cp_names.state_dict: pt_model.state_dict(),
                cp_names.info: train_log,
                cp_names.optimizer: optimizer.state_dict(),
                cp_names.trnlog: monitor_trn.get_state_dic(),
                cp_names.vallog: tmp_val_log
            }
            torch.save(tmp_dic, tmp_model_name)
            if args.verbose == 1:
                nii_display.f_eprint(str(datetime.datetime.now()))
                nii_display.f_eprint("Save {:s}".format(tmp_model_name),
                                     flush=True)

        # early stopping
        if monitor_val is not None and \
           monitor_val.should_early_stop(no_best_epoch_num):
            flag_early_stopped = True
            break

    # loop done
    nii_op_display_tk.print_log_tail()
    if flag_early_stopped:
        nii_display.f_print("Training finished by early stopping")
    else:
        nii_display.f_print("Training finished")
    nii_display.f_print("Model is saved to", end='')
    nii_display.f_print("{}".format(f_save_trained_name(args)))
    return
def f_run_one_epoch_GAN(
        args, pt_model_G, pt_model_D,
        loss_wrapper, \
        device, monitor,  \
        data_loader, epoch_idx,
        optimizer_G = None, optimizer_D = None, \
        target_norm_method = None):
    """
    f_run_one_epoch_GAN: 
       run one poech over the dataset (for training or validation sets)

    Args:
       args:         from argpase
       pt_model_G:   pytorch model (torch.nn.Module) generator
       pt_model_D:   pytorch model (torch.nn.Module) discriminator
       loss_wrapper: a wrapper over loss function
                     loss_wrapper.compute(generated, target) 
       device:       torch.device("cuda") or torch.device("cpu")
       monitor:      defined in op_procfess_monitor.py
       data_loader:  pytorch DataLoader. 
       epoch_idx:    int, index of the current epoch
       optimizer_G:  torch optimizer or None, for generator
       optimizer_D:  torch optimizer or None, for discriminator
                     if None, the back propgation will be skipped
                     (for developlement set)
       target_norm_method: method to normalize target data
                           (by default, use pt_model.normalize_target)
    """
    # timer
    start_time = time.time()

    # loop over samples
    for data_idx, (data_in, data_tar, data_info, idx_orig) in \
        enumerate(data_loader):

        # send data to device
        if optimizer_G is not None:
            optimizer_G.zero_grad()
        if optimizer_D is not None:
            optimizer_D.zero_grad()

        # prepare data
        if isinstance(data_tar, torch.Tensor):
            data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype)
            # there is no way to normalize the data inside loss
            # thus, do normalization here
            if target_norm_method is None:
                normed_target = pt_model_G.normalize_target(data_tar)
            else:
                normed_target = target_norm_method(data_tar)
        else:
            nii_display.f_die("target data is required")

        # to device (we assume noise will be generated by the model itself)
        # here we only provide external condition
        data_in = data_in.to(device, dtype=nii_dconf.d_dtype)

        ############################
        # Update Discriminator
        ############################
        # train with real
        pt_model_D.zero_grad()
        d_out_real = pt_model_D(data_tar)
        errD_real = loss_wrapper.compute_gan_D_real(d_out_real)
        if optimizer_D is not None:
            errD_real.backward()

        # this should be given by pt_model_D or loss wrapper
        #d_out_real_mean = d_out_real.mean()

        # train with fake
        #  generate sample
        if args.model_forward_with_target:
            # if model.forward requires (input, target) as arguments
            # for example, for auto-encoder & autoregressive model
            if isinstance(data_tar, torch.Tensor):
                data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype)
                if args.model_forward_with_file_name:
                    data_gen = pt_model_G(data_in, data_tar_tm, data_info)
                else:
                    data_gen = pt_model_G(data_in, data_tar_tm)
            else:
                nii_display.f_print("--model-forward-with-target is set")
                nii_display.f_die("but data_tar is not loaded")
        else:
            if args.model_forward_with_file_name:
                # specifcal case when model.forward requires data_info
                data_gen = pt_model_G(data_in, data_info)
            else:
                # normal case for model.forward(input)
                data_gen = pt_model_G(data_in)

        # data_gen.detach() is required
        # https://github.com/pytorch/examples/issues/116
        d_out_fake = pt_model_D(data_gen.detach())
        errD_fake = loss_wrapper.compute_gan_D_fake(d_out_fake)
        if optimizer_D is not None:
            errD_fake.backward()

        errD = errD_real + errD_fake
        if optimizer_D is not None:
            optimizer_D.step()

        ############################
        # Update Generator
        ############################
        pt_model_G.zero_grad()
        d_out_fake_for_G = pt_model_D(data_gen)
        errG_gan = loss_wrapper.compute_gan_G(d_out_fake_for_G)

        # if defined, calculate auxilliart loss
        if hasattr(loss_wrapper, "compute_aux"):
            errG_aux = loss_wrapper.compute_aux(data_gen, data_tar)
        else:
            errG_aux = torch.zeros_like(errG_gan)

        # if defined, calculate feat-matching loss
        if hasattr(loss_wrapper, "compute_feat_match"):
            errG_feat = loss_wrapper.compute_feat_match(
                d_out_real, d_out_fake_for_G)
        else:
            errG_feat = torch.zeros_like(errG_gan)

        # sum loss for generator
        errG = errG_gan + errG_aux + errG_feat

        if optimizer_G is not None:
            errG.backward()
            optimizer_G.step()

        # construct the loss for logging and early stopping
        # only use errG_aux for early-stopping
        loss_computed = [[errG_aux, errD_real, errD_fake, errG_gan, errG_feat],
                         [True, False, False, False, False]]

        # to handle cases where there are multiple loss functions
        _, loss_vals, loss_flags = nii_nn_tools.f_process_loss(loss_computed)

        # save the training process information to the monitor
        end_time = time.time()
        batchsize = len(data_info)
        for idx, data_seq_info in enumerate(data_info):
            # loss_value is supposed to be the average loss value
            # over samples in the the batch, thus, just loss_value
            # rather loss_value / batchsize
            monitor.log_loss(loss_vals, loss_flags, \
                             (end_time-start_time) / batchsize, \
                             data_seq_info, idx_orig.numpy()[idx], \
                             epoch_idx)
            # print infor for one sentence
            if args.verbose == 1:
                monitor.print_error_for_batch(data_idx*batchsize + idx,\
                                              idx_orig.numpy()[idx], \
                                              epoch_idx)
            #
        # start the timer for a new batch
        start_time = time.time()

    # lopp done
    return