def __init__(self, model, args): """ Initialize an optimizer over model.parameters() """ # check valildity of model if not hasattr(model, "parameters"): nii_warn.f_print("model is not torch.nn", "error") nii_warn.f_die("Error in creating OptimizerWrapper") # set optimizer type self.op_flag = args.optimizer self.lr = args.lr # create optimizer if self.op_flag == "Adam": self.optimizer = torch_optim.Adam(model.parameters(), lr=self.lr) elif self.op_flag == "RMSprop": self.optimizer = torch_optim.RMSprop(model.parameters(), lr=self.lr) else: nii_warn.f_print("%s not availabel" % (self.op_flag), "error") nii_warn.f_die("Please change optimizer") # number of epochs self.epochs = args.epochs self.no_best_epochs = args.no_best_epochs return
def f_model_show(pt_model): """ f_model_show(pt_model) Args: pt_model, a Pytorch model Print the informaiton of the model """ print(pt_model) num = sum(p.numel() for p in pt_model.parameters() if p.requires_grad) nii_display.f_print("Parameter number: {:d}".format(num), "normal") return
def f_parse(self): """ f_parse parse the configuration file """ if self.m_config_path is not None: tmp_config = configparser.ConfigParser() tmp_config.read(self.m_config_path) return tmp_config else: nii_display.f_print("No config file provided", 'error') return None
def _get_loss_for_learning_stopping(self, epoch_idx): # compute the average loss values if epoch_idx > self.cur_epoch: nii_display.f_print("To find loss for future epochs", 'error') nii_display.f_die("Op_process_monitor: error") if epoch_idx < 0: nii_display.f_print("To find loss for NULL epoch", 'error') nii_display.f_die("Op_process_monitor: error") loss_this = np.sum(self.loss_mat[epoch_idx, :, :], axis=0) # compute only part of the loss for early stopping when necessary loss_this = np.sum(loss_this * self.loss_flag) return loss_this
def f_inference_wrapper(args, pt_model, device, \ test_dataset_wrapper, checkpoint): """ """ test_data_loader = test_dataset_wrapper.get_loader() test_seq_num = test_dataset_wrapper.get_seq_num() test_dataset_wrapper.print_info() # print the network pt_model.to(device, dtype=nii_dconf.d_dtype) print(pt_model) cp_names = CheckPointKey() if type(checkpoint) is dict and cp_names.state_dict in checkpoint: pt_model.load_state_dict(checkpoint[cp_names.state_dict]) else: pt_model.load_state_dict(checkpoint) pt_model.eval() with torch.no_grad(): for _, (data_in, data_tar, data_info, idx_orig) in \ enumerate(test_data_loader): # send data to device data_in = data_in.to(device) if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) # compute output start_time = time.time() if args.model_forward_with_target: # if model.forward requires (input, target) as arguments # for example, for auto-encoder data_gen = pt_model(data_in, data_tar) else: data_gen = pt_model(data_in) data_gen = pt_model.denormalize_output(data_gen) time_cost = time.time() - start_time # average time for each sequence when batchsize > 1 time_cost = time_cost / len(data_info) # save output (in case batchsize > 1, ) data_gen_np = data_gen.to("cpu").numpy() for idx, seq_info in enumerate(data_info): _ = nii_op_display_tk.print_gen_info(seq_info, time_cost) test_dataset_wrapper.putitem(data_gen_np[idx:idx+1],\ args.output_dir, \ seq_info) # nii_display.f_print("Generated data to %s" % (args.output_dir)) # done return
def load_state_dic(self, state_dic): """ resume training, load the information """ try: if self.seq_num != state_dic['seq_num']: nii_display.f_print("Number of samples are different \ from previous training", 'error') nii_display.f_print("Please make sure that you are \ using the same training/development sets as before.", "error") nii_display.f_print("Or\nPlease add --") nii_display.f_print("ignore_training_history_in_trained_model") nii_display.f_die(" to avoid loading training history") if self.epoch_num == state_dic['epoch_num']: self.loss_mat = state_dic['loss_mat'] self.time_mat = state_dic['time_mat'] else: # if training epoch is increased, resize the shape tmp_loss_mat = state_dic['loss_mat'] self.loss_mat = np.resize( self.loss_mat, [self.epoch_num, self.seq_num, tmp_loss_mat.shape[2]]) self.loss_mat[0:tmp_loss_mat.shape[0]] = tmp_loss_mat self.time_mat[0:tmp_loss_mat.shape[0]] = state_dic['time_mat'] self.seq_num = state_dic['seq_num'] # since the saved cur_epoch has been finished self.cur_epoch = state_dic['cur_epoch'] + 1 self.best_error = state_dic['best_error'] self.best_epoch = state_dic['best_epoch'] self.loss_flag = state_dic['loss_flag'] self.seq_names = {} except KeyError: nii_display.f_die("Invalid op_process_monitor state_dic")
def f_check_file_list(self): """ f_check_file_list(): Check the file list after initialization Make sure that the file in file_list appears in every input/output feature directory. If not, get a file_list in which every file is avaiable in every input/output directory """ if not isinstance(self.m_file_list, list): nii_warn.f_print("Read file list from directories") self.m_list = None # get a initial file list if self.m_file_list is None: self.m_file_list = nii_list_tools.listdir_with_ext( self.m_input_dirs[0], self.m_input_exts[0]) # check the list of files exist in all input/output directories for tmp_d, tmp_e in zip(self.m_input_dirs[1:], \ self.m_input_exts[1:]): tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e) self.m_file_list = nii_list_tools.common_members( tmp_list, self.m_file_list) if len(self.m_file_list) < 1: nii_warn.f_print("No input features after scannning", 'error') nii_warn.f_print("Please check input config", 'error') nii_warn.f_print("Please check feature directory", 'error') # check output files if necessary if self.m_output_dirs: for tmp_d, tmp_e in zip(self.m_output_dirs, \ self.m_output_exts): tmp_list = nii_list_tools.listdir_with_ext(tmp_d, tmp_e) self.m_file_list = nii_list_tools.common_members( tmp_list, self.m_file_list) if len(self.m_file_list) < 1: nii_warn.f_print("No output data found", 'error') nii_warn.f_die("Please check outpupt config") else: #nii_warn.f_print("Not loading output features") pass # done return
def f_putitem(self, output_data, save_dir, data_infor_str): """ """ # Change the dimension to (length, dim) if output_data.ndim == 3 and output_data.shape[0] == 1: # When input data is (batchsize=1, length, dim) output_data = output_data[0] elif output_data.ndim == 2 and output_data.shape[0] == 1: # When input data is (batchsize=1, length) output_data = np.expand_dims(output_data[0], -1) else: nii_warn.f_print("Output data format not supported.", "error") nii_warn.f_print("Format is not (batch, len, dim)", "error") nii_warn.f_die("Please use batch_size = 1 in generation") # Save output if output_data.shape[1] != self.m_output_all_dim: nii_warn.f_print("Output data dim != expected dim", "error") nii_warn.f_print("Output:%d" % (output_data.shape[1]), \ "error") nii_warn.f_print("Expected:%d" % (self.m_output_all_dim), \ "error") nii_warn.f_die("Please check configuration") if not os.path.isdir(save_dir): try: os.mkdir(save_dir) except OSError: nii_warn.f_die("Cannot carete {}".format(save_dir)) # read the sentence information tmp_seq_info = nii_seqinfo.SeqInfo() tmp_seq_info.parse_from_str(data_infor_str) # write the data file_name = tmp_seq_info.seq_tag() s_dim = 0 e_dim = 0 for t_ext, t_dim in zip(self.m_output_exts, self.m_output_dims): e_dim = s_dim + t_dim file_path = nii_str_tk.f_realpath(save_dir, file_name, t_ext) self.f_write_data(output_data[:, s_dim:e_dim], file_path) return
def f_model_show(pt_model, do_model_def_check=True, model_type=None): """ f_model_show(pt_model, do_model_check=True) Print the informaiton of the model Args: pt_model, a Pytorch model do_model_def_check, bool, whether check model definition (default True) model_type: str or None (default None), what type of network Return: None """ if do_model_def_check: f_model_check(pt_model, model_type) nii_display.f_print("Model infor:") print(pt_model) num = sum(p.numel() for p in pt_model.parameters() if p.requires_grad) nii_display.f_print("Parameter number: {:d}\n".format(num), "normal") return
def load_state_dic(self, state_dic): """ resume training, load the information """ try: if self.seq_num != state_dic['seq_num']: nii_display.f_print( "Number of samples are different \ from previous training", 'error') nii_display.f_die("Please make sure resumed training are \ using the same training/development sets as before") self.loss_mat = state_dic['loss_mat'] self.time_mat = state_dic['time_mat'] self.epoch_num = state_dic['epoch_num'] self.seq_num = state_dic['seq_num'] # since the saved cur_epoch has been finished self.cur_epoch = state_dic['cur_epoch'] + 1 self.best_error = state_dic['best_error'] self.best_epoch = state_dic['best_epoch'] self.seq_names = {} except KeyError: nii_display.f_die("Invalid op_process_monitor state_dic")
def f_log_data_len(self, file_name, t_len, t_reso): """ f_log_data_len(file_name, t_len, t_reso): Log down the length of the data file. When comparing the different input/output features for the same file_name, only keep the shortest length """ # the length for the sequence with the fast tempoeral rate # For example, acoustic-feature -> waveform 16kHz, # if acoustic-feature is one frame per 5ms, # tmp_len = acoustic feature frame length * (5 * 16) # where t_reso = 5*16 is the up-sampling rate of acoustic feature tmp_len = t_len * t_reso # save length when have not read the file if file_name not in self.m_data_length: self.m_data_length[file_name] = tmp_len # check length if t_len == 1: # if this is an utterance-level feature, it has only 1 frame pass elif self.f_valid_len(self.m_data_length[file_name], tmp_len, \ nii_dconf.data_seq_min_length): # if the difference in length is small if self.m_data_length[file_name] > tmp_len: self.m_data_length[file_name] = tmp_len else: nii_warn.f_print("Sequence length mismatch:", 'error') self.f_check_specific_data(file_name) nii_warn.f_print("Please the above features", 'error') nii_warn.f_die("Possible invalid data %s" % (file_name)) # adjust the length so that, when reso is used, # the sequence length will be N * reso tmp = self.m_data_length[file_name] self.m_data_length[file_name] = self.f_adjust_len(tmp) return
def listdir_with_ext(file_dir, file_ext=None): """ file_list = lstdir_with_ext(file_dir, file_ext=None) Return a list of file names with specified extention Args: file_dir: a file directory file_ext: string, specify the extention, e.g., txt, bin Return: file_list: a list of file_names """ try: if file_ext is None: file_list = [os.path.splitext(x)[0] for x in os.listdir(file_dir) \ if not x.startswith('.')] else: file_list = [os.path.splitext(x)[0] for x in os.listdir(file_dir) \ if not x.startswith('.') and x.endswith(file_ext)] return file_list except OSError: nii_warn.f_print("Cannot access %s" % (file_dir), "error") return []
def __init__(self, optimizer, args): # learning rate decay self.lr_decay = args.lr_decay_factor # lr scheduler type # please check arg_parse.py for the number ID self.lr_scheduler_type = args.lr_scheduler_type # patentience for ReduceLROnPlateau self.lr_patience = 5 # step size for stepLR self.lr_stepLR_size = 10 if self.lr_decay > 0: if self.lr_scheduler_type == 1: # StepLR self.lr_scheduler = torch.optim.lr_scheduler.StepLR( optimizer=optimizer, step_size=self.lr_stepLR_size, gamma=self.lr_decay) else: # by default, ReduceLROnPlateau if args.no_best_epochs < 0: self.lr_patience = 5 nii_warn.f_print("--no-best-epochs is set to 5 ") nii_warn.f_print("for learning rate decaying") self.lr_scheduler = torch_optim_steplr.ReduceLROnPlateau( optimizer=optimizer, factor=self.lr_decay, patience=self.lr_patience) self.flag = True else: self.lr_scheduler = None self.flag =False return
def __init__(self, model, args): """ Initialize an optimizer over model.parameters() """ # check valildity of model if not hasattr(model, "parameters"): nii_warn.f_print("model is not torch.nn", "error") nii_warn.f_die("Error in creating OptimizerWrapper") # set optimizer type self.op_flag = args.optimizer self.lr = args.lr self.l2_penalty = args.l2_penalty # grad clip norm is directly added in nn_manager self.grad_clip_norm = args.grad_clip_norm # create optimizer if self.op_flag == "Adam": if self.l2_penalty > 0: self.optimizer = torch_optim.Adam(model.parameters(), lr=self.lr, weight_decay=self.l2_penalty) else: self.optimizer = torch_optim.Adam(model.parameters(), lr=self.lr) else: nii_warn.f_print("%s not availabel" % (self.op_flag), "error") nii_warn.f_die("Please change optimizer") # number of epochs self.epochs = args.epochs self.no_best_epochs = args.no_best_epochs # lr scheduler self.lr_scheduler = nii_lr_scheduler.LRScheduler(self.optimizer, args) return
def f_init_data_len_stats(self, data_path): """ flag = f_init_data_len_stats(self, data_path) Check whether data length has been stored in data_pat. If yes, load data_path and return False Else, return True """ self.m_seq_info = [] self.m_data_length = {} self.m_data_total_length = 0 flag = True if os.path.isfile(data_path): # load data length from pre-stored *.dic dic_seq_infos = nii_io_tk.read_dic(self.m_data_len_path) for dic_seq_info in dic_seq_infos: seq_info = nii_seqinfo.SeqInfo() seq_info.load_from_dic(dic_seq_info) self.m_seq_info.append(seq_info) seq_tag = seq_info.seq_tag() if seq_tag not in self.m_data_length: self.m_data_length[seq_tag] = seq_info.seq_length() else: self.m_data_length[seq_tag] += seq_info.seq_length() self.m_data_total_length = self.f_sum_data_length() # check whether *.dic contains files in filelist if nii_list_tools.list_identical(self.m_file_list,\ self.m_data_length.keys()): nii_warn.f_print("Read sequence info: %s" % (data_path)) flag = False else: self.m_seq_info = [] self.m_data_length = {} self.m_data_total_length = 0 return flag
def f_init_mean_std(self, ms_input_path, ms_output_path): """ f_init_mean_std Initialzie mean and std vectors for input and output """ self.m_input_mean = np.zeros([self.m_input_all_dim]) self.m_input_std = np.ones([self.m_input_all_dim]) self.m_output_mean = np.zeros([self.m_output_all_dim]) self.m_output_std = np.ones([self.m_output_all_dim]) flag = True if not self.m_save_ms: # assume mean/std will be in the network flag = False if os.path.isfile(ms_input_path) and \ os.path.isfile(ms_output_path): # load mean and std if exists ms_input = self.f_load_data(ms_input_path, 1) ms_output = self.f_load_data(ms_output_path, 1) if ms_input.shape[0] != (self.m_input_all_dim * 2) or \ ms_output.shape[0] != (self.m_output_all_dim * 2): if ms_input.shape[0] != (self.m_input_all_dim * 2): nii_warn.f_print("%s incompatible" % (m_input_path), 'warning') if ms_output.shape[0] != (self.m_output_all_dim * 2): nii_warn.f_print("%s incompatible" % (m_output_path), 'warning') nii_warn.f_print("mean/std will be recomputed", 'warning') else: self.m_input_mean = ms_input[0:self.m_input_all_dim] self.m_input_std = ms_input[self.m_input_all_dim:] self.m_output_mean = ms_output[0:self.m_output_all_dim] self.m_output_std = ms_output[self.m_output_all_dim:] nii_warn.f_print("Load mean/std from %s and %s" % \ (ms_input_path, ms_output_path)) flag = False return flag
def f_loss_check(loss_module, model_type=None): """ f_loss_check(pt_model) Check whether the loss module contains all the necessary keywords Args: ---- loss_module, a class model_type, a str or None Return: ------- """ nii_display.f_print("Loss check") if model_type in nii_nn_manage_conf.loss_method_keywords_bags: keywords_bag = nii_nn_manage_conf.loss_method_keywords_bags[model_type] else: keywords_bag = nii_nn_manage_conf.loss_method_keywords_default for tmpkey in keywords_bag.keys(): flag_mandatory, mes = keywords_bag[tmpkey] # mandatory keywords if flag_mandatory: if not hasattr(loss_module, tmpkey): nii_display.f_print("Please implement %s (%s)" % (tmpkey, mes)) nii_display.f_die("[Error]: found no %s in Loss" % (tmpkey)) else: # no need to print other information here pass #print("[OK]: %s found" % (tmpkey)) else: if not hasattr(loss_module, tmpkey): # no need to print other information here pass #print("[OK]: %s is ignored, %s" % (tmpkey, mes)) else: print("[OK]: use %s, %s" % (tmpkey, mes)) # done nii_display.f_print("Loss check done\n") return
def f_model_check(pt_model, model_type=None): """ f_model_check(pt_model) Check whether the model contains all the necessary keywords Args: ---- pt_model: a Pytorch model model_type_flag: str or None, a flag indicating the type of network Return: ------- """ nii_display.f_print("Model check:") if model_type in nii_nn_manage_conf.nn_model_keywords_bags: keywords_bag = nii_nn_manage_conf.nn_model_keywords_bags[model_type] else: keywords_bag = nii_nn_manage_conf.nn_model_keywords_default for tmpkey in keywords_bag.keys(): flag_mandatory, mes = keywords_bag[tmpkey] # mandatory keywords if flag_mandatory: if not hasattr(pt_model, tmpkey): nii_display.f_print("Please implement %s (%s)" % (tmpkey, mes)) nii_display.f_die("[Error]: found no %s in Model" % (tmpkey)) else: print("[OK]: %s found" % (tmpkey)) else: if not hasattr(pt_model, tmpkey): print("[OK]: %s is ignored, %s" % (tmpkey, mes)) else: print("[OK]: use %s, %s" % (tmpkey, mes)) # done nii_display.f_print("Model check done\n") return
def f_inference_wrapper(args, pt_model, device, \ test_dataset_wrapper, checkpoint): """ Wrapper for inference """ # prepare dataloader test_data_loader = test_dataset_wrapper.get_loader() test_seq_num = test_dataset_wrapper.get_seq_num() test_dataset_wrapper.print_info() # cuda device if torch.cuda.device_count() > 1 and args.multi_gpu_data_parallel: nii_display.f_print("DataParallel for inference is not implemented", 'warning') nii_display.f_print("\nUse single GPU: %s\n" % \ (torch.cuda.get_device_name(device))) # print the network pt_model.to(device, dtype=nii_dconf.d_dtype) nii_nn_tools.f_model_show(pt_model) # load trained model parameters from checkpoint cp_names = nii_nn_manage_conf.CheckPointKey() if type(checkpoint) is dict and cp_names.state_dict in checkpoint: pt_model.load_state_dict(checkpoint[cp_names.state_dict]) else: pt_model.load_state_dict(checkpoint) # start generation nii_display.f_print("Start inference (generation):", 'highlight') pt_model.eval() with torch.no_grad(): for _, (data_in, data_tar, data_info, idx_orig) in \ enumerate(test_data_loader): # send data to device and convert data type data_in = data_in.to(device, dtype=nii_dconf.d_dtype) if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) # compute output start_time = time.time() # in case the model defines inference function explicitly if hasattr(pt_model, "inference"): infer_func = pt_model.inference else: infer_func = pt_model.forward if args.model_forward_with_target: # if model.forward requires (input, target) as arguments # for example, for auto-encoder if args.model_forward_with_file_name: data_gen = infer_func(data_in, data_tar, data_info) else: data_gen = infer_func(data_in, data_tar) else: if args.model_forward_with_file_name: data_gen = infer_func(data_in, data_info) else: data_gen = infer_func(data_in) time_cost = time.time() - start_time # average time for each sequence when batchsize > 1 time_cost = time_cost / len(data_info) if data_gen is None: nii_display.f_print("No output saved: %s" % (str(data_info)),\ 'warning') for idx, seq_info in enumerate(data_info): _ = nii_op_display_tk.print_gen_info(seq_info, time_cost) continue else: try: data_gen = pt_model.denormalize_output(data_gen) data_gen_np = data_gen.to("cpu").numpy() except AttributeError: mes = "Output data is not torch.tensor. Please check " mes += "model.forward or model.inference" nii_display.f_die(mes) # save output (in case batchsize > 1, ) for idx, seq_info in enumerate(data_info): _ = nii_op_display_tk.print_gen_info(seq_info, time_cost) test_dataset_wrapper.putitem(data_gen_np[idx:idx+1],\ args.output_dir, \ seq_info) # done for # done with # nii_display.f_print("Generated data to %s" % (args.output_dir)) # finish up if necessary if hasattr(pt_model, "finish_up_inference"): pt_model.finish_up_inference() # done return
def f_train_wrapper(args, pt_model, loss_wrapper, device, \ optimizer_wrapper, \ train_dataset_wrapper, \ val_dataset_wrapper = None, \ checkpoint = None): """ f_train_wrapper(args, pt_model, loss_wrapper, device, optimizer_wrapper train_dataset_wrapper, val_dataset_wrapper = None, check_point = None): A wrapper to run the training process Args: args: argument information given by argpase pt_model: pytorch model (torch.nn.Module) loss_wrapper: a wrapper over loss function loss_wrapper.compute(generated, target) device: torch.device("cuda") or torch.device("cpu") optimizer_wrapper: a wrapper over optimizer (defined in op_manager.py) optimizer_wrapper.optimizer is torch.optimizer train_dataset_wrapper: a wrapper over training data set (data_io/default_data_io.py) train_dataset_wrapper.get_loader() returns torch.DataSetLoader val_dataset_wrapper: a wrapper over validation data set (data_io/default_data_io.py) it can None. check_point: a check_point that stores every thing to resume training """ nii_display.f_print_w_date("Start model training") ############## ## Preparation ############## # get the optimizer optimizer_wrapper.print_info() optimizer = optimizer_wrapper.optimizer lr_scheduler = optimizer_wrapper.lr_scheduler epoch_num = optimizer_wrapper.get_epoch_num() no_best_epoch_num = optimizer_wrapper.get_no_best_epoch_num() # get data loader for training set train_dataset_wrapper.print_info() train_data_loader = train_dataset_wrapper.get_loader() train_seq_num = train_dataset_wrapper.get_seq_num() # get the training process monitor monitor_trn = nii_monitor.Monitor(epoch_num, train_seq_num) # if validation data is provided, get data loader for val set if val_dataset_wrapper is not None: val_dataset_wrapper.print_info() val_data_loader = val_dataset_wrapper.get_loader() val_seq_num = val_dataset_wrapper.get_seq_num() monitor_val = nii_monitor.Monitor(epoch_num, val_seq_num) else: monitor_val = None # training log information train_log = '' # prepare for DataParallism if available # pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html if torch.cuda.device_count() > 1 and args.multi_gpu_data_parallel: flag_multi_device = True nii_display.f_print("\nUse %d GPUs\n" % (torch.cuda.device_count())) # no way to call normtarget_f after pt_model is in DataParallel normtarget_f = pt_model.normalize_target pt_model = nn.DataParallel(pt_model) else: nii_display.f_print("\nUse single GPU: %s\n" % \ (torch.cuda.get_device_name(device))) flag_multi_device = False normtarget_f = None pt_model.to(device, dtype=nii_dconf.d_dtype) # print the network nii_nn_tools.f_model_show(pt_model) nii_nn_tools.f_loss_show(loss_wrapper) ############################### ## Resume training if necessary ############################### # resume training or initialize the model if necessary cp_names = nii_nn_manage_conf.CheckPointKey() if checkpoint is not None: if type(checkpoint) is dict: # checkpoint # load model parameter and optimizer state if cp_names.state_dict in checkpoint: # wrap the state_dic in f_state_dict_wrapper # in case the model is saved when DataParallel is on pt_model.load_state_dict( nii_nn_tools.f_state_dict_wrapper( checkpoint[cp_names.state_dict], flag_multi_device)) # load optimizer state if cp_names.optimizer in checkpoint and \ not args.ignore_optimizer_statistics_in_trained_model: optimizer.load_state_dict(checkpoint[cp_names.optimizer]) # optionally, load training history if not args.ignore_training_history_in_trained_model: #nii_display.f_print("Load ") if cp_names.trnlog in checkpoint: monitor_trn.load_state_dic(checkpoint[cp_names.trnlog]) if cp_names.vallog in checkpoint and monitor_val: monitor_val.load_state_dic(checkpoint[cp_names.vallog]) if cp_names.info in checkpoint: train_log = checkpoint[cp_names.info] if cp_names.lr_scheduler in checkpoint and \ checkpoint[cp_names.lr_scheduler] and lr_scheduler.f_valid(): lr_scheduler.f_load_state_dict( checkpoint[cp_names.lr_scheduler]) nii_display.f_print("Load check point, resume training") else: nii_display.f_print("Load pretrained model and optimizer") else: # only model status pt_model.load_state_dict( nii_nn_tools.f_state_dict_wrapper(checkpoint, flag_multi_device)) nii_display.f_print("Load pretrained model") ###################### ### User defined setup ###################### if hasattr(pt_model, "other_setups"): nii_display.f_print("Conduct User-defined setup") pt_model.other_setups() # This should be merged with other_setups if hasattr(pt_model, "g_pretrained_model_path") and \ hasattr(pt_model, "g_pretrained_model_prefix"): nii_display.f_print("Load pret-rained models as part of this mode") nii_nn_tools.f_load_pretrained_model_partially( pt_model, pt_model.g_pretrained_model_path, pt_model.g_pretrained_model_prefix) ###################### ### Start training ###################### # other variables flag_early_stopped = False start_epoch = monitor_trn.get_epoch() epoch_num = monitor_trn.get_max_epoch() # print _ = nii_op_display_tk.print_log_head() nii_display.f_print_message(train_log, flush=True, end='') # loop over multiple epochs for epoch_idx in range(start_epoch, epoch_num): # training one epoch pt_model.train() # set validation flag if necessary if hasattr(pt_model, 'validation'): pt_model.validation = False mes = "Warning: model.validation is deprecated, " mes += "please use model.flag_validation" nii_display.f_print(mes, 'warning') if hasattr(pt_model, 'flag_validation'): pt_model.flag_validation = False f_run_one_epoch(args, pt_model, loss_wrapper, device, \ monitor_trn, train_data_loader, \ epoch_idx, optimizer, normtarget_f) time_trn = monitor_trn.get_time(epoch_idx) loss_trn = monitor_trn.get_loss(epoch_idx) # if necessary, do validataion if val_dataset_wrapper is not None: # set eval() if necessary if args.eval_mode_for_validation: pt_model.eval() # set validation flag if necessary if hasattr(pt_model, 'validation'): pt_model.validation = True mes = "Warning: model.validation is deprecated, " mes += "please use model.flag_validation" nii_display.f_print(mes, 'warning') if hasattr(pt_model, 'flag_validation'): pt_model.flag_validation = True with torch.no_grad(): f_run_one_epoch(args, pt_model, loss_wrapper, \ device, \ monitor_val, val_data_loader, \ epoch_idx, None, normtarget_f) time_val = monitor_val.get_time(epoch_idx) loss_val = monitor_val.get_loss(epoch_idx) # update lr rate scheduler if necessary if lr_scheduler.f_valid(): lr_scheduler.f_step(loss_val) else: time_val, loss_val = 0, 0 if val_dataset_wrapper is not None: flag_new_best = monitor_val.is_new_best() else: flag_new_best = True # print information train_log += nii_op_display_tk.print_train_info( epoch_idx, time_trn, loss_trn, time_val, loss_val, flag_new_best, optimizer_wrapper.get_lr_info()) # save the best model if flag_new_best: tmp_best_name = nii_nn_tools.f_save_trained_name(args) torch.save(pt_model.state_dict(), tmp_best_name) # save intermediate model if necessary if not args.not_save_each_epoch: tmp_model_name = nii_nn_tools.f_save_epoch_name(args, epoch_idx) if monitor_val is not None: tmp_val_log = monitor_val.get_state_dic() else: tmp_val_log = None if lr_scheduler.f_valid(): lr_scheduler_state = lr_scheduler.f_state_dict() else: lr_scheduler_state = None # save tmp_dic = { cp_names.state_dict: pt_model.state_dict(), cp_names.info: train_log, cp_names.optimizer: optimizer.state_dict(), cp_names.trnlog: monitor_trn.get_state_dic(), cp_names.vallog: tmp_val_log, cp_names.lr_scheduler: lr_scheduler_state } torch.save(tmp_dic, tmp_model_name) if args.verbose == 1: nii_display.f_eprint(str(datetime.datetime.now())) nii_display.f_eprint("Save {:s}".format(tmp_model_name), flush=True) # Early stopping # note: if LR scheduler is used, early stopping will be # disabled if lr_scheduler.f_allow_early_stopping() and \ monitor_val is not None and \ monitor_val.should_early_stop(no_best_epoch_num): flag_early_stopped = True break # loop done nii_op_display_tk.print_log_tail() if flag_early_stopped: nii_display.f_print("Training finished by early stopping") else: nii_display.f_print("Training finished") nii_display.f_print("Model is saved to", end='') nii_display.f_print("{}".format(nii_nn_tools.f_save_trained_name(args))) return
def f_run_one_epoch(args, pt_model, loss_wrapper, \ device, monitor, \ data_loader, epoch_idx, optimizer = None, \ target_norm_method = None): """ f_run_one_epoch: run one poech over the dataset (for training or validation sets) Args: args: from argpase pt_model: pytorch model (torch.nn.Module) loss_wrapper: a wrapper over loss function loss_wrapper.compute(generated, target) device: torch.device("cuda") or torch.device("cpu") monitor: defined in op_procfess_monitor.py data_loader: pytorch DataLoader. epoch_idx: int, index of the current epoch optimizer: torch optimizer or None if None, the back propgation will be skipped (for developlement set) target_norm_method: method to normalize target data (by default, use pt_model.normalize_target) """ # timer start_time = time.time() # loop over samples for data_idx, (data_in, data_tar, data_info, idx_orig) in \ enumerate(data_loader): ############# # prepare ############# # idx_orig is the original idx in the dataset # which can be different from data_idx when shuffle = True #idx_orig = idx_orig.numpy()[0] #data_seq_info = data_info[0] # send data to device if optimizer is not None: optimizer.zero_grad() ############ # compute output ############ data_in = data_in.to(device, dtype=nii_dconf.d_dtype) if args.model_forward_with_target: # if model.forward requires (input, target) as arguments # for example, for auto-encoder & autoregressive model if isinstance(data_tar, torch.Tensor): data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype) if args.model_forward_with_file_name: data_gen = pt_model(data_in, data_tar_tm, data_info) else: data_gen = pt_model(data_in, data_tar_tm) else: nii_display.f_print("--model-forward-with-target is set") nii_display.f_die("but data_tar is not loaded") else: if args.model_forward_with_file_name: # specifcal case when model.forward requires data_info data_gen = pt_model(data_in, data_info) else: # normal case for model.forward(input) data_gen = pt_model(data_in) ##################### # compute loss and do back propagate ##################### # Two cases # 1. if loss is defined as pt_model.loss, then let the users do # normalization inside the pt_mode.loss # 2. if loss_wrapper is defined as a class independent from model # there is no way to normalize the data inside the loss_wrapper # because the normalization weight is saved in pt_model if hasattr(pt_model, 'loss'): # case 1, pt_model.loss is available if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) else: data_tar = [] loss_computed = pt_model.loss(data_gen, data_tar) else: # case 2, loss is defined independent of pt_model if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) # there is no way to normalize the data inside loss # thus, do normalization here if target_norm_method is None: normed_target = pt_model.normalize_target(data_tar) else: normed_target = target_norm_method(data_tar) else: normed_target = [] # return the loss from loss_wrapper # loss_computed may be [[loss_1, loss_2, ...],[flag_1, flag_2,.]] # which contain multiple loss and flags indicating whether # the corresponding loss should be taken into consideration # for early stopping # or # loss_computed may be simply a tensor loss loss_computed = loss_wrapper.compute(data_gen, normed_target) loss_values = [0] # To handle cases where there are multiple loss functions # when loss_comptued is [[loss_1, loss_2, ...],[flag_1, flag_2,.]] # loss: sum of [loss_1, loss_2, ...], for backward() # loss_values: [loss_1.item(), loss_2.item() ..], for logging # loss_flags: [True/False, ...], for logging, # whether loss_n is used for early stopping # when loss_computed is loss # loss: loss # los_vals: [loss.item()] # loss_flags: [True] loss, loss_values, loss_flags = nii_nn_tools.f_process_loss( loss_computed) # Back-propgation using the summed loss if optimizer is not None: # backward propagation loss.backward() # apply gradient clip if args.grad_clip_norm > 0: grad_norm = torch.nn.utils.clip_grad_norm_( pt_model.parameters(), args.grad_clip_norm) # update parameters optimizer.step() # save the training process information to the monitor end_time = time.time() batchsize = len(data_info) for idx, data_seq_info in enumerate(data_info): # loss_value is supposed to be the average loss value # over samples in the the batch, thus, just loss_value # rather loss_value / batchsize monitor.log_loss(loss_values, loss_flags, \ (end_time-start_time) / batchsize, \ data_seq_info, idx_orig.numpy()[idx], \ epoch_idx) # print infor for one sentence if args.verbose == 1: monitor.print_error_for_batch(data_idx*batchsize + idx,\ idx_orig.numpy()[idx], \ epoch_idx) # # start the timer for a new batch start_time = time.time() # Save intermediate model for every n mini-batches (optional). # Note that if we re-start trainining with this intermediate model, # the data will start from the 1st sample, not the one where we stopped if args.save_model_every_n_minibatches > 0 \ and (data_idx+1) % args.save_model_every_n_minibatches == 0 \ and optimizer is not None and data_idx > 0: cp_names = nii_nn_manage_conf.CheckPointKey() tmp_model_name = nii_nn_tools.f_save_epoch_name( args, epoch_idx, '_{:05d}'.format(data_idx + 1)) # save tmp_dic = { cp_names.state_dict: pt_model.state_dict(), cp_names.optimizer: optimizer.state_dict() } torch.save(tmp_dic, tmp_model_name) # loop done return
def f_online_mean_std(data, mean_old, var_old, cnt_old): """ mean, var, count=f_online_mean_var(data, mean, var, num_count): online algorithm to accumulate mean and var Args: data: input data as numpy.array, in shape [length, dimension] mean: mean to be updated, np.array [dimension] var: var to be updated, np.array [dimension] num_count: how many data rows have been calculated before this calling. Return: mean: mean, np.array [dimension] var: var, np.array [dimension] count: accumulated data number, = num_count + data.shape[0] Ref. parallel algorithm https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance """ try: # how many time steps (number of rows) in this data cnt_this = data.shape[0] # if input data is empty, don't update if cnt_this == 0: return mean_old, var_old, cnt_old if data.ndim == 1: # single dimension data, 1d array mean_this = data.mean() var_this = data.var() dim = 1 else: # multiple dimension data, 2d array mean_this = data.mean(axis=0) var_this = data.var(axis=0) dim = data.shape[1] # difference of accumulated mean and data mean diff_mean = mean_this - mean_old # new mean and var new_mean = np.zeros([dim], dtype=nii_dconf.h_dtype) new_var = np.zeros([dim], dtype=nii_dconf.h_dtype) # update count updated_count = cnt_old + cnt_this # update mean new_mean = mean_old + diff_mean * (float(cnt_this) / (cnt_old + cnt_this)) # update var if cnt_old == 0: # if this is the first data if data.ndim == 1: # remember that var is array, not scalar new_var[0] = var_this else: new_var = var_this else: # not first data new_var = ( var_old * (float(cnt_old) / updated_count) + var_this * (float(cnt_this) / updated_count) + (diff_mean * diff_mean / (float(cnt_this) / cnt_old + float(cnt_old) / cnt_this + 2.0)) ) # done return new_mean, new_var, updated_count except ValueError: if data.ndim > 1: if data.shape[1] != mean_old.shape[0] or \ data.shape[1] != var_old.shape[0]: nii_display.f_print("Dimension incompatible", "error") nii_display.f_die("Error in online mean var calculation") else: if mean_old.shape[0] != 1 or \ var_old.shape[0] != 1: nii_display.f_print("Dimension incompatible", "error") nii_display.f_die("Error in online mean var calculation")
def main(): """ main(): the default wrapper for training and inference process Please prepare config.py and model.py """ # arguments initialization args = nii_arg_parse.f_args_parsed() # nii_warn.f_print_w_date("Start program", level='h') nii_warn.f_print("Load module: %s" % (args.module_config)) nii_warn.f_print("Load module: %s" % (args.module_model)) prj_conf = importlib.import_module(args.module_config) prj_model = importlib.import_module(args.module_model) # initialization nii_startup.set_random_seed(args.seed, args) use_cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") # prepare data io if not args.inference: params = { 'batch_size': args.batch_size, 'shuffle': args.shuffle, 'num_workers': args.num_workers } # Load file list and create data loader trn_lst = nii_list_tool.read_list_from_text(prj_conf.trn_list) trn_set = nii_dset.NIIDataSetLoader( prj_conf.trn_set_name, \ trn_lst, prj_conf.input_dirs, \ prj_conf.input_exts, \ prj_conf.input_dims, \ prj_conf.input_reso, \ prj_conf.input_norm, \ prj_conf.output_dirs, \ prj_conf.output_exts, \ prj_conf.output_dims, \ prj_conf.output_reso, \ prj_conf.output_norm, \ './', params = params, truncate_seq = prj_conf.truncate_seq, min_seq_len = prj_conf.minimum_len, save_mean_std = True, wav_samp_rate = prj_conf.wav_samp_rate) if prj_conf.val_list is not None: val_lst = nii_list_tool.read_list_from_text(prj_conf.val_list) val_set = nii_dset.NIIDataSetLoader( prj_conf.val_set_name, val_lst, prj_conf.input_dirs, \ prj_conf.input_exts, \ prj_conf.input_dims, \ prj_conf.input_reso, \ prj_conf.input_norm, \ prj_conf.output_dirs, \ prj_conf.output_exts, \ prj_conf.output_dims, \ prj_conf.output_reso, \ prj_conf.output_norm, \ './', \ params = params, truncate_seq= prj_conf.truncate_seq, min_seq_len = prj_conf.minimum_len, save_mean_std = False, wav_samp_rate = prj_conf.wav_samp_rate) else: val_set = None # initialize the model and loss function model = prj_model.Model(trn_set.get_in_dim(), \ trn_set.get_out_dim(), \ args, trn_set.get_data_mean_std()) loss_wrapper = prj_model.Loss(args) # initialize the optimizer optimizer_wrapper = nii_op_wrapper.OptimizerWrapper(model, args) # if necessary, resume training if args.trained_model == "": checkpoint = None else: checkpoint = torch.load(args.trained_model) # start training nii_nn_wrapper.f_train_wrapper(args, model, loss_wrapper, device, optimizer_wrapper, trn_set, val_set, checkpoint) # done for traing else: # for inference # default, no truncating, no shuffling params = { 'batch_size': args.batch_size, 'shuffle': False, 'num_workers': args.num_workers } if type(prj_conf.test_list) is list: t_lst = prj_conf.test_list else: t_lst = nii_list_tool.read_list_from_text(prj_conf.test_list) test_set = nii_dset.NIIDataSetLoader( prj_conf.test_set_name, \ t_lst, \ prj_conf.test_input_dirs, prj_conf.input_exts, prj_conf.input_dims, prj_conf.input_reso, prj_conf.input_norm, prj_conf.test_output_dirs, prj_conf.output_exts, prj_conf.output_dims, prj_conf.output_reso, prj_conf.output_norm, './', params = params, truncate_seq= None, min_seq_len = None, save_mean_std = False, wav_samp_rate = prj_conf.wav_samp_rate) # initialize model model = prj_model.Model(test_set.get_in_dim(), \ test_set.get_out_dim(), \ args) if args.trained_model == "": print("No model is loaded by ---trained-model for inference") print("By default, load %s%s" % (args.save_trained_name, args.save_model_ext)) checkpoint = torch.load( "%s%s" % (args.save_trained_name, args.save_model_ext)) else: checkpoint = torch.load(args.trained_model) # do inference and output data nii_nn_wrapper.f_inference_wrapper(args, model, device, \ test_set, checkpoint) # done return
def f_run_one_epoch(args, pt_model, loss_wrapper, \ device, monitor, \ data_loader, epoch_idx, optimizer = None, \ target_norm_method = None): """ f_run_one_epoch: run one poech over the dataset (for training or validation sets) Args: args: from argpase pt_model: pytorch model (torch.nn.Module) loss_wrapper: a wrapper over loss function loss_wrapper.compute(generated, target) device: torch.device("cuda") or torch.device("cpu") monitor: defined in op_procfess_monitor.py data_loader: pytorch DataLoader. epoch_idx: int, index of the current epoch optimizer: torch optimizer or None if None, the back propgation will be skipped (for developlement set) target_norm_method: method to normalize target data (by default, use pt_model.normalize_target) """ # timer start_time = time.time() # loop over samples pbar = tqdm(data_loader) epoch_num = monitor.get_max_epoch() for data_idx, (data_in, data_tar, data_info, idx_orig) in enumerate(pbar): pbar.set_description("Epoch: {}/{}".format(epoch_idx, epoch_num)) # idx_orig is the original idx in the dataset # which can be different from data_idx when shuffle = True #idx_orig = idx_orig.numpy()[0] #data_seq_info = data_info[0] # send data to device if optimizer is not None: optimizer.zero_grad() # compute data_in = data_in.to(device, dtype=nii_dconf.d_dtype) if args.model_forward_with_target: # if model.forward requires (input, target) as arguments # for example, for auto-encoder & autoregressive model if isinstance(data_tar, torch.Tensor): data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype) if args.model_forward_with_file_name: data_gen = pt_model(data_in, data_tar_tm, data_info) else: data_gen = pt_model(data_in, data_tar_tm) else: nii_display.f_print("--model-forward-with-target is set") nii_display.f_die("but data_tar is not loaded") else: if args.model_forward_with_file_name: # specifcal case when model.forward requires data_info data_gen = pt_model(data_in, data_info) else: # normal case for model.forward(input) data_gen = pt_model(data_in) # compute loss and do back propagate loss_vals = [0] if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) # there is no way to normalize the data inside loss # thus, do normalization here if target_norm_method is None: normed_target = pt_model.normalize_target(data_tar) else: normed_target = target_norm_method(data_tar) # return the loss from loss_wrapper # loss_computed may be [[loss_1, loss_2, ...],[flag_1, flag_2,.]] # which contain multiple loss and flags indicating whether # the corresponding loss should be taken into consideration # for early stopping # or # loss_computed may be simply a tensor loss loss_computed = loss_wrapper.compute(data_gen, normed_target) # To handle cases where there are multiple loss functions # when loss_comptued is [[loss_1, loss_2, ...],[flag_1, flag_2,.]] # loss: sum of [loss_1, loss_2, ...], for backward() # loss_vals: [loss_1.item(), loss_2.item() ..], for logging # loss_flags: [True/False, ...], for logging, # whether loss_n is used for early stopping # when loss_computed is loss # loss: loss # los_vals: [loss.item()] # loss_flags: [True] loss, loss_vals, loss_flags = nii_nn_tools.f_process_loss( loss_computed) # Back-propgation using the summed loss if optimizer is not None: loss.backward() optimizer.step() # save the training process information to the monitor end_time = time.time() batchsize = len(data_info) for idx, data_seq_info in enumerate(data_info): # loss_value is supposed to be the average loss value # over samples in the the batch, thus, just loss_value # rather loss_value / batchsize monitor.log_loss(loss_vals, loss_flags, \ (end_time-start_time) / batchsize, \ data_seq_info, idx_orig.numpy()[idx], \ epoch_idx) # print infor for one sentence if args.verbose == 1: monitor.print_error_for_batch(data_idx*batchsize + idx,\ idx_orig.numpy()[idx], \ epoch_idx) # # start the timer for a new batch start_time = time.time() # lopp done pbar.close() return
def __init__(self, dataset_name, \ file_list, \ input_dirs, input_exts, input_dims, input_reso, \ input_norm, \ output_dirs, output_exts, output_dims, output_reso, \ output_norm, \ stats_path, \ data_format = '<f4', \ truncate_seq = None, \ min_seq_len = None, \ save_mean_std = True, \ wav_samp_rate = None): """ Args: dataset_name: name of this data set file_list: a list of file name strings (without extension) input_dirs: a list of dirs from each input feature is loaded input_exts: a list of input feature name extentions input_dims: a list of input feature dimensions input_reso: a list of input feature temporal resolutions output_dirs: a list of dirs from each output feature is loaded output_exts: a list of output feature name extentions output_dims: a list of output feature dimensions output_reso: a list of output feature temporal resolutions stat_path: path to the directory that saves mean/std, utterance length data_format: method to load the data '<f4' (default): load data as float32m little-endian 'htk': load data as htk format truncate_seq: None or int, truncate sequence into truncks. truncate_seq > 0 specifies the trunck length """ # initialization self.m_set_name = dataset_name self.m_file_list = file_list self.m_input_dirs = input_dirs self.m_input_exts = input_exts self.m_input_dims = input_dims self.m_output_dirs = output_dirs self.m_output_exts = output_exts self.m_output_dims = output_dims if len(self.m_input_dirs) != len(self.m_input_exts) or \ len(self.m_input_dirs) != len(self.m_input_dims): nii_warn.f_print("Input dirs, exts, dims, unequal length", 'error') nii_warn.f_print(str(self.m_input_dirs), 'error') nii_warn.f_print(str(self.m_input_exts), 'error') nii_warn.f_print(str(self.m_input_dims), 'error') nii_warn.f_die("Please check input dirs, exts, dims") if len(self.m_output_dims) != len(self.m_output_exts) or \ (self.m_output_dirs and \ len(self.m_output_dirs) != len(self.m_output_exts)): nii_warn.f_print("Output dirs, exts, dims, unequal length", \ 'error') nii_warn.f_die("Please check output dirs, exts, dims") # fill in m_*_reso and m_*_norm def _tmp_f(list2, default_value, length): if list2 is None: return [default_value for x in range(length)] else: return list2 self.m_input_reso = _tmp_f(input_reso, 1, len(input_dims)) self.m_input_norm = _tmp_f(input_norm, True, len(input_dims)) self.m_output_reso = _tmp_f(output_reso, 1, len(output_dims)) self.m_output_norm = _tmp_f(output_norm, True, len(output_dims)) if len(self.m_input_reso) != len(self.m_input_dims): nii_warn.f_die("Please check input_reso") if len(self.m_output_reso) != len(self.m_output_dims): nii_warn.f_die("Please check output_reso") if len(self.m_input_norm) != len(self.m_input_dims): nii_warn.f_die("Please check input_norm") if len(self.m_output_norm) != len(self.m_output_dims): nii_warn.f_die("Please check output_norm") # dimensions self.m_input_all_dim = sum(self.m_input_dims) self.m_output_all_dim = sum(self.m_output_dims) self.m_io_dim = self.m_input_all_dim + self.m_output_all_dim self.m_truncate_seq = truncate_seq self.m_min_seq_len = min_seq_len self.m_save_ms = save_mean_std # in case there is waveform data in input or output features self.m_wav_sr = wav_samp_rate # sanity check on resolution configuration # currently, only input features can have different reso, # and the m_input_reso must be the same for all input features if any([x != self.m_input_reso[0] for x in self.m_input_reso]): nii_warn.f_print("input_reso: %s" % (str(self.m_input_reso)),\ 'error') nii_warn.f_print("NIIDataSet not support", 'error', end='') nii_warn.f_die(" different input_reso") if any([x != 1 for x in self.m_output_reso]): nii_warn.f_print("NIIDataSet only supports", 'error', end='') nii_warn.f_die(" output_reso = [1, 1, ... 1]") self.m_single_reso = self.m_input_reso[0] # To make sure that target waveform length is exactly equal # to the up-sampled sequence length # self.m_truncate_seq must be changed to be N * up_sample if self.m_truncate_seq is not None: # assume input resolution is the same self.m_truncate_seq = self.f_adjust_len(self.m_truncate_seq) # method to load/write raw data if data_format == '<f4': self.f_load_data = _data_reader self.f_length_data = _data_len_reader self.f_write_data = lambda x, y: _data_writer(x, y, \ self.m_wav_sr) else: nii_warn.f_print("Unsupported dtype %s" % (data_format)) nii_warn.f_die("Only supports np.float32 <f4") # check the validity of data self.f_check_file_list() # log down statiscs # 1. length of each data utterance # 2. mean / std of feature feature file def get_name(stats_path, set_name, file_name): tmp = set_name + '_' + file_name return os.path.join(stats_path, tmp) self.m_ms_input_path = get_name(stats_path, self.m_set_name, \ nii_dconf.mean_std_i_file) self.m_ms_output_path = get_name(stats_path, self.m_set_name, \ nii_dconf.mean_std_o_file) self.m_data_len_path = get_name(stats_path, self.m_set_name, \ nii_dconf.data_len_file) # initialize data length and mean /std flag_cal_len = self.f_init_data_len_stats(self.m_data_len_path) flag_cal_mean_std = self.f_init_mean_std(self.m_ms_input_path, self.m_ms_output_path) # if data information is not available, read it again from data if flag_cal_len or flag_cal_mean_std: self.f_calculate_stats(flag_cal_len, flag_cal_mean_std) # check if self.__len__() < 1: nii_warn.f_print("Fail to load any data", "error") nii_warn.f_die("Please check configuration") # done return
def f_run_one_epoch_WGAN( args, pt_model_G, pt_model_D, loss_wrapper, \ device, monitor, \ data_loader, epoch_idx, optimizer_G = None, optimizer_D = None, \ target_norm_method = None): """ f_run_one_epoch_WGAN: similar to f_run_one_epoch_GAN, but for WGAN """ # timer start_time = time.time() # number of critic (default 5) if hasattr(args, "wgan-critic-num"): num_critic = args.wgan_critic_num else: num_critic = 5 # clip value if hasattr(args, "wgan-clamp"): wgan_clamp = args.wgan_clamp else: wgan_clamp = 0.01 # loop over samples for data_idx, (data_in, data_tar, data_info, idx_orig) in \ enumerate(data_loader): # send data to device if optimizer_G is not None: optimizer_G.zero_grad() if optimizer_D is not None: optimizer_D.zero_grad() # prepare data if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) # there is no way to normalize the data inside loss # thus, do normalization here if target_norm_method is None: normed_target = pt_model_G.normalize_target(data_tar) else: normed_target = target_norm_method(data_tar) else: nii_display.f_die("target data is required") # to device (we assume noise will be generated by the model itself) # here we only provide external condition data_in = data_in.to(device, dtype=nii_dconf.d_dtype) ############################ # Update Discriminator ############################ # train with real pt_model_D.zero_grad() d_out_real = pt_model_D(data_tar) errD_real = loss_wrapper.compute_gan_D_real(d_out_real) if optimizer_D is not None: errD_real.backward() d_out_real_mean = d_out_real.mean() # train with fake # generate sample if args.model_forward_with_target: # if model.forward requires (input, target) as arguments # for example, for auto-encoder & autoregressive model if isinstance(data_tar, torch.Tensor): data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype) if args.model_forward_with_file_name: data_gen = pt_model_G(data_in, data_tar_tm, data_info) else: data_gen = pt_model_G(data_in, data_tar_tm) else: nii_display.f_print("--model-forward-with-target is set") nii_display.f_die("but data_tar is not loaded") else: if args.model_forward_with_file_name: # specifcal case when model.forward requires data_info data_gen = pt_model_G(data_in, data_info) else: # normal case for model.forward(input) data_gen = pt_model_G(data_in) # data_gen.detach() is required # https://github.com/pytorch/examples/issues/116 d_out_fake = pt_model_D(data_gen.detach()) errD_fake = loss_wrapper.compute_gan_D_fake(d_out_fake) if optimizer_D is not None: errD_fake.backward() d_out_fake_mean = d_out_fake.mean() errD = errD_real + errD_fake if optimizer_D is not None: optimizer_D.step() # clip weights of discriminator for p in pt_model_D.parameters(): p.data.clamp_(-wgan_clamp, wgan_clamp) ############################ # Update Generator ############################ pt_model_G.zero_grad() d_out_fake_for_G = pt_model_D(data_gen) errG_gan = loss_wrapper.compute_gan_G(d_out_fake_for_G) errG_aux = loss_wrapper.compute_aux(data_gen, data_tar) errG = errG_gan + errG_aux # only update after num_crictic iterations on discriminator if data_idx % num_critic == 0 and optimizer_G is not None: errG.backward() optimizer_G.step() d_out_fake_for_G_mean = d_out_fake_for_G.mean() # construct the loss for logging and early stopping # only use errG_aux for early-stopping loss_computed = [[ errG_aux, errG_gan, errD_real, errD_fake, d_out_real_mean, d_out_fake_mean, d_out_fake_for_G_mean ], [True, False, False, False, False, False, False]] # to handle cases where there are multiple loss functions loss, loss_vals, loss_flags = nii_nn_tools.f_process_loss( loss_computed) # save the training process information to the monitor end_time = time.time() batchsize = len(data_info) for idx, data_seq_info in enumerate(data_info): # loss_value is supposed to be the average loss value # over samples in the the batch, thus, just loss_value # rather loss_value / batchsize monitor.log_loss(loss_vals, loss_flags, \ (end_time-start_time) / batchsize, \ data_seq_info, idx_orig.numpy()[idx], \ epoch_idx) # print infor for one sentence if args.verbose == 1: monitor.print_error_for_batch(data_idx*batchsize + idx,\ idx_orig.numpy()[idx], \ epoch_idx) # # start the timer for a new batch start_time = time.time() # lopp done return
def f_run_one_epoch(args, pt_model, loss_wrapper, \ device, monitor, \ data_loader, epoch_idx, optimizer = None): """ f_run_one_epoch: run one poech over the dataset (for training or validation sets) Args: args: from argpase pt_model: pytorch model (torch.nn.Module) loss_wrapper: a wrapper over loss function loss_wrapper.compute(generated, target) device: torch.device("cuda") or torch.device("cpu") monitor: defined in op_procfess_monitor.py data_loader: pytorch DataLoader. epoch_idx: int, index of the current epoch optimizer: torch optimizer or None if None, the back propgation will be skipped (for developlement set) """ # timer start_time = time.time() # loop over samples for data_idx, (data_in, data_tar, data_info, idx_orig) in \ enumerate(data_loader): # idx_orig is the original idx in the dataset # which can be different from data_idx when shuffle = True #idx_orig = idx_orig.numpy()[0] #data_seq_info = data_info[0] # send data to device if optimizer is not None: optimizer.zero_grad() # compute data_in = data_in.to(device, dtype=nii_dconf.d_dtype) if args.model_forward_with_target: # if model.forward requires (input, target) as arguments # for example, for auto-encoder & autoregressive model if isinstance(data_tar, torch.Tensor): data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype) data_gen = pt_model(data_in, data_tar_tm) else: nii_display.f_print("--model-forward-with-target is set") nii_display.f_die("but no data_tar is not loaded") else: # normal case for model.forward(input) data_gen = pt_model(data_in) # compute loss and do back propagate loss_value = 0 if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) # there is no way to normalize the data inside loss # thus, do normalization here normed_target = pt_model.normalize_target(data_tar) loss = loss_wrapper.compute(data_gen, normed_target) loss_value = loss.item() if optimizer is not None: loss.backward() optimizer.step() # log down process information end_time = time.time() batchsize = len(data_info) for idx, data_seq_info in enumerate(data_info): monitor.log_loss(loss_value / batchsize, \ (end_time-start_time) / batchsize, \ data_seq_info, idx_orig.numpy()[idx], \ epoch_idx) # print infor for one sentence if args.verbose == 1: monitor.print_error_for_batch(data_idx*batchsize + idx,\ idx_orig.numpy()[idx], \ epoch_idx) # # start the timer for a new batch start_time = time.time() # lopp done return
def f_train_wrapper_GAN( args, pt_model_G, pt_model_D, loss_wrapper, device, \ optimizer_G_wrapper, optimizer_D_wrapper, \ train_dataset_wrapper, \ val_dataset_wrapper = None, \ checkpoint_G = None, checkpoint_D = None): """ f_train_wrapper_GAN( args, pt_model_G, pt_model_D, loss_wrapper, device, optimizer_G_wrapper, optimizer_D_wrapper, train_dataset_wrapper, val_dataset_wrapper = None, check_point = None): A wrapper to run the training process Args: args: argument information given by argpase pt_model_G: generator, pytorch model (torch.nn.Module) pt_model_D: discriminator, pytorch model (torch.nn.Module) loss_wrapper: a wrapper over loss functions loss_wrapper.compute_D_real(discriminator_output) loss_wrapper.compute_D_fake(discriminator_output) loss_wrapper.compute_G(discriminator_output) loss_wrapper.compute_G(fake, real) device: torch.device("cuda") or torch.device("cpu") optimizer_G_wrapper: a optimizer wrapper for generator (defined in op_manager.py) optimizer_D_wrapper: a optimizer wrapper for discriminator (defined in op_manager.py) train_dataset_wrapper: a wrapper over training data set (data_io/default_data_io.py) train_dataset_wrapper.get_loader() returns torch.DataSetLoader val_dataset_wrapper: a wrapper over validation data set (data_io/default_data_io.py) it can None. checkpoint_G: a check_point that stores every thing to resume training checkpoint_D: a check_point that stores every thing to resume training """ nii_display.f_print_w_date("Start model training") # get the optimizer optimizer_G_wrapper.print_info() optimizer_D_wrapper.print_info() optimizer_G = optimizer_G_wrapper.optimizer optimizer_D = optimizer_D_wrapper.optimizer epoch_num = optimizer_G_wrapper.get_epoch_num() no_best_epoch_num = optimizer_G_wrapper.get_no_best_epoch_num() # get data loader for training set train_dataset_wrapper.print_info() train_data_loader = train_dataset_wrapper.get_loader() train_seq_num = train_dataset_wrapper.get_seq_num() # get the training process monitor monitor_trn = nii_monitor.Monitor(epoch_num, train_seq_num) # if validation data is provided, get data loader for val set if val_dataset_wrapper is not None: val_dataset_wrapper.print_info() val_data_loader = val_dataset_wrapper.get_loader() val_seq_num = val_dataset_wrapper.get_seq_num() monitor_val = nii_monitor.Monitor(epoch_num, val_seq_num) else: monitor_val = None # training log information train_log = '' model_tags = ["_G", "_D"] # prepare for DataParallism if available # pytorch.org/tutorials/beginner/blitz/data_parallel_tutorial.html if torch.cuda.device_count() > 1 and args.multi_gpu_data_parallel: nii_display.f_die("data_parallel not implemented for GAN") else: nii_display.f_print("Use single GPU: %s" % \ (torch.cuda.get_device_name(device))) flag_multi_device = False normtarget_f = None pt_model_G.to(device, dtype=nii_dconf.d_dtype) pt_model_D.to(device, dtype=nii_dconf.d_dtype) # print the network nii_display.f_print("Setup generator") f_model_show(pt_model_G) nii_display.f_print("Setup discriminator") f_model_show(pt_model_D) # resume training or initialize the model if necessary cp_names = CheckPointKey() if checkpoint_G is not None or checkpoint_D is not None: for checkpoint, optimizer, pt_model, model_name in \ zip([checkpoint_G, checkpoint_D], [optimizer_G, optimizer_D], [pt_model_G, pt_model_D], ["Generator", "Discriminator"]): nii_display.f_print("For %s" % (model_name)) if type(checkpoint) is dict: # checkpoint # load model parameter and optimizer state if cp_names.state_dict in checkpoint: # wrap the state_dic in f_state_dict_wrapper # in case the model is saved when DataParallel is on pt_model.load_state_dict( nii_nn_tools.f_state_dict_wrapper( checkpoint[cp_names.state_dict], flag_multi_device)) # load optimizer state if cp_names.optimizer in checkpoint: optimizer.load_state_dict(checkpoint[cp_names.optimizer]) # optionally, load training history if not args.ignore_training_history_in_trained_model: #nii_display.f_print("Load ") if cp_names.trnlog in checkpoint: monitor_trn.load_state_dic(checkpoint[cp_names.trnlog]) if cp_names.vallog in checkpoint and monitor_val: monitor_val.load_state_dic(checkpoint[cp_names.vallog]) if cp_names.info in checkpoint: train_log = checkpoint[cp_names.info] nii_display.f_print("Load check point, resume training") else: nii_display.f_print("Load pretrained model and optimizer") elif checkpoint is not None: # only model status #pt_model.load_state_dict(checkpoint) pt_model.load_state_dict( nii_nn_tools.f_state_dict_wrapper(checkpoint, flag_multi_device)) nii_display.f_print("Load pretrained model") else: nii_display.f_print("No pretrained model") # done for resume training # other variables flag_early_stopped = False start_epoch = monitor_trn.get_epoch() epoch_num = monitor_trn.get_max_epoch() if hasattr(loss_wrapper, "flag_wgan") and loss_wrapper.flag_wgan: f_wrapper_gan_one_epoch = f_run_one_epoch_WGAN else: f_wrapper_gan_one_epoch = f_run_one_epoch_GAN # print _ = nii_op_display_tk.print_log_head() nii_display.f_print_message(train_log, flush=True, end='') # loop over multiple epochs for epoch_idx in range(start_epoch, epoch_num): # training one epoch pt_model_D.train() pt_model_G.train() f_wrapper_gan_one_epoch( args, pt_model_G, pt_model_D, loss_wrapper, device, \ monitor_trn, train_data_loader, \ epoch_idx, optimizer_G, optimizer_D, normtarget_f) time_trn = monitor_trn.get_time(epoch_idx) loss_trn = monitor_trn.get_loss(epoch_idx) # if necessary, do validataion if val_dataset_wrapper is not None: # set eval() if necessary if args.eval_mode_for_validation: pt_model_G.eval() pt_model_D.eval() with torch.no_grad(): f_wrapper_gan_one_epoch( args, pt_model_G, pt_model_D, loss_wrapper, \ device, \ monitor_val, val_data_loader, \ epoch_idx, None, None, normtarget_f) time_val = monitor_val.get_time(epoch_idx) loss_val = monitor_val.get_loss(epoch_idx) else: time_val, loss_val = 0, 0 if val_dataset_wrapper is not None: flag_new_best = monitor_val.is_new_best() else: flag_new_best = True # print information train_log += nii_op_display_tk.print_train_info( epoch_idx, time_trn, loss_trn, time_val, loss_val, flag_new_best) # save the best model if flag_new_best: for pt_model, model_tag in \ zip([pt_model_G, pt_model_D], model_tags): tmp_best_name = f_save_trained_name_GAN(args, model_tag) torch.save(pt_model.state_dict(), tmp_best_name) # save intermediate model if necessary if not args.not_save_each_epoch: # save model discrminator and generator for pt_model, optimizer, model_tag in \ zip([pt_model_G, pt_model_D], [optimizer_G, optimizer_D], model_tags): tmp_model_name = f_save_epoch_name_GAN(args, epoch_idx, model_tag) if monitor_val is not None: tmp_val_log = monitor_val.get_state_dic() else: tmp_val_log = None # save tmp_dic = { cp_names.state_dict: pt_model.state_dict(), cp_names.info: train_log, cp_names.optimizer: optimizer.state_dict(), cp_names.trnlog: monitor_trn.get_state_dic(), cp_names.vallog: tmp_val_log } torch.save(tmp_dic, tmp_model_name) if args.verbose == 1: nii_display.f_eprint(str(datetime.datetime.now())) nii_display.f_eprint("Save {:s}".format(tmp_model_name), flush=True) # early stopping if monitor_val is not None and \ monitor_val.should_early_stop(no_best_epoch_num): flag_early_stopped = True break # loop done nii_op_display_tk.print_log_tail() if flag_early_stopped: nii_display.f_print("Training finished by early stopping") else: nii_display.f_print("Training finished") nii_display.f_print("Model is saved to", end='') for model_tag in model_tags: nii_display.f_print("{}".format( f_save_trained_name_GAN(args, model_tag))) return
def f_train_wrapper(args, pt_model, loss_wrapper, device, \ optimizer_wrapper, \ train_dataset_wrapper, \ val_dataset_wrapper = None, \ checkpoint = None): """ f_train_wrapper(args, pt_model, loss_wrapper, device, optimizer_wrapper train_dataset_wrapper, val_dataset_wrapper = None, check_point = None): A wrapper to run the training process Args: args: argument information given by argpase pt_model: pytorch model (torch.nn.Module) loss_wrapper: a wrapper over loss function loss_wrapper.compute(generated, target) device: torch.device("cuda") or torch.device("cpu") optimizer_wrapper: a wrapper over optimizer (defined in op_manager.py) optimizer_wrapper.optimizer is torch.optimizer train_dataset_wrapper: a wrapper over training data set (data_io/default_data_io.py) train_dataset_wrapper.get_loader() returns torch.DataSetLoader val_dataset_wrapper: a wrapper over validation data set (data_io/default_data_io.py) it can None. check_point: a check_point that stores every thing to resume training """ nii_display.f_print_w_date("Start model training") # get the optimizer optimizer_wrapper.print_info() optimizer = optimizer_wrapper.optimizer epoch_num = optimizer_wrapper.get_epoch_num() no_best_epoch_num = optimizer_wrapper.get_no_best_epoch_num() # get data loader for training set train_dataset_wrapper.print_info() train_data_loader = train_dataset_wrapper.get_loader() train_seq_num = train_dataset_wrapper.get_seq_num() # get the training process monitor monitor_trn = nii_monitor.Monitor(epoch_num, train_seq_num) # if validation data is provided, get data loader for val set if val_dataset_wrapper is not None: val_dataset_wrapper.print_info() val_data_loader = val_dataset_wrapper.get_loader() val_seq_num = val_dataset_wrapper.get_seq_num() monitor_val = nii_monitor.Monitor(epoch_num, val_seq_num) else: monitor_val = None # training log information train_log = '' # print the network pt_model.to(device, dtype=nii_dconf.d_dtype) f_model_show(pt_model) # resume training or initialize the model if necessary cp_names = CheckPointKey() if checkpoint is not None: if type(checkpoint) is dict: # checkpoint if cp_names.state_dict in checkpoint: pt_model.load_state_dict(checkpoint[cp_names.state_dict]) if cp_names.optimizer in checkpoint: optimizer.load_state_dict(checkpoint[cp_names.optimizer]) if cp_names.trnlog in checkpoint: monitor_trn.load_state_dic(checkpoint[cp_names.trnlog]) if cp_names.vallog in checkpoint and monitor_val: monitor_val.load_state_dic(checkpoint[cp_names.vallog]) if cp_names.info in checkpoint: train_log = checkpoint[cp_names.info] nii_display.f_print("Load check point and resume training") else: # only model status pt_model.load_state_dict(checkpoint) nii_display.f_print("Load pre-trained model") # other variables flag_early_stopped = False start_epoch = monitor_trn.get_epoch() epoch_num = monitor_trn.get_max_epoch() # print _ = nii_op_display_tk.print_log_head() nii_display.f_print_message(train_log, flush=True, end='') # loop over multiple epochs for epoch_idx in range(start_epoch, epoch_num): # training one epoch pt_model.train() f_run_one_epoch(args, pt_model, loss_wrapper, device, \ monitor_trn, train_data_loader, \ epoch_idx, optimizer) time_trn = monitor_trn.get_time(epoch_idx) loss_trn = monitor_trn.get_loss(epoch_idx) # if necessary, do validataion if val_dataset_wrapper is not None: # set eval() if necessary if args.eval_mode_for_validation: pt_model.eval() with torch.no_grad(): f_run_one_epoch(args, pt_model, loss_wrapper, \ device, \ monitor_val, val_data_loader, \ epoch_idx, None) time_val = monitor_val.get_time(epoch_idx) loss_val = monitor_val.get_loss(epoch_idx) else: time_val, loss_val = 0, 0 if val_dataset_wrapper is not None: flag_new_best = monitor_val.is_new_best() else: flag_new_best = True # print information train_log += nii_op_display_tk.print_train_info(epoch_idx, \ time_trn, \ loss_trn, \ time_val, \ loss_val, \ flag_new_best) # save the best model if flag_new_best: tmp_best_name = f_save_trained_name(args) torch.save(pt_model.state_dict(), tmp_best_name) # save intermediate model if necessary if not args.not_save_each_epoch: tmp_model_name = f_save_epoch_name(args, epoch_idx) if monitor_val is not None: tmp_val_log = monitor_val.get_state_dic() else: tmp_val_log = None # save tmp_dic = { cp_names.state_dict: pt_model.state_dict(), cp_names.info: train_log, cp_names.optimizer: optimizer.state_dict(), cp_names.trnlog: monitor_trn.get_state_dic(), cp_names.vallog: tmp_val_log } torch.save(tmp_dic, tmp_model_name) if args.verbose == 1: nii_display.f_eprint(str(datetime.datetime.now())) nii_display.f_eprint("Save {:s}".format(tmp_model_name), flush=True) # early stopping if monitor_val is not None and \ monitor_val.should_early_stop(no_best_epoch_num): flag_early_stopped = True break # loop done nii_op_display_tk.print_log_tail() if flag_early_stopped: nii_display.f_print("Training finished by early stopping") else: nii_display.f_print("Training finished") nii_display.f_print("Model is saved to", end='') nii_display.f_print("{}".format(f_save_trained_name(args))) return
def f_run_one_epoch_GAN( args, pt_model_G, pt_model_D, loss_wrapper, \ device, monitor, \ data_loader, epoch_idx, optimizer_G = None, optimizer_D = None, \ target_norm_method = None): """ f_run_one_epoch_GAN: run one poech over the dataset (for training or validation sets) Args: args: from argpase pt_model_G: pytorch model (torch.nn.Module) generator pt_model_D: pytorch model (torch.nn.Module) discriminator loss_wrapper: a wrapper over loss function loss_wrapper.compute(generated, target) device: torch.device("cuda") or torch.device("cpu") monitor: defined in op_procfess_monitor.py data_loader: pytorch DataLoader. epoch_idx: int, index of the current epoch optimizer_G: torch optimizer or None, for generator optimizer_D: torch optimizer or None, for discriminator if None, the back propgation will be skipped (for developlement set) target_norm_method: method to normalize target data (by default, use pt_model.normalize_target) """ # timer start_time = time.time() # loop over samples for data_idx, (data_in, data_tar, data_info, idx_orig) in \ enumerate(data_loader): # send data to device if optimizer_G is not None: optimizer_G.zero_grad() if optimizer_D is not None: optimizer_D.zero_grad() # prepare data if isinstance(data_tar, torch.Tensor): data_tar = data_tar.to(device, dtype=nii_dconf.d_dtype) # there is no way to normalize the data inside loss # thus, do normalization here if target_norm_method is None: normed_target = pt_model_G.normalize_target(data_tar) else: normed_target = target_norm_method(data_tar) else: nii_display.f_die("target data is required") # to device (we assume noise will be generated by the model itself) # here we only provide external condition data_in = data_in.to(device, dtype=nii_dconf.d_dtype) ############################ # Update Discriminator ############################ # train with real pt_model_D.zero_grad() d_out_real = pt_model_D(data_tar) errD_real = loss_wrapper.compute_gan_D_real(d_out_real) if optimizer_D is not None: errD_real.backward() # this should be given by pt_model_D or loss wrapper #d_out_real_mean = d_out_real.mean() # train with fake # generate sample if args.model_forward_with_target: # if model.forward requires (input, target) as arguments # for example, for auto-encoder & autoregressive model if isinstance(data_tar, torch.Tensor): data_tar_tm = data_tar.to(device, dtype=nii_dconf.d_dtype) if args.model_forward_with_file_name: data_gen = pt_model_G(data_in, data_tar_tm, data_info) else: data_gen = pt_model_G(data_in, data_tar_tm) else: nii_display.f_print("--model-forward-with-target is set") nii_display.f_die("but data_tar is not loaded") else: if args.model_forward_with_file_name: # specifcal case when model.forward requires data_info data_gen = pt_model_G(data_in, data_info) else: # normal case for model.forward(input) data_gen = pt_model_G(data_in) # data_gen.detach() is required # https://github.com/pytorch/examples/issues/116 d_out_fake = pt_model_D(data_gen.detach()) errD_fake = loss_wrapper.compute_gan_D_fake(d_out_fake) if optimizer_D is not None: errD_fake.backward() errD = errD_real + errD_fake if optimizer_D is not None: optimizer_D.step() ############################ # Update Generator ############################ pt_model_G.zero_grad() d_out_fake_for_G = pt_model_D(data_gen) errG_gan = loss_wrapper.compute_gan_G(d_out_fake_for_G) # if defined, calculate auxilliart loss if hasattr(loss_wrapper, "compute_aux"): errG_aux = loss_wrapper.compute_aux(data_gen, data_tar) else: errG_aux = torch.zeros_like(errG_gan) # if defined, calculate feat-matching loss if hasattr(loss_wrapper, "compute_feat_match"): errG_feat = loss_wrapper.compute_feat_match( d_out_real, d_out_fake_for_G) else: errG_feat = torch.zeros_like(errG_gan) # sum loss for generator errG = errG_gan + errG_aux + errG_feat if optimizer_G is not None: errG.backward() optimizer_G.step() # construct the loss for logging and early stopping # only use errG_aux for early-stopping loss_computed = [[errG_aux, errD_real, errD_fake, errG_gan, errG_feat], [True, False, False, False, False]] # to handle cases where there are multiple loss functions _, loss_vals, loss_flags = nii_nn_tools.f_process_loss(loss_computed) # save the training process information to the monitor end_time = time.time() batchsize = len(data_info) for idx, data_seq_info in enumerate(data_info): # loss_value is supposed to be the average loss value # over samples in the the batch, thus, just loss_value # rather loss_value / batchsize monitor.log_loss(loss_vals, loss_flags, \ (end_time-start_time) / batchsize, \ data_seq_info, idx_orig.numpy()[idx], \ epoch_idx) # print infor for one sentence if args.verbose == 1: monitor.print_error_for_batch(data_idx*batchsize + idx,\ idx_orig.numpy()[idx], \ epoch_idx) # # start the timer for a new batch start_time = time.time() # lopp done return