def train(self, auto_resume=True, eval_freq=0): ''' Args: auto_resume(boolean): automatically resume latest model from self.result_dir/model_{latest_epoch}.pth if True. eval_freq(int): if type is int, evaluate every eval_freq. default is 0. ''' # automatically resume from the latest model start_epoch = 0 if auto_resume: start_epoch = self.resume_latest_model() start_epoch = 0 if start_epoch is None else start_epoch # train loop for curr_epoch in range(start_epoch, self.optimizer.max_epochs): # save model self.save_model(curr_epoch) # evaluate final model if eval_freq > 0 and curr_epoch % eval_freq == 0 and curr_epoch > 0: self.eval(onebyone=False) # train results = self.train_an_epoch(curr_epoch) # logging self.logging(EPOCH=curr_epoch, TIME=time_now(), RESULTS=results) # save final model self.save_model(self.optimizer.max_epochs) # evaluate final model self.eval(onebyone=False)
def resume_latest_model(self): ''' resume from the latest model in path self.results_dir ''' root, _, files = os_walk(self.results_dir) pth_files = [file for file in files if '.pth' in file and file != 'final_model.pth.tar'] if len(pth_files) != 0: pth_epochs = [int(pth_file.replace('.pth', '').split('_')[1]) for pth_file in pth_files] max_epoch = max(pth_epochs) model_path = os.path.join(root, 'model_{}.pth'.format(max_epoch)) self.model.load_state_dict(torch.load(model_path), strict=True) self.logging(time_now(), 'restore from {}'.format(model_path)) return max_epoch else: return None