class Grapher(object): ''' A helper class to assist with plotting to visdom ''' def __init__(self, env, server, port=8097): self.vis = Visdom(server=server, port=port, env=env) self.env = env self.param_map = self._init_map() self.function_map = { 'line': self._plot_line, 'imgs': self._plot_imgs, 'img': self._plot_img, 'hist': self._plot_hist, 'video': self._plot_video } # this is persisted through the lifespan of the object # it contains the window objects self.registered_lines = {} def save(self): self.vis.save([self.env]) def _init_map(self): ''' Internal member to return a map of lists ''' return {'line': [], 'imgs': [], 'img': [], 'video': [], 'hist': []} def clear(self): '''Helper to clear and reset the internal map''' if hasattr(self, 'param_map'): self.param_map.clear() self.param_map = self._init_map() def _plot_img(self, img_list): for img_map in img_list: for key, value in img_map.items(): self.vis.image(to_data(value).detach().cpu().numpy(), opts=dict(title=key), win=key) def _plot_imgs(self, imgs_list): for imgs_map in imgs_list: for key, value in imgs_map.items(): self.vis.images(to_data(value).detach().cpu().numpy(), opts=dict(title=key), win=key) def _plot_line(self, line_list): for line_map in line_list: for key, value in line_map.items(): x = np.asarray(value[0]) # time-point y = np.asarray(value[1]) # value if len(y.shape) < 1: y = np.expand_dims(y, -1) if len(x.shape) < 1: x = np.expand_dims(x, -1) if key not in self.registered_lines: self.registered_lines[key] = self.vis.line( Y=y, X=x, opts=dict(title=key), win=key) else: self.vis.line(Y=y, X=x, opts=dict(title=key), win=self.registered_lines[key], update='append') def _plot_hist(self, hist_list): for hist_map in hist_list: for key, value in hist_map.items(): numbins = value[0] hist_value = value[1] self.vis.histogram(hist_value, opts=dict(title=key, numbins=numbins), win=key) def _plot_video(self, video_list): for video_map in video_list: for key, value in video_map.item(): assert isinstance(value, torch.Tensor), "files not supported" self.vis.video(tensor=to_data(value), opts=dict(title=key), win=key) def register(self, param_map, plot_types, override=True): ''' submit bulk map here, see register_single for detail ''' assert len(param_map) == len(plot_types) if type(override) != list: override = [override] * len(param_map) for pm, pt, o in zip(param_map, plot_types, override): self.register_single(pm, pt, o) def _find_and_append(self, param_map, plot_type): assert plot_type == 'line', "only line append supported currently" exists = False for i in range(len(self.param_map[plot_type])): list_item = self.param_map[plot_type] for key, value in param_map.items(): for j in range(len(list_item)): if key in list_item[j]: list_item[j][key][0].extend(value[0]) list_item[j][key][1].extend(value[1]) exists = True if not exists: self.param_map[plot_type].append(param_map) def _find_and_replace(self, param_map, plot_type): exists = False for i in range(len(self.param_map[plot_type])): list_item = self.param_map[plot_type] for key, value in param_map.items(): for j in range(len(list_item)): if key in list_item[j]: list_item[j][key] = value exists = True if not exists: self.param_map[plot_type].append(param_map) def register_single(self, param_map, plot_type='line', append=False, override=True): ''' register a single plot which will be added to the current map eg: register({'title': value}, 'line') plot_type: 'line', 'hist', 'imgs', 'img', 'video' override : if True then overwrite an item if it exists append : if True appends to the line. This is mainly useful useful if you are extending a line before show() Note: you can't override and append ''' assert len(param_map) == 1, "only one register per call" assert not (override is True and append is True), "cant override and append" plot_type = plot_type.lower().strip() assert plot_type == 'line' \ or plot_type == 'hist' \ or plot_type == 'imgs' \ or plot_type == 'img' \ or plot_type == 'video' if append: self._find_and_append(param_map, plot_type) if override: self._find_and_replace(param_map, plot_type) def _check_exists(self, plot_type, param_map): for key, _ in param_map.items(): # {'name', value} for list_item in self.param_map[ plot_type]: # [{'name': value}, {'name2': value2}] return key not in list_item def show(self, clear=True): ''' This helper is called to actually push the data to visdom''' for key, value_list in self.param_map.items(): self.function_map[key](value_list) if clear: # helper to clear the plot map self._init_map()
class Trainer(object): def __init__(self, train_dataset, val_dataset, model, loss_fn, optimizer, lr_scheduler, params): """ General purpose training script :param train_dataset: PyTorch dataset that loads training images :param val_dataset: PyTorch dataset that loads testing / validation images :param model: Network model :param optimizer: PyTorch optimizer object :param lr_scheduler: PyTorch learning rate scheduler object :param loss_fn: loss function :param params: dictionary containing parameters for the training process It can contain the following fields (fields with no default value mentioned are mandatory): n_epochs: number of epochs of training batch_size: batch size for one iteration do_val: perform validation? (default: True) shuffle: shuffle training data? (default: True) num_workers: number of CPU threads for loading data (default: 4) val_freq: frequency of validation (in number of epochs) (default: 1) print_freq: progress printing frequency (in number of iterations (default: 20) experiment: name of the experiment, used to create logs and checkpoints checkpoint_file: Name of file with saved weights. Loaded at before start of training if provided (default: None) resume_optim: whether to resume optimization from loaded weights (default: True) """ self.model = model self.loss_fn = loss_fn self.optimizer = optimizer self.lr_scheduler = lr_scheduler self.best_prec1 = -float('inf') # parse params with default values self.config = { 'n_epochs': params['n_epochs'], 'batch_size': params['batch_size'], 'do_val': params.get('do_val', True), 'shuffle': params.get('shuffle', True), 'num_workers': params.get('num_workers', 4), 'val_freq': params.get('val_freq', 1), 'print_freq': params.get('print_freq', 100), 'experiment': params['experiment'], 'checkpoint_file': params.get('checkpoint_file'), 'resume_optim': params.get('resume_optim', True) } self.logdir = osp.join(os.getcwd(), 'logs', self.config['experiment']) if not osp.isdir(self.logdir): os.makedirs(self.logdir) # visdom plots self.vis_env = self.config['experiment'] self.loss_win = 'loss_win' self.vis = Visdom() self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.loss_win, opts={ 'legend': ['train_loss', 'val_loss'], 'xlabel': 'epochs', 'ylabel': 'loss' }, env=self.vis_env) self.lr_win = 'lr_win' self.vis.line(X=np.zeros(1), Y=np.zeros(1), win=self.lr_win, opts={ 'legend': ['learning_rate'], 'xlabel': 'epochs', 'ylabel': 'log(lr)' }, env=self.vis_env) self.top1_win = 'top1_win' self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.top1_win, opts={ 'legend': ['train_top1_prec', 'val_top1_prec'], 'xlabel': 'epochs', 'ylabel': 'top1_prec (%)' }, env=self.vis_env) self.top5_win = 'top5_win' self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.top5_win, opts={ 'legend': ['train_top5_prec', 'val_top5_prec'], 'xlabel': 'epochs', 'ylabel': 'top5_prec (%)' }, env=self.vis_env) # log all the command line options print('---------------------------------------') print('Experiment: {:s}'.format(self.config['experiment'])) for k, v in self.config.items(): print('{:s}: {:s}'.format(k, str(v))) print('---------------------------------------') self.start_epoch = int(0) checkpoint_file = self.config['checkpoint_file'] if checkpoint_file: if osp.isfile(checkpoint_file): checkpoint = torch.load(checkpoint_file) self.model.load_state_dict(checkpoint['model_state_dict']) self.best_prec1 = checkpoint['best_prec1'] if self.config['resume_optim']: self.optimizer.load_state_dict( checkpoint['optim_state_dict']) self.start_epoch = checkpoint['epoch'] print('Loaded checkpoint {:s} epoch {:d}'.format( checkpoint_file, checkpoint['epoch'])) self.train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.config['batch_size'], shuffle=self.config['shuffle'], num_workers=self.config['num_workers']) if self.config['do_val']: self.val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=self.config['batch_size'], shuffle=False, num_workers=self.config['num_workers']) else: self.val_loader = None def save_checkpoint(self, epoch, is_best): filename = osp.join(self.logdir, 'checkpoint.pth.tar') checkpoint_dict = \ {'epoch': epoch, 'model_state_dict': self.model.state_dict(), 'optim_state_dict': self.optimizer.state_dict(), 'best_prec1': self.best_prec1} torch.save(checkpoint_dict, filename) if is_best: shutil.copyfile(filename, osp.join(self.logdir, 'best_model.pth.tar')) def step_func(self, train): batch_time = AverageMeter() data_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() top5 = AverageMeter() if train: self.model.train() status = 'train' loader = self.train_loader else: self.model.eval() status = 'val' loader = self.val_loader end = time.time() for batch_idx, (data, target) in enumerate(loader): data_time.update(time.time() - end) kwargs = dict(target=target, loss_fn=self.loss_fn, optim=self.optimizer, train=train) loss, output = step_feedfwd(data, self.model, **kwargs) # measure accuracy and calculate loss prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) losses.update(loss, data.size(0)) top1.update(prec1[0], data.size(0)) top5.update(prec5[0], data.size(0)) # measure batch time batch_time.update(time.time() - end) end = time.time() if batch_idx % self.config['print_freq'] == 0: print( '{:s} {:s}: batch {:d}/{:d}, loss {:4.3f}, top-1 accuracy {:4.3f},' ' top-5 accuracy {:4.3f}'.format(status, self.config['experiment'], batch_idx, len(loader) - 1, loss, prec1[0], prec5[0])) print('{:s} {:s}: loss {:f}'.format(status, self.config['experiment'], losses.avg)) return losses.avg, top1.avg, top5.avg def train_val(self): for epoch in range(self.start_epoch, self.config['n_epochs']): print('{:s} Epoch {:d} / {:d}'.format(self.config['experiment'], epoch, self.config['n_epochs'])) # ADJUST LR self.lr_scheduler.step() lr = self.lr_scheduler.get_lr()[0] self.vis.line(X=np.asarray([epoch]), Y=np.asarray([np.log10(lr)]), win=self.lr_win, name='learning_rate', update='append', env=self.vis_env) # TRAIN loss, top1_prec, top5_prec = self.step_func(train=True) self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss]), win=self.loss_win, name='train_loss', update='append', env=self.vis_env) self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top1_prec]), win=self.top1_win, name='train_top1_prec', update='append', env=self.vis_env) self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top5_prec]), win=self.top5_win, name='train_top5_prec', update='append', env=self.vis_env) self.vis.save(envs=[self.vis_env]) # VALIDATION if self.config['do_val'] and ( (epoch % self.config['val_freq'] == 0) or (epoch == self.config['n_epochs'] - 1)): loss, top1_prec, top5_prec = self.step_func(train=False) self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss]), win=self.loss_win, name='val_loss', update='append', env=self.vis_env) self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top1_prec]), win=self.top1_win, name='val_top1_prec', update='append', env=self.vis_env) self.vis.line(X=np.asarray([epoch]), Y=np.asarray([top5_prec]), win=self.top5_win, name='val_top5_prec', update='append', env=self.vis_env) self.vis.save(envs=[self.vis_env]) # SAVE CHECKPOINT is_best = top1_prec > self.best_prec1 self.best_prec1 = max(self.best_prec1, top1_prec) self.save_checkpoint(epoch, is_best) print('Checkpoint saved') if is_best: print('BEST TOP1 ACCURACY SO FAR') return self.best_prec1
class Trainer(object): def __init__(self, model, optimizer, train_criterion, config_file, experiment, train_dataset, val_dataset, device, checkpoint_file=None, resume_optim=False, val_criterion=None): """ General purpose training script :param model: Network model :param optimizer: object of the Optimizer class, wrapping torch.optim and lr :param train_criterion: Training loss function :param config_file: configuration .ini file for training parameters :param experiment: name of the experiment, used to create logging dir :param train_dataset: PyTorch dataset :param val_dataset: PyTorch dataset :param device: IDs of the GPUs to use - value of $CUDA_VISIBLE_DEVICES :param checkpoint_file: Name of file with saved weights and optim params :param resume_optim: whether to resume optimization :param val_criterion: loss function to be used for validation """ self.model = model self.train_criterion = train_criterion if val_criterion is None: self.val_criterion = self.train_criterion else: self.val_criterion = val_criterion self.experiment = experiment self.optimizer = optimizer if 'CUDA_VISIBLE_DEVICES' not in os.environ: os.environ['CUDA_VISIBLE_DEVICES'] = device # read the config settings = configparser.ConfigParser() with open(config_file, 'r') as f: settings.read_file(f) self.config = {} section = settings['training'] self.config['n_epochs'] = section.getint('n_epochs') self.config['batch_size'] = section.getint('batch_size') self.config['do_val'] = section.getboolean('do_val') self.config['shuffle'] = section.getboolean('shuffle') self.config['seed'] = section.getint('seed') self.config['num_workers'] = section.getint('num_workers') self.config['snapshot'] = section.getint('snapshot') self.config['val_freq'] = section.getint('val_freq') self.config['cuda'] = torch.cuda.is_available() self.config['max_grad_norm'] = section.getfloat('max_grad_norm', 0) section = settings['logging'] self.config['log_visdom'] = section.getboolean('visdom') self.config['print_freq'] = section.getint('print_freq') self.logdir = osp.join(os.getcwd(), 'logs', self.experiment) if not osp.isdir(self.logdir): os.makedirs(self.logdir) if self.config['log_visdom']: # start plots self.vis_env = experiment self.loss_win = 'loss_win' self.vis = Visdom() self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.loss_win, opts={ 'legend': ['train_loss', 'val_loss'], 'xlabel': 'epochs', 'ylabel': 'loss' }, env=self.vis_env) self.lr_win = 'lr_win' self.vis.line(X=np.zeros(1), Y=np.zeros(1), win=self.lr_win, opts={ 'legend': ['learning_rate'], 'xlabel': 'epochs', 'ylabel': 'log(lr)' }, env=self.vis_env) criterion_params = { k: v.data.cpu().numpy()[0] for k, v in self.train_criterion.named_parameters() } self.n_criterion_params = len(criterion_params) if self.n_criterion_params: self.criterion_param_win = 'cparam_win' self.vis.line(X=np.zeros((1, self.n_criterion_params)), Y=np.asarray( criterion_params.values())[np.newaxis, :], win=self.criterion_param_win, env=self.vis_env, opts={ 'legend': criterion_params.keys(), 'xlabel': 'epochs', 'ylabel': 'value' }) logfile = osp.join(self.logdir, 'log.txt') stdout = Logger.Logger(logfile) print 'Logging to {:s}'.format(logfile) sys.stdout = stdout # log all the command line options print '---------------------------------------' print 'Experiment: {:s}'.format(self.experiment) for k, v in self.config.items(): print '{:s}: {:s}'.format(k, str(v)) print 'Using GPU {:s} / {:d}'.format(device, torch.cuda.device_count()) print '---------------------------------------' # set random seed torch.manual_seed(self.config['seed']) if self.config['cuda']: torch.cuda.manual_seed(self.config['seed']) self.start_epoch = int(0) if checkpoint_file: if osp.isfile(checkpoint_file): loc_func = None if self.config[ 'cuda'] else lambda storage, loc: storage checkpoint = torch.load(checkpoint_file, map_location=loc_func) load_state_dict(self.model, checkpoint['model_state_dict']) if resume_optim: self.optimizer.learner.load_state_dict( checkpoint['optim_state_dict']) self.start_epoch = checkpoint['epoch'] if checkpoint.has_key('criterion_state_dict'): c_state = checkpoint['criterion_state_dict'] append_dict = { k: torch.Tensor([0.0]) for k, _ in self.train_criterion.named_parameters() if not k in c_state } c_state.update(append_dict) self.train_criterion.load_state_dict(c_state) print 'Loaded checkpoint {:s} epoch {:d}'.format( checkpoint_file, checkpoint['epoch']) self.train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=self.config['batch_size'], shuffle=self.config['shuffle'], num_workers=self.config['num_workers'], pin_memory=True, collate_fn=safe_collate) if self.config['do_val']: self.val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=self.config['batch_size'], shuffle=self.config['shuffle'], num_workers=self.config['num_workers'], pin_memory=True, collate_fn=safe_collate) else: self.val_loader = None # activate GPUs if self.config['cuda']: self.model.cuda() self.train_criterion.cuda() self.val_criterion.cuda() def save_checkpoint(self, epoch): filename = osp.join(self.logdir, 'epoch_{:03d}.pth.tar'.format(epoch)) checkpoint_dict =\ {'epoch': epoch, 'model_state_dict': self.model.state_dict(), 'optim_state_dict': self.optimizer.learner.state_dict(), 'criterion_state_dict': self.train_criterion.state_dict()} torch.save(checkpoint_dict, filename) def train_val(self, lstm): """ Function that does the training and validation :param lstm: whether the model is an LSTM :return: """ for epoch in xrange(self.start_epoch, self.config['n_epochs']): # VALIDATION if self.config['do_val'] and ( (epoch % self.config['val_freq'] == 0) or (epoch == self.config['n_epochs'] - 1)): val_batch_time = Logger.AverageMeter() val_loss = Logger.AverageMeter() self.model.eval() end = time.time() val_data_time = Logger.AverageMeter() for batch_idx, (data, target) in enumerate(self.val_loader): val_data_time.update(time.time() - end) kwargs = dict(target=target, criterion=self.val_criterion, optim=self.optimizer, train=False) if lstm: loss, _ = step_lstm(data, self.model, self.config['cuda'], **kwargs) else: loss, _ = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) val_loss.update(loss) val_batch_time.update(time.time() - end) if batch_idx % self.config['print_freq'] == 0: print 'Val {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data time {:.4f} ({:.4f})\t' \ 'Batch time {:.4f} ({:.4f})\t' \ 'Loss {:f}' \ .format(self.experiment, epoch, batch_idx, len(self.val_loader)-1, val_data_time.val, val_data_time.avg, val_batch_time.val, val_batch_time.avg, loss) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env]) end = time.time() print 'Val {:s}: Epoch {:d}, val_loss {:f}'.format( self.experiment, epoch, val_loss.avg) if self.config['log_visdom']: self.vis.updateTrace(X=np.asarray([epoch]), Y=np.asarray([val_loss.avg]), win=self.loss_win, name='val_loss', append=True, env=self.vis_env) self.vis.save(envs=[self.vis_env]) # SAVE CHECKPOINT if epoch % self.config['snapshot'] == 0: self.save_checkpoint(epoch) print 'Epoch {:d} checkpoint saved for {:s}'.\ format(epoch, self.experiment) # ADJUST LR lr = self.optimizer.adjust_lr(epoch) if self.config['log_visdom']: self.vis.updateTrace(X=np.asarray([epoch]), Y=np.asarray([np.log10(lr)]), win=self.lr_win, name='learning_rate', append=True, env=self.vis_env) # TRAIN self.model.train() train_data_time = Logger.AverageMeter() train_batch_time = Logger.AverageMeter() end = time.time() for batch_idx, (data, target) in enumerate(self.train_loader): train_data_time.update(time.time() - end) kwargs = dict(target=target, criterion=self.train_criterion, optim=self.optimizer, train=True, max_grad_norm=self.config['max_grad_norm']) if lstm: loss, _ = step_lstm(data, self.model, self.config['cuda'], **kwargs) else: loss, _ = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) train_batch_time.update(time.time() - end) if batch_idx % self.config['print_freq'] == 0: n_iter = epoch * len(self.train_loader) + batch_idx epoch_count = float(n_iter) / len(self.train_loader) print 'Train {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data Time {:.4f} ({:.4f})\t' \ 'Batch Time {:.4f} ({:.4f})\t' \ 'Loss {:f}\t' \ 'lr: {:f}'.\ format(self.experiment, epoch, batch_idx, len(self.train_loader)-1, train_data_time.val, train_data_time.avg, train_batch_time.val, train_batch_time.avg, loss, lr) if self.config['log_visdom']: self.vis.updateTrace(X=np.asarray([epoch_count]), Y=np.asarray([loss]), win=self.loss_win, name='train_loss', append=True, env=self.vis_env) if self.n_criterion_params: for name, v in self.train_criterion.named_parameters( ): v = v.data.cpu().numpy()[0] self.vis.updateTrace( X=np.asarray([epoch_count]), Y=np.asarray([v]), win=self.criterion_param_win, name=name, append=True, env=self.vis_env) self.vis.save(envs=[self.vis_env]) end = time.time() # Save final checkpoint epoch = self.config['n_epochs'] self.save_checkpoint(epoch) print 'Epoch {:d} checkpoint saved'.format(epoch) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env])
def train(optimizer_options, data_options, logger_options, model_options, scheduler_options):#, results_path=None): #torch.manual_seed(42) #np.random.seed(42) vis = Visdom(env=logger_options['vislogger_env'], port=logger_options['vislogger_port']) device = torch.device(optimizer_options['device']) epochs = optimizer_options['epochs'] ## ======================================= Scheduler ======================================= ## ## ======================================= Scheduler ======================================= ## ## ======================================= Save model ======================================= ## if (logger_options['save_model'] == ""): model_checkpoint = ModelCheckpoint() else: suffix = optimizer_options['optimizer']+"_"+str(optimizer_options['learning_rate'])+"_"+logger_options['suffix'] model_checkpoint = ModelCheckpoint(save_model=True, save_path=logger_options['save_model'], use_loss=True, suffix=suffix) ####### FILL PARAMTERS!!!! ## ======================================= Save model ======================================= ## ## ======================================= Data ======================================= ## # image_transform = Compose([Resize(data_options['image_size'])]) # image_transform = Compose([Resize(data_options['image_size']), ToTensor()]) # iaa_transform = iaa.Sequential([iaa.Scale(0.5)]) # Not worth scaling image, haven't found a fast scaler. image_transform = Compose([ #Resize(data_options['image_size']), RangeNormalize(0., 1.0), # Faster than the one in BatchGenerators ChannelFirst(), # RangeTransform(), MeanStdNormalizationTransform(mean=[0.3610,0.2131,0.2324], std=[0.0624,0.0463,0.0668]), NumpyToTensor(keys=['data', 'target']) ]) # kfoldWorkflowSet = kFoldWorkflowSplitMT('/home/anant/data/endovis/COMPRESSED_0_05/TrainingSet/', # image_transform=image_transform, # video_extn='.avi', shuffle=True, # n_folds=3, num_phases=14, # batch_size=32, # num_workers=12) kfoldWorkflowSet = kFoldWorkflowSplitMT(data_options['base_path'], image_transform=image_transform, video_extn='.avi', shuffle=True, n_folds=data_options['n_folds'], num_phases=14, batch_size=data_options['batch_size'], num_workers=data_options['n_threads'], video_folder='videos_480x272') ## ======================================= Data ======================================= ## nfolds_training_loss_avg = CumulativeMovingAvgStd() nfolds_validation_loss_avg = CumulativeMovingAvgStd() nfolds_validation_score_avg = CumulativeMovingAvgStd() folds_pbar = ProgressBar(kfoldWorkflowSet, desc="Folds", pb_len=optimizer_options['run_nfolds']) max_folds = folds_pbar.total for iFold, (train_loader, val_loader) in enumerate(folds_pbar): #= next(kfoldWorkflowSet) ## ======================================= Create Plot ======================================= ## create_plot_window(vis, "Epochs+Iterations", "CE Loss", "Training loss Fold "+str(iFold+1), tag='Training_Loss_Fold_'+str(iFold+1), name='Training Loss Fold '+str(iFold+1)) create_plot_window(vis, "Epochs+Iterations", "CE Loss", "Validation loss Fold "+str(iFold+1), tag='Validation_Loss_Fold_'+str(iFold+1), name='Validation Loss Fold '+str(iFold+1)) create_plot_window(vis, "Epochs+Iterations", "Score", "Validation Score Fold "+str(iFold+1), tag='Validation_Score_Fold_'+str(iFold+1), name='Validation Loss Fold '+str(iFold+1)) ## ======================================= Create Plot ======================================= ## ## ======================================= Model ======================================= ## # TODO: Pass 'models.resnet50' as string model = ResFeatureExtractor(pretrained_model=models.resnet101, device=device) if model_options['pretrained'] is not None: # print('Loading pretrained model...') checkpoint = torch.load(model_options['pretrained']) model.load_state_dict(checkpoint['model']) ## ======================================= Model ======================================= ## ### ============================== Parts of Training step ============================== ### criterion_CE = nn.CrossEntropyLoss().to(device) ### ============================== Parts of Training step ============================== ### epoch_pbar = ProgressBar(range(epochs), desc="Epochs") #tqdm(range(epochs)) epoch_training_avg_loss = CumulativeMovingAvgStd() epoch_training_avg_score = CumulativeMovingAvgStd() epoch_validation_loss = BestScore() # epoch_validation_score = BestScore() epoch_msg_dict = {} evaluator = Engine(model, None, criterion_CE, None, val_loader, 0, device, False, use_half_precision=optimizer_options["use_half_precision"], score_type="f1") for epoch in epoch_pbar: if (optimizer_options['switch_optimizer'] > 0) and ((epoch+1) % optimizer_options['switch_optimizer'] == 0): temp_optimizer_options = optimizer_options temp_optimizer_options['optimizer'] = 'sgd' temp_optimizer_options['learning_rate'] = 1e-3 optimizer, scheduler = get_optimizer(model.parameters(), temp_optimizer_options, scheduler_options, train_loader, vis) else: optimizer, scheduler = get_optimizer(model.parameters(), optimizer_options, scheduler_options, train_loader, vis) # else: # optimizer, scheduler = get_optimizer(model.parameters(), optimizer_options, scheduler_options, train_loader, vis) runEpoch(train_loader, model, criterion_CE, optimizer, scheduler, device, vis, epoch, iFold, folds_pbar, epoch_training_avg_loss, epoch_training_avg_score, logger_options, optimizer_options, epoch_msg_dict) ### ============================== Validation ============================== ### validation_loss, validation_score = None, None if (optimizer_options["validation_interval_epochs"] > 0): if ((epoch+1) % optimizer_options["validation_interval_epochs"] == 0): validation_loss, validation_score = predict(evaluator, optimizer_options['max_valid_iterations'], device, vis) epoch_validation_loss.step(validation_loss, [validation_score]) # epoch_validation_score.step(validation_score) vis.line(X=np.array([epoch]), Y=np.array([validation_loss]), update='append', win='Validation_Loss_Fold_'+str(iFold+1), name='Validation Loss Fold '+str(iFold+1)) vis.line(X=np.array([epoch]), Y=np.array([validation_score]), update='append', win='Validation_Score_Fold_'+str(iFold+1), name='Validation Score Fold '+str(iFold+1)) epoch_msg_dict['CVL'] = validation_loss epoch_msg_dict['CVS'] = validation_score epoch_msg_dict['BVL'] = epoch_validation_loss.score()[0] epoch_msg_dict['BVS'] = epoch_validation_loss.score()[1][0] folds_pbar.update_message(msg_dict=epoch_msg_dict) ### ============================== Validation ============================== ### ### ============================== Save model ============================== ### model_checkpoint.step(curr_loss=validation_loss, model=model, suffix='_Fold_'+str(iFold)) vis.save([logger_options['vislogger_env']]) ### ============================== Save model ============================== ### # if early_stop: # epoch_pbar.close() # break # torch.cuda.empty_cache() if (iFold+1) == max_folds: folds_pbar.refresh() folds_pbar.close() break print("\n\n\n\n=================================== DONE ===================================\n\n")
class Trainer(object): def __init__(self, model, optimizer, train_criterion, config_file, experiment, train_dataset, val_dataset, device, checkpoint_file=None, resume_optim=False, val_criterion=None, visdom_server='http://localhost', visdom_port=8097): """ General purpose training script :param model: Network model :param optimizer: object of the Optimizer class, wrapping torch.optim and lr :param train_criterion: Training loss function :param config_file: configuration .ini file for training parameters :param experiment: name of the experiment, used to create logging dir :param train_dataset: PyTorch dataset :param val_dataset: PyTorch dataset :param device: IDs of the GPUs to use - value of $CUDA_VISIBLE_DEVICES :param checkpoint_file: Name of file with saved weights and optim params :param resume_optim: whether to resume optimization :param val_criterion: loss function to be used for validation """ self.model = model self.train_criterion = train_criterion #if val_criterion is None: # self.val_criterion = self.train_criterion #else: self.extra_criterion = val_criterion self.experiment = experiment self.optimizer = optimizer if device is not None: if 'CUDA_VISIBLE_DEVICES' not in os.environ: os.environ['CUDA_VISIBLE_DEVICES'] = device # read the config settings = configparser.ConfigParser() with open(config_file, 'r') as f: settings.read_file(f) self.config = {} section = settings['training'] self.config['n_epochs'] = section.getint('n_epochs') self.config['batch_size'] = section.getint('batch_size') self.config['do_val'] = section.getboolean('do_val') self.config['shuffle'] = section.getboolean('shuffle') self.config['seed'] = section.getint('seed') self.config['num_workers'] = section.getint('num_workers') self.config['snapshot'] = section.getint('snapshot') self.config['val_freq'] = section.getint('val_freq') self.config['cuda'] = torch.cuda.is_available() self.config['max_grad_norm'] = section.getfloat('max_grad_norm', 0) section = settings['logging'] self.config['log_visdom'] = section.getboolean('visdom') self.config['print_freq'] = section.getint('print_freq') self.logdir = osp.join(os.getcwd(), 'logs', self.experiment+'_version0') if osp.isdir(self.logdir): i = 1 tmp_l = self.logdir while(osp.isdir(tmp_l)): tmp_e = self.experiment+ '_version' + str(i) tmp_l = osp.join(os.getcwd(), 'logs', tmp_e) i += 1 self.experiment = tmp_e self.logdir = tmp_l else: self.experiment = self.experiment+'_version0' os.makedirs(self.logdir) shutil.copyfile(config_file, os.path.join(self.logdir, 'config.ini')) if self.config['log_visdom']: # start plots self.vis_env = self.experiment self.loss_win = 'loss_win' self.vis = Visdom(server=visdom_server, port=visdom_port) self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.loss_win, opts={'legend': ['train_loss', 'val_loss'], 'xlabel': 'epochs', 'ylabel': 'loss'}, env=self.vis_env) if self.extra_criterion: self.extra_loss_win = 'extra_' + self.loss_win self.vis.line(X=np.zeros((1, 2)), Y=np.zeros((1, 2)), win=self.extra_loss_win, opts={'legend': ['train_extra_loss', 'val_extra_loss'], 'xlabel': 'epochs', 'ylabel': 'loss'}, env=self.vis_env) self.multiloss_logging = 'multitask' in self.experiment or 'mapnet' in self.experiment if self.multiloss_logging: self.multi_losses = 'multilosses' num = 6 if 'multitask' in self.experiment else 4 names = ['t_loss_val','q_loss_val', 's_loss_val', 't_loss_train','q_loss_train', 's_loss_train'] if 'mapnet' in self.experiment: names = names[0:2]+names[3:5] self.vis.line(X=np.zeros((1, num)), Y=np.zeros((1, num)), win=self.multi_losses, opts={'legend': names, 'xlabel': 'epochs', 'ylabel': 'loss'}, env=self.vis_env) self.lr_win = 'lr_win' self.vis.line(X=np.zeros(1), Y=np.zeros(1), win=self.lr_win, opts={'legend': ['learning_rate'], 'xlabel': 'epochs', 'ylabel': 'log(lr)'}, env=self.vis_env) criterion_params = {k: v.item() for k, v in self.train_criterion.named_parameters()} self.n_criterion_params = len(criterion_params) if self.n_criterion_params: self.criterion_param_win = 'cparam_win' self.vis.line(X=np.zeros((1, self.n_criterion_params)), Y=np.asarray(list(criterion_params.values()))[ np.newaxis, :], win=self.criterion_param_win, env=self.vis_env, opts={'legend': list(criterion_params.keys()), 'xlabel': 'epochs', 'ylabel': 'value'}) logfile = osp.join(self.logdir, 'log.txt') stdout = Logger.Logger(logfile) print('Logging to {:s}'.format(logfile)) sys.stdout = stdout # log all the command line options print('---------------------------------------') print('Experiment: {:s}'.format(self.experiment)) print('Start time: %s'%str(datetime.datetime.now())) for k, v in list(self.config.items()): print('{:s}: {:s}'.format(k, str(v))) print('Using GPU {:s} / {:d}'.format(device if device is not None else str(torch.cuda.current_device()), torch.cuda.device_count())) for i in range(torch.cuda.device_count()): print('Device %d: %s \tCapability: %s'%(i, torch.cuda.get_device_name(i), torch.cuda.get_device_capability(i))) print('---------------------------------------') # set random seed torch.manual_seed(self.config['seed']) if self.config['cuda']: torch.cuda.manual_seed(self.config['seed']) self.start_epoch = int(0) if checkpoint_file: if osp.isfile(checkpoint_file): loc_func = None if self.config['cuda'] else lambda storage, loc: storage checkpoint = torch.load(checkpoint_file, map_location=loc_func) if checkpoint['epoch'] is None: print('WARNING: SAVED EPOCH NOT SPECIFIED IN SAVED FILE - ASSUMING 150') checkpoint['epoch'] = 150 load_state_dict(self.model, checkpoint['model_state_dict']) if resume_optim: self.optimizer.learner.load_state_dict( checkpoint['optim_state_dict']) if self.config['cuda']: for state in optimizer.learner.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.cuda() self.start_epoch = checkpoint['epoch'] if 'criterion_state_dict' in checkpoint: c_state = checkpoint['criterion_state_dict'] append_dict = {k: torch.Tensor([0.0]) for k, _ in self.train_criterion.named_parameters() if not k in c_state} c_state.update(append_dict) self.train_criterion.load_state_dict(c_state) print('Loaded checkpoint {:s} epoch {:d}'.format(str(checkpoint_file), checkpoint['epoch'])) self.train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=self.config['batch_size'], shuffle=self.config['shuffle'], num_workers=self.config['num_workers'], pin_memory=True, collate_fn=safe_collate) if self.config['do_val']: self.val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=self.config['batch_size'], shuffle=self.config['shuffle'], num_workers=self.config['num_workers'], pin_memory=True, collate_fn=safe_collate) else: self.val_loader = None # activate GPUs if self.config['cuda']: self.model.cuda() self.train_criterion.cuda() if self.extra_criterion: self.extra_criterion.cuda() def save_checkpoint(self, epoch, final_epoch): if epoch >= final_epoch: filename = osp.join(self.logdir, 'final_model.pth.tar'.format(epoch)) else: filename = osp.join(self.logdir, 'epoch_{:03d}.pth.tar'.format(epoch)) checkpoint_dict =\ {'epoch': epoch, 'model_state_dict': self.model.state_dict(), 'optim_state_dict': self.optimizer.learner.state_dict(), 'criterion_state_dict': self.train_criterion.state_dict()} torch.save(checkpoint_dict, filename) def train_val(self, lstm, dual_target=None): """ Function that does the training and validation :param lstm: whether the model is an LSTM :return: """ #print("Dual target in train_val: %r"%dual_target) for epoch in range(self.start_epoch, self.config['n_epochs']): # VALIDATION if self.config['do_val'] and ((epoch % self.config['val_freq'] == 0) or (epoch == self.config['n_epochs'] - 1)): val_batch_time = Logger.AverageMeter() val_loss = Logger.AverageMeter() if self.extra_criterion: val_extra_loss = Logger.AverageMeter() self.model.eval() end = time.time() val_data_time = Logger.AverageMeter() for batch_idx, (data, target) in enumerate(self.val_loader): val_data_time.update(time.time() - end) #print(target[1].size()) #a = 1.0/0.0 kwargs = dict(target=target, criterion=self.train_criterion, optim=self.optimizer, train=False) if lstm: loss, output = step_lstm( data, self.model, self.config['cuda'], **kwargs) else: loss, output, loss_list = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) val_loss.update(loss) val_batch_time.update(time.time() - end) if self.extra_criterion: dual_target = type(target) is list or type(target) is tuple with torch.set_grad_enabled(False): if self.config['cuda']: if dual_target: target = tuple(single_target.cuda(async=True) for single_target in target) else: target = target.cuda(async=True) if dual_target: target_var = tuple(Variable(t, requires_grad=False) for t in target) else: target_var = Variable(target, requires_grad=False) extra_loss, _ = self.extra_criterion(output, target_var) extra_loss = extra_loss.item() val_extra_loss.update(extra_loss) if batch_idx % self.config['print_freq'] == 0: print_string = 'Val {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data Time {:.4f} ({:.4f})\t' \ 'Batch Time {:.4f} ({:.4f})\t' \ 'Loss {:f}\t' \ .format(self.experiment, epoch, batch_idx, len(self.val_loader) - 1, val_data_time.val, val_data_time.avg, val_batch_time.val, val_batch_time.avg, loss) if self.extra_criterion: print_string += 'Loss Extra Scale {:f}\t'.format(extra_loss) print(print_string) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env]) end = time.time() print_string = 'Val {:s}: Epoch {:d}, val_loss {:f}' \ .format(self.experiment, epoch, val_loss.avg) if self.extra_criterion: print_string += ' val_extra_loss {:f}\t'.format(val_extra_loss.avg) print(print_string) if self.config['log_visdom']: val_loss_avg = val_loss.avg self.vis.line(X=np.asarray([epoch]), Y=np.asarray([val_loss_avg]), win=self.loss_win, name='val_loss', update='append', env=self.vis_env) if self.multiloss_logging: self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss_list[0]]), win=self.multi_losses, name='t_loss_val', update='append', env=self.vis_env) self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss_list[1]]), win=self.multi_losses, name='q_loss_val', update='append', env=self.vis_env) if 'multitask' in self.experiment: self.vis.line(X=np.asarray([epoch]), Y=np.asarray([loss_list[2]]), win=self.multi_losses, name='s_loss_val', update='append', env=self.vis_env) if self.extra_criterion: val_extra_loss_avg = val_extra_loss.avg self.vis.line(X=np.asarray([epoch]), Y=np.asarray([val_extra_loss_avg]), win=self.extra_loss_win, name='val_extra_loss', update='append', env=self.vis_env) self.vis.save(envs=[self.vis_env]) # SAVE CHECKPOINT if epoch % self.config['snapshot'] == 0 or abs(self.config['n_epochs']-epoch) < 5: self.save_checkpoint(epoch, final_epoch=self.config['n_epochs']) print('Epoch {:d} checkpoint saved for {:s}'.\ format(epoch, self.experiment)) # ADJUST LR lr = self.optimizer.adjust_lr(epoch) if self.config['log_visdom']: self.vis.line(X=np.asarray([epoch]), Y=np.asarray([np.log10(lr)]), win=self.lr_win, name='learning_rate', update='append', env=self.vis_env) # TRAIN self.model.train() train_data_time = Logger.AverageMeter() train_batch_time = Logger.AverageMeter() end = time.time() for batch_idx, (data, target) in enumerate(self.train_loader): train_data_time.update(time.time() - end) #print(target[1].size()) kwargs = dict(target=target, criterion=self.train_criterion, optim=self.optimizer, train=True, max_grad_norm=self.config['max_grad_norm']) if lstm: loss, output = step_lstm( data, self.model, self.config['cuda'], **kwargs) else: loss, output, loss_list = step_feedfwd(data, self.model, self.config['cuda'], **kwargs) if self.extra_criterion: dual_target = type(target) is list or type(target) is tuple with torch.set_grad_enabled(False): if self.config['cuda']: if dual_target: target = tuple(single_target.cuda(async=True) for single_target in target) else: target = target.cuda(async=True) if dual_target: target_var = tuple(Variable(t, requires_grad=False) for t in target) else: target_var = Variable(target, requires_grad=False) extra_loss, _ = self.extra_criterion(output, target_var) extra_loss = extra_loss.item() train_batch_time.update(time.time() - end) if batch_idx % self.config['print_freq'] == 0: n_iter = epoch * len(self.train_loader) + batch_idx epoch_count = float(n_iter) / len(self.train_loader) print_string = 'Train {:s}: Epoch {:d}\t' \ 'Batch {:d}/{:d}\t' \ 'Data Time {:.4f} ({:.4f})\t' \ 'Batch Time {:.4f} ({:.4f})\t' \ 'Loss {:f}\t' \ .format(self.experiment, epoch, batch_idx, len(self.train_loader) - 1, train_data_time.val, train_data_time.avg, train_batch_time.val, train_batch_time.avg, loss) if self.extra_criterion: print_string += 'Loss Extra Scale {:f}\t'.format(extra_loss) print_string += 'lr: {:f}'.format(lr) print(print_string) end = time.time() if self.config['log_visdom']: self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([loss]), win=self.loss_win, name='train_loss', update='append', env=self.vis_env) if self.multiloss_logging: self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([loss_list[0]]), win=self.multi_losses, name='t_loss_train', update='append', env=self.vis_env) self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([loss_list[1]]), win=self.multi_losses, name='q_loss_train', update='append', env=self.vis_env) if 'multitask' in self.experiment: self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([loss_list[2]]), win=self.multi_losses, name='s_loss_train', update='append', env=self.vis_env) if self.extra_criterion: self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([extra_loss]), win=self.extra_loss_win, name='train_extra_loss', update='append', env=self.vis_env) if self.n_criterion_params: for name, v in self.train_criterion.named_parameters(): v = v.item() self.vis.line(X=np.asarray([epoch_count]), Y=np.asarray([v]), win=self.criterion_param_win, name=name, update='append', env=self.vis_env) self.vis.save(envs=[self.vis_env]) end = time.time() # Save final checkpoint epoch = self.config['n_epochs'] self.save_checkpoint(epoch=epoch, final_epoch=epoch) print('Epoch {:d} checkpoint saved'.format(epoch)) if self.config['log_visdom']: self.vis.save(envs=[self.vis_env])
def trainer(enTrainSet, cnTrainSet, enTrainLen, cnTrainLen, enDevSet, cnDevSet, enDevLen, cnDevLen, enVocabSize, cnVocabSize): # Creating the logging. logging.basicConfig(filename=Cfg.logDir + f'/logging-{currentTime}.txt', filemode='a', level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S %p') # Logging the information. logging.info(f''' Vocabulary Size: {Cfg.vs} Hidden Size: {Cfg.hs} Learning Rate: {Cfg.lr} Adam Beta One: {Cfg.beta1} Adam Beta Two: {Cfg.beta2} Batch Size: {Cfg.bs} Epoches: {Cfg.epoches} Random Seed: {Cfg.seed} GPU ID: {Cfg.GPUID} Model Directory: {Cfg.modelDir} Log Directory: {Cfg.logDir} Dataset Directory: {Cfg.dataDir} ''') # Creating the visdom. vis = Visdom(env='Seq2SeqGRUModel') # Creating the graph. lossGraph = vis.line( X=[0], Y=[0], opts=dict(legend=['TrainingLoss', 'EvaluatingLoss'], xlabel='Epoches', ylabel='Loss', title=f'Training and Evaluating Loss - {currentTime}'), name='TrainingLoss') vis.line(X=[0], Y=[0], win=lossGraph, update='append', name='EvaluatingLoss') accGraph = vis.line( X=[0], Y=[0], opts=dict(legend=['TrainingAcc', 'EvaluatingAcc'], xlabel='Epoches', ylabel='Acc', title=f'Training and Evaluating Acc - {currentTime}'), name='TrainingAcc') vis.line(X=[0], Y=[0], win=accGraph, update='append', name='EvaluatingAcc') # Creating the encoder. encoder = Encoder(enVocabSize, Cfg.hs).to(device) # Creating the decoder. decoder = Decoder(cnVocabSize, Cfg.hs).to(device) # Creating the sequence to sequence model. model = Seq2SeqGRUModelNN(encoder, decoder).to(device) # Creating the loss function. loss = nn.CrossEntropyLoss() # Creating the optimizer. optimizer = optim.Adam(model.parameters(), lr=Cfg.lr, betas=[Cfg.beta1, Cfg.beta2]) # Setting the list to storing the training loss. trainLosses = [] # Setting the list to storing the training accuracy. trainAccs = [] # Setting the list to storing the evaluating loss. evalLosses = [] # Setting the list to storing the evaluating accuracy. evalAccs = [] # Training the model. for epoch in range(Cfg.epoches): # Setting the list for storing the training loss and accuracy. trainLoss = [] trainAcc = [] # Setting the loading bar. with tqdm(total=len(enTrainSet), desc=f'Epoch {epoch + 1}/{Cfg.epoches}', unit='batches', dynamic_ncols=True) as pbars: for i, bacth in enumerate(enTrainSet): # Getting the training data. enBatch = bacth.to(device) enLength = enTrainLen[i] # Decoder do not need the last words for inputting. cnBatchIn = cnTrainSet[i][:, :-1].to(device) # Decoder do not need the first word for predicting. cnBatchOut = cnTrainSet[i][:, 1:].to(device) cnLength = [j - 1 for j in cnTrainLen[i]] # Training the model. # Getting the prediction. prediction = model(enBatch, enLength, cnBatchIn, cnLength) # Computing the loss. cost = loss(prediction, cnBatchOut.reshape(-1)) # Storing the cost. trainLoss.append(cost.item()) # Clearing the gradient. optimizer.zero_grad() # Applying the backward propagation. cost.backward() # Updating the parameters. optimizer.step() # Computing the accuracy. accuracy = (torch.argmax(prediction, 1) == cnBatchOut.reshape(-1)) accuracy = accuracy.sum().float() / len(accuracy) # Storing the accuracy. trainAcc.append(accuracy.item()) # Updating the loading bar. pbars.update(1) # Updating the training information. pbars.set_postfix_str(' - Train Loss %.4f - Train Acc %.4f' % (np.mean(trainLoss), np.mean(trainAcc))) # Closing the loading bar. pbars.close() # Printing the hint for evaluating. print('Evaluating...', end=' ') # Evalutaing the model. evalLoss, evalAcc = evaluator(enDevSet, cnDevSet, enDevLen, cnDevLen, model.eval(), loss) # Printing the evaluating information. print(' - Eval Loss %.4f - Eval Acc %.4f' % (evalLoss, evalAcc)) # Storing the training and evaluating information. trainLosses.append(np.mean(trainLoss)) trainAccs.append(np.mean(trainAcc)) evalLosses.append(evalLoss) evalAccs.append(evalAcc) # Logging the information. logging.info( 'Epoch [%d/%d] -> Training: Loss [%.4f] - Acc [%.4f] || Evaluating: Loss [%.4f] - Acc [%.4f]' % (epoch + 1, Cfg.epoches, np.mean(trainLoss), np.mean(trainAcc), evalLoss, evalAcc)) # Drawing the graph. vis.line(X=[k for k in range(1, len(trainLosses) + 1)], Y=trainLosses, win=lossGraph, update='new', name='TrainingLoss') vis.line(X=[k for k in range(1, len(evalLosses) + 1)], Y=evalLosses, win=lossGraph, update='new', name='EvaluatingLoss') vis.line(X=[k for k in range(1, len(trainAccs) + 1)], Y=trainAccs, win=accGraph, update='new', name='TrainingAcc') vis.line(X=[k for k in range(1, len(evalAccs) + 1)], Y=evalAccs, win=accGraph, update='new', name='EvaluatingAcc') # Giving the hint for saving the model. logging.info("Model Saved") # Saving the model. try: torch.save( encoder.train().state_dict(), Cfg.modelDir + f'/Encoder-{currentTime}/Seq2SeqEncoder-Epoch{epoch + 1}.pt') torch.save( decoder.train().state_dict(), Cfg.modelDir + f'/Decoder-{currentTime}/Seq2SeqDecoder-Epoch{epoch + 1}.pt') except: os.mkdir(Cfg.modelDir + f'/Encoder-{currentTime}/') os.mkdir(Cfg.modelDir + f'/Decoder-{currentTime}/') torch.save( encoder.train().state_dict(), Cfg.modelDir + f'/Encoder-{currentTime}/Seq2SeqEncoder-Epoch{epoch + 1}.pt') torch.save( decoder.train().state_dict(), Cfg.modelDir + f'/Decoder-{currentTime}/Seq2SeqDecoder-Epoch{epoch + 1}.pt') # Converting the model state. model = model.train() # Saving the graph. vis.save(envs=['Seq2SeqGRUModel'])
class Trainer: def __init__(self, netG, netD, loader, optimizerD, optimizerG, checkpoint, epochs, output='./outputs', interval=50, n_critic_D=5, n_critic_G=5, device='cuda', resume=False, server='http://192.168.1.121', port=9999, env='GAN'): self.netG, self.netD = netG, netD self.loader = loader self.N_batch = len(loader) self.optimizerD, self.optimizerG = optimizerD, optimizerG self.checkpoint = checkpoint self.epochs = epochs self.device = torch.device(device) self.resume = resume if resume: if os.path.exists(checkpoint) is not True: raise NameError('[%s] not exist' % checkpoint) cp = torch.load(checkpoint) self.netD.load_state_dict(cp['netD']) self.netG.load_state_dict(cp['netG']) self.viz = Visdom(server=server, port=port, env=env) self.env = env self.plotter = LinePlotter(self.viz) self.criterion = nn.BCELoss() self.fixed_noise = torch.randn(16, self.netG.nz, device=self.device) self.dir_output = output if os.path.exists(self.dir_output) is True: shutil.rmtree(self.dir_output) os.mkdir(self.dir_output) self.interval = interval self.iters = 0 self.image_list = [] self.n_critic_D = n_critic_D self.n_critic_G = n_critic_G self.count_D = self.n_critic_D self.count_G = self.n_critic_G def train(self, epoch): self.netD.train() self.netG.train() N = len(self.loader) Loss_D, Loss_G, Loss_D_x, Loss_D_G_z1, Loss_D_G_z2 = 0., 0., 0., 0., 0. local_iter = 0 pbar = tqdm(enumerate(self.loader)) for idx, real_x in pbar: real_x = real_x.to(self.device) # some header n = real_x.size(0) real_y = torch.full((n, 1), 1, device=self.device, requires_grad=False) fake_y = torch.full((n, 1), 0, device=self.device, requires_grad=False) noise = torch.randn(n, self.netG.nz, device=self.device, requires_grad=False) fake_x = self.netG(noise) # Discriminator self.netD.zero_grad() # Real output = self.netD(real_x) errD_real = self.criterion(output, real_y) D_x = output.mean().item() # fake output = self.netD(fake_x.detach()) errD_fake = self.criterion(output, fake_y) D_G_z1 = output.mean().item() errD = (errD_real + errD_fake) / 2. errD.backward() self.count_D -= 1 if self.count_D >= 0: self.optimizerD.step() self.count_G = -10000 elif self.count_D != -10000 - 1: self.count_G = self.n_critic_G # generator self.netG.zero_grad() output = self.netD(fake_x) errG = self.criterion(output, real_y) D_G_z2 = output.mean().item() errG.backward() self.count_G -= 1 if self.count_G >= 0: self.optimizerG.step() self.count_D = -10000 elif self.count_G != -10000 - 1: self.count_D = self.n_critic_D pbar.set_description( '[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f' % (epoch, self.epochs, idx, N, errD.item(), errG.item(), D_x, D_G_z1, D_G_z2)) Loss_D += errD.item() Loss_G += errG.item() Loss_D_x += D_x Loss_D_G_z1 += D_G_z1 Loss_D_G_z2 += D_G_z2 local_iter += 1 self.iters += 1 if self.iters % self.interval == 0: self.plotter.plot('Loss', 'D', 'Loss', self.iters, Loss_D / local_iter) self.plotter.plot('Loss', 'G', 'Loss', self.iters, Loss_G / local_iter) self.plotter.plot('Error', 'real', 'Error', self.iters, Loss_D_x / local_iter) self.plotter.plot('Error', 'Before', 'Error', self.iters, Loss_D_G_z1 / local_iter) self.plotter.plot('Error', 'After', 'Error', self.iters, Loss_D_G_z2 / local_iter) Loss_D, Loss_G, Loss_D_x, Loss_D_z1, Loss_D_z2 = 0., 0., 0., 0., 0. local_iter = 0 # 每一个epoch都保存 state = { 'netD': self.netD.state_dict(), 'netG': self.netG.state_dict(), 'epoch': epoch } torch.save(state, self.checkpoint) def val(self, epoch): self.netG.eval() with torch.no_grad(): fake = self.netG(self.fixed_noise).detach().cpu() images = make_grid(fake, padding=2, normalize=True) images = images.numpy() * 255 images = images.astype(np.uint8) self.viz.image(images, env=self.env, opts=dict(title='Output - %d' % epoch, )) img = Image.fromarray(np.transpose(images, (1, 2, 0))) img.save(os.path.join(self.dir_output, '%d.png' % epoch)) def run(self): for epoch in range(self.epochs): self.train(epoch) self.val(epoch) self.viz.save([self.env])
class Trainer: def __init__(self, netG, netD, loader, optimizerD, optimizerG, checkpoint, epochs, output='./outputs', interval=50, n_critic_D=5, n_critic_G=5, device='cuda', resume=False, server='http://192.168.1.121', port=9999, env='GAN'): self.netG, self.netD = netG, netD self.loader = loader self.N_batch = len(loader) self.optimizerD, self.optimizerG = optimizerD, optimizerG self.checkpoint = checkpoint self.epochs = epochs self.device = torch.device(device) self.resume = resume if resume: if os.path.exists(checkpoint) is not True: raise NameError('[%s] not exist' % checkpoint) cp = torch.load(checkpoint) self.netD.load_state_dict(cp['netD']) self.netG.load_state_dict(cp['netG']) self.viz = Visdom(server=server, port=port, env=env) self.env = env self.plotter = LinePlotter(self.viz) self.criterion = nn.BCELoss() self.fixed_noise = torch.randn(16, self.netG.nz, device=self.device) # 红头发红眼睛 t = [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.] t = t * 16 self.fixed_c = torch.Tensor(t).to(self.device).view(16, -1) self.dir_output = output if os.path.exists(self.dir_output) is True: shutil.rmtree(self.dir_output) os.mkdir(self.dir_output) self.interval = interval self.iters = 0 self.image_list = [] self.n_critic_D = n_critic_D self.n_critic_G = n_critic_G self.count_D = self.n_critic_D self.count_G = self.n_critic_G def train(self, epoch): self.netD.train() self.netG.train() N = len(self.loader) LossD, LossG = 0., 0. local_iter = 0 pbar = tqdm(enumerate(self.loader)) for idx, (real_x, real_c, fake_c) in pbar: # 整理该batch的头 real_x = real_x.to(self.device) real_c = real_c.to(self.device) fake_c = fake_c.to(self.device) n = real_x.size(0) real_y = torch.full((n, 1), 1, device=self.device, requires_grad=False) fake_y = torch.full((n, 1), 0, device=self.device, requires_grad=False) noise = torch.randn(n, self.netG.nz, device=self.device, requires_grad=False) fake_x_realc_G = self.netG(noise, real_c) fake_x_realc_D = fake_x_realc_G.detach() # Discriminator self.netD.zero_grad() # Real x, real c output_realistic = self.netD(real_x, real_c) errD_realx_realc_realistic = self.criterion(output_realistic, real_y) errD_realx_realc = errD_realx_realc_realistic # Real x, fake c output_realistic = self.netD(real_x, fake_c) errD_realx_fakec_realistic = self.criterion(output_realistic, fake_y) errD_realx_fakec = errD_realx_fakec_realistic # fake x, real c G, real c D output_realistic = self.netD(fake_x_realc_D, real_c) errD_fakex_realc_realc_realistic = self.criterion(output_realistic, fake_y) errD_fakex_realc_realc = errD_fakex_realc_realc_realistic errD = errD_realx_realc + errD_realx_fakec + errD_fakex_realc_realc errD.backward() self.count_D -= 1 if self.count_D >= 0: self.optimizerD.step() self.count_G = -10000 elif self.count_D != -10000 - 1: self.count_G = self.n_critic_G # Generator self.netG.zero_grad() # fake x, real c G, real c D output_realistic = self.netD(fake_x_realc_G, real_c) errG_realc_realc_realistic = self.criterion(output_realistic, real_y) errG_realc_realc = errG_realc_realc_realistic errG = errG_realc_realc errG.backward() self.count_G -= 1 if self.count_G >= 0: self.optimizerG.step() self.count_D = -10000 elif self.count_G != -10000 - 1: self.count_D = self.n_critic_D # err errD = errD.item() errG = errG.item() pbar.set_description('[%d/%d][%d/%d]\terrD: %.4f\terrG: %.4f' % (epoch, self.epochs, idx, N, errD, errG)) LossD += errD LossG += errG local_iter += 1 self.iters += 1 if self.iters % self.interval == 0: self.plotter.plot('Loss', 'D', 'Loss', self.iters, LossD / local_iter) self.plotter.plot('Loss', 'G', 'Loss', self.iters, LossG / local_iter) LossD, LossG = 0., 0. local_iter = 0 # 每一个epoch都保存 state = { 'netD': self.netD.state_dict(), 'netG': self.netG.state_dict(), 'epoch': epoch } torch.save(state, self.checkpoint) def val(self, epoch): self.netG.eval() with torch.no_grad(): fake = self.netG(self.fixed_noise, self.fixed_c).detach().cpu() images = make_grid(fake, padding=2, normalize=True) images = images.numpy() * 255 images = images.astype(np.uint8) self.viz.image(images, env=self.env, opts=dict( title='Output - %d' % epoch, )) img = Image.fromarray(np.transpose(images, (1, 2, 0))) img.save(os.path.join(self.dir_output, '%d.png' % epoch)) def run(self): for epoch in range(self.epochs): self.train(epoch) self.val(epoch) self.viz.save([self.env])
((epoch + 1), loss_D.item(), loss_GE.item(), loss_info.item())) if (epoch+1)%10 == 0: G.eval() D.eval() E.eval() """ G network """ C = torch.cat([fixed_c, fixed_class],1) valid_x_fake = G(fixed_z, fixed_c, FG.axis) # valid_x_fake = G(fixed_z, C, FG.axis) fake = (valid_x_fake*0.5)+0.5 printers['fake']('fake', fake[0,:,:,:]) if ((epoch+1) % 50 == 0): saver = Image3D(vis, 'output_'+str(epoch+1)) saver('output_'+str(epoch+1), fake[0,:,:,:]) valid_scores.clear() if ((epoch+1) % 10 == 0): with torch.no_grad(): torch.save(D.state_dict(), '%s/D_%d.pth' % (save_dir, epoch+1)) torch.save(G.state_dict(), '%s/G_%d.pth' % (save_dir, epoch+1)) torch.save(E.state_dict(), '%s/E_%d.pth' % (save_dir, epoch+1)) # torch.save(D.state_dict(), os.path.join(save_dir, 'D.pth')) # torch.save(G.state_dict(), os.path.join(save_dir, 'G.pth')) # torch.save(E.state_dict(), os.path.join(save_dir, 'E.pth')) timer.toc() print('Time elapse {}h {}m {}s'.format(*timer.total())) vis.save([vis.env]) time.sleep(0.5)
predictions = f(params, xrange_inputs) win_inference = viz.line(Y=targets, X=xrange_inputs, name='target', opts=dict(title='sinusoid inference', xlabel='x', ylabel='y', ) ) viz.line(Y=predictions, X=xrange_inputs, win=win_inference, update='append', name='pre-update predictions') x1 = onp.random.uniform(low=-5., high=5., size=(args.n_support, 1)) y1 = 1. * onp.sin(x1 + 0.) for i in range(1, args.n_inner_step + 1): params, _ = inner_optimization(params, x1, y1, n_inner_step=1) predictions = f(params, xrange_inputs) viz.line(Y=predictions, X=xrange_inputs, win=win_inference, update='append', name=f'{i}-step predictions') viz.line(Y=predictions, X=xrange_inputs, win=win_inference, update='replace', name=f'{i}-step predictions', opts=dict(legend=['target', 'pre-update predictions'] + [f'{i}-step predictions' for i in range(1, args.n_inner_step + 1)])) # serialize np_dir = os.path.join(args.log_dir, 'np') os.makedirs(np_dir, exist_ok=True) onp.save(file=os.path.join(np_dir, f'log'), arr=log) # serialize visdom envs viz.save(viz.get_env_list())
def train(optimizer_options, data_options, logger_options, model_options, scheduler_options): #, results_path=None): torch.manual_seed(42) np.random.seed(42) vis = Visdom(env=logger_options['vislogger_env'], port=logger_options['vislogger_port']) device = torch.device(optimizer_options['device']) epochs = optimizer_options['epochs'] ## ======================================= Early Stop ======================================= ## early_stop = False if not (optimizer_options['early_stopping'] == ""): #['min', '0.01', '21'] mode = optimizer_options['early_stopping'][0] min_delta = float(optimizer_options['early_stopping'][1]) patience = int(optimizer_options['early_stopping'][2]) early_stopping = EarlyStopping(mode=mode, min_delta=min_delta, patience=patience) ## ======================================= Early Stop ======================================= ## ## ======================================= Scheduler ======================================= ## scheduler = LRSchedulerWithRestart_V2( scheduler_type=scheduler_options['scheduler'], n_restarts=scheduler_options['n_restarts'], n_lr_updates=scheduler_options['n_param_updates'], restart_factor=scheduler_options['restart_factor'], init_lr_factor=scheduler_options['init_lr_factor'], eta_min=scheduler_options['eta_min'], vis=vis) ## ======================================= Scheduler ======================================= ## ## ======================================= Save model ======================================= ## if (logger_options['save_model'] == ""): model_checkpoint = ModelCheckpoint() else: suffix = optimizer_options['optimizer'] + "_" + str( optimizer_options['learning_rate']) model_checkpoint = ModelCheckpoint( save_model=True, save_path=logger_options['save_model'], use_loss=True, suffix=suffix) ####### FILL PARAMTERS!!!! ## ======================================= Save model ======================================= ## ## ======================================= Data ======================================= ## # image_transform = Compose([Resize(data_options['image_size'])]) # image_transform = Compose([Resize(data_options['image_size']), ToTensor()]) image_transform = Compose([ Resize(data_options['image_size']), ToTensor(), Normalize(mean=[0.3610, 0.2131, 0.2324], std=[0.0624, 0.0463, 0.0668]) ]) kfoldWorkflowSet = kFoldWorkflowSplit( data_options['base_path'], image_transform=image_transform, video_extn='.avi', shuffle=True, n_folds=data_options['n_folds'], num_phases=14, batch_size=data_options['batch_size'], num_workers=data_options['n_threads']) ## ======================================= Data ======================================= ## nfolds_training_loss_avg = CumulativeMovingAvgStd() nfolds_validation_loss_avg = CumulativeMovingAvgStd() folds_pbar = ProgressBar(kfoldWorkflowSet, desc="Folds") for iFold, ( train_loader, val_loader) in enumerate(folds_pbar): #= next(kfoldWorkflowSet) ## ======================================= Create Plot ======================================= ## create_plot_window(vis, "Epochs+Iterations", "CE Loss", "Training loss Fold " + str(iFold + 1), tag='Training_Loss_Fold_' + str(iFold + 1), name='Training Loss Fold ' + str(iFold + 1)) create_plot_window(vis, "Epochs+Iterations", "CE Loss", "Validation loss Fold " + str(iFold + 1), tag='Validation_Loss_Fold_' + str(iFold + 1), name='Validation Loss Fold ' + str(iFold + 1)) ## ======================================= Create Plot ======================================= ## ## ======================================= Model ======================================= ## # TODO: Pass 'models.resnet50' as string model = ResFeatureExtractor(pretrained_model=models.resnet101, device=device) ## ======================================= Model ======================================= ## ### ============================== Parts of Training step ============================== ### criterion_CE = nn.CrossEntropyLoss().to(device) optimizer = get_optimizer(model.parameters(), optimizer_options) cycle_length = scheduler_options['cycle_length'] if scheduler_options[ 'cycle_length'] > 0 else len(train_loader) scheduler(optimizer, cycle_length) trainer = Engine(model, optimizer, criterion_CE, scheduler, train_loader, optimizer_options["accumulate_count"], device) evaluator = Engine(model, None, criterion_CE, None, val_loader, 0, device, False) ### ============================== Parts of Training step ============================== ### epoch_pbar = ProgressBar(range(epochs), desc="Epochs") #tqdm(range(epochs)) epoch_training_avg_loss = CumulativeMovingAvgStd() epoch_validation_avg_loss = CumulativeMovingAvgStd() epoch_msg_dict = {} for epoch in epoch_pbar: iteration_pbar = ProgressBar( train_loader, desc="Iteration", pb_len=optimizer_options['max_iterations']) max_iterations = iteration_pbar.total for iteration, (images, phase_annotations) in enumerate(iteration_pbar): ### ============================== Training ============================== ### train_loss, train_accuracy = trainer( images.to(device=device), phase_annotations.to(device=device)) epoch_training_avg_loss.update(train_loss) epoch_msg_dict['ATL'] = epoch_training_avg_loss.get_value()[0] ### ============================== Training ============================== ### ### ============================== Validation ============================== ### if (optimizer_options["validation_interval"] > 0): if ((iteration + 1) % optimizer_options["validation_interval"] == 0) and (iteration > 0): validation_loss, validation_accuracy = predict( evaluator, optimizer_options['max_valid_iterations'], device, vis) epoch_validation_avg_loss.update(validation_loss) vis.line(X=np.array( [epoch + (iteration / iteration_pbar.total)]), Y=np.array([ epoch_validation_avg_loss.get_value()[0] ]), update='append', win='Validation_Loss_Fold_' + str(iFold + 1), name='Validation Loss Fold ' + str(iFold + 1)) epoch_msg_dict[ 'AVL'] = epoch_validation_avg_loss.get_value()[0] model_checkpoint.step( curr_loss=epoch_validation_avg_loss.get_value()[0], model=model, suffix='_Fold_' + str(iFold)) ### ============================== Validation ============================== ### ### ============================== Plot ============================== ### if (iteration % logger_options["vislogger_interval"] == 0): vis.line(X=np.array( [epoch + (iteration / iteration_pbar.total)]), Y=np.array( [epoch_training_avg_loss.get_value()[0]]), update='append', win='Training_Loss_Fold_' + str(iFold + 1), name='Training Loss Fold ' + str(iFold + 1)) ### ============================== Plot ============================== ### if early_stop: loader_pbar.close() print( "\n==========================\nEarly stop\n==========================\n" ) break folds_pbar.update_message(msg_dict=epoch_msg_dict) if iteration == max_iterations: iteration_pbar.close() break ### ============================== Validation ============================== ### if (optimizer_options["validation_interval_epochs"] > 0): if ((epoch + 1) % optimizer_options["validation_interval_epochs"] == 0): validation_loss, validation_accuracy = predict( evaluator, optimizer_options['max_valid_iterations'], device, vis) epoch_validation_avg_loss.update(validation_loss) vis.line(X=np.array( [epoch + (iteration / iteration_pbar.total)]), Y=np.array( [epoch_validation_avg_loss.get_value()[0]]), update='append', win='Validation_Loss_Fold_' + str(iFold + 1), name='Validation Loss Fold ' + str(iFold + 1)) epoch_msg_dict[ 'AVL'] = epoch_validation_avg_loss.get_value()[0] folds_pbar.update_message(msg_dict=epoch_msg_dict) ### ============================== Validation ============================== ### ### ============================== Save model ============================== ### model_checkpoint.step( curr_loss=epoch_validation_avg_loss.get_value()[0], model=model, suffix='_Fold_' + str(iFold)) vis.save([logger_options['vislogger_env']]) ### ============================== Save model ============================== ### if early_stop: epoch_pbar.close() break # torch.cuda.empty_cache() print( "\n\n\n\n=================================== DONE ===================================\n\n" )
def main(): torch.set_num_threads(1) if args.vis: from visdom import Visdom viz = Visdom(port=args.port) win = None # Environment stuffs envs = [] for i in range(args.num_processes): if args.scene_dir: scene_dir = os.path.join(args.scene_dir, "seed{}".format(args.seed + i)) assert os.path.exists(scene_dir) else: scene_dir = None envs.append( make_env(args.env_name, args.seed, i, log_path, args.add_timestep, scene_dir)) # Hack infomation of gym environment tmp_env = envs[0]() sensor_type = tmp_env.unwrapped.hp_sensing_mode num_agent = tmp_env.unwrapped.hp_uav_n dim = tmp_env.unwrapped.hp_dim # Shape of o_env for each agent, required by the observation feature extraction module of the model if sensor_type == "lidar": atom_o_env_shape = tmp_env.unwrapped.hp_lidar_n + dim elif sensor_type == "pos": atom_o_env_shape = (dim + 1) * tmp_env.unwrapped.hp_n_nearest_obs else: raise Exception( "No implementation for sensing mode {}".format(sensor_type)) if args.num_processes > 1: envs = SubprocVecEnv(envs) else: envs = DummyVecEnv(envs) if len(envs.observation_space.shape) == 1: if not args.unordered: envs = VecNormalize( envs, gamma=args.gamma ) # Different observation normalization factors for different agents else: envs = VecNormalize(envs, gamma=args.gamma, num_agent=num_agent) num_subagents = num_agent if args.indep else 1 # The way you view the robot team (i.e., a virtual structure or many robots) obs_shape = envs.observation_space.shape atom_obs_shape = (obs_shape[0] // num_subagents * args.num_stack, *obs_shape[1:]) # Shape for each logical agent action_shape = envs.action_space.shape atom_action_shape = (action_shape[0] // num_subagents, *action_shape[1:]) # Agent stuffs (core elements of PPO) if args.load_dir: # Resume from breakpoint print("Loading model parameters from: " + args.load_dir) actor_critic, ob_rms, ret_rms = torch.load(args.load_dir) assert envs.ob_rms.mean.shape == ob_rms.mean.shape, "Mismatched observation shape, which may be induced by wrong flags (e.g., --unordered / --num_stack)" envs.ob_rms = ob_rms envs.ret_rms = ret_rms else: actor_critic = Policy(atom_obs_shape, atom_action_shape, sensor_type, atom_o_env_shape, dim, num_agent, args.unordered, args.indep, args.sigmoid, args.share, args.no_rnn) if args.cuda: actor_critic.cuda() agent = algo.PPO(actor_critic, args.clip_param, args.ppo_epoch, args.num_mini_batch, args.value_loss_coef, args.entropy_coef, lr=args.lr, eps=args.eps, max_grad_norm=args.max_grad_norm) rollouts = [ RolloutStorage(args.num_steps, args.num_processes, atom_obs_shape, atom_action_shape, actor_critic.state_size) for _ in range(num_subagents) ] # Auxiliary stuffs current_obs = [ torch.zeros(args.num_processes, *atom_obs_shape) for _ in range(num_subagents) ] # Stack sequent observations to get current_obs, using the trick of reshaping. # # current_obs # Index |1 |2 |3 # Observation |a1 a2 a3 |b1 b2 b3 |c1 c2 c3 def update_current_obs(obs, idx): nonlocal current_obs shape_dim0 = atom_obs_shape[0] // args.num_stack obs = torch.from_numpy(obs).float() if args.num_stack > 1: current_obs[idx][:, :-shape_dim0] = current_obs[idx][:, shape_dim0:] current_obs[idx][:, -shape_dim0:] = obs obs = envs.reset() for i in range(num_subagents): update_current_obs( obs[:, i * atom_obs_shape[0]:(i + 1) * atom_obs_shape[0]], i) rollouts[i].observations[0].copy_(current_obs[i]) # These variables are used to compute average rewards for all processes. episode_rewards = torch.zeros([args.num_processes, 1]) final_rewards = torch.zeros([args.num_processes, 1]) if args.cuda: for i in range(num_subagents): current_obs[i] = current_obs[i].cuda() rollouts[i].cuda() # Main loop train_start = datetime.datetime.now() print("Training starts at: {}".format(train_start)) env_time = 0. # time cost of interaction with environment env_compute_time = 0. env_step_time = 0. env_rollout_time = 0. update_time = 0. # time cost of updating parameters log_time = 0. # time cost of logging for j in range(num_updates): # Interact with the environment start_env_time = time.time() # Timer for step in range(args.num_steps): start_env_compute_time = time.time() # Sample actions with torch.no_grad(): l_value, l_action, l_action_log_prob, l_states = [], [], [], [] for i in range(num_subagents): value, action, action_log_prob, states = actor_critic.act( rollouts[i].observations[step], rollouts[i].states[step], rollouts[i].masks[step]) l_value.append(value) l_action.append(action) l_action_log_prob.append(action_log_prob) l_states.append(states) action = torch.cat(l_action, dim=1) cpu_actions = action.squeeze(1).cpu().numpy() env_compute_time += time.time() - start_env_compute_time start_env_step_time = time.time() obs, reward, done, info = envs.step(cpu_actions) env_step_time += time.time() - start_env_step_time start_env_rollout_time = time.time() reward = torch.from_numpy(np.expand_dims(np.stack(reward), 1)).float() episode_rewards += reward # If done then clean the history of observations. masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]) # final_rewards is the accumulated reward of the last trajectory, episode_rewards is an auxuliary variable. # The motivation is to enable logging in arbitrary time step. final_rewards *= masks final_rewards += ( 1 - masks ) * episode_rewards # If not done, mask=1, final_rewards doesn't change episode_rewards *= masks if args.cuda: masks = masks.cuda() for i in range(num_subagents): current_obs[i] *= masks # Useful when args.num_stack > 1 update_current_obs( obs[:, i * atom_obs_shape[0]:(i + 1) * atom_obs_shape[0]], i) rollouts[i].insert(current_obs[i], l_states[i], l_action[i], l_action_log_prob[i], l_value[i], reward, masks) env_rollout_time += time.time() - start_env_rollout_time env_time += time.time() - start_env_time # Update parameters start_update_time = time.time() # Timer for i in range(num_subagents): with torch.no_grad(): next_value = actor_critic.get_value( rollouts[i].observations[-1], rollouts[i].states[-1], rollouts[i].masks[-1]).detach() rollouts[i].compute_returns(next_value, args.use_gae, args.gamma, args.tau) value_loss, action_loss, dist_entropy = agent.update(rollouts[i]) rollouts[i].after_update() update_time += time.time() - start_update_time # Logging start_log_time = time.time() # Timer # Save models if j % args.save_interval == 0 or j == num_updates - 1: # A really ugly way to save a model to CPU save_model = actor_critic if args.cuda: save_model = copy.deepcopy(actor_critic).cpu() save_model = [ save_model, hasattr(envs, 'ob_rms') and envs.ob_rms or None, hasattr(envs, 'ret_rms') and envs.ret_rms or None ] torch.save(save_model, os.path.join(model_path, "model" + str(j) + ".pt")) # For logging training information if j % args.log_interval == 0 or j == num_updates - 1: log_env_time = [] for i, info_i in enumerate(info): log_reset_i = " Average reset time for env{}: {:.1f}ms = {:.1f}h / {}".format( i, info_i['reset_time'] * 1000 / info_i['reset_num'], info_i['reset_time'] / 3600, info_i['reset_num']) log_step_i = " Average step time for env{}: {:.1f}ms = {:.1f}h / {}".format( i, info_i['step_time'] * 1000 / info_i['step_num'], info_i['step_time'] / 3600, info_i['step_num']) log_env_time.append(log_reset_i) log_env_time.append(log_step_i) log_env_time = '\n'.join(log_env_time) current_time = datetime.datetime.now() summary = '\n'.join([ "Training starts at: {}".format(train_start), "Current time: {}".format(current_time), "Elapsed time: {}".format(current_time - train_start), " Environment interaction: {:.1f}h".format( env_time / 3600), " Compute action: {:.1f}h".format( env_compute_time / 3600), " Rollout: {:.1f}h".format(env_rollout_time / 3600), " Interaction with gym: {:.1f}h".format( env_step_time / 3600), log_env_time, " Parameters update: {:.1f}h".format(update_time / 3600), " logging: {:.1f}h".format(log_time / 3600) ]) + '\n' # Write down summary of the training with open(os.path.join(root_path, "summary.txt"), 'w') as f: f.write(summary) # For Visdom visualization if args.vis and (j % args.vis_interval == 0 or j == num_updates - 1): # Sometimes monitor doesn't properly flush the outputs win = visdom_plot(viz, win, args.vis_env, log_path, title, args.algo, args.num_frames, save_dir=root_path) viz.save([args.vis_env]) log_time += time.time() - start_log_time print(summary)
def trainer(trainSet, devSet): # Getting the current time. currentTime = time.strftime('%Y-%m-%d-%H-%M-%S') # Setting the logging. logging.basicConfig(filename=Cfg.logDir + f'/logging-{currentTime}.txt', filemode='a', level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S %p') # Logging the information. logging.info(f''' Vocabulary Size: {Cfg.vs} Embedding Size: {Cfg.es} Hidden Size: {Cfg.hs} Batch Size: {Cfg.bs} Learning Rate: {Cfg.lr} Epoches: {Cfg.epoches} Random Seed: {Cfg.seed} GPU ID: {Cfg.GPUID} Model Directory: {Cfg.modelDir} Log Directory: {Cfg.logDir} Data Directory: {Cfg.dataDir} ''') # Setting the visdom. vis = Visdom(env='LanguageModel') # Setting the graph. lossGraph = vis.line( X=[0], Y=[0], opts=dict(legend=['TrainingLoss', 'EvaluatingLoss'], xlabel='Epoches', ylabel='Loss', title=f'Training and Evaluating Loss for {currentTime}'), name='TrainingLoss') vis.line(X=[0], Y=[0], win=lossGraph, update='append', name='EvaluatingLoss') accGraph = vis.line( X=[0], Y=[0], opts=dict(legend=['TrainingAcc', 'EvaluatingAcc'], xlabel='Epoches', ylabel='Acc', title=f'Training and Evaluating Acc for {currentTime}'), name='TrainingAcc') vis.line(X=[0], Y=[0], win=accGraph, update='append', name='EvaluatingAcc') # Setting the list for training loss. trainLosses = [] # Setting the list for training accuracy. trainAccs = [] # Setting the list for evaluating loss. evalLosses = [] # Setting the list for evaluating accuracy. evalAccs = [] # Creating the model. model = LanguageModelNN(Cfg.vs, Cfg.es, Cfg.hs) # Sending the model into the specific device. model = model.to(device) # Creating the loss function. loss = nn.CrossEntropyLoss() # Creating the optimizer. optimizer = optim.Adam(model.parameters(), lr=Cfg.lr) # Training the model. for epoch in range(Cfg.epoches): # Initializing the training loss. trainLoss = [] # Initializing the training accuracy. trainAcc = [] # Initializing the hidden. hidden = model.initHidden(Cfg.bs, Cfg.hs) # Setting the loading bar. with tqdm( total=len(trainSet), desc=f'Epoch {epoch + 1}/{Cfg.epoches}', unit='batches', dynamic_ncols=True, ) as pbars: # Training the model. for i, trainData in enumerate(trainSet): # Remembering the historical hidden. hidden = model.splitHiddenHistory(hidden) # Feeding the data into the model. prediction, hidden = model(trainData.text, hidden) # Getting the value of the loss. cost = loss(prediction, trainData.target.view(-1)) # Storing the cost. trainLoss.append(cost.item()) # Clearing the previous gradient. optimizer.zero_grad() # Applying the backward propagation. cost.backward() # Updating the parameters. optimizer.step() # Computing the accuracy. accuracy = (torch.argmax(prediction, 1) == trainData.target.view(-1)) accuracy = accuracy.sum().float() / len(accuracy) # Storing the accuracy. trainAcc.append(accuracy.item()) # Updating the loading bar. pbars.update(1) # Updating the training information. pbars.set_postfix_str('Train Loss: %.4f - Train Acc: %.4f' % (np.mean(trainLoss), np.mean(trainAcc))) # Closing the loading bar. pbars.close() # Printing the evaluating hint. print('Evaluating...', end=' ') # Evaluating the model. evalLoss, evalAcc = evaluator(model.eval(), loss, devSet, Cfg.bs, Cfg.hs) # Printing the evluating information. print('- Eval Loss: %.4f - Eval Acc: %.4f' % (evalLoss, evalAcc)) # Storing the training and evaluating information. trainLosses.append(np.mean(trainLoss)) trainAccs.append(np.mean(trainAcc)) evalLosses.append(evalLoss) evalAccs.append(evalAcc) # Logging the training and evaluating information. logging.info( 'Epoch [%d/%d] -> Training: Loss [%.4f] - Acc [%.4f] || Evaluating: Loss [%.4f] - Acc [%.4f]' % (epoch + 1, Cfg.epoches, np.mean(trainLoss), np.mean(trainAcc), evalLoss, evalAcc)) # Drawing the graph. vis.line(X=[k for k in range(1, len(trainLosses) + 1)], Y=trainLosses, win=lossGraph, update='new', name='TrainingLoss') vis.line(X=[k for k in range(1, len(evalLosses) + 1)], Y=evalLosses, win=lossGraph, update='new', name='EvaluatingLoss') vis.line(X=[k for k in range(1, len(trainAccs) + 1)], Y=trainAccs, win=accGraph, update='new', name='TrainingAcc') vis.line(X=[k for k in range(1, len(evalAccs) + 1)], Y=evalAccs, win=accGraph, update='new', name='EvaluatingAcc') # Saving the model. torch.save(model.state_dict(), Cfg.modelDir + f'/LanguageModel-Epoch{epoch + 1}.pt') logging.info("Model Saved") # Converting the model mode. model.train() # Saving the visdom. vis.save(envs=['LanguageModel'])
def trainer(textField, trainSet, devSet): # Creating the logging. logging.basicConfig(filename=Cfg.logDir + f'/logging-{currentTime}.txt', filemode='a', level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s', datefmt='%Y-%m-%d %H:%M:%S %p') # Logging the information. logging.info(f''' Vocabulary Size: {Cfg.vs} Embedding Size: {Cfg.es} Hidden Size: {Cfg.hs} Class Size: {Cfg.cs} Learning Rate: {Cfg.lr} Adam Beta One: {Cfg.beta1} Adam Beta Two: {Cfg.beta2} Weight Decay: {Cfg.wd} Batch Size: {Cfg.bs} Epoches: {Cfg.epoches} Random Seed: {Cfg.seed} GPU ID: {Cfg.GPUID} Model Directory: {Cfg.modelDir} Log Directory: {Cfg.logDir} Dataset Directory: {Cfg.dataDir} ''') # Creating the visdom. vis = Visdom(env='WordAverageModel') # Creating the graph. lossGraph = vis.line( X=[0], Y=[0], opts=dict(legend=['TrainingLoss', 'EvaluatingLoss'], xlabel='Epoches', ylabel='Loss', title=f'Training and Evaluating Loss - {currentTime}'), name='TrainingLoss') vis.line(X=[0], Y=[0], win=lossGraph, update='append', name='EvaluatingLoss') accGraph = vis.line( X=[0], Y=[0], opts=dict(legend=['TrainingAcc', 'EvaluatingAcc'], xlabel='Epoches', ylabel='Acc', title=f'Training and Evaluating Acc - {currentTime}'), name='TrainingAcc') vis.line(X=[0], Y=[0], win=accGraph, update='append', name='EvaluatingAcc') # Creating the sequence to sequence model. model = WordAverageModelNN( Cfg.vs + 2, Cfg.es, Cfg.hs, Cfg.cs, textField.vocab.stoi[textField.pad_token]).to(device) # Customizing the initialized parameters of the embedding layer. # Getting the vocabulary as the vectors. gloveVector = textField.vocab.vectors # Reinitializing the parameters of the embedding layer. model.embedding.weight.data.copy_(gloveVector) # Adding the '<unk>' and '<pad>' tokens into the parameters of the embedding layer. model.embedding.weight.data[textField.vocab.stoi[textField.pad_token]] model.embedding.weight.data[textField.vocab.stoi[textField.unk_token]] # Setting the optimizer. optimizer = optim.Adam(model.parameters(), lr=Cfg.lr, weight_decay=Cfg.wd, betas=[Cfg.beta1, Cfg.beta2]) # Setting the loss function. loss = nn.BCEWithLogitsLoss() # Setting the list to storing the training loss. trainLosses = [] # Setting the list to storing the training accuracy. trainAccs = [] # Setting the list to storing the evaluating loss. evalLosses = [] # Setting the list to storing the evaluating accuracy. evalAccs = [] # Training the model. for epoch in range(Cfg.epoches): # Setting the list for storing the training loss and accuracy. trainLoss = [] trainAcc = [] # Setting the loading bar. with tqdm(total=len(trainSet), desc=f'Epoch {epoch + 1}/{Cfg.epoches}', unit='batches', dynamic_ncols=True) as pbars: for i, trainData in enumerate(trainSet): # Feeding the data into the model. prediction = model(trainData.text) # Computing the loss. cost = loss(prediction, trainData.label) # Storing the loss. trainLoss.append(cost.item()) # Clearing the previous gradient. optimizer.zero_grad() # Applying the backward propagation. cost.backward() # Updating the parameters. optimizer.step() # Computing the accuracy. accuracy = (torch.round( torch.sigmoid(prediction)) == trainData.label) accuracy = accuracy.sum().float() / len(accuracy) # Storing the accurcy. trainAcc.append(accuracy.item()) # Updating the loading bar. pbars.update(1) # Updating the training information. pbars.set_postfix_str(' - Train Loss %.4f - Train Acc %.4f' % (np.mean(trainLoss), np.mean(trainAcc))) # Closing the loading bar. pbars.close() # Printing the hint for evaluating. print('Evaluating...', end=' ') # Evalutaing the model. evalLoss, evalAcc = evaluator(model.eval(), loss, devSet) # Printing the evaluating information. print(' - Eval Loss %.4f - Eval Acc %.4f' % (evalLoss, evalAcc)) # Storing the training and evaluating information. trainLosses.append(np.mean(trainLoss)) trainAccs.append(np.mean(trainAcc)) evalLosses.append(evalLoss) evalAccs.append(evalAcc) # Logging the information. logging.info( 'Epoch [%d/%d] -> Training: Loss [%.4f] - Acc [%.4f] || Evaluating: Loss [%.4f] - Acc [%.4f]' % (epoch + 1, Cfg.epoches, np.mean(trainLoss), np.mean(trainAcc), evalLoss, evalAcc)) # Drawing the graph. vis.line(X=[k for k in range(1, len(trainLosses) + 1)], Y=trainLosses, win=lossGraph, update='new', name='TrainingLoss') vis.line(X=[k for k in range(1, len(evalLosses) + 1)], Y=evalLosses, win=lossGraph, update='new', name='EvaluatingLoss') vis.line(X=[k for k in range(1, len(trainAccs) + 1)], Y=trainAccs, win=accGraph, update='new', name='TrainingAcc') vis.line(X=[k for k in range(1, len(evalAccs) + 1)], Y=evalAccs, win=accGraph, update='new', name='EvaluatingAcc') # Giving the hint for saving the model. logging.info("Model Saved") # Saving the model. torch.save( model.train().state_dict(), Cfg.modelDir + f'/{currentTime}/WordAverageModel-Epoch{epoch + 1}.pt') # Converting the model state. model = model.train() # Saving the graph. vis.save(envs=['WordAverageModel'])