def train_one_epoch(self, epoch, accum_iter): self.model.train() self.lr_scheduler.step() average_meter_set = AverageMeterSet() tqdm_dataloader = tqdm(self.train_loader) for batch_idx, batch in enumerate(tqdm_dataloader): batch_size = batch[0].size(0) batch = [x.to(self.device) for x in batch] self.optimizer.zero_grad() loss = self.calculate_loss(batch) loss.backward() self.optimizer.step() average_meter_set.update('loss', loss.item()) tqdm_dataloader.set_description( 'Epoch {}, loss {:.3f} '.format(epoch+1, average_meter_set['loss'].avg)) accum_iter += batch_size if self._needs_to_log(accum_iter): tqdm_dataloader.set_description('Logging to Tensorboard') log_data = { 'state_dict': (self._create_state_dict()), 'epoch': epoch, 'accum_iter': accum_iter, } log_data.update(average_meter_set.averages()) self.log_extra_train_info(log_data) self.logger_service.log_train(log_data) return accum_iter
def validate(self, epoch, accum_iter): self.model.eval() average_meter_set = AverageMeterSet() with torch.no_grad(): tqdm_dataloader = tqdm(self.val_loader) for batch_idx, batch in enumerate(tqdm_dataloader): batch = [x.to(self.device) for x in batch] metrics = self.calculate_metrics(batch) for k, v in metrics.items(): average_meter_set.update(k, v) description_metrics = ['NDCG@%d' % k for k in self.metric_ks[:3]] +\ ['Recall@%d' % k for k in self.metric_ks[:3]] description = 'Val: ' + ', '.join(s + ' {:.3f}' for s in description_metrics) description = description.replace('NDCG', 'N').replace('Recall', 'R') description = description.format(*(average_meter_set[k].avg for k in description_metrics)) tqdm_dataloader.set_description(description) log_data = { 'state_dict': (self._create_state_dict()), 'epoch': epoch, 'accum_iter': accum_iter, } log_data.update(average_meter_set.averages()) self.logger_service.log_val(log_data)
def test(self): print('Test best model with test set!') best_model = torch.load( os.path.join(self.export_root, 'models', 'best_acc_model.pth')).get('model_state_dict') self.model.load_state_dict(best_model) self.model.eval() average_meter_set = AverageMeterSet() with torch.no_grad(): tqdm_dataloader = tqdm(self.test_loader) for batch_idx, batch in enumerate(tqdm_dataloader): batch = [x.to(self.device) for x in batch] metrics, preds = self.calculate_metrics(batch) for k, v in metrics.items(): average_meter_set.update(k, v) description_metrics = ['NDCG@%d' % k for k in self.metric_ks[:3]] +\ ['Recall@%d' % k for k in self.metric_ks[:3]] description = 'Val: ' + ', '.join(s + ' {:.3f}' for s in description_metrics) description = description.replace('NDCG', 'N').replace('Recall', 'R') description = description.format( *(average_meter_set[k].avg for k in description_metrics)) tqdm_dataloader.set_description(description) average_metrics = average_meter_set.averages() with open( os.path.join(self.export_root, 'logs', 'test_metrics.json'), 'w') as f: json.dump(average_metrics, f, indent=4) print(average_metrics)
def validate(self, epoch, accum_iter, mode, doLog=True, **kwargs): if mode == 'val': loader = self.val_loader elif mode == 'test': loader = self.test_loader else: raise ValueError self.model.eval() average_meter_set = AverageMeterSet() num_instance = 0 with torch.no_grad(): tqdm_dataloader = tqdm(loader) if not self.pilot else loader for batch_idx, batch in enumerate(tqdm_dataloader): if self.pilot and batch_idx >= self.pilot_batch_cnt: # print('Break validation due to pilot mode') break batch = {k: v.to(self.device) for k, v in batch.items()} batch_size = next(iter(batch.values())).size(0) num_instance += batch_size metrics = self.calculate_metrics(batch) for k, v in metrics.items(): average_meter_set.update(k, v) if not self.pilot: description_metrics = ['NDCG@%d' % k for k in self.metric_ks] +\ ['Recall@%d' % k for k in self.metric_ks] description = '{}: '.format(mode.capitalize()) + ', '.join( s + ' {:.4f}' for s in description_metrics) description = description.replace('NDCG', 'N').replace( 'Recall', 'R') description = description.format( *(average_meter_set[k].avg for k in description_metrics)) tqdm_dataloader.set_description(description) log_data = { 'state_dict': (self._create_state_dict(epoch, accum_iter)), 'epoch': epoch, 'accum_iter': accum_iter, 'num_eval_instance': num_instance, } log_data.update(average_meter_set.averages()) log_data.update(kwargs) if doLog: if mode == 'val': self.logger_service.log_val(log_data) elif mode == 'test': self.logger_service.log_test(log_data) else: raise ValueError return log_data
def validate(self, epoch, accum_iter): self.model.eval() self.all_preds = [] average_meter_set = AverageMeterSet() with torch.no_grad(): tqdm_dataloader = tqdm(self.val_loader) for batch_idx, batch in enumerate(tqdm_dataloader): batch = [x.to(self.device) for x in batch] metrics, preds = self.calculate_metrics(batch) for p in preds: self.all_preds.append(p.tolist()) for k, v in metrics.items(): average_meter_set.update(k, v) description_metrics = ['NDCG@%d' % k for k in self.metric_ks[:3]] + \ ['Recall@%d' % k for k in self.metric_ks[:3]] description = 'Val: ' + ', '.join(s + ' {:.3f}' for s in description_metrics) description = description.replace('NDCG', 'N').replace('Recall', 'R') description = description.format( *(average_meter_set[k].avg for k in description_metrics)) tqdm_dataloader.set_description(description) log_data = { 'state_dict': (self._create_state_dict()), 'epoch': epoch + 1, 'accum_iter': accum_iter, } log_data.update(average_meter_set.averages()) self.log_extra_val_info(log_data) self.logger_service.log_val(log_data) df = pd.DataFrame(self.all_preds, columns=[ 'prediction_' + str(i) for i in range(len(self.all_preds[0])) ]) if not os.path.isdir(self.args.output_predictions_folder): os.makedirs(self.args.output_predictions_folder) with open( os.path.join(self.args.output_predictions_folder, 'config.json'), 'w') as f: self.args.recommender = "BERT4rec" self.args.seed = str(self.args.model_init_seed) args_dict = {} args_dict['args'] = vars(self.args) f.write(json.dumps(args_dict, indent=4, sort_keys=True)) df.to_csv(self.args.output_predictions_folder + "/predictions.csv", index=False)
def train_one_epoch(self, epoch, accum_iter): self.model.train() average_meter_set = AverageMeterSet() tqdm_dataloader = tqdm(self.train_loader) for batch_idx, batch in enumerate(tqdm_dataloader): batch = self.batch_to_device(batch) batch_size = self.args.train_batch_size # forward pass self.optimizer.zero_grad() loss = self.calculate_loss(batch) # backward pass loss.backward() self.optimizer.step() # update metrics average_meter_set.update('loss', loss.item()) average_meter_set.update('lr', self.optimizer.defaults['lr']) tqdm_dataloader.set_description('Epoch {}, loss {:.3f} '.format(epoch + 1, average_meter_set['loss'].avg)) accum_iter += batch_size if self._needs_to_log(accum_iter): tqdm_dataloader.set_description('Logging to Tensorboard') log_data = { 'state_dict': (self._create_state_dict()), 'epoch': epoch+1, 'accum_iter': accum_iter, } log_data.update(average_meter_set.averages()) self.log_extra_train_info(log_data) self.logger_service.log_train(log_data) if self.args.local and batch_idx == 20: break # adapt learning rate if self.args.enable_lr_schedule: self.lr_scheduler.step() if epoch % self.lr_scheduler.step_size == 0: print(self.optimizer.defaults['lr']) return accum_iter
def validate(self, epoch, accum_iter): self.model.eval() average_meter_set = AverageMeterSet() with torch.no_grad(): tqdm_dataloader = tqdm(self.val_loader) for batch_idx, batch in enumerate(tqdm_dataloader): batch = [x.to(self.device) for x in batch] metrics = self.calculate_metrics(batch) for k, v in metrics.items(): average_meter_set.update(k, v) description_metrics = ['NDCG@%d' % k for k in self.metric_ks[:3]] + \ ['Recall@%d' % k for k in self.metric_ks[:3]] if 'accuracy' in self.args.metrics_to_log: description_metrics = ['accuracy'] description = 'Val: ' + ', '.join(s + ' {:.3f}' for s in description_metrics) description = description.replace('NDCG', 'N').replace('Recall', 'R') description = description.format(*(average_meter_set[k].avg for k in description_metrics)) tqdm_dataloader.set_description(description) log_data = { 'state_dict': (self._create_state_dict()), 'epoch': epoch + 1, 'accum_iter': accum_iter, 'user_embedding': self.model.embedding.user.weight.cpu().detach().numpy() if self.args.dump_useritem_embeddings == 'True' and self.model.embedding.user is not None else None, 'item_embedding': self.model.embedding.token.weight.cpu().detach().numpy() if self.args.dump_useritem_embeddings == 'True' else None, } log_data.update(average_meter_set.averages()) self.log_extra_val_info(log_data) self.logger_service.log_val(log_data)
def train_one_epoch(self, epoch, accum_iter, train_loader, **kwargs): self.model.train() average_meter_set = AverageMeterSet() num_instance = 0 tqdm_dataloader = tqdm( train_loader) if not self.pilot else train_loader for batch_idx, batch in enumerate(tqdm_dataloader): if self.pilot and batch_idx >= self.pilot_batch_cnt: # print('Break training due to pilot mode') break batch_size = next(iter(batch.values())).size(0) batch = {k: v.to(self.device) for k, v in batch.items()} num_instance += batch_size if self.total_anneal_steps > 0: anneal = min(self.anneal_cap, 1. * self.update_count / self.total_anneal_steps) else: anneal = self.anneal_cap self.optimizer.zero_grad() loss = self.calculate_loss(batch, anneal) if isinstance(loss, tuple): loss, extra_info = loss for k, v in extra_info.items(): average_meter_set.update(k, v) loss.backward() if self.clip_grad_norm is not None: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip_grad_norm) self.optimizer.step() self.update_count += 1 average_meter_set.update('loss', loss.item()) if not self.pilot: tqdm_dataloader.set_description( 'Epoch {}, loss {:.3f} '.format( epoch, average_meter_set['loss'].avg)) accum_iter += batch_size if self._needs_to_log(accum_iter): if not self.pilot: tqdm_dataloader.set_description('Logging') log_data = { # 'state_dict': (self._create_state_dict()), 'epoch': epoch, 'accum_iter': accum_iter, } log_data.update(average_meter_set.averages()) log_data.update(kwargs) self.log_extra_train_info(log_data) self.logger_service.log_train(log_data) log_data = { # 'state_dict': (self._create_state_dict()), 'epoch': epoch, 'accum_iter': accum_iter, 'num_train_instance': num_instance, } log_data.update(average_meter_set.averages()) log_data.update(kwargs) self.log_extra_train_info(log_data) self.logger_service.log_train(log_data) return accum_iter