def attach_test(validation_engine, verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) # If the verbosity is set, progress bar would be shown for mini-batch iterations. # Without ignite, you can use tqdm to implement progress bar. validation_metric_names = ['loss', 'accuracy'] for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) # Do same things for validation engine. if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print('Test - loss={:.4e} accuracy={:.4f} '.format( engine.state.metrics['loss'], engine.state.metrics['accuracy']))
def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) # runningAverage : 미니 배치마다 return을 하면 알아서 통계적 수치를 보여줌 training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|'] for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) # If the verbosity is set, progress bar would be shown for mini-batch iterations. # Without ignite, you can use tqdm to implement progress bar. if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) # progress bar를 출력하라는 것 # If the verbosity is set, statistics would be shown after each epoch. if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) # epoch가 끝났을때 Print def print_train_logs(engine): print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}' .format( engine.state.epoch, engine.state.metrics['|param|'], engine.state.metrics['|g_param|'], engine.state.metrics['loss'], engine.state.metrics['accuracy'], )) validation_metric_names = ['loss', 'accuracy'] for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) # Do same things for validation engine. if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print( 'Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}' .format( engine.state.metrics['loss'], engine.state.metrics['accuracy'], engine.best_loss, ))
def attach_pbar_and_metrics(trainer, evaluator): loss_metric = Average(output_transform=lambda output: output["loss"]) accuracy_metric = Accuracy( output_transform=lambda output: (output["logit"], output["label"])) pbar = ProgressBar() loss_metric.attach(trainer, "loss") accuracy_metric.attach(trainer, "accuracy") accuracy_metric.attach(evaluator, "accuracy") pbar.attach(trainer)
def main(config): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") train_loader, valid_loader, test_loader = load_dataloader_for_featureNet( config) model = DeepSleepNet(input_dim=1, n_classes=5, is_train=True, use_dropout=config.use_dropout, use_rnn=config.use_rnn).to(device) optimizer = optim.Adam(model.parameters()) crit = nn.CrossEntropyLoss() data = torch.load("./folder0_model.pth") model.load_state_dict(data["model"]) def validate(engine, mini_batch): print(1) engine.model.eval() with torch.no_grad(): x, y = mini_batch x, y = x.to(engine.device), y.to(engine.device) y_hat = engine.model(x) loss = engine.crit(y_hat, y) if isinstance(y, torch.LongTensor) or isinstance( y, torch.cuda.LongTensor): accuracy = (torch.argmax(y_hat, dim=-1) == y).sum() / float( y.size(0)) else: accuracy = 0 return {'loss': float(loss), 'accuracy': float(accuracy)} test_engine = MyEngine(validate, model, crit, optimizer, config) if config.verbose >= 2: print(model) print(optimizer) print(crit) def log_metrics(engine, title): print(engine.state.metrics.items()) print(f"{title} accuracy: {engine.state.metrics['accuracy']:.2f}") test_engine.add_event_handler(Events.EPOCH_COMPLETED, log_metrics, 'test') RunningAverage(output_transform=lambda x: x['accuracy']).attach( test_engine, 'accuracy') pbar = ProgressBar() pbar.attach(test_engine, ['accuracy']) test_engine.run(test_loader, max_epochs=1)
def attach(train_engine, validation_engine, training_metric_names=[ 'actor', 'baseline', 'risk', '|param|', '|g_param|' ], validation_metric_names=[ 'BLEU', ], verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_reward = engine.state.metrics['actor'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} BLEU={:.2f}'. format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_reward, )) for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_bleu = engine.state.metrics['BLEU'] print('Validation - BLEU={:.2f} best_BLEU={:.2f}'.format( avg_bleu, -engine.best_loss, ))
def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE): '''현재 상황 보고 및 출력 함수''' def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name) ''' Train Attach Process ''' training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|'] for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_tag(engine): print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} accuracy={:.4f}' .format( engine.state.epoch, engine.state.metrics['|param|'], engine.state.metrics['|g_param|'], engine.state.metrics['loss'], engine.state.metrics['accuracy'], )) ''' Validate Attach Process ''' validation_metric_names = ['loss', 'accuracy'] for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print( 'Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}' .format( engine.state.metrics['loss'], engine.state.metrics['accuracy'], engine.best_loss, ))
def create_supervised_trainer_skipgram(model, optimizer, prepare_batch, metrics={}, device=None, log_dir='output/log/', checkpoint_dir='output/checkpoints/', checkpoint_every=None, tensorboard_every=50) -> Engine: def _prepare_batch(batch): return batch def _update(engine, batch): model.train() optimizer.zero_grad() batch = _prepare_batch(batch) batch_loss = model._loss(batch) loss = batch_loss.mean() loss.backward() optimizer.step() return {'loss': loss.item(), 'y_pred': scores, 'y': target} model.to(device) engine = Engine(_update) # Metrics RunningAverage(output_transform=lambda x: x['loss']).attach( engine, 'average_loss') # TQDM pbar = ProgressBar(persist=True, ) pbar.attach(engine, ['average_loss']) # Checkpoint saving # to_save = {'model': model, 'optimizer': optimizer, 'engine': engine} final_checkpoint_handler = Checkpoint({'model': model}, DiskSaver(checkpoint_dir, create_dir=True), n_saved=None, filename_prefix='final') engine.add_event_handler(Events.COMPLETED, final_checkpoint_handler) @engine.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): metrics = engine.state.metrics print(f"Epoch results - Avg loss: {metrics['average_loss']:.6f}," f" Accuracy: {metrics['accuracy']:.6f}," f" Non-Pad-Accuracy: {metrics['non_pad_accuracy']:.6f}") return engine
def attach(trainer, evaluator, verbose=VERBOSE_BATCH_WISE): from ignite.engine import Events from ignite.metrics import RunningAverage from ignite.contrib.handlers.tqdm_logger import ProgressBar RunningAverage(output_transform=lambda x: x[0]).attach( trainer, 'actor') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, 'baseline') RunningAverage(output_transform=lambda x: x[2]).attach( trainer, 'reward') RunningAverage(output_transform=lambda x: x[3]).attach( trainer, '|param|') RunningAverage(output_transform=lambda x: x[4]).attach( trainer, '|g_param|') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach( trainer, ['|param|', '|g_param|', 'actor', 'baseline', 'reward']) if verbose >= VERBOSE_EPOCH_WISE: @trainer.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_reward = engine.state.metrics['actor'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} BLEU={:.2f}'. format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_reward, )) RunningAverage(output_transform=lambda x: x).attach(evaluator, 'BLEU') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(evaluator, ['BLEU']) if verbose >= VERBOSE_EPOCH_WISE: @evaluator.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_bleu = engine.state.metrics['BLEU'] print('Validation - BLEU={:.2f} best_BLEU={:.2f}'.format( avg_bleu, -engine.best_loss, ))
def attach( train_engine, validation_engine, training_metric_names=['loss', 'ppl', '|param|', '|g_param|'], validation_metric_names=['loss', 'ppl'], verbose=VERBOSE_BATCH_WISE, ): def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}'.format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss, np.exp(avg_loss), )) for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_loss = engine.state.metrics['loss'] print('Validation - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}'.format( avg_loss, np.exp(avg_loss), engine.best_loss, np.exp(engine.best_loss), ))
def attach(trainer, evaluator, verbose=VERBOSE_BATCH_WISE): from ignite.engine import Events from ignite.metrics import RunningAverage from ignite.contrib.handlers.tqdm_logger import ProgressBar RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, '|param|') RunningAverage(output_transform=lambda x: x[2]).attach( trainer, '|g_param|') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(trainer, ['|param|', '|g_param|', 'loss']) if verbose >= VERBOSE_EPOCH_WISE: @trainer.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}' .format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss, np.exp(avg_loss), )) RunningAverage(output_transform=lambda x: x).attach(evaluator, 'loss') if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(evaluator, ['loss']) if verbose >= VERBOSE_EPOCH_WISE: @evaluator.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_loss = engine.state.metrics['loss'] print( 'Validation - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}' .format( avg_loss, np.exp(avg_loss), engine.best_loss, np.exp(engine.best_loss), ))
def inference( cfg, model, val_loader, num_query ): device = cfg.MODEL.DEVICE logger = logging.getLogger("reid_baseline.inference") logger.info("Enter inferencing") if cfg.TEST.RE_RANKING == 'no': print("Create evaluator") if 'test_all' in cfg.TEST.TEST_MODE: if len(val_loader.dataset.dataset[0]) == 4: # mask no new eval evaluator = create_supervised_all_evaluator_with_mask(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) elif len(val_loader.dataset.dataset[0]) == 6: # mask , new eval evaluator = create_supervised_all_evaluator_with_mask_new_eval(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM,new_eval=True)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) else: evaluator = create_supervised_all_evaluator(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) else: if len(val_loader.dataset.dataset[0]) == 6: # mask , new eval evaluator = create_supervised_evaluator_with_mask_new_eval(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM,new_eval=True)}, device=device) elif len(val_loader.dataset.dataset[0]) == 4 : # mask, no new eval evaluator = create_supervised_evaluator_with_mask(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, device=device) else: evaluator = create_supervised_evaluator(model, metrics={'r1_mAP': R1_mAP(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, device=device) elif cfg.TEST.RE_RANKING == 'yes': # haven't implement with mask print("Create evaluator for reranking") if 'test_all' in cfg.TEST.TEST_MODE: evaluator = create_supervised_all_evaluator(model, metrics={'r1_mAP': R1_mAP_reranking(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, seq_len=cfg.INPUT.SEQ_LEN,device=device) else: evaluator = create_supervised_evaluator(model, metrics={'r1_mAP': R1_mAP_reranking(num_query, max_rank=50, feat_norm=cfg.TEST.FEAT_NORM)}, device=device) else: print("Unsupported re_ranking config. Only support for no or yes, but got {}.".format(cfg.TEST.RE_RANKING)) pbar = ProgressBar(persist=True,ncols=120) pbar.attach(evaluator) evaluator.run(val_loader) cmc, mAP = evaluator.state.metrics['r1_mAP'] logger.info('Validation Results') logger.info("mAP: {:.1%}".format(mAP)) for r in [1, 5, 10]: logger.info("CMC curve, Rank-{:<3}:{:.1%}".format(r, cmc[r - 1]))
def train(self): """ Full training logic """ if self.train_logger is not None: self.train_logger.watch(self.model) engine = Engine(self._train_update_func) @engine.on(Events.EPOCH_STARTED) def log_training_loss(engine): engine.state.total_loss = 0 @engine.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): engine.state.total_loss += engine.state.output['loss'] for name, metric in self.metrics.items(): metric.attach(engine, name) pbar = ProgressBar() pbar.attach(engine) if self.valid: # TODO proper implementation: currently handled only in subclass evaluator = self._prepare_evaluator() engine.add_event_handler(Events.EPOCH_COMPLETED, self.run_validate, evaluator) @engine.on(Events.EPOCH_COMPLETED) def mk_checkpoints( engine): # TODO use checkpointing/scheduling from ignnite log = { 'epoch': engine.state.epoch, 'loss': engine.state.total_loss / len(engine.state.dataloader), 'metrics': engine.state.metrics } if hasattr(engine.state, 'validation_result'): log['val_loss'] = engine.state.validation_result.total_loss / len( engine.state.validation_result.dataloader) self._prepare_checkpoint(log=log) self._reschedule_lr(epoch=engine.state.epoch) if self.train_logger is not None: self.train_logger.add_entry(log) if self.verbosity >= 1: for key, value in log.items(): self.logger.info(' {:15s}: {}'.format( str(key), value)) engine.run( self.data_loader, max_epochs=self.epochs ) # TODO return resume logic of range(self.start_epoch, self.epochs + 1):
def train(self, epochs: int, train_loader, test_loader=None, trainsize=None, valsize=None): self.model.train() train_engine = Engine(lambda e, b: self.train_step(b)) @train_engine.on(Events.EPOCH_COMPLETED(every=self.track_loss_freq)) def eval_test(engine): if self.track_loss: self.tb_log(train_loader, engine.state.epoch, is_train=True, eval_length=valsize) if test_loader is not None: self.tb_log(test_loader, engine.state.epoch, is_train=False, eval_length=valsize) @train_engine.on(Events.EPOCH_COMPLETED) def save_state(engine): torch.save(self.model.state_dict(), self.snail_path) torch.save(self.opt.state_dict(), self.snail_opt_path) @train_engine.on( Events.ITERATION_COMPLETED(every=self.track_params_freq)) def tb_log_histogram_params(engine): if self.track_layers: for name, params in self.model.named_parameters(): self.logger.add_histogram(name.replace('.', '/'), params, engine.state.iteration) if params.grad is not None: self.logger.add_histogram( name.replace('.', '/') + '/grad', params.grad, engine.state.iteration) if self.trainpbar: RunningAverage(output_transform=lambda x: x).attach( train_engine, 'loss') p = ProgressBar() p.attach(train_engine, ['loss']) train_engine.run(train_loader, max_epochs=epochs, epoch_length=trainsize)
def attach(train_engine, validation_engine, verbose=VERBOSE_BATCH_WISE): def attach_runngin_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name) training_metric_names = ["loss", "accuracy", "|param|", "|g_param|"] for metric_name in training_metric_names: attach_runngin_average(train_engine, metric_name) # 매 iteration 마다 출력 if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) # epoch이 끝났을때 출력 if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_loss(engine): print( "Epoch {} - |param| = {:.2e} |g_param| = {:.2e} loss = {:.4e} accuracy = {:.4f}" .format(engine.state.epoch, engine.state.metrics["|param|"], engine.state.metrics["|g_param|"], engine.state.metrics["loss"], engine.state.metrics["accuracy"])) validation_metric_names = ["loss", "accuracy"] for metric_name in validation_metric_names: attach_runngin_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_loss(engine): print( "Validation - loss={:.4e} accuracy={:.4f} best_loss={:.4e}" .format(engine.state.metrics["loss"], engine.state.metrics["accuracy"], engine.best_loss))
def attach(trainer, evaluator, verbose=2): from ignite.engine import Events from ignite.metrics import RunningAverage from ignite.contrib.handlers.tqdm_logger import ProgressBar RunningAverage(output_transform=lambda x: x[0]).attach(trainer, 'loss') RunningAverage(output_transform=lambda x: x[1]).attach( trainer, '|param|') RunningAverage(output_transform=lambda x: x[2]).attach( trainer, '|g_param|') if verbose >= 2: pbar = ProgressBar() pbar.attach(trainer, ['|param|', '|g_param|', 'loss']) if verbose >= 1: @trainer.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print('Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e}'. format(engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss)) RunningAverage(output_transform=lambda x: x).attach(evaluator, 'loss') if verbose >= 2: pbar = ProgressBar() pbar.attach(evaluator, ['loss']) if verbose >= 1: @evaluator.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_loss = engine.state.metrics['loss'] print('Validation - loss={:.4e} lowest_loss={:.4e}'.format( avg_loss, engine.lowest_loss))
def attatch_running_average(engine, meric_name): RunningAverage(output_transform = lambda x : x[metric_name]).attach( engine, meric_name, ) training_metric_names = ['loss', 'accuracy', '|param|', '|g_param|'] for metric_name in training_metric_names: attatch_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format = None, ncols = 120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): print('Epoch {} - |param\ = {:.2e} loss = {:.4e} accuracy = {}').format( engine.state.epoch, engine.state.metrics['|param|'], engnie.state.metrics['|g_param|'], engine.state.metrics['loss'], engnie.state.metrics['accuracy'] ) validation_metric_names = ['loss','accuracy'] for metric_name in validation_metric_names: attatch_running_average(validation_engine, metric_name) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): print('Validation - loss={:.4e} accuracy={:.4f} best_loss={:4.}'.format( engine.state.metrics['loss'], engine.state.metrics['accuracy'], engine.best_loss, )) @staticmethos
def attach( train_engine, # validation_engine, training_metric_names=['loss', 'ppl', '|param|', '|g_param|'], # validation_metric_names = ['loss', 'ppl'], verbose=VERBOSE_BATCH_WISE, ): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_loss = engine.state.metrics['loss'] print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss={:.4e} ppl={:.2f}' .format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_loss, np.exp(avg_loss), ))
def attach(train_engine, val_engine, verbose=EPOCHWISE): def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) training_metric_names = ['loss', 'accuracy'] for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose == BATCHWISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, ['loss', 'accuracy']) if verbose == EPOCHWISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, ['loss', 'accuracy']) @train_engine.on(Events.EPOCH_COMPLETED) def print_logs(engine): print('Epoch {} Train - Accuracy: {:.4f} Loss: {:.4f}'.format( engine.state.epoch, engine.state.metrics['accuracy'], engine.state.metrics['loss'], )) validation_metric_names = ['loss', 'accuracy'] for metric_name in validation_metric_names: attach_running_average(val_engine, metric_name) if verbose == BATCHWISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(val_engine, ['loss', 'accuracy']) if verbose == EPOCHWISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(val_engine, ['loss', 'accuracy'])
def attach(train_engine, validation_engine, training_metric_names=[ 'x2y', 'y2x', 'reg', '|param|', '|g_param|' ], validation_metric_names=['x2y', 'y2x'], verbose=VERBOSE_BATCH_WISE): # Attaching would be repaeted for serveral metrics. # Thus, we can reduce the repeated codes by using this function. def attach_running_average(engine, metric_name): RunningAverage(output_transform=lambda x: x[metric_name]).attach( engine, metric_name, ) for metric_name in training_metric_names: attach_running_average(train_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(train_engine, training_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @train_engine.on(Events.EPOCH_COMPLETED) def print_train_logs(engine): avg_p_norm = engine.state.metrics['|param|'] avg_g_norm = engine.state.metrics['|g_param|'] avg_x2y = engine.state.metrics['x2y'] avg_y2x = engine.state.metrics['y2x'] avg_reg = engine.state.metrics['reg'] print( 'Epoch {} - |param|={:.2e} |g_param|={:.2e} loss_x2y={:.4e} ppl_x2y={:.2f} loss_y2x={:.4e} ppl_y2x={:.2f} dual_loss={:.4e}' .format( engine.state.epoch, avg_p_norm, avg_g_norm, avg_x2y, np.exp(avg_x2y), avg_y2x, np.exp(avg_y2x), avg_reg, )) for metric_name in validation_metric_names: attach_running_average(validation_engine, metric_name) if verbose >= VERBOSE_BATCH_WISE: pbar = ProgressBar(bar_format=None, ncols=120) pbar.attach(validation_engine, validation_metric_names) if verbose >= VERBOSE_EPOCH_WISE: @validation_engine.on(Events.EPOCH_COMPLETED) def print_valid_logs(engine): avg_x2y = engine.state.metrics['x2y'] avg_y2x = engine.state.metrics['y2x'] print( 'Validation X2Y - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}' .format( avg_x2y, np.exp(avg_x2y), engine.best_x2y, np.exp(engine.best_x2y), )) print( 'Validation Y2X - loss={:.4e} ppl={:.2f} best_loss={:.4e} best_ppl={:.2f}' .format( avg_y2x, np.exp(avg_y2x), engine.best_y2x, np.exp(engine.best_y2x), ))
def run(root_path, log_path, student_class_module, teacher_class_module, student_class_name, teacher_class_name, init_interval, hard_ratio, train_targets, test_targets, test_camera_base, augmentation_types, batch_size, n_workers, save_interval, n_saved, gpu_ids, max_epochs=150, init_lr_student_conv=.01, init_lr_teacher_conv=.01, init_lr_student_classifier=.01, init_lr_teacher_classifier=.1, lr_decay_step=100, lr_decay_rate=.1): device = 'cuda:{}'.format(gpu_ids[0]) train_transformer = Transformer(True, augmentation_types) test_transformer = Transformer(False, []) train_dataset = TrainDatasetWrapper(root_path, train_targets, train_transformer) train_loader = utils.data.DataLoader(train_dataset, batch_size, shuffle=True, num_workers=n_workers, pin_memory=True) test_datasets = [] for test_target in test_targets: test_datasets.append( TestDatasetWrapper(root_path, test_target, test_transformer, test_camera_base)) loader_caller = _get_test_data_loader_caller(batch_size, n_workers) student_class_module = importlib.import_module(student_class_module) student_model_class = getattr(student_class_module, student_class_name) teacher_class_module = importlib.import_module(teacher_class_module) teacher_model_class = getattr(teacher_class_module, teacher_class_name) models = { 'student': student_model_class(train_dataset.n_classes), 'teacher': teacher_model_class(train_dataset.n_classes), 'generator': teacher_model_class(train_dataset.n_classes) } loss_functions = { 'student': SoftLabelLoss(), 'teacher': nn.CrossEntropyLoss() } student_classifier_parameters = list( models['student'].classifier.parameters()) student_classifier_parameters_ids = [] for p in student_classifier_parameters: student_classifier_parameters_ids.append(id(p)) student_conv_parameters = [] for p in models['student'].parameters(): if id(p) not in student_classifier_parameters_ids: student_conv_parameters.append(p) teacher_classifier_parameters = list( models['teacher'].classifier.parameters()) teacher_classifier_parameters_ids = [] for p in teacher_classifier_parameters: teacher_classifier_parameters_ids.append(id(p)) teacher_conv_parameters = [] for p in models['teacher'].parameters(): if id(p) not in teacher_classifier_parameters_ids: teacher_conv_parameters.append(p) optimizers = { 'student_conv': optim.SGD(student_conv_parameters, init_lr_student_conv, momentum=.9, weight_decay=5e-4, nesterov=True), 'student_classifier': optim.SGD(student_classifier_parameters, init_lr_student_classifier, momentum=.9, weight_decay=5e-4, nesterov=True), 'teacher_conv': optim.SGD(teacher_conv_parameters, init_lr_teacher_conv, momentum=.9, weight_decay=5e-4, nesterov=True), 'teacher_classifier': optim.SGD(teacher_classifier_parameters, init_lr_teacher_classifier, momentum=.9, weight_decay=5e-4, nesterov=True), } schedulers = { 'student_conv': optim.lr_scheduler.StepLR(optimizers['student_conv'], lr_decay_step, gamma=lr_decay_rate), 'student_classifier': optim.lr_scheduler.StepLR(optimizers['student_classifier'], lr_decay_step, gamma=lr_decay_rate), 'teacher_conv': optim.lr_scheduler.StepLR(optimizers['teacher_conv'], lr_decay_step, gamma=lr_decay_rate), } writer = SummaryWriter(log_dir=log_path) trainer = create_supervised_soft_label_trainer( models, optimizers, loss_functions, hard_ratio, init_interval, device=device, non_blocking=True, output_transform=lambda x, y, y_pred_student, y_pred_teacher, loss_student, loss_teacher: (y, y_pred_student, y_pred_teacher, loss_student, loss_teacher)) RunningAverage(output_transform=lambda output: output[3].item()).attach( trainer, 'loss_student') RunningAverage(output_transform=lambda output: output[4].item()).attach( trainer, 'loss_teacher') Accuracy(output_transform=lambda output: (output[1], output[0])).attach( trainer, 'accuracy_student') Accuracy(output_transform=lambda output: (output[2], output[0])).attach( trainer, 'accuracy_teacher') progress_bar = ProgressBar() progress_bar.attach(trainer, ['loss_student', 'loss_teacher']) checkpointer = ModelCheckpoint(log_path, 'checkpoint', save_interval=save_interval, n_saved=n_saved) rank_accuracy = RankAccuracy(n_workers) evaluator = create_supervised_evaluator(models['student'], metrics={'rank': rank_accuracy}, device=device, non_blocking=True) trainer.add_event_handler( Events.EPOCH_COMPLETED, _get_result_write_function(rank_accuracy, test_datasets, loader_caller, evaluator, writer)) trainer.add_event_handler( Events.EPOCH_COMPLETED, _get_init_classifier_function(models, optimizers['teacher_classifier'], init_interval)) trainer.add_event_handler(Events.ITERATION_COMPLETED, _get_loss_write_function(writer)) trainer.add_event_handler(Events.EPOCH_COMPLETED, _get_lr_decay_function(schedulers)) trainer.add_event_handler(Events.EPOCH_COMPLETED, _get_lr_write_function(optimizers, writer)) trainer.add_event_handler( Events.EPOCH_COMPLETED, checkpointer, { 'student_model': models['student'], 'teacher_model': models['teacher'], 'generator_model': models['generator'] }) trainer.run(train_loader, max_epochs=max_epochs) writer.close()
class BasicTrainTask(BaseTask): name = "Train Task" def _validate(self, config): """ Method to check if specific configuration is correct. Raises AssertError if is incorrect. """ assert isinstance(config, BasicTrainConfig), \ "Configuration should be instance of `BasicTrainConfig`, but given {}".format(type(config)) def _start(self): """Method to run the task """ if 'cuda' in self.device: self.model = self.model.to(self.device) mlflow.log_param("model", get_object_name(self.model)) self.logger.debug("Setup criterion") if "cuda" in self.device: self.criterion = self.criterion.to(self.device) mlflow.log_param("criterion", get_object_name(self.criterion)) mlflow.log_param("optimizer", get_object_name(self.optimizer)) self.logger.debug("Setup ignite trainer") trainer = self._setup_trainer() self._setup_trainer_handlers(trainer) metrics = {'loss': Loss(self.criterion)} metrics.update(self.metrics) self.logger.debug("Input data info: ") msg = "- train data loader: {} number of batches".format( len(self.train_dataloader)) if isinstance(self.train_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.train_dataloader.sampler)) self.logger.debug(msg) if isinstance(self.train_dataloader, DataLoader): write_model_graph(self.writer, model=self.model, data_loader=self.train_dataloader, device=self.device) self.pbar_eval = None if self.train_eval_dataloader is not None: self.pbar_eval = ProgressBar() self._setup_offline_train_metrics_computation(trainer, metrics) if self.val_dataloader is not None: if self.val_metrics is None: self.val_metrics = metrics if self.pbar_eval is None: self.pbar_eval = ProgressBar() val_evaluator = self._setup_val_metrics_computation(trainer) if self.reduce_lr_on_plateau is not None: assert self.reduce_lr_on_plateau_var in self.val_metrics, \ "Monitor variable {} is not found in metrics {}" \ .format(self.reduce_lr_on_plateau_var, metrics) @val_evaluator.on(Events.COMPLETED) def update_reduce_on_plateau(engine): val_var = engine.state.metrics[ self.reduce_lr_on_plateau_var] self.reduce_lr_on_plateau.step(val_var) def default_score_function(engine): val_loss = engine.state.metrics['loss'] # Objects with highest scores will be retained. return -val_loss # Setup early stopping: if self.early_stopping_kwargs is not None: if 'score_function' in self.early_stopping_kwargs: es_score_function = self.early_stopping_kwargs[ 'score_function'] else: es_score_function = default_score_function self._setup_early_stopping(trainer, val_evaluator, es_score_function) # Setup model checkpoint: if self.model_checkpoint_kwargs is None: self.model_checkpoint_kwargs = { "filename_prefix": "model", "score_name": "val_loss", "score_function": default_score_function, "n_saved": 3, "atomic": True, "create_dir": True, "save_as_state_dict": True } self._setup_best_model_checkpointing(val_evaluator) self.logger.debug("Setup other handlers") if self.lr_scheduler is not None: @trainer.on(Events.ITERATION_STARTED) def update_lr_scheduler(engine): self.lr_scheduler.step() self._setup_log_learning_rate(trainer) self.logger.info("Start training: {} epochs".format(self.num_epochs)) mlflow.log_param("num_epochs", self.num_epochs) trainer.run(self.train_dataloader, max_epochs=self.num_epochs) self.logger.debug("Training is ended") def _setup_trainer(self): trainer = create_supervised_trainer(self.model, self.optimizer, self.criterion, device=self.device, non_blocking='cuda' in self.device) return trainer def _setup_trainer_handlers(self, trainer): # Setup timer to measure training time timer = setup_timer(trainer) self._setup_log_training_loss(trainer) @trainer.on(Events.EPOCH_COMPLETED) def log_training_time(engine): self.logger.info("One epoch training time (seconds): {}".format( timer.value())) last_model_saver = ModelCheckpoint( self.log_dir.as_posix(), filename_prefix="checkpoint", save_interval=self.trainer_checkpoint_interval, n_saved=1, atomic=True, create_dir=True, save_as_state_dict=True) model_name = get_object_name(self.model) to_save = { model_name: self.model, "optimizer": self.optimizer, } if self.lr_scheduler is not None: to_save['lr_scheduler'] = self.lr_scheduler trainer.add_event_handler(Events.ITERATION_COMPLETED, last_model_saver, to_save) trainer.add_event_handler(Events.ITERATION_COMPLETED, TerminateOnNan()) def _setup_log_training_loss(self, trainer): self.avg_output = RunningAverage(output_transform=lambda out: out) self.avg_output.attach(trainer, 'running_avg_loss') self.pbar.attach(trainer, ['running_avg_loss']) @trainer.on(Events.ITERATION_COMPLETED) def log_training_loss(engine): iteration = (engine.state.iteration - 1) % len( self.train_dataloader) + 1 if iteration % self.log_interval == 0: # self.logger.info("Epoch[{}] Iteration[{}/{}] Loss: {:.4f}".format(engine.state.epoch, iteration, # len(self.train_dataloader), # engine.state.output)) self.writer.add_scalar("training/loss_vs_iterations", engine.state.output, engine.state.iteration) mlflow.log_metric("training_loss_vs_iterations", engine.state.output) def _setup_log_learning_rate(self, trainer): @trainer.on(Events.EPOCH_STARTED) def log_lrs(engine): if len(self.optimizer.param_groups) == 1: lr = float(self.optimizer.param_groups[0]['lr']) self.logger.debug("Learning rate: {}".format(lr)) self.writer.add_scalar("learning_rate", lr, engine.state.epoch) mlflow.log_metric("learning_rate", lr) else: for i, param_group in enumerate(self.optimizer.param_groups): lr = float(param_group['lr']) self.logger.debug("Learning rate (group {}): {}".format( i, lr)) self.writer.add_scalar("learning_rate_group_{}".format(i), lr, engine.state.epoch) mlflow.log_metric("learning_rate_group_{}".format(i), lr) def _setup_offline_train_metrics_computation(self, trainer, metrics): train_eval_loader = self.train_eval_dataloader msg = "- train evaluation data loader: {} number of batches".format( len(train_eval_loader)) if isinstance(train_eval_loader, DataLoader): msg += " | {} number of samples".format( len(train_eval_loader.sampler)) self.logger.debug(msg) train_evaluator = create_supervised_evaluator(self.model, metrics=metrics, device=self.device, non_blocking="cuda" in self.device) self.pbar_eval.attach(train_evaluator) @trainer.on(Events.EPOCH_COMPLETED) def log_training_metrics(engine): epoch = engine.state.epoch if epoch % self.val_interval_epochs == 0: self.logger.debug("Compute training metrics") metrics_results = train_evaluator.run( train_eval_loader).metrics self.logger.info("Training Results - Epoch: {}".format(epoch)) for name in metrics_results: self.logger.info("\tAverage {}: {:.5f}".format( name, metrics_results[name])) self.writer.add_scalar("training/avg_{}".format(name), metrics_results[name], epoch) mlflow.log_metric("training_avg_{}".format(name), metrics_results[name]) return train_evaluator def _setup_val_metrics_computation(self, trainer): val_evaluator = create_supervised_evaluator(self.model, metrics=self.val_metrics, device=self.device, non_blocking="cuda" in self.device) self.pbar_eval.attach(val_evaluator) msg = "- validation data loader: {} number of batches".format( len(self.val_dataloader)) if isinstance(self.val_dataloader, DataLoader): msg += " | {} number of samples".format( len(self.val_dataloader.sampler)) self.logger.debug(msg) @trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(engine): epoch = engine.state.epoch if epoch % self.val_interval_epochs == 0: self.logger.debug("Compute validation metrics") metrics_results = val_evaluator.run( self.val_dataloader).metrics self.logger.info( "Validation Results - Epoch: {}".format(epoch)) for name in metrics_results: self.logger.info("\tAverage {}: {:.5f}".format( name, metrics_results[name])) self.writer.add_scalar("validation/avg_{}".format(name), metrics_results[name], epoch) mlflow.log_metric("validation_avg_{}".format(name), metrics_results[name]) return val_evaluator def _setup_early_stopping(self, trainer, val_evaluator, score_function): kwargs = dict(self.early_stopping_kwargs) if 'score_function' not in kwargs: kwargs['score_function'] = score_function handler = EarlyStopping(trainer=trainer, **kwargs) setup_logger(handler._logger, self.log_filepath, self.log_level) val_evaluator.add_event_handler(Events.COMPLETED, handler) def _setup_best_model_checkpointing(self, val_evaluator): model_name = get_object_name(self.model) best_model_saver = ModelCheckpoint(self.log_dir.as_posix(), **self.model_checkpoint_kwargs) val_evaluator.add_event_handler(Events.COMPLETED, best_model_saver, {model_name: self.model})
def main(): args = get_args() if 'e-SNLI-VE' in args.data_path: args.no_image = False else: args.no_image = True if not args.no_image: args.no_premise = True args.with_expl = True '''Setup''' t = datetime.today() output_dir = os.path.join(args.output_folder, f"{t.month}_{t.day}_{t.hour}_{t.minute}_{t.second}") if not os.path.exists(output_dir): os.makedirs(output_dir, exist_ok=True) # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log main process only, logger.warning => log all processes logging.basicConfig(filename=os.path.join(output_dir, 'app.log'), filemode='a', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) # This is a logger.warning: it will be printed by all distributed processes logger.warning(f"Running process {args.local_rank}") logger.info(f"Arguments: {pformat(args)}") logger.info(f'Image not used:{args.no_image}') logger.info(f'Premise not used:{args.no_premise}') logger.info(f'Explanations used:{args.with_expl}') '''Initialize distributed training if needed''' args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') logger.info( "Prepare tokenizer, pretrained model and optimizer - add special tokens for fine-tuning") tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT) if args.no_image: model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: import image_gpt2_291 model = image_gpt2_291.GPT2LMHeadModel.from_pretrained( args.model_checkpoint) model.resize_token_embeddings(len(tokenizer)) model.to(args.device) optimizer = AdamW(model.parameters(), lr=args.lr) ''' Prepare model for FP16 and distributed training if needed (order is important, distributed should be the last) ''' if args.fp16: from apex import amp # Apex is only required if we use fp16 training model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16) if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) model = model.module logger.info("Prepare datasets") train_loader, val_loader = get_data_loaders(args, tokenizer) '''Training function and trainer''' def train(engine, batch): model.train() batch = tuple(input_tensor.to(args.device) for input_tensor in batch) if args.no_image: input_ids, lm_label, label, input_mask = batch else: image, input_ids, lm_label, label, input_mask = batch if args.no_image: output = model(input_ids=input_ids, # attention_mask=input_mask, labels=lm_label) else: output = model(input_ids=input_ids, images=image, # attention_mask=input_mask, labels=lm_label) loss, logits, _ = output loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() if not args.with_expl: lbl_accuracy = torch.eq(label, logits.argmax( dim=1)).float().sum() / len(label) return { 'loss': loss.item(), 'lbl_accuracy': lbl_accuracy.item() } else: if engine.state.iteration % (args.gradient_accumulation_steps * 500) == 0: input_output = list(zip(input_ids, logits)) random_item = random.choice(input_output) in_sent = tokenizer.decode(list(filter( lambda x: x != tokenizer.eos_token_id, random_item[0]))) out_expl = tokenizer.decode(random_item[1].argmax(dim=1), skip_special_tokens=True) logger.info(f'MODEL INPUT: {in_sent}') logger.info(f'GEN. EXPL {out_expl}') logger.info('--------------------------------') return { 'loss': loss.item(), } '''Validation function and validator (validator output is the input of the metrics)''' def validation(engine, batch): model.eval() with torch.no_grad(): batch = tuple(input_tensor.to(args.device) for input_tensor in batch) if args.no_image: input_ids, lm_label, label, input_mask = batch else: image, input_ids, lm_label, label, input_mask = batch if args.no_image: output = model(input_ids=input_ids, # attention_mask=input_mask ) else: output = model(input_ids=input_ids, images=image, # attention_mask=input_mask ) logits, _ = output logits_shifted = logits[..., :-1, :].contiguous().view(-1, logits.size(-1)) labels_shifted = lm_label[..., 1:].contiguous().view(-1) return logits_shifted, labels_shifted '''Engines''' trainer = Engine(train) validator = Engine(validation) # t_total = len( # train_loader) // args.gradient_accumulation_steps * args.n_epochs # scheduler = get_linear_schedule_with_warmup( # optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) '''Linearly decrease the learning rate from lr to zero''' scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) ''' Attach validation to trainer: we evaluate when we start the training and at the end of each epoch ''' trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: validator.run(val_loader)) if args.eval_before_start: trainer.add_event_handler(Events.STARTED, lambda _: validator.run(val_loader)) '''Prepare metrics - note how we compute distributed metrics''' RunningAverage(output_transform=lambda x: x['loss']).attach( trainer, "loss") RunningAverage(output_transform=lambda x: math.exp( average_distributed_scalar(x['loss'], args))).attach(trainer, "ppl") if not args.with_expl: RunningAverage(output_transform=lambda x: 100 * x['lbl_accuracy']).attach( trainer, "lbl_accuracy") metrics = {} metrics["lbl_loss"] = Loss(torch.nn.CrossEntropyLoss(), output_transform=lambda x: (x[0], x[1])) metrics["loss"] = MetricsLambda( lambda l, a: average_distributed_scalar( l / a.gradient_accumulation_steps, a), metrics["lbl_loss"], args) metrics["ppl"] = MetricsLambda(math.exp, metrics["loss"]) if not args.with_expl: metrics["lbl_accuracy"] = 100 * \ Accuracy(output_transform=lambda x: (x[0], x[1])) for name, metric in metrics.items(): metric.attach(validator, name) ''' On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train ''' if args.local_rank in [-1, 0]: pbar = ProgressBar(persist=True) pbar.attach(trainer, metric_names=["loss", 'ppl'] if args.with_expl else ["loss", 'lbl_accuracy', 'ppl']) validator.add_event_handler(Events.COMPLETED, lambda _: pbar.log_message( "Validation: %s" % pformat(validator.state.metrics))) tb_logger = TensorboardLogger(log_dir=output_dir) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["loss"]), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", metric_names=["ppl"] if args.with_expl else ["lbl_accuracy", "ppl"]), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(validator, log_handler=OutputHandler( tag="validation", metric_names=[ 'ppl', 'loss'] if args.with_expl else['ppl', 'loss', 'lbl_accuracy'], global_step_transform=lambda *args, **kwargs: trainer.state.iteration), event_name=Events.EPOCH_COMPLETED) checkpoint_handler = ModelCheckpoint(output_dir, 'checkpoint', n_saved=8, require_empty=False) trainer.add_event_handler(Events.EPOCH_COMPLETED(every=1), checkpoint_handler, {'mymodel': getattr(model, 'module', model)}) # "getattr" take care of distributed encapsulation torch.save(args, os.path.join(output_dir, 'model_training_args.bin')) getattr(model, 'module', model).config.to_json_file( os.path.join(output_dir, CONFIG_NAME)) tokenizer.save_vocabulary(output_dir) '''Run the training''' trainer.run(train_loader, max_epochs=args.n_epochs)
def main(args): os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu) cfg = load_config(args.config) path, config_name = os.path.split(args.config) copyfile(args.config, os.path.join(cfg.workdir, config_name)) copyfile(os.path.join(path, "model.py"), os.path.join(cfg.workdir, "model.py")) pbar = ProgressBar() tb_logger = TensorboardLogger(log_dir=os.path.join(cfg.workdir, "tb_logs")) checkpointer = ModelCheckpoint(os.path.join(cfg.workdir, "checkpoints"), '', save_interval=1, n_saved=cfg.n_epochs, create_dir=True, atomic=True) def _update(engine, batch): cfg.model.train() cfg.optimizer.zero_grad() x, y = cfg.prepare_train_batch(batch) y_pred = cfg.model(**x) loss = cfg.loss_fn(y_pred, y) loss['loss'].backward() cfg.optimizer.step() for k in loss: loss[k] = loss[k].item() return loss trainer = Engine(_update) pbar.attach(trainer, output_transform=lambda x: {k: "{:.5f}".format(v) for k, v in x.items()}) trainer.add_event_handler(Events.ITERATION_STARTED, cfg.scheduler) trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpointer, {'model': cfg.model, 'optimizer': cfg.optimizer}) tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", output_transform=lambda x: x), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(cfg.optimizer), event_name=Events.ITERATION_STARTED) # tb_logger.attach(trainer, # log_handler=WeightsScalarHandler(cfg.model), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer, # log_handler=WeightsHistHandler(cfg.model), # event_name=Events.EPOCH_COMPLETED) # tb_logger.attach(trainer, # log_handler=GradsScalarHandler(cfg.model), # event_name=Events.ITERATION_COMPLETED) # tb_logger.attach(trainer, # log_handler=GradsHistHandler(cfg.model), # event_name=Events.EPOCH_COMPLETED) def _evaluate(engine, batch): cfg.model.eval() x, y = cfg.prepare_train_batch(batch) batch_size = len(batch[list(batch.keys())[0]]) with torch.no_grad(): y_pred = cfg.model(**x) loss = cfg.loss_fn(y_pred, y) for k in loss: loss[k] = loss[k].item() if k not in engine.state.metrics: engine.state.metrics[k] = 0.0 engine.state.metrics[k] += loss[k] * batch_size / len(cfg.valid_ds) return loss evaluator = Engine(_evaluate) pbar.attach(evaluator, output_transform=lambda x: {k: "{:.5f}".format(v) for k, v in x.items()}) @trainer.on(Events.EPOCH_COMPLETED) def evaluate_on_valid_dl(engine): evaluator.run(cfg.valid_dl) tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=['loss', 'rot_loss_cos', 'rot_loss_l1', 'trans_loss', 'true_distance', 'cls_loss'], global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED) trainer.run(cfg.train_dl, cfg.n_epochs) tb_logger.close()
# sample from the prior prior_sample_args = {} prior_sample_args.update(svi_args) prior_sample_args["cond"] = False prior_sample_args["cond_label"] = False fwd_trace = poutine.trace(forward_model).get_trace(x, y, N=x.shape[0], **prior_sample_args) prior_sample = fwd_trace.nodes["pixels"]["fn"].mean prior_canonical_sample = fwd_trace.nodes["canonical_view"]["value"] tb.add_image("prior_samples", torchvision.utils.make_grid(prior_sample), epoch) tb.add_image( "canonical_prior_samples", torchvision.utils.make_grid(prior_canonical_sample), epoch, ) pbar = ProgressBar() pbar.attach(train_engine) @train_engine.on(Events.EPOCH_COMPLETED(every=eval_every)) def eval(engine): eval_engine.run(test_dl, seed=engine.state.epoch) train_engine.run(train_dl, max_epochs=50)
def setup(self, training_metrics: Dict): def metric_name(n) -> str: if n.endswith('Accuracy'): n = 'acc' else: n = n[:-6] if n.endswith('Metric') else n return n def print_metrics(metrics) -> str: rv = '' metric_keys = sorted(k for k in metrics) for k in metric_keys: if k == 'Accuracy': rv += f'{metric_name(k)}: {metrics[k]:.3} | ' else: rv += f'{metric_name(k)}: {metrics[k]} | ' return rv def store_metrics(metrics: Dict, mode: str): metric_keys = sorted(k for k in metrics) for k in metric_keys: self.metrics_history[mode][metric_name(k)].append(metrics[k]) if self.seed: set_seed_everywhere(self.seed, self.cuda) pbar = ProgressBar(persist=True) names = [] for k, v in training_metrics.items(): name = f'r{k}' names.append(name) RunningAverage(v).attach(self.trainer, name) RunningAverage(None, output_transform=lambda x: x[-1]).attach( self.trainer, 'rloss') names.append('rloss') pbar.attach(self.trainer, names) ProgressBar(persist=True).attach(engine=self.evaluator, metric_names=names) # A few events handler. To add / modify the events handler, you need to extend the __init__ method of RunnerABC # Ignite provides the necessary abstractions and a furnished repository of useful tools @self.trainer.on(Events.EPOCH_COMPLETED) def log_training_validation_results(trainer): self.evaluator.run(self.dataset_splits.train_data_loader()) metrics = self.evaluator.state.metrics store_metrics(metrics=metrics, mode="training") logger.info( f"Training Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) self.evaluator.run(self.dataset_splits.val_data_loader()) metrics = self.evaluator.state.metrics store_metrics(metrics=metrics, mode="validation") logger.info( f"Validation Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) metrics = self.trainer.state.metrics if self.scheduler: self.scheduler.step(metrics["rloss"]) # self.scheduler.step(metrics[self.loss_metric.__class__.__name__]) @self.trainer.on(Events.COMPLETED) def log_test_results(trainer): if self.dataset_splits.test_set: self.evaluator.run(self.dataset_splits.test_data_loader()) metrics = self.evaluator.state.metrics store_metrics(metrics=metrics, mode="test") logger.info( f"Test Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" )
def __call__(self, model, train_dataset, val_dataset=None, **_): """Train a PyTorch model. Args: model (torch.nn.Module): PyTorch model to train. train_dataset (torch.utils.data.Dataset): Dataset used to train. val_dataset (torch.utils.data.Dataset, optional): Dataset used to validate. Returns: trained_model (torch.nn.Module): Trained PyTorch model. """ assert train_dataset is not None train_params = self.train_params mlflow_logging = self.mlflow_logging if mlflow_logging: try: import mlflow # NOQA except ImportError: log.warning( "Failed to import mlflow. MLflow logging is disabled.") mlflow_logging = False loss_fn = train_params.get("loss_fn") assert loss_fn epochs = train_params.get("epochs") seed = train_params.get("seed") optimizer = train_params.get("optimizer") assert optimizer optimizer_params = train_params.get("optimizer_params", dict()) train_dataset_size_limit = train_params.get("train_dataset_size_limit") if train_dataset_size_limit: train_dataset = PartialDataset(train_dataset, train_dataset_size_limit) log.info("train dataset size is set to {}".format( len(train_dataset))) val_dataset_size_limit = train_params.get("val_dataset_size_limit") if val_dataset_size_limit and (val_dataset is not None): val_dataset = PartialDataset(val_dataset, val_dataset_size_limit) log.info("val dataset size is set to {}".format(len(val_dataset))) train_data_loader_params = train_params.get("train_data_loader_params", dict()) val_data_loader_params = train_params.get("val_data_loader_params", dict()) evaluation_metrics = train_params.get("evaluation_metrics") evaluate_train_data = train_params.get("evaluate_train_data") evaluate_val_data = train_params.get("evaluate_val_data") progress_update = train_params.get("progress_update") scheduler = train_params.get("scheduler") scheduler_params = train_params.get("scheduler_params", dict()) model_checkpoint = train_params.get("model_checkpoint") model_checkpoint_params = train_params.get("model_checkpoint_params") early_stopping_params = train_params.get("early_stopping_params") time_limit = train_params.get("time_limit") cudnn_deterministic = train_params.get("cudnn_deterministic") cudnn_benchmark = train_params.get("cudnn_benchmark") if seed: torch.manual_seed(seed) np.random.seed(seed) if cudnn_deterministic: torch.backends.cudnn.deterministic = cudnn_deterministic if cudnn_benchmark: torch.backends.cudnn.benchmark = cudnn_benchmark device = "cuda" if torch.cuda.is_available() else "cpu" model.to(device) optimizer_ = optimizer(model.parameters(), **optimizer_params) trainer = create_supervised_trainer(model, optimizer_, loss_fn=loss_fn, device=device) train_data_loader_params.setdefault("shuffle", True) train_data_loader_params.setdefault("drop_last", True) train_data_loader_params["batch_size"] = _clip_batch_size( train_data_loader_params.get("batch_size", 1), train_dataset, "train") train_loader = DataLoader(train_dataset, **train_data_loader_params) RunningAverage(output_transform=lambda x: x, alpha=0.98).attach(trainer, "ema_loss") RunningAverage(output_transform=lambda x: x, alpha=2**(-1022)).attach(trainer, "batch_loss") if scheduler: class ParamSchedulerSavingAsMetric( ParamSchedulerSavingAsMetricMixIn, scheduler): pass cycle_epochs = scheduler_params.pop("cycle_epochs", 1) scheduler_params.setdefault("cycle_size", int(cycle_epochs * len(train_loader))) scheduler_params.setdefault("param_name", "lr") scheduler_ = ParamSchedulerSavingAsMetric(optimizer_, **scheduler_params) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler_) if evaluate_train_data: evaluator_train = create_supervised_evaluator( model, metrics=evaluation_metrics, device=device) if evaluate_val_data: val_data_loader_params["batch_size"] = _clip_batch_size( val_data_loader_params.get("batch_size", 1), val_dataset, "val") val_loader = DataLoader(val_dataset, **val_data_loader_params) evaluator_val = create_supervised_evaluator( model, metrics=evaluation_metrics, device=device) if model_checkpoint_params: assert isinstance(model_checkpoint_params, dict) minimize = model_checkpoint_params.pop("minimize", True) save_interval = model_checkpoint_params.get("save_interval", None) if not save_interval: model_checkpoint_params.setdefault( "score_function", get_score_function("ema_loss", minimize=minimize)) model_checkpoint_params.setdefault("score_name", "ema_loss") mc = model_checkpoint(**model_checkpoint_params) trainer.add_event_handler(Events.EPOCH_COMPLETED, mc, {"model": model}) if early_stopping_params: assert isinstance(early_stopping_params, dict) metric = early_stopping_params.pop("metric", None) assert (metric is None) or (metric in evaluation_metrics) minimize = early_stopping_params.pop("minimize", False) if metric: assert ( "score_function" not in early_stopping_params ), "Remove either 'metric' or 'score_function' from early_stopping_params: {}".format( early_stopping_params) early_stopping_params["score_function"] = get_score_function( metric, minimize=minimize) es = EarlyStopping(trainer=trainer, **early_stopping_params) if evaluate_val_data: evaluator_val.add_event_handler(Events.COMPLETED, es) elif evaluate_train_data: evaluator_train.add_event_handler(Events.COMPLETED, es) elif early_stopping_params: log.warning( "Early Stopping is disabled because neither " "evaluate_val_data nor evaluate_train_data is set True.") if time_limit: assert isinstance(time_limit, (int, float)) tl = TimeLimit(limit_sec=time_limit) trainer.add_event_handler(Events.ITERATION_COMPLETED, tl) pbar = None if progress_update: if not isinstance(progress_update, dict): progress_update = dict() progress_update.setdefault("persist", True) progress_update.setdefault("desc", "") pbar = ProgressBar(**progress_update) pbar.attach(trainer, ["ema_loss"]) else: def log_train_metrics(engine): log.info("[Epoch: {} | {}]".format(engine.state.epoch, engine.state.metrics)) trainer.add_event_handler(Events.EPOCH_COMPLETED, log_train_metrics) if evaluate_train_data: def log_evaluation_train_data(engine): evaluator_train.run(train_loader) train_report = _get_report_str(engine, evaluator_train, "Train Data") if pbar: pbar.log_message(train_report) else: log.info(train_report) eval_train_event = (Events[evaluate_train_data] if isinstance( evaluate_train_data, str) else Events.EPOCH_COMPLETED) trainer.add_event_handler(eval_train_event, log_evaluation_train_data) if evaluate_val_data: def log_evaluation_val_data(engine): evaluator_val.run(val_loader) val_report = _get_report_str(engine, evaluator_val, "Val Data") if pbar: pbar.log_message(val_report) else: log.info(val_report) eval_val_event = (Events[evaluate_val_data] if isinstance( evaluate_val_data, str) else Events.EPOCH_COMPLETED) trainer.add_event_handler(eval_val_event, log_evaluation_val_data) if mlflow_logging: mlflow_logger = MLflowLogger() logging_params = { "train_n_samples": len(train_dataset), "train_n_batches": len(train_loader), "optimizer": _name(optimizer), "loss_fn": _name(loss_fn), "pytorch_version": torch.__version__, "ignite_version": ignite.__version__, } logging_params.update(_loggable_dict(optimizer_params, "optimizer")) logging_params.update( _loggable_dict(train_data_loader_params, "train")) if scheduler: logging_params.update({"scheduler": _name(scheduler)}) logging_params.update( _loggable_dict(scheduler_params, "scheduler")) if evaluate_val_data: logging_params.update({ "val_n_samples": len(val_dataset), "val_n_batches": len(val_loader), }) logging_params.update( _loggable_dict(val_data_loader_params, "val")) mlflow_logger.log_params(logging_params) batch_metric_names = ["batch_loss", "ema_loss"] if scheduler: batch_metric_names.append(scheduler_params.get("param_name")) mlflow_logger.attach( trainer, log_handler=OutputHandler( tag="step", metric_names=batch_metric_names, global_step_transform=global_step_from_engine(trainer), ), event_name=Events.ITERATION_COMPLETED, ) if evaluate_train_data: mlflow_logger.attach( evaluator_train, log_handler=OutputHandler( tag="train", metric_names=list(evaluation_metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) if evaluate_val_data: mlflow_logger.attach( evaluator_val, log_handler=OutputHandler( tag="val", metric_names=list(evaluation_metrics.keys()), global_step_transform=global_step_from_engine(trainer), ), event_name=Events.COMPLETED, ) trainer.run(train_loader, max_epochs=epochs) try: if pbar and pbar.pbar: pbar.pbar.close() except Exception as e: log.error(e, exc_info=True) model = load_latest_model(model_checkpoint_params)(model) return model
pbar.log_message(name + ' is unfrozen') for param in child.parameters(): param.requires_grad = True else: pbar.log_message(name + ' is frozen') for param in child.parameters(): param.requires_grad = False elif epoch > 1: pbar.log_message("Turn on all the layers") for name, child in model.named_children(): for param in child.parameters(): param.requires_grad = True pbar = ProgressBar(bar_format='') pbar.attach(trainer, output_transform=lambda x: {'loss': x}) trainer.run(loader, max_epochs=500) # with torch.no_grad(): # preds = np.empty(0) # for x, _ in tqdm_notebook(tloader): # x = x.to(device) # output = model_resnet_18(x) # idx = output.max(dim=-1)[1].cpu().numpy() # preds = np.append(preds, idx, axis=0) # # # submission = pd.read_csv(path_data + '/test.csv') # submission['sirna'] = preds.astype(int) # submission.to_csv('submission_1.csv', index=False, columns=['id_code', 'sirna'])
ewc_loss = 0 if 'ewc_loss' in engine.state.output: ewc_loss = engine.state.output['ewc_loss'] metric = model.get_metrics() lr = optimizer.param_groups[0]['lr'] e = engine.state.epoch n = engine.state.max_epochs i = engine.state.iteration print( "Epoch {}/{} : {} - batch loss: {}, ewc loss: {}, lr: {}, accuracy: {}, average: {} " .format(e, n, i, batch_loss, ewc_loss, lr, metric['accuracy'], metric['average'])) pbar = ProgressBar() pbar.attach(itrainer, ['loss']) current_task = None @itrainer.on(Events.EPOCH_COMPLETED) def run_validation(engine): val_iterator = BucketIterator(batch_size=args.bs, sorting_keys=[("tokens", "num_tokens")]) val_iterator.index_with(vocabulary[current_task]) raw_val_generator = iterator(dev_data[current_task], num_epochs=1) val_groups = list(raw_val_generator) model.get_metrics(True) ievaluator.run(val_groups) batch_loss = ievaluator.state.metric['loss'] metric = ievaluator.state.metric lr = optimizer.param_groups[0]['lr']
def setup(self, training_metrics): def metric_name(n) -> str: if n.endswith('Accuracy'): n = 'acc' else: n = n[:-6] if n.endswith('Metric') else n return n def print_metrics(metrics) -> str: rv = '' metric_keys = sorted(k for k in metrics) for k in metric_keys: if k == 'Accuracy': rv += f'{metric_name(k)}: {metrics[k]:.3}' else: rv += f'{metric_name(k)}: {metrics[k]:.6}' return rv if self.seed: set_seed_everywhere(self.seed, self.cuda) pbar = ProgressBar() names = [] for k, v in training_metrics.items(): name = f'r{k}' names.append(name) RunningAverage(v).attach(self.trainer, name) RunningAverage(None, output_transform=lambda x: x[-1] * self. loss_accumulation_steps).attach(self.trainer, 'rloss') names.append('rloss') pbar.attach(self.trainer, names) pbar = ProgressBar() pbar.attach(self.evaluator) # A few events handler. To add / modify the events handler, you need to extend the __init__ method of RunnerABC # Ignite provides the necessary abstractions and a furnished repository of useful tools @self.trainer.on(Events.EPOCH_COMPLETED) def log_validation_results(trainer): self.evaluator.run(self.dataset_splits.val_data_loader()) metrics = self.evaluator.state.metrics logger.info( f"Validation Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) if self.scheduler: self.scheduler.step( metrics[self.loss_metric.__class__.__name__]) @self.trainer.on(Events.COMPLETED) def log_test_results(trainer): self.evaluator.run(self.dataset_splits.test_data_loader()) metrics = self.evaluator.state.metrics logger.info( f"Test Results - Epoch: {trainer.state.epoch} {print_metrics(metrics)}" ) if self.tensorboard_logs: tb_logger = TensorboardLogger(log_dir=self.tensorboard_logs) tb_logger.attach(self.trainer, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {'loss': loss}), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.evaluator, log_handler=OutputHandler( tag="validation", metric_names=["LossMetric"], another_engine=self.trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=OptimizerParamsHandler( self.optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(self.trainer, log_handler=WeightsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(self.trainer, log_handler=WeightsHistHandler(self.model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(self.trainer, log_handler=GradsScalarHandler(self.model), event_name=Events.ITERATION_COMPLETED) # This is important to close the tensorboard file logger @self.trainer.on(Events.COMPLETED) def end_tensorboard(trainer): logger.info("Training completed") tb_logger.close() if self.embeddings_name: @self.trainer.on(Events.COMPLETED) def log_embeddings(trainer): if hasattr(self.model, self.embeddings_name) and hasattr( self.dataset_splits, "vectorizer"): logger.info( f"Logging embeddings ({self.embeddings_name}) to Tensorboard!" ) embeddings = getattr(self.model, self.embeddings_name).weight.data metadata = [ str(self.dataset_splits.vectorizer.data_vocab. _id2token[token_index]).encode('utf-8') for token_index in range(embeddings.shape[0]) ] self.writer.add_embedding( mat=embeddings, metadata=metadata, global_step=self.trainer.state.epoch)
def main(local_rank): params = init_parms(local_rank) device = params.get('device') model = ASRModel(input_features=config.num_mel_banks, num_classes=config.vocab_size).to(device) logger.info( f'Model initialized with {get_model_size(model):.3f}M parameters') optimizer = Ranger(model.parameters(), lr=config.lr, eps=1e-5) model = DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank, check_reduction=True) load_checkpoint(model, optimizer, params) print(f"Loaded model on {local_rank}") start_epoch = params['start_epoch'] sup_criterion = CustomCTCLoss() unsup_criterion = UDALoss() if args.local_rank == 0: tb_logger = TensorboardLogger(log_dir=log_path) pbar = ProgressBar(persist=True, desc="Training") pbar_valid = ProgressBar(persist=True, desc="Validation Clean") pbar_valid_other = ProgressBar(persist=True, desc="Validation Other") pbar_valid_airtel = ProgressBar(persist=True, desc="Validation Airtel") pbar_valid_airtel_payments = ProgressBar( persist=True, desc="Validation Airtel Payments") timer = Timer(average=True) best_meter = params.get('best_stats', BestMeter()) trainCleanPath = os.path.join(lmdb_root_path, 'train-labelled') trainOtherPath = os.path.join(lmdb_root_path, 'train-unlabelled') trainCommonVoicePath = os.path.join(lmdb_commonvoice_root_path, 'train-labelled-en') trainAirtelPath = os.path.join(lmdb_airtel_root_path, 'train-labelled-en') trainAirtelPaymentsPath = os.path.join(lmdb_airtel_payments_root_path, 'train-labelled-en') testCleanPath = os.path.join(lmdb_root_path, 'test-clean') testOtherPath = os.path.join(lmdb_root_path, 'test-other') testAirtelPath = os.path.join(lmdb_airtel_root_path, 'test-labelled-en') testAirtelPaymentsPath = os.path.join(lmdb_airtel_payments_root_path, 'test-labelled-en') devOtherPath = os.path.join(lmdb_root_path, 'dev-other') train_clean = lmdbMultiDataset(roots=[ trainCleanPath, trainOtherPath, trainCommonVoicePath, trainAirtelPath, trainAirtelPaymentsPath ], transform=image_train_transform) train_other = lmdbMultiDataset(roots=[devOtherPath], transform=image_train_transform) test_clean = lmdbMultiDataset(roots=[testCleanPath], transform=image_val_transform) test_other = lmdbMultiDataset(roots=[testOtherPath], transform=image_val_transform) test_airtel = lmdbMultiDataset(roots=[testAirtelPath], transform=image_val_transform) test_payments_airtel = lmdbMultiDataset(roots=[testAirtelPaymentsPath], transform=image_val_transform) logger.info( f'Loaded Train & Test Datasets, train_labbeled={len(train_clean)}, train_unlabbeled={len(train_other)}, test_clean={len(test_clean)}, test_other={len(test_other)}, test_airtel={len(test_airtel)}, test_payments_airtel={len(test_payments_airtel)} examples' ) def train_update_function(engine, _): optimizer.zero_grad() # Supervised gt, pred imgs_sup, labels_sup, label_lengths = next( engine.state.train_loader_labbeled) imgs_sup = imgs_sup.cuda(local_rank, non_blocking=True) labels_sup = labels_sup probs_sup = model(imgs_sup) # Unsupervised gt, pred # imgs_unsup, augmented_imgs_unsup = next(engine.state.train_loader_unlabbeled) # with torch.no_grad(): # probs_unsup = model(imgs_unsup.to(device)) # probs_aug_unsup = model(augmented_imgs_unsup.to(device)) sup_loss = sup_criterion(probs_sup, labels_sup, label_lengths) # unsup_loss = unsup_criterion(probs_unsup, probs_aug_unsup) # Blend supervised and unsupervised losses till unsupervision_warmup_epoch # alpha = get_alpha(engine.state.epoch) # final_loss = ((1 - alpha) * sup_loss) + (alpha * unsup_loss) # final_loss = sup_loss sup_loss.backward() optimizer.step() return sup_loss.item() @torch.no_grad() def validate_update_function(engine, batch): img, labels, label_lengths = batch y_pred = model(img.cuda(local_rank, non_blocking=True)) if np.random.rand() > 0.99: pred_sentences = get_most_probable(y_pred) labels_list = labels.tolist() idx = 0 for i, length in enumerate(label_lengths.cpu().tolist()): pred_sentence = pred_sentences[i] gt_sentence = get_sentence(labels_list[idx:idx + length]) idx += length print(f"Pred sentence: {pred_sentence}, GT: {gt_sentence}") return (y_pred, labels, label_lengths) train_sampler_labbeled = torch.utils.data.distributed.DistributedSampler( train_clean, num_replicas=3, rank=args.local_rank) train_sampler_unlabbeled = torch.utils.data.distributed.DistributedSampler( train_other, num_replicas=3, rank=args.local_rank) test_sampler_clean = torch.utils.data.distributed.DistributedSampler( test_clean, num_replicas=3, rank=args.local_rank, shuffle=False) test_sampler_other = torch.utils.data.distributed.DistributedSampler( test_other, num_replicas=3, rank=args.local_rank, shuffle=False) test_sampler_airtel = torch.utils.data.distributed.DistributedSampler( test_airtel, num_replicas=3, rank=args.local_rank, shuffle=False) test_sampler_airtel_payments = torch.utils.data.distributed.DistributedSampler( test_payments_airtel, num_replicas=3, rank=args.local_rank, shuffle=False) train_loader_labbeled_loader = torch.utils.data.DataLoader( train_clean, batch_size=train_batch_size // 3, sampler=train_sampler_labbeled, num_workers=config.workers // 3, pin_memory=True, collate_fn=allign_collate) train_loader_unlabbeled_loader = torch.utils.data.DataLoader( train_other, batch_size=train_batch_size * 4, sampler=train_sampler_unlabbeled, num_workers=config.workers // 3, pin_memory=True, collate_fn=allign_collate) test_loader_clean = torch.utils.data.DataLoader( test_clean, batch_size=1, sampler=test_sampler_clean, num_workers=config.workers // 3, pin_memory=True, collate_fn=allign_collate) test_loader_other = torch.utils.data.DataLoader( test_other, batch_size=1, sampler=test_sampler_other, num_workers=config.workers // 3, pin_memory=True, collate_fn=allign_collate) test_loader_airtel = torch.utils.data.DataLoader( test_airtel, batch_size=1, sampler=test_sampler_airtel, num_workers=config.workers // 3, pin_memory=True, collate_fn=allign_collate) test_loader_airtel_payments = torch.utils.data.DataLoader( test_payments_airtel, batch_size=1, sampler=test_sampler_airtel_payments, num_workers=config.workers // 3, pin_memory=True, collate_fn=allign_collate) trainer = Engine(train_update_function) iteration_log_step = int(0.33 * len(train_loader_labbeled_loader)) evaluator_clean = Engine(validate_update_function) evaluator_other = Engine(validate_update_function) evaluator_airtel = Engine(validate_update_function) evaluator_airtel_payments = Engine(validate_update_function) metrics = {'wer': WordErrorRate(), 'cer': CharacterErrorRate()} for name, metric in metrics.items(): metric.attach(evaluator_clean, name) metric.attach(evaluator_other, name) metric.attach(evaluator_airtel, name) metric.attach(evaluator_airtel_payments, name) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=config.lr_gamma, patience=int(config.epochs * 0.05), verbose=True, threshold_mode="abs", cooldown=int(config.epochs * 0.025), min_lr=1e-5) if args.local_rank == 0: tb_logger.attach(trainer, log_handler=OutputHandler( tag="training", output_transform=lambda loss: {'loss': loss}), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED) tb_logger.attach(trainer, log_handler=WeightsHistHandler(model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(trainer, log_handler=WeightsScalarHandler(model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=GradsScalarHandler(model), event_name=Events.ITERATION_COMPLETED) tb_logger.attach(trainer, log_handler=GradsHistHandler(model), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator_clean, log_handler=OutputHandler(tag="validation_clean", metric_names=["wer", "cer"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator_other, log_handler=OutputHandler(tag="validation_other", metric_names=["wer", "cer"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator_airtel, log_handler=OutputHandler(tag="validation_airtel", metric_names=["wer", "cer"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) tb_logger.attach(evaluator_airtel_payments, log_handler=OutputHandler( tag="validation_airtel_payments", metric_names=["wer", "cer"], another_engine=trainer), event_name=Events.EPOCH_COMPLETED) pbar.attach(trainer, output_transform=lambda x: {'loss': x}) pbar_valid.attach(evaluator_clean, ['wer', 'cer'], event_name=Events.EPOCH_COMPLETED, closing_event_name=Events.COMPLETED) pbar_valid_other.attach(evaluator_other, ['wer', 'cer'], event_name=Events.EPOCH_COMPLETED, closing_event_name=Events.COMPLETED) pbar_valid_airtel.attach(evaluator_airtel, ['wer', 'cer'], event_name=Events.EPOCH_COMPLETED, closing_event_name=Events.COMPLETED) pbar_valid_airtel_payments.attach(evaluator_airtel_payments, ['wer', 'cer'], event_name=Events.EPOCH_COMPLETED, closing_event_name=Events.COMPLETED) timer.attach(trainer) @trainer.on(Events.STARTED) def set_init_epoch(engine): engine.state.epoch = params['start_epoch'] logger.info(f'Initial epoch for trainer set to {engine.state.epoch}') @trainer.on(Events.EPOCH_STARTED) def set_model_train(engine): if hasattr(engine.state, 'train_loader_labbeled'): del engine.state.train_loader_labbeled engine.state.train_loader_labbeled = iter(train_loader_labbeled_loader) # engine.state.train_loader_unlabbeled = iter(train_loader_unlabbeled_loader) @trainer.on(Events.ITERATION_COMPLETED) def iteration_completed(engine): if (engine.state.iteration % iteration_log_step == 0) and (engine.state.iteration > 0): engine.state.epoch += 1 train_clean.set_epochs(engine.state.epoch) train_other.set_epochs(engine.state.epoch) model.eval() logger.info('Model set to eval mode') evaluator_clean.run(test_loader_clean) evaluator_other.run(test_loader_other) evaluator_airtel.run(test_loader_airtel) evaluator_airtel_payments.run(test_loader_airtel_payments) model.train() logger.info('Model set back to train mode') if args.local_rank == 0: @evaluator_other.on(Events.EPOCH_COMPLETED) def save_checkpoints(engine): metrics = engine.state.metrics wer = metrics['wer'] cer = metrics['cer'] epoch = trainer.state.epoch scheduler.step(wer) save_checkpoint(model, optimizer, best_meter, wer, cer, epoch) best_meter.update(wer, cer, epoch) @trainer.on(Events.EPOCH_COMPLETED) def after_complete(engine): logger.info('Epoch {} done. Time per batch: {:.3f}[s]'.format( engine.state.epoch, timer.value())) timer.reset() trainer.run(train_loader_labbeled_loader, max_epochs=epochs) if args.local_rank == 0: tb_logger.close()