def setup_training(self): assert self.batch_size is not None trainer = Engine(lambda e, b: self.train_step(b)) trainer.register_events("EVAL_DONE") Average(lambda o: o['loss']).attach(trainer, 'avg_loss') state_vars = dict(model=self.model, opt=self.opt, trainer=trainer) checkpoint_handler = ModelCheckpoint(self.run_path, '', score_function=lambda e: e.state.metrics['val_accuracy'], score_name='val_accuracy', n_saved=2, global_step_transform=lambda e, evt_name: e.state.epoch) if checkpoint_handler.last_checkpoint: checkpoint_handler.load_objects(state_vars, self.run_path / checkpoint_handler.last_checkpoint) trainer.add_event_handler("EVAL_DONE", lambda e: checkpoint_handler(e, state_vars)) if self.use_lr_decay: trainer.add_event_handler(Events.ITERATION_COMPLETED, lambda e: self.lr_decay.step(e.state.iteration * self.batch_size)) RunningAverage(output_transform=lambda o: o['loss']).attach(trainer, 'running_avg_loss') ProgressBar().attach(trainer, ['running_avg_loss']) logger.setup_logger(self.run_path, trainer, self.model) @trainer.on(Events.EPOCH_COMPLETED) def eval_and_log(e: Engine): eval_results = self.eval() e.state.metrics['val_accuracy'] = eval_results['val'].metrics['accuracy'] e.state.metrics['val_loss'] = eval_results['val'].metrics['avg_loss'] e.state.eval_results = eval_results e.fire_event("EVAL_DONE") if self.use_early_stop: es = self.make_early_stopper(trainer) trainer.add_event_handler("EVAL_DONE", es) return trainer
def cluster(train_batch_size, val_batch_size): device = "cuda" train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size) model = Net() checkpointer = ModelCheckpoint("mnist-cluster", "resnet50", n_saved=1, require_empty=False) checkpoint = torch.load("mnist-cluster/resnet50_model_4690.pth") checkpointer.load_objects({'model': model}, checkpoint) model.cuda() model.eval() # evaluator = create_supervised_evaluator( # module, metrics={"accuracy": Accuracy()}, device=device) # evaluator.run(val_loader) # metrics = evaluator.state.metrics # print(f"Validation Results Avg accuracy: {metrics['accuracy']:.2f}") features = [] labels = [] for image, label in val_loader: with torch.no_grad(): image = image.cuda() label = label.cuda() feature = model(image) features.append(feature) labels.append(label) features = torch.cat(features) target = torch.cat(labels) dict_matrix = compute_dist(features, if_re_ranking=False) class_num, labels = generate_self_label(dict_matrix) print(f"class_num {class_num}") for i in range(target.size()[0]): print(f"{target[i]} : {labels[i]}")
def run(args, seed): config.make_paths() torch.random.manual_seed(seed) train_loader, val_loader, shape = get_data_loaders( config.Training.batch_size, proportion=config.Training.proportion, test_batch_size=config.Training.batch_size * 2, ) n, d, t = shape model = models.ConvNet(d, seq_len=t) writer = tb.SummaryWriter(log_dir=config.TENSORBOARD) model.to(config.device) # Move model before creating optimizer optimizer = torch.optim.Adam(model.parameters()) criterion = nn.MSELoss() trainer = create_supervised_trainer(model, optimizer, criterion, device=config.device) trainer.logger = setup_logger("trainer") checkpointer = ModelCheckpoint( config.MODEL, model.__class__.__name__, n_saved=2, create_dir=True, save_as_state_dict=True, ) trainer.add_event_handler( Events.EPOCH_COMPLETED(every=config.Training.save_every), checkpointer, {"model": model}, ) val_metrics = { "mse": Loss(criterion), "mae": MeanAbsoluteError(), "rmse": RootMeanSquaredError(), } evaluator = create_supervised_evaluator(model, metrics=val_metrics, device=config.device) evaluator.logger = setup_logger("evaluator") ar_evaluator = create_ar_evaluator(model, metrics=val_metrics, device=config.device) ar_evaluator.logger = setup_logger("ar") @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.save_every)) def log_ar(engine): ar_evaluator.run(val_loader) y_pred, y = ar_evaluator.state.output fig = plot_output(y, y_pred) writer.add_figure("eval/ar", fig, engine.state.epoch) plt.close() # desc = "ITERATION - loss: {:.2f}" # pbar = tqdm(initial=0, leave=False, total=len(train_loader), desc=desc.format(0)) @trainer.on(Events.ITERATION_COMPLETED(every=config.Training.log_every)) def log_training_loss(engine): # pbar.desc = desc.format(engine.state.output) # pbar.update(log_interval) if args.verbose: grad_norm = torch.stack( [p.grad.norm() for p in model.parameters()]).sum() writer.add_scalar("train/grad_norm", grad_norm, engine.state.iteration) writer.add_scalar("train/loss", engine.state.output, engine.state.iteration) @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every)) def log_training_results(engine): # pbar.refresh() evaluator.run(train_loader) metrics = evaluator.state.metrics for k, v in metrics.items(): writer.add_scalar(f"train/{k}", v, engine.state.epoch) # tqdm.write( # f"Training Results - Epoch: {engine.state.epoch} Avg mse: {evaluator.state.metrics['mse']:.2f}" # ) @trainer.on(Events.EPOCH_COMPLETED(every=config.Training.eval_every)) def log_validation_results(engine): evaluator.run(val_loader) metrics = evaluator.state.metrics for k, v in metrics.items(): writer.add_scalar(f"eval/{k}", v, engine.state.epoch) # tqdm.write( # f"Validation Results - Epoch: {engine.state.epoch} Avg mse: {evaluator.state.metrics['mse']:.2f}" # ) # pbar.n = pbar.last_print_n = 0 y_pred, y = evaluator.state.output fig = plot_output(y, y_pred) writer.add_figure("eval/preds", fig, engine.state.epoch) plt.close() # @trainer.on(Events.EPOCH_COMPLETED | Events.COMPLETED) # def log_time(engine): # #tqdm.write( # # f"{trainer.last_event_name.name} took {trainer.state.times[trainer.last_event_name.name]} seconds" # #) if args.ckpt is not None: ckpt = torch.load(args.ckpt) ModelCheckpoint.load_objects({"model": model}, ckpt) try: trainer.run(train_loader, max_epochs=config.Training.max_epochs) except Exception as e: import traceback print(traceback.format_exc()) # pbar.close() writer.close()
def build_trainer(experiment_dir: DirPath, train_data_loader: data.DataLoader, test_data_loader: data.DataLoader, train_params: dict, net_params: dict, image_size: int, optimizer_params: dict, is_tabolar_mode: str, runmode: str, cpugpu: str = 'gpu') -> ignite.engine: checkpoint_dir = osp.join(experiment_dir, train_params['checkpoint_relative_path']) logging_dir = osp.join( experiment_dir, train_params['logging_dir_relative_path']) # the log tb_dir = osp.join(logging_dir, 'tensorboard') if is_tabolar_mode == 'yes': mfctr = FeClinicNet(net_params['net_name'], net_params[net_params['net_name']], net_params['classifier_fc_size'], net_params['pretrained'], image_size) else: mfctr = ModelFactory.create_model(net_params['net_name'], net_params[net_params['net_name']], net_params['pretrained'], net_params['classifier_layer_size'], image_size) if cpugpu == 'gpu': model = mfctr.cuda() device = 'cuda' else: model = mfctr.cpu() device = None loss = nn.CrossEntropyLoss() optimizer = OptimizerFactory.create_optimizer( optimizer_name=train_params['optimizer_type'], net_params=model.parameters(), optimizer_params=optimizer_params) lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9261) summary_writer = SummaryWriter(log_dir=tb_dir) trainer = create_supervised_trainer( model=model, optimizer=optimizer, device=device, is_tabolar_mode=is_tabolar_mode, prepare_batch=choose_prepare_batch(is_tabolar_mode), non_blocking=True, loss_fn=loss) metrics = build_metrics() evaluator = create_supervised_evaluator( model=model, metrics=metrics, device=device, non_blocking=True, prepare_batch=choose_prepare_batch(is_tabolar_mode), is_tabolar_mode=is_tabolar_mode) eval_dir = create_logging_dir_for_eval(log_dir=logging_dir) checkpoint_handler = ModelCheckpoint(dirname=checkpoint_dir, filename_prefix='checkpoint', save_interval=1, n_saved=30, atomic=True, require_empty=False, create_dir=True, save_as_state_dict=True) if runmode == 'new': shutil.rmtree(tb_dir, ignore_errors=True) starting_epoch = 0 elif runmode == 'resume': to_load = {'trainer': trainer, 'model': model, 'optimizer': optimizer} checkpoint = torch.load(get_last_checkpoint(checkpoint_dir)) ModelCheckpoint.load_objects(to_load=to_load, checkpoint=checkpoint) starting_epoch = trainer.state.epoch else: raise ValueError('Unknown runmode, shouldn' 't reach this') attach_trainer_events(trainer=trainer, evaluator=evaluator, train_data_loader=train_data_loader, test_data_loader=test_data_loader, checkpoint_handler=checkpoint_handler, model=model, summary_writer=summary_writer, eval_freq=train_params["eval_freq"], starting_epoch=starting_epoch, optimizer=optimizer, eval_dir=eval_dir, lr_scheduler=lr_scheduler) return trainer