def train_and_evaluate_kd(model, teacher_model, train_dataloader, val_dataloader, optimizer, loss_fn_kd, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - file to restore (without its extension .pth.tar) """ # reload weights from restore_file if specified restore_start = time.time() if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) logging.info("restore_load_time: {}".format(time.time() - restore_start)) best_val_acc = 0.0 # Tensorboard logger setup # board_logger = utils.Board_Logger(os.path.join(model_dir, 'board_logs')) # fetch teacher outputs using teacher_model under eval() mode loading_start = time.time() teacher_model.eval() teacher_outputs = fetch_teacher_outputs(teacher_model, train_dataloader, params) elapsed_time = math.ceil(time.time() - loading_start) logging.info("- Finished computing teacher outputs after {} secs..".format(elapsed_time)) # learning rate schedulers for different models: if params.model_version == "resnet18_distill": scheduler = StepLR(optimizer, step_size=150, gamma=0.1) # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here elif params.model_version == "cnn_distill": scheduler = StepLR(optimizer, step_size=100, gamma=0.2) train_start = time.time() for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("KD:Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_kd(model, teacher_outputs, optimizer, loss_fn_kd, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate_kd(model, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc>=best_val_acc # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict' : optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path) # #============ TensorBoard logging: uncomment below to turn in on ============# # # (1) Log the scalar values # info = { # 'val accuracy': val_acc # } # for tag, value in info.items(): # board_logger.scalar_summary(tag, value, epoch+1) # # (2) Log values and gradients of the parameters (histogram) # for tag, value in model.named_parameters(): # tag = tag.replace('.', '/') # board_logger.histo_summary(tag, value.data.cpu().numpy(), epoch+1) # # board_logger.histo_summary(tag+'/grad', value.grad.data.cpu().numpy(), epoch+1) logging.info("train: {}".format(time.time()-train_start))
def train_and_evaluate_kd(model, teacher_model, train_dataloader, val_dataloader, optimizer, loss_fn_kd, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - file to restore (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 # Tensorboard logger setup # board_logger = utils.Board_Logger(os.path.join(model_dir, 'board_logs')) # fetch teacher outputs using teacher_model under eval() mode loading_start = time.time() teacher_model.eval() teacher_outputs = fetch_teacher_outputs(teacher_model, train_dataloader, params) elapsed_time = math.ceil(time.time() - loading_start) logging.info("- Finished computing teacher outputs after {} secs..".format(elapsed_time)) # learning rate schedulers for different models: if params.model_version == "resnet18_distill": scheduler = StepLR(optimizer, step_size=150, gamma=0.1) # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here elif params.model_version == "cnn_distill": scheduler = StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_kd(model, teacher_outputs, optimizer, loss_fn_kd, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate_kd(model, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc>=best_val_acc # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict' : optimizer.state_dict()}, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate_kd(model, teacher_model, train_dataloader, val_dataloader, optimizer, loss_fn_kd, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - file to restore (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 # Tensorboard logger setup # board_logger = utils.Board_Logger(os.path.join(model_dir, 'board_logs')) # learning rate schedulers for different models: if params.model_version == "resnet18_distill": scheduler = StepLR(optimizer, step_size=150, gamma=0.1) # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here elif params.model_version == "cnn_distill": scheduler = StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_kd(model, teacher_model, optimizer, loss_fn_kd, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate_kd(model, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate_kd(model, teacher_model, train_dataloader, val_dataloader, optimizer, loss_fn_kd, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - file to restore (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 # fetch teacher outputs using teacher_model under eval() mode loading_start = time.time() teacher_model.eval() teacher_outputs = fetch_teacher_outputs(teacher_model, train_dataloader, params) teacher_outputs_val = fetch_teacher_outputs(teacher_model, val_dataloader, params) elapsed_time = math.ceil(time.time() - loading_start) logging.info("- Finished computing teacher outputs after {} secs..".format( elapsed_time)) # for cnn models, num_epoch is always < 100, so it's intentionally not using scheduler here scheduler = StepLR(optimizer, step_size=100, gamma=0.2) for epoch in range(params.num_epochs): scheduler.step() # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_kd(model, teacher_outputs, optimizer, loss_fn_kd, train_dataloader, metrics, params, epoch) # Evaluate for one epoch on validation set val_metrics = evaluate_kd(model, val_dataloader, metrics, params, teacher_outputs_val, loss_fn_kd) #logging on Tensorboard for k, v in val_metrics.items(): board_logger.add_scalars(k, {'test': v}, epoch) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate_kd(model, teacher_model, train_dataloader, val_dataloader, optimizer, loss_fn_kd, warmup_scheduler, params, args, restore_file=None): """ KD Train the model and evaluate every epoch. """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) # tensorboard setting log_dir = args.model_dir + '/tensorboard/' writer = SummaryWriter(log_dir=log_dir) best_val_acc = 0.0 teacher_model.eval() teacher_acc = evaluate_kd(teacher_model, val_dataloader, params) print(">>>>>>>>>The teacher accuracy: {}>>>>>>>>>".format( teacher_acc['accuracy'])) scheduler = MultiStepLR(optimizer, milestones=[60, 120, 160], gamma=0.2) for epoch in range(params.num_epochs): if epoch > 0: # 0 is the warm up epoch scheduler.step() logging.info("Epoch {}/{}, lr:{}".format( epoch + 1, params.num_epochs, optimizer.param_groups[0]['lr'])) # KD Train train_acc, train_loss = train_kd(model, teacher_model, optimizer, loss_fn_kd, train_dataloader, warmup_scheduler, params, args, epoch) # Evaluate val_metrics = evaluate_kd(model, val_dataloader, params) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=args.model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory file_name = "eval_best_result.json" best_json_path = os.path.join(args.model_dir, file_name) utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(args.model_dir, "eval_last_result.json") utils.save_dict_to_json(val_metrics, last_json_path) # Tensorboard writer.add_scalar('Train_accuracy', train_acc, epoch) writer.add_scalar('Train_loss', train_loss, epoch) writer.add_scalar('Test_accuracy', val_metrics['accuracy'], epoch) writer.add_scalar('Test_loss', val_metrics['loss'], epoch) # export scalar data to JSON for external processing writer.close()