def main(): # Set the random seed for reproducible experiments torch.manual_seed(230) parser = argparse.ArgumentParser() parser.add_argument('--data_dir', help="Directory containing the dataset") parser.add_argument('--model_dir', help="Directory containing params.json") parser.add_argument('--params', help='Directory containing params.json') parser.add_argument('--restore_file', default='best', help="name of the file in --model_dir \ containing weights to load") params = utils.Params(args.params) # Get the logger utils.set_logger(os.path.join(params.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") test_dataset = dataset(file_path=params.metadata_file, split="Test", classes=params.classes) test_loader = DataLoader(dataset=test_dataset, batch_size=params.batch_size, shuffle=True, num_workers=8) logging.info("- done.") # Define the model and optimizer if model != "Inception": net = importlib.import_module("features.models.{}".format( params.model)) model = net.Net() inception = False else: model = models.inception_v3(pretrained=False) model.fc = nn.Linear(2048, num_classes) model.AuxLogits.fc = nn.Linear(768, 1) inception = True model.cuda() metrics_save = metrics_code.metrics_save logging.info("Starting evaluation") # Reload weights from the saved file utils.load_checkpoint( os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model) # Evaluate test_metrics = evaluate(model, test_loader, metrics_save, experiment, inception) save_path = os.path.join(model_dir, "metrics_test_{}.json".format(restore_file)) utils.save_dict_to_json(test_metrics, save_path)
def test(self, restore_from): """Test the model Args: restore_from: (string) directory or file containing weights to restore the graph """ hp = self.hp experiment_dir = hp.experiment_dir split = DatasetSplit.TEST model_spec = self._get_model_spec(split) # Initialize tf.Saver saver = tf.train.Saver() with tf.Session() as sess: # Initialize the lookup table sess.run(model_spec['variable_init_op']) # Reload weights from the weights subdirectory save_path = os.path.join(experiment_dir, restore_from) if os.path.isdir(save_path): save_path = tf.train.latest_checkpoint(save_path) saver.restore(sess, save_path) # Evaluate num_steps = (hp.test_size + hp.batch_size - 1) // hp.batch_size metrics = self.evaluate_epoch(sess, model_spec, num_steps) loss_string, acc_string = self.metrics_string(metrics) tf.logging.info("- Test metrics: " + acc_string) tf.logging.info("- Test metrics: " + loss_string) metrics_name = '_'.join(restore_from.split('/')) save_path = os.path.join( experiment_dir, "metrics_test_{}.json".format(metrics_name)) utils.save_dict_to_json(metrics, save_path)
# Get the logger utils.set_logger(os.path.join(args.model_dir, 'evaluate.log')) # Create the input data pipeline logging.info("Creating the dataset...") # fetch dataloaders dataloaders = data_loader.fetch_data_loader(['val'], args.data_dir, params) test_dl = dataloaders['val'] logging.info("getting the test dataloader - done.") # Define the model model = PhdGifNet().cuda() if params.cuda else PhdGifNet() loss_fn = loss_fn metrics = metrics logging.info("Starting evaluation") # Reload weights from the saved file utils.load_checkpoint( os.path.join(args.model_dir, args.restore_file + '.pth.tar'), model) # Evaluate test_metrics = evaluate(model, loss_fn, test_dl, metrics, params) save_path = os.path.join(args.model_dir, "metrics_test_{}.json".format(args.restore_file)) utils.save_dict_to_json(test_metrics, save_path)
def train_and_evaluate(model, ad_net, grl, ad_net_m, grl_m, Myacc, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, logger, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) - name of file to restore from (without its extension .pth.tar) """ best_val_acc = 0.0 # reload weights from restore_file if specified #if args.finetune: # num_ftrs = model.fc8.in_features # model.fc8 = nn.Linear(num_ftrs, 60) # model = model.cuda() #logger.info(model) if restore_file is not None: logging.info("Restoring parameters from {}".format(restore_file)) checkpoint = utils.load_checkpoint(restore_file, model, optimizer) params.start_epoch = checkpoint['epoch'] best_val_acc = checkpoint['best_val_acc'] print('best_val_acc=', best_val_acc) print(optimizer.state_dict()['param_groups'][0]['lr'], checkpoint['epoch']) # learning rate schedulers for different models: if params.lr_decay_type == None: logging.info("no lr decay") else: assert params.lr_decay_type in ['multistep', 'exp', 'plateau'] logging.info("lr decay:{}".format(params.lr_decay_type)) if params.lr_decay_type == 'multistep': scheduler = MultiStepLR(optimizer, milestones=params.lr_step, gamma=params.scheduler_gamma, last_epoch=params.start_epoch - 1) elif params.lr_decay_type == 'exp': scheduler = ExponentialLR(optimizer, gamma=params.scheduler_gamma2, last_epoch=params.start_epoch - 1) elif params.lr_decay_type == 'plateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=params.scheduler_gamma3, patience=params.patience, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) if args.finetune: num_ftrs = model.fc8.in_features model.fc8 = nn.Linear(num_ftrs, 30) model = model.cuda() logger.info(model) for epoch in range(params.start_epoch, params.num_epochs): params.current_epoch = epoch if params.lr_decay_type != 'plateau': scheduler.step() # Run one epoch logger.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_metrics, train_confusion_meter = train(model, ad_net, grl, ad_net_m, grl_m, optimizer, loss_fn, train_dataloader, metrics, params, logger) # Evaluate for one epoch on validation set val_metrics, val_confusion_meter = evaluate(model, loss_fn, val_dataloader, metrics, params, logger) # vis logger accs = [ 100. * (1 - train_metrics['accuracytop1']), 100. * (1 - train_metrics['accuracytop5']), 100. * (1 - val_metrics['accuracytop1']), 100. * (1 - val_metrics['accuracytop5']), ] error_logger15.log([epoch] * 4, accs) Myacc.append(100. * (1 - val_metrics['accuracytop1'])) losses = [train_metrics['loss'], val_metrics['loss']] loss_logger.log([epoch] * 2, losses) train_confusion_logger.log(train_confusion_meter.value()) test_confusion_logger.log(val_confusion_meter.value()) # log split loss if epoch == params.start_epoch: loss_key_train = [] loss_key_val = [] for key in [k for k, v in train_metrics.items()]: if 'ls' in key: loss_key_train.append(key) for key in [k for k, v in val_metrics.items()]: if 'ls' in key: loss_key_val.append(key) loss_split_key = ['train_' + k for k in loss_key_train ] + ['val_' + k for k in loss_key_val] loss_logger_split.opts['legend'] = loss_split_key loss_split = [train_metrics[k] for k in loss_key_train ] + [val_metrics[k] for k in loss_key_val] loss_logger_split.log([epoch] * len(loss_split_key), loss_split) if params.lr_decay_type == 'plateau': scheduler.step(val_metrics['ls_all']) val_acc = val_metrics['accuracytop1'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict(), 'best_val_acc': best_val_acc }, epoch=epoch + 1, is_best=is_best, save_best_ever_n_epoch=params.save_best_ever_n_epoch, checkpointpath=params.experiment_path + '/checkpoint', start_epoch=params.start_epoch) val_metrics['best_epoch'] = epoch + 1 # If best_eval, best_save_path, metric if is_best: logger.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(params.experiment_path, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(params.experiment_path, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model_source, model_target, transfer, train_dl, val_dl_source, val_dl_target, opt, loss_fn, metrics, params, lr_scheduler, checkpoint_dir, ckpt_filename, log_dir, writer): ckpt_file_path = os.path.join(checkpoint_dir, ckpt_filename) best_value = -float('inf') early_stopping = utils.EarlyStopping(patience=10, verbose=True) start_epoch = 0 batch_sample_source, batch_gt_source = next(iter(val_dl_source)) batch_sample_target, batch_gt_target = next(iter(val_dl_target)) if os.path.exists(ckpt_file_path): model, opt, lr_scheduler, start_epoch, best_value = utils.load_checkpoint(transfer, opt, lr_scheduler, start_epoch, False, best_value, checkpoint_dir, ckpt_filename) print("=> loaded transfer checkpoint form {} (epoch {})".format( ckpt_file_path, start_epoch)) else: print("=> Initializing from scratch") source_encoder = model_source.backbone target_encoder = model_target.backbone target_decoder = model_target.classifier adpative_model = get_adaptive_network(source_encoder, transfer, target_decoder) for epoch in range(start_epoch, params.num_epochs): # Run one epoch current_lr = get_lr(opt) logging.info('Epoch {}/{}, current lr={}'.format(epoch, params.num_epochs-1, current_lr)) writer.add_scalar('Learning_rate', current_lr, epoch) transfer.train() train_loss, train_metrics = train_epoch( source_encoder, target_encoder, transfer, loss_fn, train_dl, opt, lr_scheduler, params=params) transfer.eval() val_loss_source, _ = train_epoch( source_encoder, target_encoder, transfer, loss_fn, val_dl_source, params=params) # Evaluate for one epoch on validation set _, val_metrics_source = evaluate( adpative_model, None, val_dl_source, metrics=metrics, params=params) _, val_metrics_target = evaluate( adpative_model, None, val_dl_target, metrics=metrics, params=params) writer.add_scalars('Loss', { 'Training': train_loss, 'Validation': val_loss_source, }, epoch) for (val_metric_name_s, val_metric_results_s), (val_metric_name_t, val_metric_results_t) in zip(val_metrics_source.items(), val_metrics_target.items()): writer.add_scalars(val_metric_name_s, { 'Validation_source': val_metric_results_s[0], 'Validation_target': val_metric_results_t[0], }, epoch) if epoch % 5 == 0 or epoch==params.num_epochs-1: predictions = inference(adpative_model, batch_sample_source) plot = train_dl.dataset.get_predictions_plot( batch_sample_source, predictions.cpu(), batch_gt_source) writer.add_image('Predictions_source', plot, epoch, dataformats='HWC') predictions = inference(adpative_model, batch_sample_target) plot = train_dl.dataset.get_predictions_plot( batch_sample_target, predictions.cpu(), batch_gt_target) writer.add_image('Predictions_target', plot, epoch, dataformats='HWC') current_value = list(val_metrics_source.values())[0][0] is_best = current_value >= best_value # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_value = current_value # Save best val metrics in a json file in the model directory best_json_path = os.path.join( log_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics_source, best_json_path) utils.save_dict_to_json(val_metrics_target, best_json_path) # Save weights utils.save_checkpoint({'epoch': epoch + 1, 'state_dict': transfer.state_dict(), 'optim_dict': opt.state_dict(), 'scheduler_dict': lr_scheduler.state_dict(), 'best_value': best_value}, is_best=is_best, ckpt_dir=checkpoint_dir, filename=ckpt_filename) logging.info("\ntrain loss: %.3f, val loss: %.3f" % (train_loss, val_loss_source)) for (val_metric_name_s, val_metric_results_s), (val_metric_name_t, val_metric_results_t) in zip(val_metrics_source.items(), val_metrics_target.items()): logging.info("source %s: %.3f, target %s: %.3f" % (val_metric_name_s, val_metric_results_s[0], val_metric_name_t, val_metric_results_t[0])) logging.info("-"*20) early_stopping(val_loss_source) if early_stopping.early_stop: logging.info("Early stopping") break
torch.cuda.manual_seed(seed) # fetch dataloaders # fetch dataloaders val_dl = dataloader.fetch_dataloader(args.data_dir, args.txt_val, "val", params) # Define the model model = get_network(params).to(params.device) #num_classes+1 for background. metrics = OrderedDict({}) for metric in params.metrics: metrics[metric] = get_metrics(metric, params) # Reload weights from the saved file model = utils.load_checkpoint(model, is_best=True, checkpoint_dir=args.checkpoint_dir)[0] # Evaluate eval_loss, val_metrics = evaluate(model, val_dl, loss_fn=None, metrics=metrics, params=params) best_json_path = os.path.join(args.model_dir, "logs/evaluation.json") for val_metric_name, val_metric_results in val_metrics.items(): print("{}: {}".format(val_metric_name, val_metric_results)) utils.save_dict_to_json(val_metrics, best_json_path)
def train_and_evaluate(model_source, model_target, transfer, train_dl_all, val_dl_all, val_dl_target, opt1, opt2, opt3, loss_fn1, loss_fn2, metrics_depth, metrics_segmentation, params, lr_scheduler1, lr_scheduler2, lr_scheduler3, checkpoint_dir_source, checkpoint_dir_target, checkpoint_dir_transfer, ckpt_filename, log_dir, writer): ckpt_file_path = os.path.join(checkpoint_dir_transfer, ckpt_filename) best_value = -float('inf') start_epoch = 0 batch_sample_carla, batch_gt_carla_sem, batch_gt_carla_depth, _, _ = next( iter(val_dl_all)) batch_sample_cs, batch_gt_cs = next(iter(val_dl_target)) if os.path.exists(ckpt_file_path): transfer, opt3, lr_scheduler3, start_epoch, best_value = utils.load_checkpoint( transfer, opt3, lr_scheduler3, start_epoch, False, best_value, checkpoint_dir_transfer, ckpt_filename) print("=> loaded transfer checkpoint form {} (epoch {})".format( ckpt_file_path, start_epoch)) else: print("=> Initializing transfer from scratch") source_encoder = model_source.backbone target_decoder = model_target.classifier adpative_model = get_adaptive_network(source_encoder, transfer, target_decoder) for epoch in range(start_epoch, params.num_epochs): # Run one epoch current_lr = get_lr(opt3) logging.info('Epoch {}/{}, current lr={}'.format( epoch, params.num_epochs - 1, current_lr)) writer.add_scalar('Learning_rate', current_lr, epoch) transfer.train() train_loss_depth, train_loss_segmentation = train_epoch( model_source, model_target, transfer, train_dl_all, opt1, opt2, opt3, loss_fn1, loss_fn2, params, lr_scheduler1, lr_scheduler2, lr_scheduler3) writer.add_scalars( 'Losses', { 'Training_depth': train_loss_depth, 'Training_segmentation': train_loss_segmentation, }, epoch) # if epoch % 5 == 0 or epoch==params.num_epochs-1: predictions_sem = inference(model_target, batch_sample_carla) predictions_depth = inference(model_source, batch_sample_carla) plot = train_dl_all.dataset.get_predictions_plot( batch_sample_carla, predictions_sem.cpu(), batch_gt_carla_sem.cpu(), predictions_depth.cpu(), batch_gt_carla_depth.cpu()) writer.add_image('Predictions_carla', plot, epoch, dataformats='HWC') predictions = inference(adpative_model, batch_sample_cs) plot = val_dl_target.dataset.dataset.get_predictions_plot( batch_sample_cs, predictions.cpu(), batch_gt_cs) writer.add_image('Predictions_target', plot, epoch, dataformats='HWC') val_metrics_depth, val_metrics_segmentation = evaluate_source( model_source, model_target, val_dl_all, metrics_depth, metrics_segmentation, params) _, val_metrics_transfer = evaluate(adpative_model, None, val_dl_target, metrics=metrics_segmentation, params=params) for (val_metric_name, val_metric_results) in val_metrics_depth.items(): writer.add_scalar(val_metric_name, val_metric_results[0], epoch) for (val_metric_name, val_metric_results) in val_metrics_segmentation.items(): writer.add_scalar(val_metric_name + '_target', val_metric_results[0], epoch) for (val_metric_name, val_metric_results) in val_metrics_transfer.items(): writer.add_scalar(val_metric_name + '_transfer', val_metric_results[0], epoch) current_value = list(val_metrics_transfer.values())[0][0] is_best = current_value >= best_value # If best_eval, best_save_path if is_best: logging.info("- Found new best value") best_value = current_value # Save best val metrics in a json file in the model directory best_json_path = os.path.join(log_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics_transfer, best_json_path) # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model_source.state_dict(), 'optim_dict': opt1.state_dict(), 'scheduler_dict': lr_scheduler1.state_dict(), 'best_value': best_value }, is_best=is_best, ckpt_dir=checkpoint_dir_source, filename=ckpt_filename) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model_target.state_dict(), 'optim_dict': opt2.state_dict(), 'scheduler_dict': lr_scheduler2.state_dict(), 'best_value': best_value }, is_best=is_best, ckpt_dir=checkpoint_dir_target, filename=ckpt_filename) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': transfer.state_dict(), 'optim_dict': opt3.state_dict(), 'scheduler_dict': lr_scheduler3.state_dict(), 'best_value': best_value }, is_best=is_best, ckpt_dir=checkpoint_dir_transfer, filename=ckpt_filename) logging.info( "\ntrain loss depth: %.3f, train loss segmentation: %.3f" % (train_loss_depth, train_loss_segmentation)) for (val_metric_name, val_metric_results) in val_metrics_depth.items(): logging.info("val depth %s: %.3f" % (val_metric_name, val_metric_results[0])) for (val_metric_name, val_metric_results) in val_metrics_segmentation.items(): logging.info("val segmentation target %s: %.3f" % (val_metric_name, val_metric_results[0])) for (val_metric_name, val_metric_results) in val_metrics_transfer.items(): logging.info("val segmentation transfer %s: %.3f" % (val_metric_name, val_metric_results[0])) logging.info("-" * 20)
def train_and_evaluate(model, train_dataloader, val_dataloader, metrics_save, model_dir, num_epochs, loss_func, optimizer, learning_rate, decay, save_summary_steps, experiment=None, inception=False, restore_file=None): """ Train the model and evaluate every epoch. model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified if restore_file is not None: restore_path = os.path.join(args.model_dir, args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 best_val_auc = 0.0 loss_func = eval(loss_func) optimizer = eval(optimizer) for epoch in range(num_epochs): logging.info("Epoch {}/{}".format(epoch + 1, num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, train_dataloader, metrics_save, loss_func, optimizer, save_summary_steps, experiment, inception) # Evaluate for one epoch on validation set val_metrics = evaluate(model, val_dataloader, metrics_save, loss_func, experiment, inception) val_acc = val_metrics['test_accuracy'] val_auc = val_metrics['test_AUC'] is_best = val_acc >= best_val_acc and val_auc >= best_val_auc # If best_eval and auc, best_save_path if is_best: logging.info("- Found new best accuracy or auc") best_val_acc = val_acc best_val_auc = val_auc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fns, scheduler, evaluator, writer, params, model_dir, name, restore_file=None): if restore_file is not None: restore_path = os.path.join( args.model_dir, args.model_type + '_' + args.restore_file + '.pth.tar') logging.info("Restoring parameters from {}".format(restore_path)) utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, train_dataloader, optimizer, loss_fns, scheduler, evaluator, writer, epoch, params) # Evaluate for one epoch on validation set val_metrics = evaluate(model, val_dataloader, loss_fns, evaluator, writer, epoch, params) val_acc = val_metrics['mIOU'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir, name=name) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_eval(self, params, restore_from=None): """Train the model and evaluate every epoch. Args: train_model_spec: (dict) contains the graph operations or nodes needed for training params: (Params) contains hyperparameters of the model. Must define: num_epochs, train_size, batch_size, eval_size, save_summary_steps train_ds: training dataset eval_ds: evaluation dataset log_dir: directory for log restore_from: (string) directory or file containing weights to restore the graph """ # set up the train summary writer current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = os.path.join(self.log_dir, current_time, 'train_summaries') eval_log_dir = os.path.join(self.log_dir, current_time, 'eval_summaries') checkpoint_dir = os.path.join(self.log_dir, current_time, "training_checkpoints", 'ckpt') model_dir = os.path.join(self.log_dir, current_time) train_summary_writer = tf.summary.create_file_writer(train_log_dir) eval_summary_writer = tf.summary.create_file_writer(eval_log_dir) begin_at_epoch = 0 best_eval_acc = 100.0 # TRAINING MAIN LOOP # ---------------------------------------------------------------------- print("[INFO] training started ...") # loop over the number of epochs epochStart = time.time() for epoch in range(begin_at_epoch, begin_at_epoch + params.num_epochs): step = 0 # sys.stdout.flush() # Compute number of batches in one epoch (one full pass over the training set) num_steps_train = int( np.ceil(params.train_size / params.batch_size)) num_steps_eval = int(np.ceil(params.eval_size / params.batch_size)) # Use tqdm for progress bar with tqdm(total=num_steps_train, desc="[INFO] Epoch {0:d}".format(epoch + 1)) as pbar: # loop over the data in batch size increments # ---------------------------------------------------------------------- # TRAIN SESSION for x_train, y_train in self.train_ds.take(num_steps_train): train_loss, logits = self.train_step(x_train, y_train) # Log the loss in the tqdm progress bar sleep(0.1) # Display metrics at the end of each epoch. metrics = { "Train_MSE": '{:04.2f}'.format( self.train_accuracy_mse.result().numpy()), "Train_Loss": '{:04.2f}'.format(self.train_loss.result().numpy()) } pbar.set_postfix(metrics) pbar.update() # record train summary for tensor board # if 0 < step < 30: with train_summary_writer.as_default(): tf.summary.image('training images', x_train, step=epoch + step + 1, max_outputs=5) tf.summary.image('logit images', logits, step=epoch + step + 1, max_outputs=5) tf.summary.image('label images', y_train, step=epoch + step + 1, max_outputs=5) step = step + 1 with train_summary_writer.as_default(): tf.summary.scalar('loss', self.train_loss.result(), step=epoch + 1) tf.summary.scalar('mse', self.train_accuracy_mse.result(), step=epoch + 1) # ---------------------------------------------------------------------- # EVALUATION SESSION # loop over the eval data in batch size increments for x_eval, y_eval in self.eval_ds.take(num_steps_eval): eval_loss = self.test_step(x_eval, y_eval) # Display metrics at the end of each epoch. metrics["Eval_MSE"] = '{:04.2f}'.format( self.test_accuracy_mse.result().numpy()) pbar.set_postfix(metrics) pbar.close() # record train summary for tensor board with eval_summary_writer.as_default(): tf.summary.scalar('mse', self.test_accuracy_mse.result(), step=epoch + 1) # ---------------------------------------------------------------------- metrics["Epoch"] = '{0:d}'.format(epoch + 1) # If best_eval, save the model at best_save_path eval_acc = self.test_accuracy_mse.result().numpy() if params.save_model: if eval_acc <= best_eval_acc: # Store new best accuracy best_eval_acc = eval_acc # Save weights best_save_path = os.path.join( model_dir, "model_{0:d}".format(epoch + 1)) tf.keras.models.save_model(self.model, best_save_path, save_format="h5") print( "[INFO] Found new best accuracy, saving in {}".format( best_save_path)) # Save best eval metrics in a json file in the model directory best_json_path = os.path.join( model_dir, "metrics_eval_best_weights.json") save_dict_to_json(metrics, best_json_path) # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_eval_last_weights.json") save_dict_to_json(metrics, last_json_path) # ---------------------------------------------------------------------- # Reset training metrics at the end of each epoch self.train_loss.reset_states() self.train_accuracy_mse.reset_states() self.train_accuracy_kld.reset_states() self.test_accuracy_mse.reset_states() self.test_accuracy_kld.reset_states() # end of train and eval # show timing information for the epoch epochEnd = time.time() elapsed = (epochEnd - epochStart) / 60.0 print("[INFO] Took {:.4} minutes".format(elapsed)) # ---------------------------------------------------------------------- if params.save_model: reconstructed_best_model = tf.keras.models.load_model( best_save_path) reconstructed_best_model.compile(optimizer=self.opt, loss=self.loss_object) best_final_path = os.path.join(model_dir, "best_full_model_path") tf.saved_model.save(reconstructed_best_model, best_final_path) print("[INFO] Final model save in {}".format(best_final_path)) print("[INFO] Training done and log saved in {} ".format(model_dir))
config = get_config_from_json(config_path) datadir = '../data_samples/ner/' words_path = datadir + 'corpus_words.txt' tags_path = datadir + 'corpus_tags.txt' X, Y = [], [] with Path(words_path).open('rb') as f: for l in f: X.append(l.strip().split()) word_vocab = build_vocab(X) with Path(tags_path).open('rb') as f: for l in f: Y.append(l.strip().split()) tag_vocab = build_vocab(Y) # add padding token if PAD_WORD not in word_vocab: word_vocab.add(PAD_WORD) if PAD_TAG not in tag_vocab: tag_vocab.add(PAD_TAG) # save to disk word_vocab_path = datadir + 'word_vocab.txt' tag_vocab_path = datadir + 'tag_vocab.txt' with Path(word_vocab_path).open('w') as f: f.write('\n'.join(word for word in word_vocab)) with Path(tag_vocab_path).open('w') as f: f.write('\n'.join(tag for tag in tag_vocab)) # save json config word_vocab_size = len(word_vocab) tag_vocab_size = len(tag_vocab) config = update_config_by_vocab(config, word_vocab_size, tag_vocab_size) save_dict_to_json(config, config_path) print("updated config file by updating vocabulary")
def train_and_evaluate(model, train_dl, val_dl, opt, loss_fn, metrics, params, lr_scheduler, checkpoint_dir, ckpt_filename, log_dir, writer): # todo restore best checkpoint ckpt_file_path = os.path.join(checkpoint_dir, ckpt_filename) early_stopping = utils.EarlyStopping(patience=10, verbose=True) best_value = -float('inf') start_epoch = 0 batch_sample_train, batch_gt_train = next(iter(train_dl)) batch_sample_val, batch_gt_val = next(iter(val_dl)) if os.path.exists(ckpt_file_path): model, opt, lr_scheduler, start_epoch, best_value = utils.load_checkpoint( model, opt, lr_scheduler, start_epoch, False, best_value, checkpoint_dir, ckpt_filename) print("=> loaded checkpoint form {} (epoch {})".format( ckpt_file_path, start_epoch)) else: print("=> Initializing from scratch") for epoch in range(start_epoch, params.num_epochs - 1): # Run one epoch current_lr = get_lr(opt) logging.info('Epoch {}/{}, current lr={}'.format( epoch, params.num_epochs - 1, current_lr)) writer.add_scalar('Learning_rate', current_lr, epoch) if epoch % 5 == 0: predictions = inference(model, batch_sample_train) plot = train_dl.dataset.get_predictions_plot( batch_sample_train, predictions.cpu(), batch_gt_train) writer.add_image('Predictions_train', plot, epoch, dataformats='HWC') predictions = inference(model, batch_sample_val) plot = train_dl.dataset.get_predictions_plot( batch_sample_val, predictions.cpu(), batch_gt_val) writer.add_image('Predictions_val', plot, epoch, dataformats='HWC') model.train() train_loss, train_metrics = train_epoch(model, loss_fn, train_dl, opt, lr_scheduler, metrics, params) # Evaluate for one epoch on validation set val_loss, val_metrics = evaluate(model, val_dl, loss_fn=loss_fn, metrics=metrics, params=params) writer.add_scalars('Loss', { 'Training': train_loss, 'Validation': val_loss, }, epoch) for (train_metric_name, train_metric_results), (val_metric_name, val_metric_results) in zip( train_metrics.items(), val_metrics.items()): writer.add_scalars( train_metric_name, { 'Training': train_metric_results[0], 'Validation': val_metric_results[0], }, epoch) # get value for first metric current_value = list(val_metrics.values())[0][0] is_best = current_value >= best_value # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_value = current_value # Save best val metrics in a json file in the model directory best_json_path = os.path.join(log_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': opt.state_dict(), 'scheduler_dict': lr_scheduler.state_dict(), 'best_value': best_value }, is_best=is_best, checkpoint_dir=checkpoint_dir, filename=ckpt_filename) logging.info("\ntrain loss: %.3f, val loss: %.3f" % (train_loss, val_loss)) for (train_metric_name, train_metric_results), (val_metric_name, val_metric_results) in zip( train_metrics.items(), val_metrics.items()): logging.info("train %s: %.3f, val %s: %.3f" % (train_metric_name, train_metric_results[0], val_metric_name, val_metric_results[0])) logging.info("-" * 20)
def train_and_evaluate(self, restore_from=None): """Train the model and evaluate every epoch. Args: restore_from: (string) directory or file containing weights to restore the graph """ hp = self.hp experiment_dir = hp.experiment_dir tf.logging.info("Starting training for {} epoch(s)".format( hp.num_epochs)) split = DatasetSplit.TRAIN train_model_spec = self._get_model_spec(split) split = DatasetSplit.EVAL eval_model_spec = self._get_model_spec(split) # Initialize tf.Saver instances to save weights during training last_saver = tf.train.Saver() # will keep last 5 epochs best_saver = tf.train.Saver( max_to_keep=1) # only keep 1 best checkpoint (best on eval) begin_at_epoch = 0 with tf.Session() as sess: # Initialize model variables sess.run(train_model_spec['variable_init_op']) # Reload weights from directory if specified if restore_from is not None: tf.logging.info( "Restoring parameters from {}".format(restore_from)) if os.path.isdir(restore_from): restore_from = tf.train.latest_checkpoint(restore_from) begin_at_epoch = int(restore_from.split('-')[-1]) last_saver.restore(sess, restore_from) # For tensorboard (takes care of writing summaries to files) train_writer = tf.summary.FileWriter( os.path.join(experiment_dir, 'train_summaries'), sess.graph) eval_writer = tf.summary.FileWriter( os.path.join(experiment_dir, 'eval_summaries'), sess.graph) tf.gfile.MakeDirs(os.path.join(experiment_dir, 'last_weights')) tf.gfile.MakeDirs(os.path.join(experiment_dir, 'best_weights')) best_eval_p_acc = 0.0 for epoch in range(begin_at_epoch, begin_at_epoch + hp.num_epochs): # Run one epoch # Compute number of batches in one epoch (one full pass over the training set) total_train_steps = (hp.train_size + hp.batch_size - 1) // hp.batch_size total_train_steps_list = list(range(total_train_steps)) length = int(np.ceil(total_train_steps / hp.eval_every)) split_train_steps = np.array_split(total_train_steps_list, length) split_train_steps = [len(l) for l in split_train_steps] total_eval_steps = (hp.dev_size + hp.batch_size - 1) // hp.batch_size total_eval_steps_list = list(range(total_eval_steps)) split_eval_steps = np.array_split(total_eval_steps_list, length) split_eval_steps = [len(l) for l in split_eval_steps] for i, (t_steps, e_steps) in enumerate( zip(split_train_steps, split_eval_steps)): tf.logging.info( "Epoch {} - {}/{} with {} train steps and {} eval steps" .format(epoch + 1, i + 1, len(split_train_steps), t_steps, e_steps)) reset = False if i == 0: reset = True train_metrics = self.train_epoch(sess, train_model_spec, t_steps, train_writer, reset) train_loss_string, train_acc_string = self.metrics_string( train_metrics) tf.logging.info("- Train metrics: " + train_acc_string) tf.logging.info("- Train metrics: " + train_loss_string) # Save weights last_save_path = os.path.join( experiment_dir, 'last_weights', 'after-epoch-{}'.format(epoch + 1)) last_saver.save(sess, last_save_path, global_step=i + 1) # Evaluate for one sub epoch on validation set eval_metrics = self.evaluate_epoch(sess, eval_model_spec, e_steps, eval_writer, reset) test_loss_string, test_acc_string = self.metrics_string( eval_metrics) tf.logging.info("- Eval metrics: " + test_acc_string) tf.logging.info("- Eval metrics: " + test_loss_string) # If best_eval, best_save_path eval_p_acc = eval_metrics['policy_accuracy'] if eval_p_acc >= best_eval_p_acc: # Store new best accuracy best_eval_p_acc = eval_p_acc # Save weights best_save_path = os.path.join( experiment_dir, 'best_weights', 'after-epoch-{}'.format(epoch + 1)) best_save_path = best_saver.save(sess, best_save_path, global_step=i + 1) tf.logging.info( "- Found new best policy accuracy, saving in {}". format(best_save_path)) # Save best train metrics in a json file in the model directory best_json_path = os.path.join( experiment_dir, "metrics_train_best_weights.json") utils.save_dict_to_json(train_metrics, best_json_path) # Save best eval metrics in a json file in the model directory best_json_path = os.path.join( experiment_dir, "metrics_eval_best_weights.json") utils.save_dict_to_json(eval_metrics, best_json_path) # Save latest train metrics in a json file in the model directory last_json_path = os.path.join( experiment_dir, "metrics_train_last_weights.json") utils.save_dict_to_json(train_metrics, last_json_path) # Save latest eval metrics in a json file in the model directory last_json_path = os.path.join( experiment_dir, "metrics_eval_last_weights.json") utils.save_dict_to_json(eval_metrics, last_json_path)
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, restore_file=None): """Train the model and evaluate every epoch. Args: model: (torch.nn.Module) the neural network train_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches training data val_dataloader: (DataLoader) a torch.utils.data.DataLoader object that fetches validation data optimizer: (torch.optim) optimizer for parameters of model loss_fn: a function that takes batch_output and batch_labels and computes the loss for the batch metrics: (dict) a dictionary of functions that compute a metric using the output and labels of each batch params: (Params) hyperparameters model_dir: (string) directory containing config, weights and log restore_file: (string) optional- name of file to restore from (without its extension .pth.tar) """ # reload weights from restore_file if specified # if restore_file is not None: # restore_path = os.path.join('experiments', params.exp_name, args.restore_file + '.pth.tar') # logging.info("Restoring parameters from {}".format(restore_path)) # utils.load_checkpoint(restore_path, model, optimizer) best_val_acc = 0.0 for epoch in range(params.num_epochs): # Run one epoch logging.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train(model, optimizer, loss_fn, train_dataloader, metrics, params) # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params) val_acc = val_metrics['accuracy'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict() }, is_best=is_best, checkpoint=model_dir) # If best_eval, best_save_path if is_best: logging.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(model_dir, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(model_dir, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)
def train_and_evaluate(model, train_dataloader, val_dataloader, optimizer, loss_fn, metrics, params, model_dir, logger, restore_file=None, add_noise=False, noise_sigma=0.1): best_val_acc = 0.0 # reload weights from restore_file if specified if restore_file is not None: logging.info("Restoring parameters from {}".format(restore_file)) checkpoint = utils.load_checkpoint(restore_file, model, optimizer) params.start_epoch = checkpoint['epoch'] best_val_acc = checkpoint['best_val_acc'] print('best_val_acc=', best_val_acc, flush=True) print(optimizer.state_dict()['param_groups'][0]['lr'], checkpoint['epoch'], flush=True) # learning rate schedulers for different models: if params.lr_decay_type == None: logging.info("no lr decay") else: assert params.lr_decay_type in ['multistep', 'exp', 'plateau'] logging.info("lr decay:{}".format(params.lr_decay_type)) if params.lr_decay_type == 'multistep': scheduler = MultiStepLR(optimizer, milestones=params.lr_step, gamma=params.scheduler_gamma, last_epoch=params.start_epoch - 1) elif params.lr_decay_type == 'exp': scheduler = ExponentialLR(optimizer, gamma=params.scheduler_gamma2, last_epoch=params.start_epoch - 1) elif params.lr_decay_type == 'plateau': scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=params.scheduler_gamma3, patience=params.patience, verbose=False, threshold=0.0001, threshold_mode='rel', cooldown=0, min_lr=0, eps=1e-08) for epoch in range(params.start_epoch, params.num_epochs): params.current_epoch = epoch if params.lr_decay_type != 'plateau': scheduler.step() # Run one epoch logger.info("Epoch {}/{}".format(epoch + 1, params.num_epochs)) # compute number of batches in one epoch (one full pass over the training set) train_metrics = train(model, optimizer, loss_fn, train_dataloader, metrics, params, logger, add_noise=add_noise, noise_sigma=noise_sigma) # Evaluate for one epoch on validation set val_metrics = evaluate(model, loss_fn, val_dataloader, metrics, params, logger, add_noise=add_noise, noise_sigma=noise_sigma) # vis logger accs = [ 100. * (1 - train_metrics['accuracytop1']), 100. * (1 - train_metrics['accuracytop5']), 100. * (1 - val_metrics['accuracytop1']), 100. * (1 - val_metrics['accuracytop5']), ] losses = [train_metrics['loss'], val_metrics['loss']] if params.lr_decay_type == 'plateau': scheduler.step(val_metrics['ls_all']) val_acc = val_metrics['accuracytop1'] is_best = val_acc >= best_val_acc # Save weights utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optim_dict': optimizer.state_dict(), 'best_val_acc': best_val_acc }, epoch=epoch + 1, is_best=is_best, save_best_ever_n_epoch=params.save_best_ever_n_epoch, checkpointpath=params.experiment_path + '/checkpoint', start_epoch=params.start_epoch) val_metrics['best_epoch'] = epoch + 1 # If best_eval, best_save_path, metric if is_best: logger.info("- Found new best accuracy") best_val_acc = val_acc # Save best val metrics in a json file in the model directory best_json_path = os.path.join(params.experiment_path, "metrics_val_best_weights.json") utils.save_dict_to_json(val_metrics, best_json_path) # Save latest val metrics in a json file in the model directory last_json_path = os.path.join(params.experiment_path, "metrics_val_last_weights.json") utils.save_dict_to_json(val_metrics, last_json_path)