def __init__(self, config): super().__init__() self.ndcg = NDCG(is_direct_ranks=True) # We are calculating NDCG directly based on ranks # self.path_val_data = config.path_val_data self.dense_annotations_jsonpath = config.dense_annotations_jsonpath self.model_preds_root = config.model_preds_root self.models_list = self.get_model_type_list(self.model_preds_root) self.annotations_json = json.load(open( self.dense_annotations_jsonpath)) self.hist_info_images = [ 257366, 425477, 191097, 552399, 12468, 458949, 109735, 311793, 437200, 355853, 98849, 57743, 83289, 488471, 446567, 196905, 308846, 328336, 289233, 52156, 366462, 511748, 457675, 518811, 413085, 432039, 531270, 430580, 293582, 544148, 80366, 179366, 150236, 400960, 10424, 451398, 498340, 268914, 384171, 172461, 387266, 214227, 555578, 181772, 149373, 251385, 407878, 574545, 544827, 120559, 19299, 73638, 496822, 204195, 97073, 209447, 53433, 403234, 524006, 178300, 376460, 570468, 292100, 227006, 170315, 456824, 525726, 179064, 98879, 558975, 193521, 377823, 449230, 44468, 573552, 288308, 237956, 69538, 250654, 439842, 146314, 458818, 122826, 33976, 322815, 239030, 209271, 560666, 361734, 225491, 27366, 29060, 191186, 394073, 120870, 580183, 111013 ] self.subset_type = "_complement"
def eval_pred(pred, answer_index, round_id, gt_relevance): """ Evaluate the predict results and report metrices. Only for val split. Parameters: ----------- pred: ndarray of shape (n_samples, n_rounds, n_options). answer_index: ndarray of shape (n_sample, n_rounds). round_id: ndarray of shape (n_samples, ). gt_relevance: ndarray of shape (n_samples, n_options). Returns: -------- None """ # Convert them to torch tensor to use visdialch.metrics pred = torch.Tensor(pred) answer_index = torch.Tensor(answer_index).long() round_id = torch.Tensor(round_id).long() gt_relevance = torch.Tensor(gt_relevance) sparse_metrics = SparseGTMetrics() ndcg = NDCG() sparse_metrics.observe(pred, answer_index) pred = pred[torch.arange(pred.size(0)), round_id - 1, :] ndcg.observe(pred, gt_relevance) all_metrics = {} all_metrics.update(sparse_metrics.retrieve(reset=True)) all_metrics.update(ndcg.retrieve(reset=True)) for metric_name, metric_value in all_metrics.items(): print(f"{metric_name}: {metric_value}")
def __init__(self, config): super().__init__() self.ndcg = NDCG(is_direct_ranks=True) # We are calculating NDCG directly based on ranks # self.path_val_data = config.path_val_data self.dense_annotations_jsonpath = config.dense_annotations_jsonpath self.model_preds_root = config.model_preds_root self.models_list = self.get_model_type_list(self.model_preds_root) self.annotations_json = json.load(open( self.dense_annotations_jsonpath)) self.subset_type = config.subset_type
def __init__(self, path_val_data, dense_annotations_jsonpath, path_images_root, model_preds_root): self.path_val_data = path_val_data self.dense_annotations_jsonpath = dense_annotations_jsonpath # Ideally return the q and a here self.read_data() self.path_images_root = path_images_root self.model_preds_root = model_preds_root self.img_folder_list = self.get_img_folder_list(self.path_images_root) self.img_map = self.get_img_map(self.img_folder_list) self.models_list = self.get_model_type_list(self.model_preds_root) self.gt_indices_list = [] # 0-indexed self.gt_relevance_list = [] self.ndcg = NDCG(is_direct_ranks=True)
decoder.word_embed = encoder.word_embed glove = np.load('data/glove.npy') encoder.word_embed.weight.data = torch.tensor(glove) # Wrap encoder and decoder in a model. model = EncoderDecoderModel(encoder, decoder).to(device) if -1 not in args.gpu_ids: model = nn.DataParallel(model, args.gpu_ids) # ============================================================================= # SETUP BEFORE TRAINING LOOP # ============================================================================= start_time = datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') sparse_metrics = SparseGTMetrics() ndcg = NDCG() # loading checkpoint start_epoch = 0 model_state_dict, _ = load_checkpoint(args.load_pthpath) if isinstance(model, nn.DataParallel): model.module.load_state_dict(model_state_dict) else: model.load_state_dict(model_state_dict) print("Loaded model from {}".format(args.load_pthpath)) def get_1round_batch_data(batch, rnd): temp_train_batch = {} for key in batch: if key in ['img_feat']: temp_train_batch[key] = batch[key].to(device)
def train(config, args, dataloader_dic, device, finetune: bool = False, load_pthpath: str = "", finetune_regression: bool = False, dense_scratch_train: bool = False, dense_annotation_type: str = "default"): """ :param config: :param args: :param dataloader_dic: :param device: :param finetune: :param load_pthpath: :param finetune_regression: :param dense_scratch_train: when we want to start training only on 2000 annotations :param dense_annotation_type: default :return: """ # ============================================================================= # SETUP BEFORE TRAINING LOOP # ============================================================================= train_dataset = dataloader_dic["train_dataset"] train_dataloader = dataloader_dic["train_dataloader"] val_dataloader = dataloader_dic["val_dataloader"] val_dataset = dataloader_dic["val_dataset"] model = get_model(config, args, train_dataset, device) if finetune and not dense_scratch_train: assert load_pthpath != "", "Please provide a path" \ " for pre-trained model before " \ "starting fine tuning" print(f"\n Begin Finetuning:") optimizer, scheduler, iterations, lr_scheduler_type = get_solver( config, args, train_dataset, val_dataset, model, finetune=finetune) start_time = datetime.datetime.strftime(datetime.datetime.utcnow(), '%d-%b-%Y-%H:%M:%S') if args.save_dirpath == 'checkpoints/': args.save_dirpath += '%s+%s/%s' % ( config["model"]["encoder"], config["model"]["decoder"], start_time) summary_writer = SummaryWriter(log_dir=args.save_dirpath) checkpoint_manager = CheckpointManager(model, optimizer, args.save_dirpath, config=config) sparse_metrics = SparseGTMetrics() ndcg = NDCG() best_val_loss = np.inf # SA: initially loss can be any number best_val_ndcg = 0.0 # If loading from checkpoint, adjust start epoch and load parameters. # SA: 1. if finetuning -> load from saved model # 2. train -> default load_pthpath = "" # 3. else load pthpath if (not finetune and load_pthpath == "") or dense_scratch_train: start_epoch = 1 else: # "path/to/checkpoint_xx.pth" -> xx ### To cater model finetuning from models with "best_ndcg" checkpoint try: start_epoch = int(load_pthpath.split("_")[-1][:-4]) + 1 except: start_epoch = 1 model_state_dict, optimizer_state_dict = load_checkpoint(load_pthpath) # SA: updating last epoch checkpoint_manager.update_last_epoch(start_epoch) if isinstance(model, nn.DataParallel): model.module.load_state_dict(model_state_dict) else: model.load_state_dict(model_state_dict) # SA: for finetuning optimizer should start from its learning rate if not finetune: optimizer.load_state_dict(optimizer_state_dict) else: print("Optimizer not loaded. Different optimizer for finetuning.") print("Loaded model from {}".format(load_pthpath)) # ============================================================================= # TRAINING LOOP # ============================================================================= # Forever increasing counter to keep track of iterations (for tensorboard log). global_iteration_step = (start_epoch - 1) * iterations running_loss = 0.0 # New train_begin = datetime.datetime.utcnow() # New if finetune: end_epoch = start_epoch + config["solver"]["num_epochs_curriculum"] - 1 if finetune_regression: # criterion = nn.MSELoss(reduction='mean') # criterion = nn.KLDivLoss(reduction='mean') criterion = nn.MultiLabelSoftMarginLoss() else: end_epoch = config["solver"]["num_epochs"] # SA: normal training criterion = get_loss_criterion(config, train_dataset) # SA: end_epoch + 1 => for loop also doing last epoch for epoch in range(start_epoch, end_epoch + 1): # ------------------------------------------------------------------------- # ON EPOCH START (combine dataloaders if training on train + val) # ------------------------------------------------------------------------- if config["solver"]["training_splits"] == "trainval": combined_dataloader = itertools.chain(train_dataloader, val_dataloader) else: combined_dataloader = itertools.chain(train_dataloader) print(f"\nTraining for epoch {epoch}:") for i, batch in enumerate(tqdm(combined_dataloader)): for key in batch: batch[key] = batch[key].to(device) optimizer.zero_grad() output = model(batch) if finetune: target = batch["gt_relevance"] # Same as for ndcg validation, only one round is present output = output[torch.arange(output.size(0)), batch["round_id"] - 1, :] # SA: todo regression loss if finetune_regression: batch_loss = mse_loss(output, target, criterion) else: batch_loss = compute_ndcg_type_loss(output, target) else: batch_loss = get_batch_criterion_loss_value( config, batch, criterion, output) batch_loss.backward() optimizer.step() # -------------------------------------------------------------------- # update running loss and decay learning rates # -------------------------------------------------------------------- if running_loss > 0.0: running_loss = 0.95 * running_loss + 0.05 * batch_loss.item() else: running_loss = batch_loss.item() # SA: lambda_lr was configured to reduce lr after milestone epochs if lr_scheduler_type == "lambda_lr": scheduler.step(global_iteration_step) global_iteration_step += 1 if global_iteration_step % 100 == 0: # print current time, running average, learning rate, iteration, epoch print( "[{}][Epoch: {:3d}][Iter: {:6d}][Loss: {:6f}][lr: {:8f}]". format(datetime.datetime.utcnow() - train_begin, epoch, global_iteration_step, running_loss, optimizer.param_groups[0]['lr'])) # tensorboardX summary_writer.add_scalar("train/loss", batch_loss, global_iteration_step) summary_writer.add_scalar("train/lr", optimizer.param_groups[0]["lr"], global_iteration_step) torch.cuda.empty_cache() # ------------------------------------------------------------------------- # ON EPOCH END (checkpointing and validation) # ------------------------------------------------------------------------- if not finetune: checkpoint_manager.step(epoch=epoch) else: print("Validating before checkpointing.") # SA: ideally another function: too much work # Validate and report automatic metrics. if args.validate: # Switch dropout, batchnorm etc to the correct mode. model.eval() val_loss = 0 print(f"\nValidation after epoch {epoch}:") for i, batch in enumerate(tqdm(val_dataloader)): for key in batch: batch[key] = batch[key].to(device) with torch.no_grad(): output = model(batch) if finetune: target = batch["gt_relevance"] # Same as for ndcg validation, only one round is present out_ndcg = output[torch.arange(output.size(0)), batch["round_id"] - 1, :] # SA: todo regression loss if finetune_regression: batch_loss = mse_loss(out_ndcg, target, criterion) else: batch_loss = compute_ndcg_type_loss( out_ndcg, target) else: batch_loss = get_batch_criterion_loss_value( config, batch, criterion, output) val_loss += batch_loss.item() sparse_metrics.observe(output, batch["ans_ind"]) if "gt_relevance" in batch: output = output[torch.arange(output.size(0)), batch["round_id"] - 1, :] ndcg.observe(output, batch["gt_relevance"]) all_metrics = {} all_metrics.update(sparse_metrics.retrieve(reset=True)) all_metrics.update(ndcg.retrieve(reset=True)) for metric_name, metric_value in all_metrics.items(): print(f"{metric_name}: {metric_value}") summary_writer.add_scalars("metrics", all_metrics, global_iteration_step) model.train() torch.cuda.empty_cache() val_loss = val_loss / len(val_dataloader) print(f"Validation loss for {epoch} epoch is {val_loss}") print(f"Validation loss for batch is {batch_loss}") summary_writer.add_scalar("val/loss", batch_loss, global_iteration_step) if val_loss < best_val_loss: print(f" Best model found at {epoch} epoch! Saving now.") best_val_loss = val_loss if dense_annotation_type == "default": checkpoint_manager.save_best() else: print(f" Not saving the model at {epoch} epoch!") # SA: Saving the best model both for loss and ndcg now val_ndcg = all_metrics["ndcg"] if val_ndcg > best_val_ndcg: print(f" Best ndcg model found at {epoch} epoch! Saving now.") best_val_ndcg = val_ndcg if dense_annotation_type == "default": checkpoint_manager.save_best(ckpt_name="best_ndcg") else: # SA: trying for dense annotations ckpt_name = f"best_ndcg_annotation_{dense_annotation_type}" checkpoint_manager.save_best(ckpt_name=ckpt_name) else: print(f" Not saving the model at {epoch} epoch!") # SA: "reduce_lr_on_plateau" works only with validate for now if lr_scheduler_type == "reduce_lr_on_plateau": # scheduler.step(val_loss) # SA: # Loss should decrease while ndcg should increase! # can also change the mode in LR reduce on plateau to max scheduler.step(-1 * val_ndcg)