image_feat_batch: batch['image_feat_batch']}) # visualization if run_vis: model.vis_batch_loc( data_reader, batch, fetch_list_val[-1], num_questions, bbox_correct, bbox_incorrect, vis_dir) # compute accuracy loc_scores_val, bbox_offset_val = fetch_list_val[0:2] bbox_pred = boxes.batch_feat_grid2bbox( np.argmax(loc_scores_val, axis=1), bbox_offset_val, data_reader.batch_loader.stride_H, data_reader.batch_loader.stride_W, data_reader.batch_loader.feat_H, data_reader.batch_loader.feat_W) bbox_gt = batch['bbox_batch'] bbox_correct += np.sum(boxes.batch_bbox_iou(bbox_pred, bbox_gt) >= iou_th) num_questions += len(bbox_gt) accuracy = bbox_correct / num_questions if n_batch % 20 == 0: print('exp: %s, iter = %d, accumulated P1@%.2f on %s = %f (%d / %d)' % (cfg.EXP_NAME, cfg.TEST.ITER, iou_th, cfg.TEST.SPLIT_LOC, accuracy, bbox_correct, num_questions)) with open(os.path.join( result_dir, 'loc_results_%s.txt' % cfg.TEST.SPLIT_LOC), 'w') as f: print('\nexp: %s, iter = %d, final P1@%.2f on %s = %f (%d / %d)' % (cfg.EXP_NAME, cfg.TEST.ITER, iou_th, cfg.TEST.SPLIT_LOC, accuracy, bbox_correct, num_questions)) print('exp: %s, iter = %d, final P1@%.2f on %s = %f (%d / %d)' % (cfg.EXP_NAME, cfg.TEST.ITER, iou_th, cfg.TEST.SPLIT_LOC, accuracy, bbox_correct, num_questions), file=f)
def forward(self, batch, run_vqa, run_ref): batchSize = len(batch['image_feat_batch']) questionIndices = torch.from_numpy( batch['input_seq_batch'].astype(np.int64)).cuda() questionLengths = torch.from_numpy( batch['seq_length_batch'].astype(np.int64)).cuda() images = torch.from_numpy( batch['image_feat_batch'].astype(np.float32)).cuda() imagesObjectNum = torch.from_numpy( np.sum(batch['image_valid_batch'].astype(np.int64), axis=1)).cuda() if run_vqa: answerIndices = torch.from_numpy( batch['answer_label_batch'].astype(np.int64)).cuda() if run_ref: bboxIndGt = torch.from_numpy( batch['bbox_ind_batch'].astype(np.int64)).cuda() bboxOffsetGt = torch.from_numpy( batch['bbox_offset_batch'].astype(np.float32)).cuda() # LSTM questionCntxWords, vecQuestions = self.encoder( questionIndices, questionLengths) # LCGN x_out = self.lcgn( images=images, q_encoding=vecQuestions, lstm_outputs=questionCntxWords, batch_size=batchSize, q_length=questionLengths, entity_num=imagesObjectNum) # Single-Hop loss = torch.tensor(0., device=x_out.device) res = {} if run_vqa: x_att = self.single_hop(x_out, vecQuestions, imagesObjectNum) logits = self.classifier(x_att, vecQuestions) predictions, num_correct = self.add_pred_op(logits, answerIndices) loss += self.add_answer_loss_op(logits, answerIndices) res.update({ "predictions": predictions, "num_correct": int(num_correct), "accuracy": float(num_correct * 1. / batchSize) }) if run_ref: assert cfg.FEAT_TYPE == 'spatial' ref_scores = self.grounder(x_out, vecQuestions, imagesObjectNum) bbox_offset, bbox_offset_fcn = self.bbox_regression( x_out, ref_scores) bbox_predictions = batch_feat_grid2bbox( np.argmax(ref_scores.detach().cpu().numpy(), axis=1), bbox_offset.detach().cpu().numpy(), cfg.IMG_H / cfg.H_FEAT, cfg.IMG_W / cfg.W_FEAT, cfg.H_FEAT, cfg.W_FEAT) bbox_ind_loss, bbox_offset_loss = self.add_bbox_loss_op( ref_scores, bbox_offset_fcn, bboxIndGt, bboxOffsetGt) loss += (bbox_ind_loss + bbox_offset_loss) bbox_ious = batch_bbox_iou(bbox_predictions, batch['bbox_batch']) bbox_num_correct = np.sum(bbox_ious >= cfg.BBOX_IOU_THRESH) res.update({ "bbox_predictions": bbox_predictions, "bbox_ious": bbox_ious, "bbox_num_correct": int(bbox_num_correct), "bbox_accuracy": float(bbox_num_correct * 1. / batchSize), }) res.update({"batch_size": int(batchSize), "loss": loss}) return res
def run_batch(self, sess, batch, train, run_vqa, run_ref, vis=False, lr=None): assert (not train) or (lr is not None), 'lr must be set for training' batchSizeOp = self.batchSizeAll trainOp = self.trainOp if train else self.noOp gradNormOp = self.gradNorm if train else self.noOp visOp = self.visAll if vis else self.noOp # vqa outputs predsAll = self.predsAll if run_vqa else self.noOp correctNumAll = self.correctNumAll if run_vqa else self.noOp # ref outputs refScoresAll = self.refScoresAll if run_ref else self.noOp bboxOffsetAll = self.bboxOffsetAll if run_ref else self.noOp feed_dict = self.create_feed_dict(batch, train, lr, feed_vqa=run_vqa, feed_ref=run_ref) batchSize, _, loss, predictions, num_correct, ref_scores, \ bbox_offset, gradNorm, visRes = sess.run( [batchSizeOp, trainOp, self.lossAll, predsAll, correctNumAll, refScoresAll, bboxOffsetAll, gradNormOp, visOp], feed_dict=feed_dict) res = { "batch_size": int(batchSize), "loss": float(loss), "grad_norm": float(gradNorm) if train else -1., "vis": visRes } if run_vqa: res.update({ "predictions": predictions, "num_correct": int(num_correct), "accuracy": float(num_correct * 1. / batchSize), }) if run_ref: assert cfg.FEAT_TYPE == 'spatial' bbox_predictions = batch_feat_grid2bbox( np.argmax(ref_scores, axis=1), bbox_offset, cfg.IMG_H / cfg.H_FEAT, cfg.IMG_W / cfg.W_FEAT, cfg.H_FEAT, cfg.W_FEAT) bbox_ious = batch_bbox_iou(bbox_predictions, batch['bbox_batch']) bbox_num_correct = np.sum(bbox_ious >= cfg.BBOX_IOU_THRESH) res.update({ "bbox_predictions": bbox_predictions, "bbox_ious": bbox_ious, "bbox_num_correct": int(bbox_num_correct), "bbox_accuracy": float(bbox_num_correct * 1. / batchSize), }) return res
# compute accuracy vqa_labels = batch_vqa['answer_label_batch'] vqa_predictions = np.argmax(vqa_scores_val, axis=1) vqa_accuracy = np.mean(vqa_predictions == vqa_labels) vqa_avg_accuracy += (1 - accuracy_decay) * (vqa_accuracy - vqa_avg_accuracy) bbox_pred = boxes.batch_feat_grid2bbox( np.argmax(loc_scores_val, axis=1), bbox_offset_val, data_reader_loc.batch_loader.stride_H, data_reader_loc.batch_loader.stride_W, data_reader_loc.batch_loader.feat_H, data_reader_loc.batch_loader.feat_W) bbox_gt = batch_loc['bbox_batch'] loc_accuracy = np.mean(boxes.batch_bbox_iou(bbox_pred, bbox_gt) >= iou_th) loc_avg_accuracy += (1 - accuracy_decay) * (loc_accuracy - loc_avg_accuracy) # Add to TensorBoard summary if (n_iter + 1) % cfg.TRAIN.LOG_INTERVAL == 0: loss_layout_val = (loss_layout_val1 + loss_layout_val2) / 2. print( "exp: %s, iter = %d\n\t" % (cfg.EXP_NAME, n_iter + 1) + "loss (vqa) = %f, loss (bbox_ind) = %f, " "loss (bbox_offset) = %f, loss (layout) = %f, loss (rec) = %f, " "loss (sharpen) = %f, sharpen_scale = %f\n\t" % (loss_vqa_val, loss_bbox_ind_val, loss_bbox_offset_val, loss_layout_val, loss_rec_val, loss_sharpen_val, sharpen_scale) + "accuracy (vqa, cur) = %f, accuracy (vqa, avg) = %f\n\t" % (vqa_accuracy, vqa_avg_accuracy) +
def vis_batch_loc(model, data_reader, batch, vis_outputs, start_idx, start_idx_correct, start_idx_incorrect, vis_dir): module_names = model.nmn.module_names iou_th = cfg.TEST.BBOX_IOU_THRESH if cfg.TEST.VIS_SEPARATE_CORRECTNESS: num_correct = max(cfg.TEST.NUM_VIS_CORRECT - start_idx_correct, 0) num_incorrect = max(cfg.TEST.NUM_VIS_INCORRECT - start_idx_incorrect, 0) bbox_pred = boxes.batch_feat_grid2bbox( np.argmax(vis_outputs['loc_scores'], axis=1), vis_outputs['bbox_offset'], data_reader.batch_loader.stride_H, data_reader.batch_loader.stride_W, data_reader.batch_loader.feat_H, data_reader.batch_loader.feat_W) bbox_gt = batch['bbox_batch'] is_correct = boxes.batch_bbox_iou(bbox_pred, bbox_gt) >= iou_th inds = (list(np.where(is_correct)[0][:num_correct]) + list(np.where(~is_correct)[0][:num_incorrect])) else: num = min(len(batch['image_path_list']), cfg.TEST.NUM_VIS - start_idx) inds = range(num) for n in inds: img_path = batch['image_path_list'][n] if cfg.TEST.VIS_SEPARATE_CORRECTNESS: if is_correct[n]: save_name = 'correct_%08d_%s.png' % ( start_idx_correct, os.path.basename(img_path).split('.')[0]) start_idx_correct += 1 else: save_name = 'incorrect_%08d_%s.png' % ( start_idx_incorrect, os.path.basename(img_path).split('.')[0]) start_idx_incorrect += 1 else: save_name = '%08d_%s.png' % ( start_idx, os.path.basename(img_path).split('.')[0]) start_idx += 1 save_path = os.path.join(vis_dir, save_name) words = [ data_reader.batch_loader.vocab_dict.idx2word(n_w) for n_w in batch['input_seq_batch'][:batch['seq_length_batch'][n], n] ] loc_scores = vis_outputs['loc_scores'][n] bbox_offset = vis_outputs['bbox_offset'][n] bbox_pred = boxes.feat_grid2bbox(np.argmax(loc_scores), bbox_offset, data_reader.batch_loader.stride_H, data_reader.batch_loader.stride_W, data_reader.batch_loader.feat_H, data_reader.batch_loader.feat_W) bbox_gt = boxes.feat_grid2bbox(batch['bbox_ind_batch'][n], batch['bbox_offset_batch'][n], data_reader.batch_loader.stride_H, data_reader.batch_loader.stride_W, data_reader.batch_loader.feat_H, data_reader.batch_loader.feat_W) # bbox_gt = batch['bbox_batch'][n] txt_att = vis_outputs['txt_att'][n] att_stack = vis_outputs['att_stack'][n] stack_ptr = vis_outputs['stack_ptr'][n] module_prob = vis_outputs['module_prob'][n] if cfg.TEST.STEPWISE_VIS: vis_one_stepwise(img_path, words, module_names, txt_att, att_stack, stack_ptr, module_prob, save_path, vis_type='loc', loc_scores=loc_scores, bbox_pred=bbox_pred, bbox_gt=bbox_gt) else: vis_one_loc(img_path, words, loc_scores, bbox_pred, bbox_gt, module_names, txt_att, att_stack, stack_ptr, module_prob, save_path)