Example #1
0
            image_feat_batch: batch['image_feat_batch']})

    # visualization
    if run_vis:
        model.vis_batch_loc(
            data_reader, batch, fetch_list_val[-1], num_questions,
            bbox_correct, bbox_incorrect, vis_dir)

    # compute accuracy
    loc_scores_val, bbox_offset_val = fetch_list_val[0:2]
    bbox_pred = boxes.batch_feat_grid2bbox(
        np.argmax(loc_scores_val, axis=1), bbox_offset_val,
        data_reader.batch_loader.stride_H, data_reader.batch_loader.stride_W,
        data_reader.batch_loader.feat_H, data_reader.batch_loader.feat_W)
    bbox_gt = batch['bbox_batch']
    bbox_correct += np.sum(boxes.batch_bbox_iou(bbox_pred, bbox_gt) >= iou_th)
    num_questions += len(bbox_gt)
    accuracy = bbox_correct / num_questions
    if n_batch % 20 == 0:
        print('exp: %s, iter = %d, accumulated P1@%.2f on %s = %f (%d / %d)' %
              (cfg.EXP_NAME, cfg.TEST.ITER, iou_th, cfg.TEST.SPLIT_LOC,
               accuracy, bbox_correct, num_questions))

with open(os.path.join(
        result_dir, 'loc_results_%s.txt' % cfg.TEST.SPLIT_LOC), 'w') as f:
    print('\nexp: %s, iter = %d, final P1@%.2f on %s = %f (%d / %d)' %
          (cfg.EXP_NAME, cfg.TEST.ITER, iou_th, cfg.TEST.SPLIT_LOC, accuracy,
           bbox_correct, num_questions))
    print('exp: %s, iter = %d, final P1@%.2f on %s = %f (%d / %d)' %
          (cfg.EXP_NAME, cfg.TEST.ITER, iou_th, cfg.TEST.SPLIT_LOC, accuracy,
           bbox_correct, num_questions), file=f)
Example #2
0
    def forward(self, batch, run_vqa, run_ref):
        batchSize = len(batch['image_feat_batch'])
        questionIndices = torch.from_numpy(
            batch['input_seq_batch'].astype(np.int64)).cuda()
        questionLengths = torch.from_numpy(
            batch['seq_length_batch'].astype(np.int64)).cuda()
        images = torch.from_numpy(
            batch['image_feat_batch'].astype(np.float32)).cuda()
        imagesObjectNum = torch.from_numpy(
            np.sum(batch['image_valid_batch'].astype(np.int64), axis=1)).cuda()
        if run_vqa:
            answerIndices = torch.from_numpy(
                batch['answer_label_batch'].astype(np.int64)).cuda()
        if run_ref:
            bboxIndGt = torch.from_numpy(
                batch['bbox_ind_batch'].astype(np.int64)).cuda()
            bboxOffsetGt = torch.from_numpy(
                batch['bbox_offset_batch'].astype(np.float32)).cuda()

        # LSTM
        questionCntxWords, vecQuestions = self.encoder(
            questionIndices, questionLengths)

        # LCGN
        x_out = self.lcgn(
            images=images, q_encoding=vecQuestions,
            lstm_outputs=questionCntxWords, batch_size=batchSize,
            q_length=questionLengths, entity_num=imagesObjectNum)

        # Single-Hop
        loss = torch.tensor(0., device=x_out.device)
        res = {}
        if run_vqa:
            x_att = self.single_hop(x_out, vecQuestions, imagesObjectNum)
            logits = self.classifier(x_att, vecQuestions)
            predictions, num_correct = self.add_pred_op(logits, answerIndices)
            loss += self.add_answer_loss_op(logits, answerIndices)
            res.update({
                "predictions": predictions,
                "num_correct": int(num_correct),
                "accuracy": float(num_correct * 1. / batchSize)
            })

        if run_ref:
            assert cfg.FEAT_TYPE == 'spatial'
            ref_scores = self.grounder(x_out, vecQuestions, imagesObjectNum)
            bbox_offset, bbox_offset_fcn = self.bbox_regression(
                x_out, ref_scores)
            bbox_predictions = batch_feat_grid2bbox(
                np.argmax(ref_scores.detach().cpu().numpy(), axis=1),
                bbox_offset.detach().cpu().numpy(),
                cfg.IMG_H / cfg.H_FEAT, cfg.IMG_W / cfg.W_FEAT,
                cfg.H_FEAT, cfg.W_FEAT)
            bbox_ind_loss, bbox_offset_loss = self.add_bbox_loss_op(
                ref_scores, bbox_offset_fcn, bboxIndGt, bboxOffsetGt)
            loss += (bbox_ind_loss + bbox_offset_loss)
            bbox_ious = batch_bbox_iou(bbox_predictions, batch['bbox_batch'])
            bbox_num_correct = np.sum(bbox_ious >= cfg.BBOX_IOU_THRESH)
            res.update({
                "bbox_predictions": bbox_predictions,
                "bbox_ious": bbox_ious,
                "bbox_num_correct": int(bbox_num_correct),
                "bbox_accuracy": float(bbox_num_correct * 1. / batchSize),
            })

        res.update({"batch_size": int(batchSize), "loss": loss})
        return res
Example #3
0
    def run_batch(self,
                  sess,
                  batch,
                  train,
                  run_vqa,
                  run_ref,
                  vis=False,
                  lr=None):
        assert (not train) or (lr is not None), 'lr must be set for training'

        batchSizeOp = self.batchSizeAll
        trainOp = self.trainOp if train else self.noOp
        gradNormOp = self.gradNorm if train else self.noOp
        visOp = self.visAll if vis else self.noOp

        # vqa outputs
        predsAll = self.predsAll if run_vqa else self.noOp
        correctNumAll = self.correctNumAll if run_vqa else self.noOp

        # ref outputs
        refScoresAll = self.refScoresAll if run_ref else self.noOp
        bboxOffsetAll = self.bboxOffsetAll if run_ref else self.noOp

        feed_dict = self.create_feed_dict(batch,
                                          train,
                                          lr,
                                          feed_vqa=run_vqa,
                                          feed_ref=run_ref)
        batchSize, _, loss, predictions, num_correct, ref_scores, \
            bbox_offset, gradNorm, visRes = sess.run(
                [batchSizeOp, trainOp, self.lossAll, predsAll, correctNumAll,
                 refScoresAll, bboxOffsetAll, gradNormOp, visOp],
                feed_dict=feed_dict)

        res = {
            "batch_size": int(batchSize),
            "loss": float(loss),
            "grad_norm": float(gradNorm) if train else -1.,
            "vis": visRes
        }

        if run_vqa:
            res.update({
                "predictions": predictions,
                "num_correct": int(num_correct),
                "accuracy": float(num_correct * 1. / batchSize),
            })

        if run_ref:
            assert cfg.FEAT_TYPE == 'spatial'
            bbox_predictions = batch_feat_grid2bbox(
                np.argmax(ref_scores,
                          axis=1), bbox_offset, cfg.IMG_H / cfg.H_FEAT,
                cfg.IMG_W / cfg.W_FEAT, cfg.H_FEAT, cfg.W_FEAT)
            bbox_ious = batch_bbox_iou(bbox_predictions, batch['bbox_batch'])
            bbox_num_correct = np.sum(bbox_ious >= cfg.BBOX_IOU_THRESH)
            res.update({
                "bbox_predictions":
                bbox_predictions,
                "bbox_ious":
                bbox_ious,
                "bbox_num_correct":
                int(bbox_num_correct),
                "bbox_accuracy":
                float(bbox_num_correct * 1. / batchSize),
            })

        return res
Example #4
0
    # compute accuracy
    vqa_labels = batch_vqa['answer_label_batch']
    vqa_predictions = np.argmax(vqa_scores_val, axis=1)
    vqa_accuracy = np.mean(vqa_predictions == vqa_labels)
    vqa_avg_accuracy += (1 - accuracy_decay) * (vqa_accuracy -
                                                vqa_avg_accuracy)

    bbox_pred = boxes.batch_feat_grid2bbox(
        np.argmax(loc_scores_val, axis=1), bbox_offset_val,
        data_reader_loc.batch_loader.stride_H,
        data_reader_loc.batch_loader.stride_W,
        data_reader_loc.batch_loader.feat_H,
        data_reader_loc.batch_loader.feat_W)
    bbox_gt = batch_loc['bbox_batch']
    loc_accuracy = np.mean(boxes.batch_bbox_iou(bbox_pred, bbox_gt) >= iou_th)
    loc_avg_accuracy += (1 - accuracy_decay) * (loc_accuracy -
                                                loc_avg_accuracy)

    # Add to TensorBoard summary
    if (n_iter + 1) % cfg.TRAIN.LOG_INTERVAL == 0:
        loss_layout_val = (loss_layout_val1 + loss_layout_val2) / 2.
        print(
            "exp: %s, iter = %d\n\t" % (cfg.EXP_NAME, n_iter + 1) +
            "loss (vqa) = %f, loss (bbox_ind) = %f, "
            "loss (bbox_offset) = %f, loss (layout) = %f, loss (rec) = %f, "
            "loss (sharpen) = %f, sharpen_scale = %f\n\t" %
            (loss_vqa_val, loss_bbox_ind_val, loss_bbox_offset_val,
             loss_layout_val, loss_rec_val, loss_sharpen_val, sharpen_scale) +
            "accuracy (vqa, cur) = %f, accuracy (vqa, avg) = %f\n\t" %
            (vqa_accuracy, vqa_avg_accuracy) +
Example #5
0
def vis_batch_loc(model, data_reader, batch, vis_outputs, start_idx,
                  start_idx_correct, start_idx_incorrect, vis_dir):
    module_names = model.nmn.module_names
    iou_th = cfg.TEST.BBOX_IOU_THRESH
    if cfg.TEST.VIS_SEPARATE_CORRECTNESS:
        num_correct = max(cfg.TEST.NUM_VIS_CORRECT - start_idx_correct, 0)
        num_incorrect = max(cfg.TEST.NUM_VIS_INCORRECT - start_idx_incorrect,
                            0)

        bbox_pred = boxes.batch_feat_grid2bbox(
            np.argmax(vis_outputs['loc_scores'], axis=1),
            vis_outputs['bbox_offset'], data_reader.batch_loader.stride_H,
            data_reader.batch_loader.stride_W, data_reader.batch_loader.feat_H,
            data_reader.batch_loader.feat_W)
        bbox_gt = batch['bbox_batch']
        is_correct = boxes.batch_bbox_iou(bbox_pred, bbox_gt) >= iou_th
        inds = (list(np.where(is_correct)[0][:num_correct]) +
                list(np.where(~is_correct)[0][:num_incorrect]))
    else:
        num = min(len(batch['image_path_list']), cfg.TEST.NUM_VIS - start_idx)
        inds = range(num)
    for n in inds:
        img_path = batch['image_path_list'][n]
        if cfg.TEST.VIS_SEPARATE_CORRECTNESS:
            if is_correct[n]:
                save_name = 'correct_%08d_%s.png' % (
                    start_idx_correct,
                    os.path.basename(img_path).split('.')[0])
                start_idx_correct += 1
            else:
                save_name = 'incorrect_%08d_%s.png' % (
                    start_idx_incorrect,
                    os.path.basename(img_path).split('.')[0])
                start_idx_incorrect += 1
        else:
            save_name = '%08d_%s.png' % (
                start_idx, os.path.basename(img_path).split('.')[0])
            start_idx += 1
        save_path = os.path.join(vis_dir, save_name)
        words = [
            data_reader.batch_loader.vocab_dict.idx2word(n_w)
            for n_w in batch['input_seq_batch'][:batch['seq_length_batch'][n],
                                                n]
        ]
        loc_scores = vis_outputs['loc_scores'][n]
        bbox_offset = vis_outputs['bbox_offset'][n]
        bbox_pred = boxes.feat_grid2bbox(np.argmax(loc_scores), bbox_offset,
                                         data_reader.batch_loader.stride_H,
                                         data_reader.batch_loader.stride_W,
                                         data_reader.batch_loader.feat_H,
                                         data_reader.batch_loader.feat_W)
        bbox_gt = boxes.feat_grid2bbox(batch['bbox_ind_batch'][n],
                                       batch['bbox_offset_batch'][n],
                                       data_reader.batch_loader.stride_H,
                                       data_reader.batch_loader.stride_W,
                                       data_reader.batch_loader.feat_H,
                                       data_reader.batch_loader.feat_W)
        # bbox_gt = batch['bbox_batch'][n]
        txt_att = vis_outputs['txt_att'][n]
        att_stack = vis_outputs['att_stack'][n]
        stack_ptr = vis_outputs['stack_ptr'][n]
        module_prob = vis_outputs['module_prob'][n]
        if cfg.TEST.STEPWISE_VIS:
            vis_one_stepwise(img_path,
                             words,
                             module_names,
                             txt_att,
                             att_stack,
                             stack_ptr,
                             module_prob,
                             save_path,
                             vis_type='loc',
                             loc_scores=loc_scores,
                             bbox_pred=bbox_pred,
                             bbox_gt=bbox_gt)
        else:
            vis_one_loc(img_path, words, loc_scores, bbox_pred, bbox_gt,
                        module_names, txt_att, att_stack, stack_ptr,
                        module_prob, save_path)