Example #1
0
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T):
    im_path = iminfo['im_path']
    im = skimage.io.imread(im_path)
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))

    # calculate the resize scaling factor
    im_h, im_w = im.shape[:2]
    # make the short size equal to min_size but also the long size no bigger than max_size
    scale = min(max(min_size/im_h, min_size/im_w), max_size/im_h, max_size/im_w)

    # resize and process the image
    new_h, new_w = int(scale*im_h), int(scale*im_w)
    im_resized = skimage.img_as_float(skimage.transform.resize(im, [new_h, new_w]))
    im_processed = im_resized*255 - im_mean
    im_batch = im_processed[np.newaxis, ...].astype(np.float32)

    # annotate regions
    regions = iminfo['regions']
    if len(regions) == 0:
        raise IOError('no region annotations for image ' + im_path)
    region_bboxes = np.array([ann[0] for ann in regions], np.float32)
    # save coco_bboxes, needed for evaluation code
    coco_bboxes = region_bboxes.copy()
    # back to [x, y, w, h]
    coco_bboxes[:, 2:4] = coco_bboxes[:, 2:4] - coco_bboxes[:, 0:2] + 1
    region_bboxes *= scale
    region_bboxes = im_processing.rectify_bboxes(region_bboxes, height=new_h, width=new_w)

    # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
    bbox_batch = np.zeros((len(region_bboxes), 5), np.float32)
    bbox_batch[:, 1:5] = region_bboxes
    spatial_batch = spatial_feature_from_bbox(region_bboxes, im_h=new_h, im_w=new_w)

    # a region may have zero, one or more sentence annotations
    # align language sequences with regions
    text_seq_batch = []
    label_batch = []
    coco_ann_ids = []  # needed for evaluation code
    questions = []  # needed for evaluation code
    for n in range(len(regions)):
        for n_s in range(len(regions[n][1])):
            s = regions[n][1][n_s]
            text_seq_batch.append(text_processing.preprocess_sentence(s, vocab_dict, T))
            label_batch.append(n)
            coco_ann_ids.append(regions[n][2])
            questions.append(s)

    text_seq_batch = np.array(text_seq_batch, dtype=np.int32).T

    label_batch = np.array(label_batch, dtype=np.int32)

    batch=dict(text_seq_batch=text_seq_batch, im_batch=im_batch,
               bbox_batch=bbox_batch, spatial_batch=spatial_batch,
               label_batch=label_batch, coco_ann_ids=coco_ann_ids,
               questions=questions, coco_bboxes=coco_bboxes)

    return batch
Example #2
0
def build_coco_batches(dataset, setname, T, input_H, input_W):
    im_dir = '/data/ryli/datasets/coco/images'
    im_type = 'train2014'
    vocab_file = './data/vocabulary_Gref.txt'

    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)

    if dataset == 'Gref':
        refer = REFER('./external/refer/data',
                      dataset='refcocog',
                      splitBy='google')
    elif dataset == 'unc':
        refer = REFER('./external/refer/data',
                      dataset='refcoco',
                      splitBy='unc')
    elif dataset == 'unc+':
        refer = REFER('./external/refer/data',
                      dataset='refcoco+',
                      splitBy='unc')
    else:
        raise ValueError('Unknown dataset %s' % dataset)
    refs = [
        refer.Refs[ref_id] for ref_id in refer.Refs
        if refer.Refs[ref_id]['split'] == setname
    ]
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    n_batch = 0
    for ref in refs:
        im_name = 'COCO_' + im_type + '_' + str(ref['image_id']).zfill(12)
        im = skimage.io.imread('%s/%s/%s.jpg' % (im_dir, im_type, im_name))
        seg = refer.Anns[ref['ann_id']]['segmentation']
        rle = cocomask.frPyObjects(seg, im.shape[0], im.shape[1])
        mask = np.max(cocomask.decode(rle), axis=2).astype(np.float32)

        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        for sentence in ref['sentences']:
            print('saving batch %d' % (n_batch + 1))
            sent = sentence['sent']
            text = text_processing.preprocess_sentence(sent, vocab_dict, T)

            np.savez(file=data_folder + data_prefix + '_' + str(n_batch) +
                     '.npz',
                     text_batch=text,
                     im_batch=im,
                     mask_batch=(mask > 0),
                     sent_batch=[sent])
            n_batch += 1
def vectorizeLearntEmbd(args):
    if args.checkpoint == '':
        # Network
        if args.savefile == "det":
            vocab_size = 8803
            embedding_dim = 1000
            vocab_file = './exp-referit/data/vocabulary_referit.txt'
            vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
            pretrained_model = './exp-referit/tfmodel/referit_fc8_det_iter_25000.tfmodel'
        else:
            vocab_size = len(vocab)
            embedding_dim = len(embd[0])
            vocab_dict = dict()
            for i in range(len(vocab)): vocab_dict[vocab[i]] = i
            pretrained_model = './coco/tfmodel/cls_coco_glove_20000.tfmodel'

        # Inputs
        text_seq_batch = tf.placeholder(tf.int32, [T, N])
        embedem = embedding_layer(text_seq_batch, vocab_size, embedding_dim)  

        # Load pretrained model
        snapshot_restorer = tf.train.Saver(None)
        sess = tf.Session()
        snapshot_restorer.restore(sess, pretrained_model)     

        # Initialize arrays
        vectors = list()
        text_seq_val = np.zeros((T, N), dtype=np.int32)

        # Generate vector embeddings
        count = 0
        for word in words: 
            count += 1
            if count % 100 == 0: print("%d out of %d words processed" % (count, len(words)))

            # Preprocess word
            text_seq = text_processing.preprocess_sentence(word, vocab_dict, T)
            text_seq_val[:, 0] = text_seq

            # Extract LSTM language feature
            embedded_seq = sess.run(embedem, feed_dict={text_seq_batch:text_seq_val})
            temp = np.squeeze(np.transpose(embedded_seq))
            vectors.append(temp)

            if count == vector_count: break

        # Save vectors for easy recovery
        backup = args.savefile + "_TSNE_backup.npz"
        np.savez(os.path.join(plot_dir, backup), words=words, vectors=vectors)

    else:
        # Load saved vectors
        npzfile = np.load(os.path.join(plot_dir, args.checkpoint))
        vectors = npzfile['vectors']

    return vectors
Example #4
0
def build_referit_batches(setname, T, input_H, input_W):
    # data directory
    im_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/images/'
    mask_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/mask/'
    query_file = './data/referit/referit_query_' + setname + '.json'
    vocab_file = './data/vocabulary_referit.txt'

    # saving directory
    data_folder = './referit/' + setname + '_batch/'
    data_prefix = 'referit_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)
    fp = open('./referit/trainval_list.txt', 'w')

    # load annotations
    query_dict = json.load(open(query_file))
    im_list = query_dict.keys()
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    # collect training samples
    samples = []
    for n_im, name in enumerate(im_list):
        im_name = name.split('_', 1)[0] + '.jpg'
        mask_name = name + '.mat'
        for sent in query_dict[name]:
            samples.append((im_name, mask_name, sent))

    # save batches to disk
    num_batch = len(samples)
    for n_batch in range(num_batch):
        print('saving batch %d / %d' % (n_batch + 1, num_batch))
        im_name, mask_name, sent = samples[n_batch]
        fp.write('%d\t%s%s\n' % (n_batch, im_dir, im_name))
        im = skimage.io.imread(im_dir + im_name)
        mask = load_gt_mask(mask_dir + mask_name).astype(np.float32)

        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        text = text_processing.preprocess_sentence(sent, vocab_dict, T)

        np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz',
                 text_batch=text,
                 im_batch=im,
                 mask_batch=(mask > 0),
                 sent_batch=[sent])
    fp.close()
Example #5
0
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T):
    im_path = iminfo['im_path']
    im = skimage.io.imread(im_path)
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))

    # calculate the resize scaling factor
    im_h, im_w = im.shape[:2]
    # make the short size equal to min_size but also the long size no bigger than max_size
    scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h,
                max_size / im_w)

    # resize and process the image
    new_h, new_w = int(scale * im_h), int(scale * im_w)
    im_resized = skimage.img_as_float(
        skimage.transform.resize(im, [new_h, new_w]))
    im_processed = im_resized * 255 - im_mean
    im_batch = im_processed[np.newaxis, ...].astype(np.float32)

    # Sample one qa pair from all QA pairs
    qa_pairs = iminfo['processed_qa_pairs']
    num_questions = len(qa_pairs)
    num_choices = 4
    text_seq_batch = np.zeros((T, num_questions * num_choices), dtype=np.int32)
    label_batch = np.zeros(num_questions, dtype=np.int32)
    bboxes = np.zeros((num_questions * num_choices, 4), np.float32)
    for n_q in range(num_questions):
        this_bboxes, question, label = qa_pairs[n_q]
        bboxes[n_q * num_choices:(n_q + 1) * num_choices, :] = this_bboxes
        text_seq_batch[:, n_q*num_choices:(n_q+1)*num_choices] = \
            np.array(text_processing.preprocess_sentence(question, vocab_dict, T)).reshape((T, 1))
        label_batch[n_q] = label

    # annotate regions
    bboxes *= scale
    bboxes = im_processing.rectify_bboxes(bboxes, height=new_h, width=new_w)
    spatial_batch = spatial_feature_from_bbox(bboxes, im_h=new_h, im_w=new_w)

    # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R
    bbox_batch = np.zeros((len(bboxes), 5), np.float32)
    bbox_batch[:, 1:5] = bboxes

    batch = dict(im_batch=im_batch,
                 bbox_batch=bbox_batch,
                 spatial_batch=spatial_batch,
                 text_seq_batch=text_seq_batch,
                 label_batch=label_batch)

    return batch
Example #6
0
def load_one_batch(n_iter):
    global imcrop_val
    global spatial_val
    global text_seq_val
    global label_val

    print('data reader: epoch = %d, batch = %d / %d' %
          (n_iter // num_images, n_iter % num_images, num_images))

    # Read one batch
    # Get images
    image = image_list[n_iter % num_images]
    for h in range(height):
        for w in range(width):
            crop = image[h * 10:(h + 1) * 10, w * 10:(w + 1) * 10, :]
            imcrop_val[h * width + w] = skimage.transform.resize(
                crop, [IM_H, IM_W])
            bboxes[h * width + w] = [w, h, w + 1, h + 1]  # [x1, y1, x2, y2]
    imcrop_val *= 255
    imcrop_val -= vgg_net.channel_mean

    # Get spatial batch
    spatial_val = spatial_feat.spatial_feature_from_bbox(bboxes,
                                                         im_h=height,
                                                         im_w=width)

    # Get text sequence
    expr_obj = query_list[n_iter % num_images]
    text_seq_val[:, 0] = text_processing.preprocess_sentence(
        expr_obj, vocab_dict, T)

    # Get labels
    matched_pairs = matched_pairs_list[n_iter % num_images]
    (h1, w1), (h2, w2) = matched_pairs[0]  # just take the first matched_pair
    if strong_supervision:
        label_val[...] = (h1 * width + w1) * N_bbox + (h2 * width + w2)
    else:
        label_val[...] = (h1 * width + w1)

    batch = dict(imcrop_batch=imcrop_val,
                 spatial_batch=spatial_val,
                 text_seq_batch=text_seq_val,
                 label_batch=label_val)
    return batch
def preprocess_data(im, mask, sent, obj_id):
    anchors = io.read_anchors(anchor_file)
    mask_color = object_color[obj_id]
    mask_obj = np.asarray(((mask == mask_color)[:, :, 0]))
    im = skimage.img_as_ubyte(
        im_processing.resize_and_pad(im, input_H, input_W))
    mask = im_processing.resize_and_pad(mask_obj, input_H, input_W)
    bbox = im_processing.bboxes_from_masks(np.asarray(mask))
    bbox[:, 2:4] += bbox[:, :2]
    label_bbox, true_bbox = processing_tools.preprocess_true_boxes(
        bbox, input_H, anchors)
    text = text_processing.preprocess_sentence(sent, vocab_dict, T)
    return {
        'text_batch': np.asarray(text),
        'im_batch': np.asarray(im),
        'mask_batch': (mask > 0),
        'sent_batch': [sent],
        'label_bbox': label_bbox,
        'true_bbox': true_bbox
    }
Example #8
0
    print('testing image %d / %d' % (n_im, num_im))
    imname = imlist[n_im]

    # Extract visual features from all proposals
    im = skimage.io.imread(image_dir + imname)
    processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W))
    if processed_im.ndim == 2:
        processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3))

    imcrop_val[...] = processed_im.astype(np.float32) - segmodel.vgg_net.channel_mean
    for imcrop_name, _, description in flat_query_dict[imname]:
        mask = load_gt_mask(mask_dir + imcrop_name + '.mat').astype(np.float32)
        labels = (mask > 0)
        processed_labels = im_processing.resize_and_pad(mask, input_H, input_W) > 0

        text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, T)
        scores_val = sess.run(scores, feed_dict={
                text_seq_batch  : text_seq_val,
                imcrop_batch    : imcrop_val
            })
        scores_val = np.squeeze(scores_val)

        # Evaluate the segmentation performance of using bounding box segmentation
        pred_raw = (scores_val >= score_thresh).astype(np.float32)
        predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1])
        I, U = eval_tools.compute_mask_IU(predicts, labels)
        cum_I += I
        cum_U += U
        this_IoU = I/U
        for n_eval_iou in range(len(eval_seg_iou_list)):
            eval_seg_iou = eval_seg_iou_list[n_eval_iou]
def test(iter, dataset, visualize, setname, dcrf, mu, tfmodel_path, model_name, pre_emb=False):
    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    if visualize:
        save_dir = './' + dataset + '/visualization/' + str(iter) + '/'
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
    weights = os.path.join(tfmodel_path)
    print("Loading trained weights from {}".format(weights))

    score_thresh = 1e-9
    eval_seg_iou_list = [.5, .6, .7, .8, .9]
    cum_I, cum_U = 0, 0
    mean_IoU, mean_dcrf_IoU = 0, 0
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    if dcrf:
        cum_I_dcrf, cum_U_dcrf = 0, 0
        seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.
    T = 20 # truncated long sentence
    H, W = 320, 320
    vocab_size = 8803 if dataset == 'referit' else 12112
    emb_name = 'referit' if dataset == 'referit' else 'refvos'
    vocab_file = './data/vocabulary_refvos.txt'
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
    IU_result = list()

    if pre_emb:
        # use pretrained embbeding
        print("Use pretrained Embeddings.")
        model = get_segmentation_model(model_name, H=H, W=W,
                                       mode='eval', 
                                       vocab_size=vocab_size, 
                                       emb_name=emb_name, 
                                       emb_dir=args.embdir)
    else:
        model = get_segmentation_model(model_name, H=H, W=W,
                                       mode='eval', vocab_size=vocab_size)

    # Load pretrained model
    snapshot_restorer = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    snapshot_restorer.restore(sess, weights)
     
    meta_expression = {}
    with open(args.meta) as meta_file:
        meta_expression = json.load(meta_file)
    videos = meta_expression['videos']
    plt.figure(figsize=[15, 4])
    sorted_video_key = ['a9f23c9150', '6cc8bce61a', '03fe6115d4', 'a46012c642', 'c42fdedcdd', 'ee9415c553', '7daa6343e6', '4fe6619a47', '0e8a6b63bb', '65e0640a2a', '8939473ea7', 'b05faf54f7', '5d2020eff8', 'a00c3fa88e', '44e5d1a969', 'deed0ab4fc', 'b205d868e6', '48d2909d9e', 'c9ef04fe59', '1e20ceafae', '0f3f8b2b2f', 'b83923fd72', 'cb06f84b6e', '17cba76927', '35d5e5149d', '62bf7630b3', '0390fabe58', 'bf2d38aefe', '8b7b57b94d', '8d803e87f7', 'c16d9a4ade', '1a1dbe153e', 'd975e5f4a9', '226f1e10f7', '6cb5b08d93', '77df215672', '466734bc5c', '94fa9bd3b5', 'f2a45acf1c', 'ba8823f2d2', '06cd94d38d', 'b772ac822a', '246e38963b', 'b5514f75d8', '188cb4e03d', '3dd327ab4e', '8e2e5af6a8', '450bd2e238', '369919ef49', 'a4bce691c6', '64c6f2ed76', '0782a6df7e', '0062f687f1', 'c74fc37224', 'f7255a57d0', '4f5b3310e3', 'e027ebc228', '30fe0ed0ce', '6a75316e99', 'a2948d4116', '8273b59141', 'abae1ce57d', '621487be65', '45dc90f558', '9787f452bf', 'cdcfd9f93a', '4f6662e4e0', '853ca85618', '13ca7bbcfd', 'f143fede6f', '92fde455eb', '0b0c90e21a', '5460cc540a', '182dbfd6ba', '85968ae408', '541ccb0844', '43115c42b2', '65350fd60a', 'eb49ce8027', 'e11254d3b9', '20a93b4c54', 'a0fc95d8fc', '696e01387c', 'fef7e84268', '72d613f21a', '8c60938d92', '975be70866', '13c3cea202', '4ee0105885', '01c88b5b60', '33e8066265', '8dea7458de', 'c280d21988', 'fd8cf868b2', '35948a7fca', 'e10236eb37', 'a1251195e7', 'b2256e265c', '2b904b76c9', '1ab5f4bbc5', '47d01d34c8', 'd7a38bf258', '1a609fa7ee', '218ac81c2d', '9f16d17e42', 'fb104c286f', 'eb263ef128', '37b4ec2e1a', '0daaddc9da', 'cd69993923', '31d3a7d2ee', '60362df585', 'd7ff44ea97', '623d24ce2b', '6031809500', '54526e3c66', '0788b4033d', '3f4bacb16a', '06a5dfb511', '9f21474aca', '7a19a80b19', '9a38b8e463', '822c31928a', 'd1ac0d8b81', 'eea1a45e49', '9f429af409', '33c8dcbe09', '9da2156a73', '3be852ed44', '3674b2c70a', '547416bda1', '4037d8305d', '29c06df0f2', '1335b16cf9', 'b7b7e52e02', 'bc9ba8917e', 'dab44991de', '9fd2d2782b', 'f054e28786', 'b00ff71889', 'eeb18f9d47', '559a611d86', 'dea0160a12', '257f7fd5b8', 'dc197289ef', 'c2bbd6d121', 'f3678388a7', '332dabe378', '63883da4f5', 'b90f8c11db', 'dce363032d', '411774e9ff', '335fc10235', '7775043b5e', '3e03f623bb', '19cde15c4b', 'bf4cc89b18', '1a894a8f98', 'f7d7fb16d0', '61fca8cbf1', 'd69812339e', 'ab9a7583f1', 'e633eec195', '0a598e18a8', 'b3b92781d9', 'cd896a9bee', 'b7928ea5c0', '69c0f7494e', 'cc1a82ac2a', '39b7491321', '352ad66724', '749f1abdf9', '7f26b553ae', '0c04834d61', 'd1dd586cfd', '3b72dc1941', '39bce09d8d', 'cbea8f6bea', 'cc7c3138ff', 'd59c093632', '68dab8f80c', '1e0257109e', '4307020e0f', '4b783f1fc5', 'ebe7138e58', '1f390d22ea', '7a72130f21', 'aceb34fcbe', '9c0b55cae5', 'b58a97176b', '152fe4902a', 'a806e58451', '9ce299a510', '97b38cabcc', 'f39c805b54', '0620b43a31', '0723d7d4fe', '7741a0fbce', '7836afc0c2', 'a7462d6aaf', '34564d26d8', '31e0beaf99']
    # sorted_video_key = ['6cc8bce61a']
    for vid_ind, vid in enumerate(sorted_video_key):
        print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys())))
        expressions = videos[vid]['expressions']
        # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']]
        frame_ids = videos[vid]['frames']
        for eid in expressions:
            exp = expressions[eid]['exp']
            index = int(eid)
            vis_dir = args.visdir
#             mask_dir = os.path.join(args.maskdir, str('{}/{}/'.format(vid, index)))
            if not os.path.exists(vis_dir):
                os.makedirs(vis_dir)
#             if not os.path.exists(mask_dir):
#                 os.makedirs(mask_dir)
            avg_time = 0
            total_frame = 0
#             Process text
            text = np.array(text_processing.preprocess_sentence(exp, vocab_dict, T))
            valid_idx = np.zeros([1], dtype=np.int32)
            for idx in range(text.shape[0]):
                if text[idx] != 0:
                    valid_idx[0] = idx
                    break
            for fid in frame_ids:
                frame_id = int(fid)
                if (frame_id % 20 != 0):
                    continue
                vis_path = os.path.join(vis_dir, str('{}_{}_{}.png'.format(vid,eid,fid)))
                frame = load_frame_from_id(vid, fid)
                if frame is None:
                    continue
                last_time = time.time()
#                 im = frame.copy()
                im = frame
#                 mask = np.array(frame, dtype=np.float32)

                proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W))
                proc_im_ = proc_im.astype(np.float32)
                proc_im_ = proc_im_[:, :, ::-1]
                proc_im_ -= mu
                scores_val, up_val, sigm_val, up_c4 = sess.run([model.pred, 
                                                                                model.up, 
                                                                                model.sigm, 
                                                                                model.up_c4, 
                                                                                ],
                                                                                feed_dict={
                                                                                    model.words: np.expand_dims(text, axis=0),
                                                                                    model.im: np.expand_dims(proc_im_, axis=0),
                                                                                    model.valid_idx: np.expand_dims(valid_idx, axis=0)
                                                                                })
                # scores_val = np.squeeze(scores_val)
                # pred_raw = (scores_val >= score_thresh).astype(np.float32)
                up_c4 = im_processing.resize_and_crop(sigmoid(np.squeeze(up_c4)), frame.shape[0], frame.shape[1])
                sigm_val = im_processing.resize_and_crop(sigmoid(np.squeeze(sigm_val)), frame.shape[0], frame.shape[1])
                up_val = np.squeeze(up_val)
                # if (not math.isnan(consitency_score) and consitency_score < 0.3):
                plt.clf()
                plt.subplot(1, 3, 1)
                plt.imshow(frame)
                plt.text(-0.7, -0.7, exp + str(consitency_score))
                plt.subplot(1, 3, 2)
                plt.imshow(up_c4)
                plt.subplot(1, 3, 3)
                plt.imshow(sigm_val)
                plt.savefig(vis_path)
#                 pred_raw = (up_val >= score_thresh).astype('uint8') * 255
#                 pred_raw = (up_val >= score_thresh).astype(np.float32)
#                 predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1])
#                 if dcrf:
#                     # Dense CRF post-processing
#                     sigm_val = np.squeeze(sigm_val) + 1e-7
#                     d = densecrf.DenseCRF2D(W, H, 2)
#                     U = np.expand_dims(-np.log(sigm_val), axis=0)
#                     U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0)
#                     unary = np.concatenate((U_, U), axis=0)
#                     unary = unary.reshape((2, -1))
#                     d.setUnaryEnergy(unary)
#                     d.addPairwiseGaussian(sxy=3, compat=3)
#                     d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10)
#                     Q = d.inference(5)
#                     pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype('uint8') * 255
# #                     pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32)
# #                     predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1])
#                 if visualize:
#                     if dcrf:
#                         cv2.imwrite(vis_path, pred_raw_dcrf)
# #                         np.save(mask_path, np.array(pred_raw_dcrf))
# #                         visualize_seg(vis_path, im, exp, predicts_dcrf)
#                     else:
#                         np.save(mask_path, np.array(sigm_val))
#                         cv2.imwrite(vis_path, pred_raw)
#                         visualize_seg(vis_path, im, exp, predicts)
#                         np.save(mask_path, np.array(pred_raw))
    # I, U = eval_tools.compute_mask_IU(predicts, mask)
    # IU_result.append({'batch_no': n_iter, 'I': I, 'U': U})
    # mean_IoU += float(I) / U
    # cum_I += I
    # cum_U += U
    # msg = 'cumulative IoU = %f' % (cum_I / cum_U)
    # for n_eval_iou in range(len(eval_seg_iou_list)):
    #     eval_seg_iou = eval_seg_iou_list[n_eval_iou]
    #     seg_correct[n_eval_iou] += (I / U >= eval_seg_iou)
    # if dcrf:
    #     I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask)
    #     mean_dcrf_IoU += float(I_dcrf) / U_dcrf
    #     cum_I_dcrf += I_dcrf
    #     cum_U_dcrf += U_dcrf
    #     msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf)
    #     for n_eval_iou in range(len(eval_seg_iou_list)):
    #         eval_seg_iou = eval_seg_iou_list[n_eval_iou]
    #         seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou)
    # print(msg)
    seg_total += 1
Example #10
0
def test(iter,
         dataset,
         visualize,
         setname,
         dcrf,
         mu,
         tfmodel_folder,
         model_name,
         pre_emb=False):
    data_folder = './' + dataset + '/' + setname + '_batch/'
    data_prefix = dataset + '_' + setname
    if visualize:
        save_dir = './' + dataset + '/visualization/' + str(iter) + '/'
        if not os.path.isdir(save_dir):
            os.makedirs(save_dir)
    weights = os.path.join(tfmodel_folder,
                           dataset + '_iter_' + str(iter) + '.tfmodel')
    print("Loading trained weights from {}".format(weights))

    score_thresh = 1e-9
    eval_seg_iou_list = [.5, .6, .7, .8, .9]
    cum_I, cum_U = 0, 0
    mean_IoU, mean_dcrf_IoU = 0, 0
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    if dcrf:
        cum_I_dcrf, cum_U_dcrf = 0, 0
        seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.
    T = 20  # truncated long sentence
    H, W = 320, 320
    vocab_size = 8803 if dataset == 'referit' else 12112
    emb_name = 'referit' if dataset == 'referit' else 'Gref'
    vocab_file = './data/vocabulary_Gref.txt'
    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
    IU_result = list()

    if pre_emb:
        # use pretrained embbeding
        print("Use pretrained Embeddings.")
        model = get_segmentation_model(model_name,
                                       H=H,
                                       W=W,
                                       mode='eval',
                                       vocab_size=vocab_size,
                                       emb_name=emb_name,
                                       emb_dir=args.embdir)
    else:
        model = get_segmentation_model(model_name,
                                       H=H,
                                       W=W,
                                       mode='eval',
                                       vocab_size=vocab_size)

    # Load pretrained model
    snapshot_restorer = tf.train.Saver()
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    sess.run(tf.global_variables_initializer())
    snapshot_restorer.restore(sess, weights)

    meta_expression = {}
    with open(args.meta) as meta_file:
        meta_expression = json.load(meta_file)
    videos = meta_expression['videos']
    for vid_ind, vid in reversed(list(enumerate(videos.keys()))):
        print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys())))
        expressions = videos[vid]['expressions']
        # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']]
        frame_ids = videos[vid]['frames']
        for eid in expressions:
            exp = expressions[eid]['exp']
            index = int(eid)
            vis_dir = os.path.join(args.visdir,
                                   str('{}/{}/'.format(vid, index)))
            mask_dir = os.path.join(args.maskdir,
                                    str('{}/{}/'.format(vid, index)))
            if not os.path.exists(vis_dir):
                os.makedirs(vis_dir)
            if not os.path.exists(mask_dir):
                os.makedirs(mask_dir)
            avg_time = 0
            total_frame = 0
            #             Process text
            text = np.array(
                text_processing.preprocess_sentence(exp, vocab_dict, T))
            valid_idx = np.zeros([1], dtype=np.int32)
            for idx in range(text.shape[0]):
                if text[idx] != 0:
                    valid_idx[0] = idx
                    break
            for fid in frame_ids:
                vis_path = os.path.join(vis_dir, str('{}.png'.format(fid)))
                mask_path = os.path.join(mask_dir, str('{}.npy'.format(fid)))
                if os.path.exists(vis_path):
                    continue
                frame = load_frame_from_id(vid, fid)
                if frame is None:
                    continue
                last_time = time.time()
                #                 im = frame.copy()
                im = frame
                #                 mask = np.array(frame, dtype=np.float32)

                proc_im = skimage.img_as_ubyte(
                    im_processing.resize_and_pad(im, H, W))
                proc_im_ = proc_im.astype(np.float32)
                # proc_im_ = proc_im_[:, :, ::-1]
                proc_im_ -= mu
                scores_val, up_val, sigm_val = sess.run(
                    [model.pred, model.up, model.sigm],
                    feed_dict={
                        model.words: np.expand_dims(text, axis=0),
                        model.im: np.expand_dims(proc_im_, axis=0),
                        model.valid_idx: np.expand_dims(valid_idx, axis=0)
                    })
                # scores_val = np.squeeze(scores_val)
                # pred_raw = (scores_val >= score_thresh).astype(np.float32)
                up_val = np.squeeze(up_val)
                pred_raw = (up_val >= score_thresh).astype('uint8') * 255
                #                 pred_raw = (up_val >= score_thresh).astype(np.float32)
                #                 predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1])
                if dcrf:
                    # Dense CRF post-processing
                    sigm_val = np.squeeze(sigm_val) + 1e-7
                    d = densecrf.DenseCRF2D(W, H, 2)
                    U = np.expand_dims(-np.log(sigm_val), axis=0)
                    U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0)
                    unary = np.concatenate((U_, U), axis=0)
                    unary = unary.reshape((2, -1))
                    d.setUnaryEnergy(unary)
                    d.addPairwiseGaussian(sxy=3, compat=3)
                    d.addPairwiseBilateral(sxy=20,
                                           srgb=3,
                                           rgbim=proc_im,
                                           compat=10)
                    Q = d.inference(5)
                    pred_raw_dcrf = np.argmax(Q, axis=0).reshape(
                        (H, W)).astype('uint8') * 255
#                     pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32)
#                     predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1])
                if visualize:
                    if dcrf:
                        cv2.imwrite(vis_path, pred_raw_dcrf)
#                         np.save(mask_path, np.array(pred_raw_dcrf))
#                         visualize_seg(vis_path, im, exp, predicts_dcrf)
                    else:
                        np.save(mask_path, np.array(sigm_val))


#                         cv2.imwrite(vis_path, pred_raw)
#                         visualize_seg(vis_path, im, exp, predicts)
#                         np.save(mask_path, np.array(pred_raw))
# I, U = eval_tools.compute_mask_IU(predicts, mask)
# IU_result.append({'batch_no': n_iter, 'I': I, 'U': U})
# mean_IoU += float(I) / U
# cum_I += I
# cum_U += U
# msg = 'cumulative IoU = %f' % (cum_I / cum_U)
# for n_eval_iou in range(len(eval_seg_iou_list)):
#     eval_seg_iou = eval_seg_iou_list[n_eval_iou]
#     seg_correct[n_eval_iou] += (I / U >= eval_seg_iou)
# if dcrf:
#     I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask)
#     mean_dcrf_IoU += float(I_dcrf) / U_dcrf
#     cum_I_dcrf += I_dcrf
#     cum_U_dcrf += U_dcrf
#     msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf)
#     for n_eval_iou in range(len(eval_seg_iou_list)):
#         eval_seg_iou = eval_seg_iou_list[n_eval_iou]
#         seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou)
# print(msg)
    seg_total += 1
Example #11
0
        #im_, pad_h, pad_w, scale = resize_and_pad(im, config.input_H, config.input_W)
        #processed_im = skimage.img_as_ubyte(im_)
        processed_im = skimage.img_as_ubyte(
            im_processing.resize_and_pad(im, config.input_H, config.input_W))
        if processed_im.ndim == 2:
            processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3))
        imcrop_val[0, :] = processed_im.astype(
            np.float32) - segmodel.channel_mean
        imcrop_val = imcrop_val.transpose((0, 3, 1, 2))
        imcrop_val = imcrop_val[:, ::-1, :, :]

        spatial_val = processing_tools.generate_spatial_batch(
            config.N, config.featmap_H, config.featmap_W)
        spatial_val = spatial_val.transpose((0, 3, 1, 2))

        text_seq_val[:, 0] = text_processing.preprocess_sentence(
            query, vocab_dict, config.T)
        cont_val = text_processing.create_cont(text_seq_val)

        dummy_label = np.zeros((config.N, 1, config.input_H, config.input_W),
                               dtype=np.float32)

        # Forward pass to get response map
        net.blobs['language'].data[...] = text_seq_val
        net.blobs['cont'].data[...] = cont_val
        net.blobs['image'].data[...] = imcrop_val
        net.blobs['spatial'].data[...] = spatial_val
        net.blobs['label'].data[...] = dummy_label

        net.forward()

        upscores = net.blobs['upscores'].data[...].copy()
Example #12
0
def build_a2d_batches(T, input_H, input_W, video=False):
    """
    Build data batches of A2D Sentence dataset

    Args:
         T: limit of number of words
         input_H: height of input frame of I3D backbone
         input_W: width of input frame of I3D backbone
         video: select consecutive frames or standalone frame
    """

    query_file = os.path.join(a2d_dir, 'a2d_annotation.txt')
    frame_dir = os.path.join(a2d_dir, 'Release/frames')
    vocab_file = os.path.join(root_dir, 'data/vocabulary_Gref.txt')

    dataset_name = 'a2d_sent_new'
    out_dataset_dir = os.path.join(root_dir, dataset_name)
    if not os.path.exists(out_dataset_dir):
        os.mkdir(out_dataset_dir)
    test_batch = os.path.join(out_dataset_dir, 'test_batch')
    train_batch = os.path.join(out_dataset_dir, 'train_batch')
    if not os.path.exists(test_batch):
        os.mkdir(test_batch)
    if not os.path.exists(train_batch):
        os.mkdir(train_batch)

    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)
    test_prefix_list = list()
    train_prefix_list = list()
    split_dict = gen_split_dict()
    SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')

    with open(query_file, 'r') as f:
        reader = csv.reader(f)
        next(reader)
        total_count = 0
        test_count = 0
        train_count = 0
        all_zero_mask_count = 0
        for row in tqdm(reader):
            # each video belongs to test or train
            video_id = row[0]
            data_prefix = video_id
            if split_dict[data_prefix] == 1:
                save_dir = test_batch
                test_prefix_list.append(data_prefix)
                test = True
            else:
                save_dir = train_batch
                train_prefix_list.append(data_prefix)
                test = False
            # load sentence
            instance_id = int(row[1])
            sent = row[2].lower()
            words = SENTENCE_SPLIT_REGEX.split(sent.strip())
            words = [w for w in words if len(w.strip()) > 0]
            # remove punctuation and restrict sentence within 20 words
            if words[-1] == '.':
                words = words[:-1]
            if len(words) > T:
                words = words[:T]
            n_sent = ""
            for w in words:
                n_sent = n_sent + w + ' '
            n_sent = n_sent.strip()
            n_sent = n_sent.encode('utf-8').decode("utf-8")
            text = text_processing.preprocess_sentence(n_sent, vocab_dict, T)

            image_paths = list()
            # for each video, get all the gt masks of a certain instance
            masks, frame_ids = get_masks(video_id, instance_id)

            for frame_id in frame_ids:
                image_path = os.path.join(frame_dir, video_id,
                                          '{:0>5d}.png'.format(frame_id))
                image_paths.append(image_path)

            for frame_id, image_path, mask in zip(frame_ids, image_paths,
                                                  masks):
                # abandon all zero mask batch
                if np.sum(mask) == 0:
                    print("all zeros mask caught")
                    all_zero_mask_count += 1
                    continue
                if video:
                    # obtain 16 consecutive frames centered at the gt frame
                    frame_paths = frame_range(frame_id=frame_id,
                                              frame_dir=os.path.join(
                                                  frame_dir, video_id))
                else:
                    # only use the gt frame
                    frame_paths = list()
                frames = list()
                if test:
                    count = test_count
                    test_count = test_count + 1
                    prefix = 'test_'
                    image = skimage.io.imread(image_path)
                    for frame_path in frame_paths:
                        frames.append(skimage.io.imread(frame_path))
                else:
                    prefix = 'train_'
                    count = train_count
                    train_count = train_count + 1
                    image = skimage.io.imread(image_path)
                    image = skimage.img_as_ubyte(
                        im_processing.resize_and_pad(image, input_H, input_W))
                    mask = im_processing.resize_and_pad(mask, input_H, input_W)
                    for frame_path in frame_paths:
                        frame = skimage.io.imread(frame_path)
                        frame = skimage.img_as_ubyte(
                            im_processing.resize_and_pad(
                                frame, input_H, input_W))
                        frames.append(frame)

                if debug:
                    m0 = mask[:, :, np.newaxis]
                    m0 = (m0 > 0).astype(np.uint8)
                    m0 = np.concatenate([m0, m0, m0], axis=2)
                    debug_image = image * m0
                    skimage.io.imsave(
                        './debug/{}_{}_{}.png'.format(data_prefix, frame_id,
                                                      sent.replace(' ', '_')),
                        debug_image)

                # save batches
                np.savez(file=os.path.join(
                    save_dir, dataset_name + '_' + prefix + str(count)),
                         text_batch=text,
                         mask_batch=(mask > 0),
                         sent_batch=[sent],
                         im_batch=image,
                         frame_id=frame_id,
                         frames=frames)
                total_count = total_count + 1

        print()
        print("num of all zeros masks is: {}".format(all_zero_mask_count))
Example #13
0
def build_refvos_batch(setname,
                       T,
                       input_H,
                       input_W,
                       im_dir,
                       mask_dir,
                       meta_expressions,
                       save_dir,
                       inrange=None):
    vocab_file = './data/vocabulary_Gref.txt'

    print(save_dir)
    # saving directory
    data_folder = os.path.join(save_dir, 'refvos/' + setname + '_batch/')
    data_prefix = 'refvos_' + setname
    if not os.path.isdir(data_folder):
        os.makedirs(data_folder)

    # load annotations
    query_dict = json.load(open(meta_expressions))
    videos = query_dict['videos']
    samples = []
    for vid in videos:
        video = videos[vid]
        expressions = video['expressions']
        frames = video['frames']
        for eid in expressions:
            exp = expressions[eid]['exp']
            obj_id = expressions[eid]['obj_id']
            for fid in frames:
                im_name = os.path.join(vid, fid + '.jpg')
                mask_name = os.path.join(vid, fid + '.png')
                samples.append((im_name, mask_name, exp, obj_id))

    vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file)

    # save batches to disk
    num_batch = len(samples)
    batch_ind = 0
    if inrange == None:
        inrange = range(num_batch)
    for n_batch in inrange:
        print('saving batch %d / %d' % (n_batch + 1, num_batch))
        im_name, mask_name, sent, obj_id = samples[n_batch]
        im_path = os.path.join(im_dir, im_name)
        mask_path = os.path.join(mask_dir, mask_name)
        if not (os.path.exists(im_path) and os.path.exists(mask_path)):
            continue
        im = skimage.io.imread(im_path)
        mask = skimage.io.imread(mask_path)[:, :, :3]
        mask_color = object_color[obj_id]
        mask_obj = np.asarray((mask == mask_color))
        if (len(mask_obj.shape) == 0):
            continue
        mask_obj = mask_obj[:, :, 0]
        if np.max(mask_obj) == 0:
            print(im_name)
            continue
        if 'train' in setname:
            im = skimage.img_as_ubyte(
                im_processing.resize_and_pad(im, input_H, input_W))
            mask = im_processing.resize_and_pad(mask_obj, input_H, input_W)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))

        text = text_processing.preprocess_sentence(sent, vocab_dict, T)

        np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz',
                 text_batch=text,
                 im_batch=im,
                 mask_batch=(mask > 0),
                 sent_batch=[sent])
        batch_ind += 1
Example #14
0
 def get_batch(self, split='train', shuffle=True, echo=True, image_id=None):
     if image_id == None:
         batch_list = self.batch_list[split][:]
         if len(batch_list) == 0:
             batch_list = self.image_split_ids[split][:]
             self.epoch[split] += 1
             if shuffle:
                 random.shuffle(batch_list)
         if echo:
             print('data reader: epoch = %d, batch = %d / %d' %
                   (self.epoch[split], len(self.image_split_ids[split]) -
                    len(batch_list), len(self.image_split_ids[split])))
         image_id = batch_list.pop(0)
         self.batch_list[split] = batch_list
     ann_ids = self.image_to_anns[image_id]
     batch = {}
     batch['im_id'] = image_id
     # coco_bboxes, category_batch
     coco_bboxes = []
     vis_batch = []
     spa_batch = []
     if self.use_category:
         category_batch = []
         visdif_batch = []
         spadif_batch = []
     for ann_id in self.image_to_anns[image_id]:
         coco_bboxes.append(self.ann_to_box[ann_id])
         vis_batch.append(self.ann_vis_feats[ann_id])
         spa_batch.append(self.ann_spa_feats[ann_id])
         if self.use_category:
             category_batch.append(self.ann_to_cat[ann_id])
             visdif_batch.append(self.ann_visdif_feats[ann_id])
             spadif_batch.append(self.ann_spadif_feats[ann_id])
     batch['coco_bboxes'] = np.array(coco_bboxes, dtype=np.float32)
     batch['vis_batch'] = np.array(vis_batch, dtype=np.float32)
     batch['spa_batch'] = np.array(spa_batch, dtype=np.float32)
     if self.use_category:
         batch['category_batch'] = np.array(category_batch, dtype=np.int32)
         batch['visdif_batch'] = np.array(visdif_batch, dtype=np.float32)
         batch['spadif_batch'] = np.array(spadif_batch, dtype=np.float32)
     # coco_ann_ids, label_batch
     coco_ann_ids = []
     label_batch = []
     questions = []
     text_zseq_batch = []  # zero + seq for comprehension
     text_seqz_batch = []  # seq + zero for generation
     for ref_id in self.image_to_refs[image_id]:
         ref = self.refs[ref_id]
         ann_id = self.ref_to_ann[ref_id]
         if ref['split'] == split:
             for sent_id in ref['sent_ids']:
                 sent = self.sents[sent_id]['sent']
                 # refine sentence
                 coco_ann_ids.append(ann_id)
                 label_batch.append(ann_ids.index(ann_id))
                 questions.append(sent)
                 text_zseq_batch.append(
                     text_processing.preprocess_sentence(sent,
                                                         self.vocab_dict,
                                                         T=20,
                                                         mode='zseq'))
                 text_seqz_batch.append(
                     text_processing.preprocess_sentence(sent,
                                                         self.vocab_dict,
                                                         T=20,
                                                         mode='seqz'))
     text_zseq_batch = np.array(text_zseq_batch, dtype=np.int32).T
     text_seqz_batch = np.array(text_seqz_batch, dtype=np.int32).T
     batch['coco_ann_ids'] = coco_ann_ids
     batch['label_batch'] = np.array(label_batch, dtype=np.int32)
     batch['questions'] = questions
     batch['text_zseq_batch'] = np.array(text_zseq_batch, dtype=np.int32)
     batch['text_seqz_batch'] = np.array(text_seqz_batch, dtype=np.int32)
     return batch
def inference():
    with open('./seg_model/test.prototxt', 'w') as f:
        f.write(str(seg_model.generate_model('val', test_config.N)))

    caffe.set_device(test_config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    net = caffe.Net('./seg_model/test.prototxt',
                    test_config.pretrained_model,
                    caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(test_config.query_file))
    bbox_dict = json.load(open(test_config.bbox_file))
    imcrop_dict = json.load(open(test_config.imcrop_file))
    imsize_dict = json.load(open(test_config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(test_config.vocab_file)

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append((imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    cum_I, cum_U = 0.0, 0.0
    eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9]
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.0

    # Pre-allocate arrays
    imcrop_val = np.zeros((test_config.N, test_config.input_H, test_config.input_W, 3), dtype=np.float32)
    text_seq_val = np.zeros((test_config.T, test_config.N), dtype=np.int32)

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]

        # Extract visual features from all proposals
        im = skimage.io.imread(test_config.image_dir + imname)
        processed_im = skimage.img_as_ubyte(
            im_processing.resize_and_pad(im, test_config.input_H, test_config.input_W))
                                                                         
        if processed_im.ndim == 2:
            processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3))

        imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract spatial features
        spatial_val = processing_tools.generate_spatial_batch(test_config.N,
                                                              test_config.featmap_H,
                                                              test_config.featmap_W)
        spatial_val = spatial_val.transpose((0, 3, 1, 2))

        for imcrop_name, _, description in flat_query_dict[imname]:
            mask = load_gt_mask(test_config.mask_dir + imcrop_name + '.mat').astype(np.float32)
            labels = (mask > 0)
            processed_labels = im_processing.resize_and_pad(mask, test_config.input_H, test_config.input_W)
            processed_labels = processed_labels > 0

            text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, test_config.T)
            cont_val = text_processing.create_cont(text_seq_val)

            net.blobs['language'].data[...] = text_seq_val
            net.blobs['cont'].data[...] = cont_val
            net.blobs['image'].data[...] = imcrop_val_trans
            net.blobs['spatial'].data[...] = spatial_val
            net.blobs['label'].data[...] = processed_labels

            net.forward()
            upscores = net.blobs['upscores'].data[...].copy()
            upscores = np.squeeze(upscores)

            # Evaluate the segmentation performance of using bounding box segmentation
            pred_raw = (upscores >= test_config.score_thresh).astype(np.float32)
            predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1])
            I, U = eval_tools.compute_mask_IU(predicts, labels)
            cum_I += I
            cum_U += U
            this_IoU = I/float(U)
            for n_eval_iou in range(len(eval_seg_iou_list)):
                eval_seg_iou = eval_seg_iou_list[n_eval_iou]
                seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou)
            seg_total += 1


    # Print results
    print('Final results on the whole test set')
    result_str = ''
    for n_eval_iou in range(len(eval_seg_iou_list)):
        result_str += 'precision@%s = %f\n' % \
            (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total)
    result_str += 'overall IoU = %f\n' % (cum_I/cum_U)
    print(result_str)
Example #16
0
def inference(config):
    with open('./det_model/fc8.prototxt', 'w') as f:
        f.write(str(det_model.generate_fc8('val', config)))
    with open('./det_model/scores.prototxt', 'w') as f:
        f.write(str(det_model.generate_scores('val', config)))

    caffe.set_device(config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    fc8_net = caffe.Net('./det_model/fc8.prototxt', config.pretrained_model,
                        caffe.TEST)

    scores_net = caffe.Net('./det_model/scores.prototxt',
                           config.pretrained_model, caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(config.query_file))
    bbox_dict = json.load(open(config.bbox_file))
    imcrop_dict = json.load(open(config.imcrop_file))
    imsize_dict = json.load(open(config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file)

    # Object proposals
    bbox_proposal_dict = {}
    for imname in imlist:
        bboxes = np.loadtxt(config.bbox_proposal_dir + imname[:-4] +
                            '.txt').astype(int).reshape((-1, 4))
        bbox_proposal_dict[imname] = bboxes

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append(
                    (imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    eval_bbox_num_list = [1, 10, 100]
    bbox_correct = np.zeros(len(eval_bbox_num_list), dtype=np.int32)
    bbox_total = 0

    # Pre-allocate arrays
    imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3),
                          dtype=np.float32)
    spatial_val = np.zeros((config.N, 8), dtype=np.float32)
    text_seq_val = np.zeros((config.T, config.N), dtype=np.int32)

    dummy_text_seq = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_cont = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_label = np.zeros((config.N, 1))

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]
        imsize = imsize_dict[imname]
        bbox_proposals = bbox_proposal_dict[imname]
        num_proposal = bbox_proposals.shape[0]
        assert (config.N >= num_proposal)

        # Extract visual features from all proposals
        im = skimage.io.imread(config.image_dir + imname)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))
        imcrop_val[:num_proposal,
                   ...] = im_processing.crop_bboxes_subtract_mean(
                       im, bbox_proposals, config.input_H,
                       det_model.channel_mean)
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract bounding box features from proposals
        spatial_val[:num_proposal, ...] = \
            processing_tools.spatial_feature_from_bbox(bbox_proposals, imsize)

        fc8_net.blobs['language'].data[...] = dummy_text_seq
        fc8_net.blobs['cont'].data[...] = dummy_cont
        fc8_net.blobs['image'].data[...] = imcrop_val_trans
        fc8_net.blobs['spatial'].data[...] = spatial_val
        fc8_net.blobs['label'].data[...] = dummy_label

        fc8_net.forward()
        fc8_val = fc8_net.blobs['fc8'].data[...].copy()

        # Extract textual features from sentences
        for imcrop_name, gt_bbox, description in flat_query_dict[imname]:
            proposal_IoUs = eval_tools.compute_bbox_iou(
                bbox_proposals, gt_bbox)

            # Extract language feature
            text = text_processing.preprocess_sentence(description, vocab_dict,
                                                       config.T)
            text_seq_val[...] = np.array(text, dtype=np.int32).reshape((-1, 1))

            cont_val = text_processing.create_cont(text_seq_val)

            scores_net.blobs['language'].data[...] = text_seq_val
            scores_net.blobs['cont'].data[...] = cont_val
            scores_net.blobs['img_feature'].data[...] = fc8_val
            scores_net.blobs['spatial'].data[...] = spatial_val
            scores_net.blobs['label'].data[...] = dummy_label

            scores_net.forward()

            scores_val = scores_net.blobs['scores'].data.copy()
            scores_val = scores_val[:num_proposal, ...].reshape(-1)

            # Sort the scores for the proposals
            if config.use_nms:
                top_ids = eval_tools.nms(proposal.astype(np.float32),
                                         scores_val, config.nms_thresh)
            else:
                top_ids = np.argsort(scores_val)[::-1]

            # Evaluate on bounding boxes
            for n_eval_num in range(len(eval_bbox_num_list)):
                eval_bbox_num = eval_bbox_num_list[n_eval_num]
                bbox_correct[n_eval_num] += \
                    np.any(proposal_IoUs[top_ids[:eval_bbox_num]] >= config.correct_iou_thresh)
            bbox_total += 1

    print('Final results on the whole test set')
    result_str = ''
    for n_eval_num in range(len(eval_bbox_num_list)):
        result_str += 'recall@%s = %f\n' % \
            (str(eval_bbox_num_list[n_eval_num]), bbox_correct[n_eval_num]/bbox_total)
    print(result_str)
    imname = imcrop_name.split('_', 1)[0] + '.jpg'
    mask_name = imcrop_name + '.mat'
    im = skimage.io.imread(image_dir + imname)
    mask = load_gt_mask(mask_dir + mask_name).astype(np.float32)

    processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W))
    if processed_im.ndim == 2:
        processed_im = processed_im[:, :, np.newaxis]
    processed_mask = im_processing.resize_and_pad(mask, input_H, input_W)
    subsampled_mask = skimage.transform.downscale_local_mean(processed_mask, (32, 32))

    labels_fine = (processed_mask > 0)
    labels_coarse = (subsampled_mask > 0)

    for description in query_dict[imcrop_name]:
        text_seq = text_processing.preprocess_sentence(description, vocab_dict, T)
        training_samples.append((processed_im, text_seq, labels_coarse, labels_fine))

# Shuffle the training instances
np.random.seed(3)
shuffle_idx = np.random.permutation(len(training_samples))
shuffled_training_samples = [training_samples[n] for n in shuffle_idx]
print('total training instance number: %d' % len(shuffled_training_samples))

# Create training batches
num_batch = len(shuffled_training_samples) // N
print('total batch number: %d' % num_batch)

################################################################################
# Save training samples to disk
################################################################################
def main(args):

    ################################################################################
    # Validate input arguments
    ################################################################################
    assert not (
        args.concat and (not args.multicrop)
    ), "Cannot test concatenated labels on single image crop per batch."
    assert not (args.classes and args.concat
                ), "Cannot test concatenated labels when using image classes"
    assert not (
        args.classes and (not args.multicrop)
    ), "Cannot test on single image per batch when using image classes"

    # Initialize GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = args.GPU_ID

    # print mode
    print()
    print("Model:", pretrained_model)
    print("All crops per batch - True | First crop per batch - False:",
          args.multicrop)
    print("Concatenated captions - True | Simple captions - False:",
          args.concat)
    print("Image Classes - True | Image Descriptions - False:", args.classes)
    print()

    ################################################################################
    # Evaluation network
    ################################################################################

    # Inputs
    text_seq_batch = tf.placeholder(tf.int32, [T, N])
    imcrop_batch = tf.placeholder(tf.float32, [N, 224, 224, 3])
    lstm_top_batch = tf.placeholder(tf.float32, [N, D_text])
    fc8_crop_batch = tf.placeholder(tf.float32, [N, D_im])

    # Language feature (LSTM hidden state)
    lstm_top = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim,
                                 lstm_dim)

    # Local image feature
    fc8_crop = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=False)

    # L2-normalize the features (except for spatial_batch)
    # and concatenate them along axis 1 (feature dimension)
    feat_all = tf.concat(axis=1,
                         values=[
                             tf.nn.l2_normalize(lstm_top_batch, 1),
                             tf.nn.l2_normalize(fc8_crop_batch, 1)
                         ])

    # Outputs
    # MLP Classifier over concatenate feature
    with tf.variable_scope('classifier'):
        mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims)
        mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1)
    scores = mlp_l2

    # Load pretrained model
    snapshot_restorer = tf.train.Saver(None)
    sess = tf.Session()
    snapshot_restorer.restore(sess, pretrained_model)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    coco = COCO(query_file)
    coco_captions = COCO(caption_file)
    imgid_list = coco.getImgIds()
    catid_list = coco.getCatIds()

    ################################################################################
    # Load testing data
    ################################################################################

    testing_samples_pos = []
    testing_samples_neg = []
    num_imcrop = len(imgid_list)

    # Gather a testing example per full image.
    for n_imcrop in range(num_imcrop):
        # image
        img_id = imgid_list[n_imcrop]

        # get the decriptions of the image
        caption_ids = coco_captions.getAnnIds(imgIds=img_id)
        captions = [
            x['caption'].strip() for x in coco_captions.loadAnns(caption_ids)
        ]

        if args.concat:
            # append two positive captions; one with itself if only one present
            pos_desc = captions[0] + ' and ' + captions[len(captions) - 1]
            testing_samples_pos.append((img_id, pos_desc, 1))

            # form negative examples by choosing random image
            # that is not the current image, get its descriptions,
            # and choose one at random.
            false_idx = n_imcrop
            while false_idx == n_imcrop:
                false_idx = randint(0, num_imcrop - 1)
            desc_ids = coco_captions.getAnnIds(imgid_list[false_idx])
            desc_idx = randint(0, len(desc_ids) - 1)
            neg_desc1 = coco_captions.loadAnns(
                desc_ids[desc_idx])[0]['caption'].strip()

            false_idx = n_imcrop
            while false_idx == n_imcrop:
                false_idx = randint(0, num_imcrop - 1)
            desc_ids = coco_captions.getAnnIds(imgid_list[false_idx])
            desc_idx = randint(0, len(desc_ids) - 1)
            neg_desc2 = coco_captions.loadAnns(
                desc_ids[desc_idx])[0]['caption'].strip()

            # negative example: append two negative captions
            neg_desc = neg_desc1 + ' and ' + neg_desc2
            testing_samples_neg.append((img_id, neg_desc, 0))

            # negative example: append one negative and one positive example
            neg_desc = neg_desc1 + ' and ' + captions[0].strip()
            testing_samples_neg.append((img_id, neg_desc, 0))
            neg_desc = captions[0].strip() + ' and ' + neg_desc1
            testing_samples_neg.append((img_id, neg_desc, 0))

        # for appending image captions
        elif args.classes:
            img_catids = coco.getCatIds(imgIds=img_id)
            img_cat_names = [cat['name'] for cat in coco.loadCats(img_catids)]
            for category in img_cat_names:
                testing_samples_pos.append((img_id, category, 1))

                # form one negative example by choosing random category that
                # img is not in
                false_catid = img_catids[0]
                while false_catid in img_catids:
                    false_catid = catid_list[randint(0, len(catid_list) - 1)]
                false_cat_name = coco.loadCats(false_catid)[0]['name']
                testing_samples_neg.append((img_id, false_cat_name, 0))

        else:
            for caption in captions:
                # append one positive sample per description
                testing_samples_pos.append((img_id, caption, 1))

                # form one negative example by choosing random image
                # that is not the current image, get its descriptions,
                # and choose one at random.
                false_idx = n_imcrop
                while false_idx == n_imcrop:
                    false_idx = randint(0, num_imcrop - 1)
                desc_ids = coco_captions.getAnnIds(imgid_list[false_idx])
                desc_idx = randint(0, len(desc_ids) - 1)
                false_cap = coco_captions.loadAnns(
                    desc_ids[desc_idx])[0]['caption'].strip()

                testing_samples_neg.append((img_id, false_cap, 0))

    # Combine samples
    print('#pos=', len(testing_samples_pos))
    print('#neg=', len(testing_samples_neg))

    # TODO: Not exactly sure what your multicrop is testing here? Just removes the
    # positive examples from being tested? How is this useful?
    if args.multicrop:
        testing_samples = testing_samples_pos + testing_samples_neg
    else:
        testing_samples = testing_samples_neg

    print('#total testing examples=', len(testing_samples))
    num_batch = len(testing_samples) // N
    print('total batch number: %d' % num_batch)

    ################################################################################
    # Testing
    ################################################################################

    # Pre-allocate arrays
    imcrop_val = np.zeros((N, 224, 224, 3), dtype=np.float32)
    text_seq_val = np.zeros((T, N), dtype=np.int32)
    lstm_top_val = np.zeros((N, D_text))
    label_val = np.zeros((N, 1), dtype=np.bool)

    correct_predictions = 0
    total_predictions = 0

    # optimization for faster image loading
    last_img_id = -100
    last_imcrop = None

    for n_batch in range(num_batch):
        print('batch %d / %d' % (n_batch + 1, num_batch))
        batch_begin = n_batch * N
        batch_end = (n_batch + 1) * N

        # load and preprocess last image from previous batch
        first_img_id = testing_samples[max(batch_begin - 1, 0)][0]
        first_imname = coco.loadImgs(first_img_id)[0]['coco_url']
        first_im = skimage.io.imread(first_imname)
        first_imcrop = skimage.img_as_ubyte(
            skimage.transform.resize(first_im, [224, 224]))
        if len(np.shape(first_im)) != 3: continue

        for n_sample in range(batch_begin, batch_end):
            img_id, description, label = testing_samples[n_sample]

            # Preprocess image and caption
            if args.multicrop:
                # Optimization: do not reload image if it is the same as the last one
                if img_id == last_img_id:
                    imcrop = last_imcrop
                else:
                    imname = coco.loadImgs(img_id)[0]['coco_url']
                    im = skimage.io.imread(imname)

                    # ignore grayscale images
                    if len(np.shape(im)) != 3: continue

                    imcrop = skimage.img_as_ubyte(
                        skimage.transform.resize(im, [224, 224]))
                    last_img_id = img_id
                    last_imcrop = imcrop
            else:
                imcrop = first_imcrop
            text_seq = text_processing.preprocess_sentence(
                description, vocab_dict, T)

            # Form batch
            idx = n_sample - batch_begin
            text_seq_val[:, idx] = text_seq
            imcrop_val[idx, ...] = imcrop - vgg_net.channel_mean
            label_val[idx] = label

        # Extract visual feature
        fc8_crop_val = sess.run(fc8_crop, feed_dict={imcrop_batch: imcrop_val})

        # Extract language feature
        lstm_top_val[...] = sess.run(lstm_top,
                                     feed_dict={text_seq_batch: text_seq_val})

        # Compute scores per proposal
        scores_val = sess.run(scores,
                              feed_dict={
                                  lstm_top_batch: lstm_top_val,
                                  fc8_crop_batch: fc8_crop_val
                              })
        scores_val = scores_val[:batch_end - batch_begin + 1, ...].reshape(-1)

        # Evaluate on bounding labels
        for indx in range(len(scores_val)):
            correct_predictions += ((scores_val[indx] > 0) == label_val[indx])
            total_predictions += 1

        print("%d correct predictions out of %d" %
              (correct_predictions, total_predictions))
        print(correct_predictions / total_predictions)

    print('Final results on the whole test set')
    result_str = 'recall = %0.4f \n' % (float(correct_predictions) /
                                        total_predictions)
    print(result_str)
def inference(config):
    with open('./det_model/fc8.prototxt', 'w') as f:
        f.write(str(det_model.generate_fc8('val', config)))
    with open('./det_model/scores.prototxt', 'w') as f:
        f.write(str(det_model.generate_scores('val', config)))

    caffe.set_device(config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    fc8_net = caffe.Net('./det_model/fc8.prototxt',
                        config.pretrained_model,
                        caffe.TEST)

    scores_net = caffe.Net('./det_model/scores.prototxt',
                           config.pretrained_model,
                           caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(config.query_file))
    bbox_dict = json.load(open(config.bbox_file))
    imcrop_dict = json.load(open(config.imcrop_file))
    imsize_dict = json.load(open(config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file)

    # Object proposals
    bbox_proposal_dict = {}
    for imname in imlist:
        bboxes = np.loadtxt(config.bbox_proposal_dir + imname[:-4] + '.txt').astype(int).reshape((-1, 4))
        bbox_proposal_dict[imname] = bboxes

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append((imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    eval_bbox_num_list = [1, 10, 100]
    bbox_correct = np.zeros(len(eval_bbox_num_list), dtype=np.int32)
    bbox_total = 0

    # Pre-allocate arrays
    imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32)
    spatial_val = np.zeros((config.N, 8), dtype=np.float32)
    text_seq_val = np.zeros((config.T, config.N), dtype=np.int32)

    dummy_text_seq = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_cont = np.zeros((config.T, config.N), dtype=np.int32)
    dummy_label = np.zeros((config.N, 1))

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]
        imsize = imsize_dict[imname]
        bbox_proposals = bbox_proposal_dict[imname]
        num_proposal = bbox_proposals.shape[0]
        assert(config.N >= num_proposal)

        # Extract visual features from all proposals
        im = skimage.io.imread(config.image_dir + imname)
        if im.ndim == 2:
            im = np.tile(im[:, :, np.newaxis], (1, 1, 3))
        imcrop_val[:num_proposal, ...] = im_processing.crop_bboxes_subtract_mean(
            im, bbox_proposals, config.input_H, det_model.channel_mean)
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract bounding box features from proposals
        spatial_val[:num_proposal, ...] = \
            processing_tools.spatial_feature_from_bbox(bbox_proposals, imsize)

        fc8_net.blobs['language'].data[...] = dummy_text_seq
        fc8_net.blobs['cont'].data[...] = dummy_cont
        fc8_net.blobs['image'].data[...] = imcrop_val_trans
        fc8_net.blobs['spatial'].data[...] = spatial_val
        fc8_net.blobs['label'].data[...] = dummy_label

        fc8_net.forward()
        fc8_val = fc8_net.blobs['fc8'].data[...].copy()

        # Extract textual features from sentences
        for imcrop_name, gt_bbox, description in flat_query_dict[imname]:
            proposal_IoUs = eval_tools.compute_bbox_iou(bbox_proposals, gt_bbox)

            # Extract language feature
            text = text_processing.preprocess_sentence(description, vocab_dict, config.T)
            text_seq_val[...] = np.array(text, dtype=np.int32).reshape((-1, 1))

            cont_val = text_processing.create_cont(text_seq_val)

            scores_net.blobs['language'].data[...] = text_seq_val
            scores_net.blobs['cont'].data[...] = cont_val
            scores_net.blobs['img_feature'].data[...] = fc8_val
            scores_net.blobs['spatial'].data[...] = spatial_val
            scores_net.blobs['label'].data[...] = dummy_label

            scores_net.forward()

            scores_val = scores_net.blobs['scores'].data.copy()
            scores_val = scores_val[:num_proposal, ...].reshape(-1)

            # Sort the scores for the proposals
            if config.use_nms:
                top_ids = eval_tools.nms(proposal.astype(np.float32), scores_val, config.nms_thresh)
            else:
                top_ids = np.argsort(scores_val)[::-1]

            # Evaluate on bounding boxes
            for n_eval_num in range(len(eval_bbox_num_list)):
                eval_bbox_num = eval_bbox_num_list[n_eval_num]
                bbox_correct[n_eval_num] += \
                    np.any(proposal_IoUs[top_ids[:eval_bbox_num]] >= config.correct_iou_thresh)
            bbox_total += 1

    print('Final results on the whole test set')
    result_str = ''
    for n_eval_num in range(len(eval_bbox_num_list)):
        result_str += 'recall@%s = %f\n' % \
            (str(eval_bbox_num_list[n_eval_num]), bbox_correct[n_eval_num]/bbox_total)
    print(result_str)
Example #20
0
def inference(config):
    with open('./seg_model/test.prototxt', 'w') as f:
        f.write(str(seg_model.generate_model('val', config)))

    caffe.set_device(config.gpu_id)
    caffe.set_mode_gpu()

    # Load pretrained model
    net = caffe.Net('./seg_model/test.prototxt',
                    config.pretrained_model,
                    caffe.TEST)

    ################################################################################
    # Load annotations and bounding box proposals
    ################################################################################

    query_dict = json.load(open(config.query_file))
    bbox_dict = json.load(open(config.bbox_file))
    imcrop_dict = json.load(open(config.imcrop_file))
    imsize_dict = json.load(open(config.imsize_file))
    imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict})
    vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file)

    ################################################################################
    # Flatten the annotations
    ################################################################################

    flat_query_dict = {imname: [] for imname in imlist}
    for imname in imlist:
        this_imcrop_names = imcrop_dict[imname]
        for imcrop_name in this_imcrop_names:
            gt_bbox = bbox_dict[imcrop_name]
            if imcrop_name not in query_dict:
                continue
            this_descriptions = query_dict[imcrop_name]
            for description in this_descriptions:
                flat_query_dict[imname].append((imcrop_name, gt_bbox, description))

    ################################################################################
    # Testing
    ################################################################################

    cum_I, cum_U = 0.0, 0.0
    eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9]
    seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32)
    seg_total = 0.0

    # Pre-allocate arrays
    imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32)
    text_seq_val = np.zeros((config.T, config.N), dtype=np.int32)

    num_im = len(imlist)
    for n_im in tqdm(range(num_im)):
        imname = imlist[n_im]

        # Extract visual features from all proposals
        im = skimage.io.imread(config.image_dir + imname)
        processed_im = skimage.img_as_ubyte(
            im_processing.resize_and_pad(im, config.input_H, config.input_W))
                                                                         
        if processed_im.ndim == 2:
            processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3))

        imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean
        imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2))

        # Extract spatial features
        spatial_val = processing_tools.generate_spatial_batch(config.N,
                                                              config.featmap_H,
                                                              config.featmap_W)
        spatial_val = spatial_val.transpose((0, 3, 1, 2))

        for imcrop_name, _, description in flat_query_dict[imname]:
            mask = load_gt_mask(config.mask_dir + imcrop_name + '.mat').astype(np.float32)
            labels = (mask > 0)
            processed_labels = im_processing.resize_and_pad(mask, config.input_H, config.input_W)
            processed_labels = processed_labels > 0

            text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, config.T)
            cont_val = text_processing.create_cont(text_seq_val)

            net.blobs['language'].data[...] = text_seq_val
            net.blobs['cont'].data[...] = cont_val
            net.blobs['image'].data[...] = imcrop_val_trans
            net.blobs['spatial'].data[...] = spatial_val
            net.blobs['label'].data[...] = processed_labels

            net.forward()
            upscores = net.blobs['upscores'].data[...].copy()
            upscores = np.squeeze(upscores)

            # Evaluate the segmentation performance of using bounding box segmentation
            pred_raw = (upscores >= config.score_thresh).astype(np.float32)
            predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1])
            I, U = eval_tools.compute_mask_IU(predicts, labels)
            cum_I += I
            cum_U += U
            this_IoU = I/float(U)
            for n_eval_iou in range(len(eval_seg_iou_list)):
                eval_seg_iou = eval_seg_iou_list[n_eval_iou]
                seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou)
            seg_total += 1


    # Print results
    print('Final results on the whole test set')
    result_str = ''
    for n_eval_iou in range(len(eval_seg_iou_list)):
        result_str += 'precision@%s = %f\n' % \
            (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total)
    result_str += 'overall IoU = %f\n' % (cum_I/cum_U)
    print(result_str)