def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T): im_path = iminfo['im_path'] im = skimage.io.imread(im_path) if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # calculate the resize scaling factor im_h, im_w = im.shape[:2] # make the short size equal to min_size but also the long size no bigger than max_size scale = min(max(min_size/im_h, min_size/im_w), max_size/im_h, max_size/im_w) # resize and process the image new_h, new_w = int(scale*im_h), int(scale*im_w) im_resized = skimage.img_as_float(skimage.transform.resize(im, [new_h, new_w])) im_processed = im_resized*255 - im_mean im_batch = im_processed[np.newaxis, ...].astype(np.float32) # annotate regions regions = iminfo['regions'] if len(regions) == 0: raise IOError('no region annotations for image ' + im_path) region_bboxes = np.array([ann[0] for ann in regions], np.float32) # save coco_bboxes, needed for evaluation code coco_bboxes = region_bboxes.copy() # back to [x, y, w, h] coco_bboxes[:, 2:4] = coco_bboxes[:, 2:4] - coco_bboxes[:, 0:2] + 1 region_bboxes *= scale region_bboxes = im_processing.rectify_bboxes(region_bboxes, height=new_h, width=new_w) # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R bbox_batch = np.zeros((len(region_bboxes), 5), np.float32) bbox_batch[:, 1:5] = region_bboxes spatial_batch = spatial_feature_from_bbox(region_bboxes, im_h=new_h, im_w=new_w) # a region may have zero, one or more sentence annotations # align language sequences with regions text_seq_batch = [] label_batch = [] coco_ann_ids = [] # needed for evaluation code questions = [] # needed for evaluation code for n in range(len(regions)): for n_s in range(len(regions[n][1])): s = regions[n][1][n_s] text_seq_batch.append(text_processing.preprocess_sentence(s, vocab_dict, T)) label_batch.append(n) coco_ann_ids.append(regions[n][2]) questions.append(s) text_seq_batch = np.array(text_seq_batch, dtype=np.int32).T label_batch = np.array(label_batch, dtype=np.int32) batch=dict(text_seq_batch=text_seq_batch, im_batch=im_batch, bbox_batch=bbox_batch, spatial_batch=spatial_batch, label_batch=label_batch, coco_ann_ids=coco_ann_ids, questions=questions, coco_bboxes=coco_bboxes) return batch
def build_coco_batches(dataset, setname, T, input_H, input_W): im_dir = '/data/ryli/datasets/coco/images' im_type = 'train2014' vocab_file = './data/vocabulary_Gref.txt' data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) if dataset == 'Gref': refer = REFER('./external/refer/data', dataset='refcocog', splitBy='google') elif dataset == 'unc': refer = REFER('./external/refer/data', dataset='refcoco', splitBy='unc') elif dataset == 'unc+': refer = REFER('./external/refer/data', dataset='refcoco+', splitBy='unc') else: raise ValueError('Unknown dataset %s' % dataset) refs = [ refer.Refs[ref_id] for ref_id in refer.Refs if refer.Refs[ref_id]['split'] == setname ] vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) n_batch = 0 for ref in refs: im_name = 'COCO_' + im_type + '_' + str(ref['image_id']).zfill(12) im = skimage.io.imread('%s/%s/%s.jpg' % (im_dir, im_type, im_name)) seg = refer.Anns[ref['ann_id']]['segmentation'] rle = cocomask.frPyObjects(seg, im.shape[0], im.shape[1]) mask = np.max(cocomask.decode(rle), axis=2).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) for sentence in ref['sentences']: print('saving batch %d' % (n_batch + 1)) sent = sentence['sent'] text = text_processing.preprocess_sentence(sent, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[sent]) n_batch += 1
def vectorizeLearntEmbd(args): if args.checkpoint == '': # Network if args.savefile == "det": vocab_size = 8803 embedding_dim = 1000 vocab_file = './exp-referit/data/vocabulary_referit.txt' vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) pretrained_model = './exp-referit/tfmodel/referit_fc8_det_iter_25000.tfmodel' else: vocab_size = len(vocab) embedding_dim = len(embd[0]) vocab_dict = dict() for i in range(len(vocab)): vocab_dict[vocab[i]] = i pretrained_model = './coco/tfmodel/cls_coco_glove_20000.tfmodel' # Inputs text_seq_batch = tf.placeholder(tf.int32, [T, N]) embedem = embedding_layer(text_seq_batch, vocab_size, embedding_dim) # Load pretrained model snapshot_restorer = tf.train.Saver(None) sess = tf.Session() snapshot_restorer.restore(sess, pretrained_model) # Initialize arrays vectors = list() text_seq_val = np.zeros((T, N), dtype=np.int32) # Generate vector embeddings count = 0 for word in words: count += 1 if count % 100 == 0: print("%d out of %d words processed" % (count, len(words))) # Preprocess word text_seq = text_processing.preprocess_sentence(word, vocab_dict, T) text_seq_val[:, 0] = text_seq # Extract LSTM language feature embedded_seq = sess.run(embedem, feed_dict={text_seq_batch:text_seq_val}) temp = np.squeeze(np.transpose(embedded_seq)) vectors.append(temp) if count == vector_count: break # Save vectors for easy recovery backup = args.savefile + "_TSNE_backup.npz" np.savez(os.path.join(plot_dir, backup), words=words, vectors=vectors) else: # Load saved vectors npzfile = np.load(os.path.join(plot_dir, args.checkpoint)) vectors = npzfile['vectors'] return vectors
def build_referit_batches(setname, T, input_H, input_W): # data directory im_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/images/' mask_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/mask/' query_file = './data/referit/referit_query_' + setname + '.json' vocab_file = './data/vocabulary_referit.txt' # saving directory data_folder = './referit/' + setname + '_batch/' data_prefix = 'referit_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) fp = open('./referit/trainval_list.txt', 'w') # load annotations query_dict = json.load(open(query_file)) im_list = query_dict.keys() vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) # collect training samples samples = [] for n_im, name in enumerate(im_list): im_name = name.split('_', 1)[0] + '.jpg' mask_name = name + '.mat' for sent in query_dict[name]: samples.append((im_name, mask_name, sent)) # save batches to disk num_batch = len(samples) for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) im_name, mask_name, sent = samples[n_batch] fp.write('%d\t%s%s\n' % (n_batch, im_dir, im_name)) im = skimage.io.imread(im_dir + im_name) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) text = text_processing.preprocess_sentence(sent, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[sent]) fp.close()
def load_one_batch(iminfo, im_mean, min_size, max_size, vocab_dict, T): im_path = iminfo['im_path'] im = skimage.io.imread(im_path) if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # calculate the resize scaling factor im_h, im_w = im.shape[:2] # make the short size equal to min_size but also the long size no bigger than max_size scale = min(max(min_size / im_h, min_size / im_w), max_size / im_h, max_size / im_w) # resize and process the image new_h, new_w = int(scale * im_h), int(scale * im_w) im_resized = skimage.img_as_float( skimage.transform.resize(im, [new_h, new_w])) im_processed = im_resized * 255 - im_mean im_batch = im_processed[np.newaxis, ...].astype(np.float32) # Sample one qa pair from all QA pairs qa_pairs = iminfo['processed_qa_pairs'] num_questions = len(qa_pairs) num_choices = 4 text_seq_batch = np.zeros((T, num_questions * num_choices), dtype=np.int32) label_batch = np.zeros(num_questions, dtype=np.int32) bboxes = np.zeros((num_questions * num_choices, 4), np.float32) for n_q in range(num_questions): this_bboxes, question, label = qa_pairs[n_q] bboxes[n_q * num_choices:(n_q + 1) * num_choices, :] = this_bboxes text_seq_batch[:, n_q*num_choices:(n_q+1)*num_choices] = \ np.array(text_processing.preprocess_sentence(question, vocab_dict, T)).reshape((T, 1)) label_batch[n_q] = label # annotate regions bboxes *= scale bboxes = im_processing.rectify_bboxes(bboxes, height=new_h, width=new_w) spatial_batch = spatial_feature_from_bbox(bboxes, im_h=new_h, im_w=new_w) # For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R bbox_batch = np.zeros((len(bboxes), 5), np.float32) bbox_batch[:, 1:5] = bboxes batch = dict(im_batch=im_batch, bbox_batch=bbox_batch, spatial_batch=spatial_batch, text_seq_batch=text_seq_batch, label_batch=label_batch) return batch
def load_one_batch(n_iter): global imcrop_val global spatial_val global text_seq_val global label_val print('data reader: epoch = %d, batch = %d / %d' % (n_iter // num_images, n_iter % num_images, num_images)) # Read one batch # Get images image = image_list[n_iter % num_images] for h in range(height): for w in range(width): crop = image[h * 10:(h + 1) * 10, w * 10:(w + 1) * 10, :] imcrop_val[h * width + w] = skimage.transform.resize( crop, [IM_H, IM_W]) bboxes[h * width + w] = [w, h, w + 1, h + 1] # [x1, y1, x2, y2] imcrop_val *= 255 imcrop_val -= vgg_net.channel_mean # Get spatial batch spatial_val = spatial_feat.spatial_feature_from_bbox(bboxes, im_h=height, im_w=width) # Get text sequence expr_obj = query_list[n_iter % num_images] text_seq_val[:, 0] = text_processing.preprocess_sentence( expr_obj, vocab_dict, T) # Get labels matched_pairs = matched_pairs_list[n_iter % num_images] (h1, w1), (h2, w2) = matched_pairs[0] # just take the first matched_pair if strong_supervision: label_val[...] = (h1 * width + w1) * N_bbox + (h2 * width + w2) else: label_val[...] = (h1 * width + w1) batch = dict(imcrop_batch=imcrop_val, spatial_batch=spatial_val, text_seq_batch=text_seq_val, label_batch=label_val) return batch
def preprocess_data(im, mask, sent, obj_id): anchors = io.read_anchors(anchor_file) mask_color = object_color[obj_id] mask_obj = np.asarray(((mask == mask_color)[:, :, 0])) im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask_obj, input_H, input_W) bbox = im_processing.bboxes_from_masks(np.asarray(mask)) bbox[:, 2:4] += bbox[:, :2] label_bbox, true_bbox = processing_tools.preprocess_true_boxes( bbox, input_H, anchors) text = text_processing.preprocess_sentence(sent, vocab_dict, T) return { 'text_batch': np.asarray(text), 'im_batch': np.asarray(im), 'mask_batch': (mask > 0), 'sent_batch': [sent], 'label_bbox': label_bbox, 'true_bbox': true_bbox }
print('testing image %d / %d' % (n_im, num_im)) imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(image_dir + imname) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - segmodel.vgg_net.channel_mean for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, input_H, input_W) > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, T) scores_val = sess.run(scores, feed_dict={ text_seq_batch : text_seq_val, imcrop_batch : imcrop_val }) scores_val = np.squeeze(scores_val) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (scores_val >= score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I cum_U += U this_IoU = I/U for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou]
def test(iter, dataset, visualize, setname, dcrf, mu, tfmodel_path, model_name, pre_emb=False): data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if visualize: save_dir = './' + dataset + '/visualization/' + str(iter) + '/' if not os.path.isdir(save_dir): os.makedirs(save_dir) weights = os.path.join(tfmodel_path) print("Loading trained weights from {}".format(weights)) score_thresh = 1e-9 eval_seg_iou_list = [.5, .6, .7, .8, .9] cum_I, cum_U = 0, 0 mean_IoU, mean_dcrf_IoU = 0, 0 seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) if dcrf: cum_I_dcrf, cum_U_dcrf = 0, 0 seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0. T = 20 # truncated long sentence H, W = 320, 320 vocab_size = 8803 if dataset == 'referit' else 12112 emb_name = 'referit' if dataset == 'referit' else 'refvos' vocab_file = './data/vocabulary_refvos.txt' vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) IU_result = list() if pre_emb: # use pretrained embbeding print("Use pretrained Embeddings.") model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size, emb_name=emb_name, emb_dir=args.embdir) else: model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size) # Load pretrained model snapshot_restorer = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, weights) meta_expression = {} with open(args.meta) as meta_file: meta_expression = json.load(meta_file) videos = meta_expression['videos'] plt.figure(figsize=[15, 4]) sorted_video_key = ['a9f23c9150', '6cc8bce61a', '03fe6115d4', 'a46012c642', 'c42fdedcdd', 'ee9415c553', '7daa6343e6', '4fe6619a47', '0e8a6b63bb', '65e0640a2a', '8939473ea7', 'b05faf54f7', '5d2020eff8', 'a00c3fa88e', '44e5d1a969', 'deed0ab4fc', 'b205d868e6', '48d2909d9e', 'c9ef04fe59', '1e20ceafae', '0f3f8b2b2f', 'b83923fd72', 'cb06f84b6e', '17cba76927', '35d5e5149d', '62bf7630b3', '0390fabe58', 'bf2d38aefe', '8b7b57b94d', '8d803e87f7', 'c16d9a4ade', '1a1dbe153e', 'd975e5f4a9', '226f1e10f7', '6cb5b08d93', '77df215672', '466734bc5c', '94fa9bd3b5', 'f2a45acf1c', 'ba8823f2d2', '06cd94d38d', 'b772ac822a', '246e38963b', 'b5514f75d8', '188cb4e03d', '3dd327ab4e', '8e2e5af6a8', '450bd2e238', '369919ef49', 'a4bce691c6', '64c6f2ed76', '0782a6df7e', '0062f687f1', 'c74fc37224', 'f7255a57d0', '4f5b3310e3', 'e027ebc228', '30fe0ed0ce', '6a75316e99', 'a2948d4116', '8273b59141', 'abae1ce57d', '621487be65', '45dc90f558', '9787f452bf', 'cdcfd9f93a', '4f6662e4e0', '853ca85618', '13ca7bbcfd', 'f143fede6f', '92fde455eb', '0b0c90e21a', '5460cc540a', '182dbfd6ba', '85968ae408', '541ccb0844', '43115c42b2', '65350fd60a', 'eb49ce8027', 'e11254d3b9', '20a93b4c54', 'a0fc95d8fc', '696e01387c', 'fef7e84268', '72d613f21a', '8c60938d92', '975be70866', '13c3cea202', '4ee0105885', '01c88b5b60', '33e8066265', '8dea7458de', 'c280d21988', 'fd8cf868b2', '35948a7fca', 'e10236eb37', 'a1251195e7', 'b2256e265c', '2b904b76c9', '1ab5f4bbc5', '47d01d34c8', 'd7a38bf258', '1a609fa7ee', '218ac81c2d', '9f16d17e42', 'fb104c286f', 'eb263ef128', '37b4ec2e1a', '0daaddc9da', 'cd69993923', '31d3a7d2ee', '60362df585', 'd7ff44ea97', '623d24ce2b', '6031809500', '54526e3c66', '0788b4033d', '3f4bacb16a', '06a5dfb511', '9f21474aca', '7a19a80b19', '9a38b8e463', '822c31928a', 'd1ac0d8b81', 'eea1a45e49', '9f429af409', '33c8dcbe09', '9da2156a73', '3be852ed44', '3674b2c70a', '547416bda1', '4037d8305d', '29c06df0f2', '1335b16cf9', 'b7b7e52e02', 'bc9ba8917e', 'dab44991de', '9fd2d2782b', 'f054e28786', 'b00ff71889', 'eeb18f9d47', '559a611d86', 'dea0160a12', '257f7fd5b8', 'dc197289ef', 'c2bbd6d121', 'f3678388a7', '332dabe378', '63883da4f5', 'b90f8c11db', 'dce363032d', '411774e9ff', '335fc10235', '7775043b5e', '3e03f623bb', '19cde15c4b', 'bf4cc89b18', '1a894a8f98', 'f7d7fb16d0', '61fca8cbf1', 'd69812339e', 'ab9a7583f1', 'e633eec195', '0a598e18a8', 'b3b92781d9', 'cd896a9bee', 'b7928ea5c0', '69c0f7494e', 'cc1a82ac2a', '39b7491321', '352ad66724', '749f1abdf9', '7f26b553ae', '0c04834d61', 'd1dd586cfd', '3b72dc1941', '39bce09d8d', 'cbea8f6bea', 'cc7c3138ff', 'd59c093632', '68dab8f80c', '1e0257109e', '4307020e0f', '4b783f1fc5', 'ebe7138e58', '1f390d22ea', '7a72130f21', 'aceb34fcbe', '9c0b55cae5', 'b58a97176b', '152fe4902a', 'a806e58451', '9ce299a510', '97b38cabcc', 'f39c805b54', '0620b43a31', '0723d7d4fe', '7741a0fbce', '7836afc0c2', 'a7462d6aaf', '34564d26d8', '31e0beaf99'] # sorted_video_key = ['6cc8bce61a'] for vid_ind, vid in enumerate(sorted_video_key): print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys()))) expressions = videos[vid]['expressions'] # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']] frame_ids = videos[vid]['frames'] for eid in expressions: exp = expressions[eid]['exp'] index = int(eid) vis_dir = args.visdir # mask_dir = os.path.join(args.maskdir, str('{}/{}/'.format(vid, index))) if not os.path.exists(vis_dir): os.makedirs(vis_dir) # if not os.path.exists(mask_dir): # os.makedirs(mask_dir) avg_time = 0 total_frame = 0 # Process text text = np.array(text_processing.preprocess_sentence(exp, vocab_dict, T)) valid_idx = np.zeros([1], dtype=np.int32) for idx in range(text.shape[0]): if text[idx] != 0: valid_idx[0] = idx break for fid in frame_ids: frame_id = int(fid) if (frame_id % 20 != 0): continue vis_path = os.path.join(vis_dir, str('{}_{}_{}.png'.format(vid,eid,fid))) frame = load_frame_from_id(vid, fid) if frame is None: continue last_time = time.time() # im = frame.copy() im = frame # mask = np.array(frame, dtype=np.float32) proc_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) proc_im_ = proc_im_[:, :, ::-1] proc_im_ -= mu scores_val, up_val, sigm_val, up_c4 = sess.run([model.pred, model.up, model.sigm, model.up_c4, ], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0), model.valid_idx: np.expand_dims(valid_idx, axis=0) }) # scores_val = np.squeeze(scores_val) # pred_raw = (scores_val >= score_thresh).astype(np.float32) up_c4 = im_processing.resize_and_crop(sigmoid(np.squeeze(up_c4)), frame.shape[0], frame.shape[1]) sigm_val = im_processing.resize_and_crop(sigmoid(np.squeeze(sigm_val)), frame.shape[0], frame.shape[1]) up_val = np.squeeze(up_val) # if (not math.isnan(consitency_score) and consitency_score < 0.3): plt.clf() plt.subplot(1, 3, 1) plt.imshow(frame) plt.text(-0.7, -0.7, exp + str(consitency_score)) plt.subplot(1, 3, 2) plt.imshow(up_c4) plt.subplot(1, 3, 3) plt.imshow(sigm_val) plt.savefig(vis_path) # pred_raw = (up_val >= score_thresh).astype('uint8') * 255 # pred_raw = (up_val >= score_thresh).astype(np.float32) # predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) # if dcrf: # # Dense CRF post-processing # sigm_val = np.squeeze(sigm_val) + 1e-7 # d = densecrf.DenseCRF2D(W, H, 2) # U = np.expand_dims(-np.log(sigm_val), axis=0) # U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) # unary = np.concatenate((U_, U), axis=0) # unary = unary.reshape((2, -1)) # d.setUnaryEnergy(unary) # d.addPairwiseGaussian(sxy=3, compat=3) # d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) # Q = d.inference(5) # pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype('uint8') * 255 # # pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32) # # predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1]) # if visualize: # if dcrf: # cv2.imwrite(vis_path, pred_raw_dcrf) # # np.save(mask_path, np.array(pred_raw_dcrf)) # # visualize_seg(vis_path, im, exp, predicts_dcrf) # else: # np.save(mask_path, np.array(sigm_val)) # cv2.imwrite(vis_path, pred_raw) # visualize_seg(vis_path, im, exp, predicts) # np.save(mask_path, np.array(pred_raw)) # I, U = eval_tools.compute_mask_IU(predicts, mask) # IU_result.append({'batch_no': n_iter, 'I': I, 'U': U}) # mean_IoU += float(I) / U # cum_I += I # cum_U += U # msg = 'cumulative IoU = %f' % (cum_I / cum_U) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct[n_eval_iou] += (I / U >= eval_seg_iou) # if dcrf: # I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask) # mean_dcrf_IoU += float(I_dcrf) / U_dcrf # cum_I_dcrf += I_dcrf # cum_U_dcrf += U_dcrf # msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou) # print(msg) seg_total += 1
def test(iter, dataset, visualize, setname, dcrf, mu, tfmodel_folder, model_name, pre_emb=False): data_folder = './' + dataset + '/' + setname + '_batch/' data_prefix = dataset + '_' + setname if visualize: save_dir = './' + dataset + '/visualization/' + str(iter) + '/' if not os.path.isdir(save_dir): os.makedirs(save_dir) weights = os.path.join(tfmodel_folder, dataset + '_iter_' + str(iter) + '.tfmodel') print("Loading trained weights from {}".format(weights)) score_thresh = 1e-9 eval_seg_iou_list = [.5, .6, .7, .8, .9] cum_I, cum_U = 0, 0 mean_IoU, mean_dcrf_IoU = 0, 0 seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) if dcrf: cum_I_dcrf, cum_U_dcrf = 0, 0 seg_correct_dcrf = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0. T = 20 # truncated long sentence H, W = 320, 320 vocab_size = 8803 if dataset == 'referit' else 12112 emb_name = 'referit' if dataset == 'referit' else 'Gref' vocab_file = './data/vocabulary_Gref.txt' vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) IU_result = list() if pre_emb: # use pretrained embbeding print("Use pretrained Embeddings.") model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size, emb_name=emb_name, emb_dir=args.embdir) else: model = get_segmentation_model(model_name, H=H, W=W, mode='eval', vocab_size=vocab_size) # Load pretrained model snapshot_restorer = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) sess.run(tf.global_variables_initializer()) snapshot_restorer.restore(sess, weights) meta_expression = {} with open(args.meta) as meta_file: meta_expression = json.load(meta_file) videos = meta_expression['videos'] for vid_ind, vid in reversed(list(enumerate(videos.keys()))): print("Running on video {}/{}".format(vid_ind + 1, len(videos.keys()))) expressions = videos[vid]['expressions'] # instance_ids = [expression['obj_id'] for expression_id in videos[vid]['expressions']] frame_ids = videos[vid]['frames'] for eid in expressions: exp = expressions[eid]['exp'] index = int(eid) vis_dir = os.path.join(args.visdir, str('{}/{}/'.format(vid, index))) mask_dir = os.path.join(args.maskdir, str('{}/{}/'.format(vid, index))) if not os.path.exists(vis_dir): os.makedirs(vis_dir) if not os.path.exists(mask_dir): os.makedirs(mask_dir) avg_time = 0 total_frame = 0 # Process text text = np.array( text_processing.preprocess_sentence(exp, vocab_dict, T)) valid_idx = np.zeros([1], dtype=np.int32) for idx in range(text.shape[0]): if text[idx] != 0: valid_idx[0] = idx break for fid in frame_ids: vis_path = os.path.join(vis_dir, str('{}.png'.format(fid))) mask_path = os.path.join(mask_dir, str('{}.npy'.format(fid))) if os.path.exists(vis_path): continue frame = load_frame_from_id(vid, fid) if frame is None: continue last_time = time.time() # im = frame.copy() im = frame # mask = np.array(frame, dtype=np.float32) proc_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, H, W)) proc_im_ = proc_im.astype(np.float32) # proc_im_ = proc_im_[:, :, ::-1] proc_im_ -= mu scores_val, up_val, sigm_val = sess.run( [model.pred, model.up, model.sigm], feed_dict={ model.words: np.expand_dims(text, axis=0), model.im: np.expand_dims(proc_im_, axis=0), model.valid_idx: np.expand_dims(valid_idx, axis=0) }) # scores_val = np.squeeze(scores_val) # pred_raw = (scores_val >= score_thresh).astype(np.float32) up_val = np.squeeze(up_val) pred_raw = (up_val >= score_thresh).astype('uint8') * 255 # pred_raw = (up_val >= score_thresh).astype(np.float32) # predicts = im_processing.resize_and_crop(pred_raw, mask.shape[0], mask.shape[1]) if dcrf: # Dense CRF post-processing sigm_val = np.squeeze(sigm_val) + 1e-7 d = densecrf.DenseCRF2D(W, H, 2) U = np.expand_dims(-np.log(sigm_val), axis=0) U_ = np.expand_dims(-np.log(1 - sigm_val), axis=0) unary = np.concatenate((U_, U), axis=0) unary = unary.reshape((2, -1)) d.setUnaryEnergy(unary) d.addPairwiseGaussian(sxy=3, compat=3) d.addPairwiseBilateral(sxy=20, srgb=3, rgbim=proc_im, compat=10) Q = d.inference(5) pred_raw_dcrf = np.argmax(Q, axis=0).reshape( (H, W)).astype('uint8') * 255 # pred_raw_dcrf = np.argmax(Q, axis=0).reshape((H, W)).astype(np.float32) # predicts_dcrf = im_processing.resize_and_crop(pred_raw_dcrf, mask.shape[0], mask.shape[1]) if visualize: if dcrf: cv2.imwrite(vis_path, pred_raw_dcrf) # np.save(mask_path, np.array(pred_raw_dcrf)) # visualize_seg(vis_path, im, exp, predicts_dcrf) else: np.save(mask_path, np.array(sigm_val)) # cv2.imwrite(vis_path, pred_raw) # visualize_seg(vis_path, im, exp, predicts) # np.save(mask_path, np.array(pred_raw)) # I, U = eval_tools.compute_mask_IU(predicts, mask) # IU_result.append({'batch_no': n_iter, 'I': I, 'U': U}) # mean_IoU += float(I) / U # cum_I += I # cum_U += U # msg = 'cumulative IoU = %f' % (cum_I / cum_U) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct[n_eval_iou] += (I / U >= eval_seg_iou) # if dcrf: # I_dcrf, U_dcrf = eval_tools.compute_mask_IU(predicts_dcrf, mask) # mean_dcrf_IoU += float(I_dcrf) / U_dcrf # cum_I_dcrf += I_dcrf # cum_U_dcrf += U_dcrf # msg += '\tcumulative IoU (dcrf) = %f' % (cum_I_dcrf / cum_U_dcrf) # for n_eval_iou in range(len(eval_seg_iou_list)): # eval_seg_iou = eval_seg_iou_list[n_eval_iou] # seg_correct_dcrf[n_eval_iou] += (I_dcrf / U_dcrf >= eval_seg_iou) # print(msg) seg_total += 1
#im_, pad_h, pad_w, scale = resize_and_pad(im, config.input_H, config.input_W) #processed_im = skimage.img_as_ubyte(im_) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, config.input_H, config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[0, :] = processed_im.astype( np.float32) - segmodel.channel_mean imcrop_val = imcrop_val.transpose((0, 3, 1, 2)) imcrop_val = imcrop_val[:, ::-1, :, :] spatial_val = processing_tools.generate_spatial_batch( config.N, config.featmap_H, config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) text_seq_val[:, 0] = text_processing.preprocess_sentence( query, vocab_dict, config.T) cont_val = text_processing.create_cont(text_seq_val) dummy_label = np.zeros((config.N, 1, config.input_H, config.input_W), dtype=np.float32) # Forward pass to get response map net.blobs['language'].data[...] = text_seq_val net.blobs['cont'].data[...] = cont_val net.blobs['image'].data[...] = imcrop_val net.blobs['spatial'].data[...] = spatial_val net.blobs['label'].data[...] = dummy_label net.forward() upscores = net.blobs['upscores'].data[...].copy()
def build_a2d_batches(T, input_H, input_W, video=False): """ Build data batches of A2D Sentence dataset Args: T: limit of number of words input_H: height of input frame of I3D backbone input_W: width of input frame of I3D backbone video: select consecutive frames or standalone frame """ query_file = os.path.join(a2d_dir, 'a2d_annotation.txt') frame_dir = os.path.join(a2d_dir, 'Release/frames') vocab_file = os.path.join(root_dir, 'data/vocabulary_Gref.txt') dataset_name = 'a2d_sent_new' out_dataset_dir = os.path.join(root_dir, dataset_name) if not os.path.exists(out_dataset_dir): os.mkdir(out_dataset_dir) test_batch = os.path.join(out_dataset_dir, 'test_batch') train_batch = os.path.join(out_dataset_dir, 'train_batch') if not os.path.exists(test_batch): os.mkdir(test_batch) if not os.path.exists(train_batch): os.mkdir(train_batch) vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) test_prefix_list = list() train_prefix_list = list() split_dict = gen_split_dict() SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') with open(query_file, 'r') as f: reader = csv.reader(f) next(reader) total_count = 0 test_count = 0 train_count = 0 all_zero_mask_count = 0 for row in tqdm(reader): # each video belongs to test or train video_id = row[0] data_prefix = video_id if split_dict[data_prefix] == 1: save_dir = test_batch test_prefix_list.append(data_prefix) test = True else: save_dir = train_batch train_prefix_list.append(data_prefix) test = False # load sentence instance_id = int(row[1]) sent = row[2].lower() words = SENTENCE_SPLIT_REGEX.split(sent.strip()) words = [w for w in words if len(w.strip()) > 0] # remove punctuation and restrict sentence within 20 words if words[-1] == '.': words = words[:-1] if len(words) > T: words = words[:T] n_sent = "" for w in words: n_sent = n_sent + w + ' ' n_sent = n_sent.strip() n_sent = n_sent.encode('utf-8').decode("utf-8") text = text_processing.preprocess_sentence(n_sent, vocab_dict, T) image_paths = list() # for each video, get all the gt masks of a certain instance masks, frame_ids = get_masks(video_id, instance_id) for frame_id in frame_ids: image_path = os.path.join(frame_dir, video_id, '{:0>5d}.png'.format(frame_id)) image_paths.append(image_path) for frame_id, image_path, mask in zip(frame_ids, image_paths, masks): # abandon all zero mask batch if np.sum(mask) == 0: print("all zeros mask caught") all_zero_mask_count += 1 continue if video: # obtain 16 consecutive frames centered at the gt frame frame_paths = frame_range(frame_id=frame_id, frame_dir=os.path.join( frame_dir, video_id)) else: # only use the gt frame frame_paths = list() frames = list() if test: count = test_count test_count = test_count + 1 prefix = 'test_' image = skimage.io.imread(image_path) for frame_path in frame_paths: frames.append(skimage.io.imread(frame_path)) else: prefix = 'train_' count = train_count train_count = train_count + 1 image = skimage.io.imread(image_path) image = skimage.img_as_ubyte( im_processing.resize_and_pad(image, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) for frame_path in frame_paths: frame = skimage.io.imread(frame_path) frame = skimage.img_as_ubyte( im_processing.resize_and_pad( frame, input_H, input_W)) frames.append(frame) if debug: m0 = mask[:, :, np.newaxis] m0 = (m0 > 0).astype(np.uint8) m0 = np.concatenate([m0, m0, m0], axis=2) debug_image = image * m0 skimage.io.imsave( './debug/{}_{}_{}.png'.format(data_prefix, frame_id, sent.replace(' ', '_')), debug_image) # save batches np.savez(file=os.path.join( save_dir, dataset_name + '_' + prefix + str(count)), text_batch=text, mask_batch=(mask > 0), sent_batch=[sent], im_batch=image, frame_id=frame_id, frames=frames) total_count = total_count + 1 print() print("num of all zeros masks is: {}".format(all_zero_mask_count))
def build_refvos_batch(setname, T, input_H, input_W, im_dir, mask_dir, meta_expressions, save_dir, inrange=None): vocab_file = './data/vocabulary_Gref.txt' print(save_dir) # saving directory data_folder = os.path.join(save_dir, 'refvos/' + setname + '_batch/') data_prefix = 'refvos_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) # load annotations query_dict = json.load(open(meta_expressions)) videos = query_dict['videos'] samples = [] for vid in videos: video = videos[vid] expressions = video['expressions'] frames = video['frames'] for eid in expressions: exp = expressions[eid]['exp'] obj_id = expressions[eid]['obj_id'] for fid in frames: im_name = os.path.join(vid, fid + '.jpg') mask_name = os.path.join(vid, fid + '.png') samples.append((im_name, mask_name, exp, obj_id)) vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) # save batches to disk num_batch = len(samples) batch_ind = 0 if inrange == None: inrange = range(num_batch) for n_batch in inrange: print('saving batch %d / %d' % (n_batch + 1, num_batch)) im_name, mask_name, sent, obj_id = samples[n_batch] im_path = os.path.join(im_dir, im_name) mask_path = os.path.join(mask_dir, mask_name) if not (os.path.exists(im_path) and os.path.exists(mask_path)): continue im = skimage.io.imread(im_path) mask = skimage.io.imread(mask_path)[:, :, :3] mask_color = object_color[obj_id] mask_obj = np.asarray((mask == mask_color)) if (len(mask_obj.shape) == 0): continue mask_obj = mask_obj[:, :, 0] if np.max(mask_obj) == 0: print(im_name) continue if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask_obj, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) text = text_processing.preprocess_sentence(sent, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[sent]) batch_ind += 1
def get_batch(self, split='train', shuffle=True, echo=True, image_id=None): if image_id == None: batch_list = self.batch_list[split][:] if len(batch_list) == 0: batch_list = self.image_split_ids[split][:] self.epoch[split] += 1 if shuffle: random.shuffle(batch_list) if echo: print('data reader: epoch = %d, batch = %d / %d' % (self.epoch[split], len(self.image_split_ids[split]) - len(batch_list), len(self.image_split_ids[split]))) image_id = batch_list.pop(0) self.batch_list[split] = batch_list ann_ids = self.image_to_anns[image_id] batch = {} batch['im_id'] = image_id # coco_bboxes, category_batch coco_bboxes = [] vis_batch = [] spa_batch = [] if self.use_category: category_batch = [] visdif_batch = [] spadif_batch = [] for ann_id in self.image_to_anns[image_id]: coco_bboxes.append(self.ann_to_box[ann_id]) vis_batch.append(self.ann_vis_feats[ann_id]) spa_batch.append(self.ann_spa_feats[ann_id]) if self.use_category: category_batch.append(self.ann_to_cat[ann_id]) visdif_batch.append(self.ann_visdif_feats[ann_id]) spadif_batch.append(self.ann_spadif_feats[ann_id]) batch['coco_bboxes'] = np.array(coco_bboxes, dtype=np.float32) batch['vis_batch'] = np.array(vis_batch, dtype=np.float32) batch['spa_batch'] = np.array(spa_batch, dtype=np.float32) if self.use_category: batch['category_batch'] = np.array(category_batch, dtype=np.int32) batch['visdif_batch'] = np.array(visdif_batch, dtype=np.float32) batch['spadif_batch'] = np.array(spadif_batch, dtype=np.float32) # coco_ann_ids, label_batch coco_ann_ids = [] label_batch = [] questions = [] text_zseq_batch = [] # zero + seq for comprehension text_seqz_batch = [] # seq + zero for generation for ref_id in self.image_to_refs[image_id]: ref = self.refs[ref_id] ann_id = self.ref_to_ann[ref_id] if ref['split'] == split: for sent_id in ref['sent_ids']: sent = self.sents[sent_id]['sent'] # refine sentence coco_ann_ids.append(ann_id) label_batch.append(ann_ids.index(ann_id)) questions.append(sent) text_zseq_batch.append( text_processing.preprocess_sentence(sent, self.vocab_dict, T=20, mode='zseq')) text_seqz_batch.append( text_processing.preprocess_sentence(sent, self.vocab_dict, T=20, mode='seqz')) text_zseq_batch = np.array(text_zseq_batch, dtype=np.int32).T text_seqz_batch = np.array(text_seqz_batch, dtype=np.int32).T batch['coco_ann_ids'] = coco_ann_ids batch['label_batch'] = np.array(label_batch, dtype=np.int32) batch['questions'] = questions batch['text_zseq_batch'] = np.array(text_zseq_batch, dtype=np.int32) batch['text_seqz_batch'] = np.array(text_seqz_batch, dtype=np.int32) return batch
def inference(): with open('./seg_model/test.prototxt', 'w') as f: f.write(str(seg_model.generate_model('val', test_config.N))) caffe.set_device(test_config.gpu_id) caffe.set_mode_gpu() # Load pretrained model net = caffe.Net('./seg_model/test.prototxt', test_config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(test_config.query_file)) bbox_dict = json.load(open(test_config.bbox_file)) imcrop_dict = json.load(open(test_config.imcrop_file)) imsize_dict = json.load(open(test_config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(test_config.vocab_file) ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append((imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ cum_I, cum_U = 0.0, 0.0 eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0.0 # Pre-allocate arrays imcrop_val = np.zeros((test_config.N, test_config.input_H, test_config.input_W, 3), dtype=np.float32) text_seq_val = np.zeros((test_config.T, test_config.N), dtype=np.int32) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(test_config.image_dir + imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, test_config.input_H, test_config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract spatial features spatial_val = processing_tools.generate_spatial_batch(test_config.N, test_config.featmap_H, test_config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(test_config.mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, test_config.input_H, test_config.input_W) processed_labels = processed_labels > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, test_config.T) cont_val = text_processing.create_cont(text_seq_val) net.blobs['language'].data[...] = text_seq_val net.blobs['cont'].data[...] = cont_val net.blobs['image'].data[...] = imcrop_val_trans net.blobs['spatial'].data[...] = spatial_val net.blobs['label'].data[...] = processed_labels net.forward() upscores = net.blobs['upscores'].data[...].copy() upscores = np.squeeze(upscores) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (upscores >= test_config.score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I cum_U += U this_IoU = I/float(U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou) seg_total += 1 # Print results print('Final results on the whole test set') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I/cum_U) print(result_str)
def inference(config): with open('./det_model/fc8.prototxt', 'w') as f: f.write(str(det_model.generate_fc8('val', config))) with open('./det_model/scores.prototxt', 'w') as f: f.write(str(det_model.generate_scores('val', config))) caffe.set_device(config.gpu_id) caffe.set_mode_gpu() # Load pretrained model fc8_net = caffe.Net('./det_model/fc8.prototxt', config.pretrained_model, caffe.TEST) scores_net = caffe.Net('./det_model/scores.prototxt', config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(config.query_file)) bbox_dict = json.load(open(config.bbox_file)) imcrop_dict = json.load(open(config.imcrop_file)) imsize_dict = json.load(open(config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file) # Object proposals bbox_proposal_dict = {} for imname in imlist: bboxes = np.loadtxt(config.bbox_proposal_dir + imname[:-4] + '.txt').astype(int).reshape((-1, 4)) bbox_proposal_dict[imname] = bboxes ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append( (imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ eval_bbox_num_list = [1, 10, 100] bbox_correct = np.zeros(len(eval_bbox_num_list), dtype=np.int32) bbox_total = 0 # Pre-allocate arrays imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32) spatial_val = np.zeros((config.N, 8), dtype=np.float32) text_seq_val = np.zeros((config.T, config.N), dtype=np.int32) dummy_text_seq = np.zeros((config.T, config.N), dtype=np.int32) dummy_cont = np.zeros((config.T, config.N), dtype=np.int32) dummy_label = np.zeros((config.N, 1)) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] imsize = imsize_dict[imname] bbox_proposals = bbox_proposal_dict[imname] num_proposal = bbox_proposals.shape[0] assert (config.N >= num_proposal) # Extract visual features from all proposals im = skimage.io.imread(config.image_dir + imname) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[:num_proposal, ...] = im_processing.crop_bboxes_subtract_mean( im, bbox_proposals, config.input_H, det_model.channel_mean) imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract bounding box features from proposals spatial_val[:num_proposal, ...] = \ processing_tools.spatial_feature_from_bbox(bbox_proposals, imsize) fc8_net.blobs['language'].data[...] = dummy_text_seq fc8_net.blobs['cont'].data[...] = dummy_cont fc8_net.blobs['image'].data[...] = imcrop_val_trans fc8_net.blobs['spatial'].data[...] = spatial_val fc8_net.blobs['label'].data[...] = dummy_label fc8_net.forward() fc8_val = fc8_net.blobs['fc8'].data[...].copy() # Extract textual features from sentences for imcrop_name, gt_bbox, description in flat_query_dict[imname]: proposal_IoUs = eval_tools.compute_bbox_iou( bbox_proposals, gt_bbox) # Extract language feature text = text_processing.preprocess_sentence(description, vocab_dict, config.T) text_seq_val[...] = np.array(text, dtype=np.int32).reshape((-1, 1)) cont_val = text_processing.create_cont(text_seq_val) scores_net.blobs['language'].data[...] = text_seq_val scores_net.blobs['cont'].data[...] = cont_val scores_net.blobs['img_feature'].data[...] = fc8_val scores_net.blobs['spatial'].data[...] = spatial_val scores_net.blobs['label'].data[...] = dummy_label scores_net.forward() scores_val = scores_net.blobs['scores'].data.copy() scores_val = scores_val[:num_proposal, ...].reshape(-1) # Sort the scores for the proposals if config.use_nms: top_ids = eval_tools.nms(proposal.astype(np.float32), scores_val, config.nms_thresh) else: top_ids = np.argsort(scores_val)[::-1] # Evaluate on bounding boxes for n_eval_num in range(len(eval_bbox_num_list)): eval_bbox_num = eval_bbox_num_list[n_eval_num] bbox_correct[n_eval_num] += \ np.any(proposal_IoUs[top_ids[:eval_bbox_num]] >= config.correct_iou_thresh) bbox_total += 1 print('Final results on the whole test set') result_str = '' for n_eval_num in range(len(eval_bbox_num_list)): result_str += 'recall@%s = %f\n' % \ (str(eval_bbox_num_list[n_eval_num]), bbox_correct[n_eval_num]/bbox_total) print(result_str)
imname = imcrop_name.split('_', 1)[0] + '.jpg' mask_name = imcrop_name + '.mat' im = skimage.io.imread(image_dir + imname) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = processed_im[:, :, np.newaxis] processed_mask = im_processing.resize_and_pad(mask, input_H, input_W) subsampled_mask = skimage.transform.downscale_local_mean(processed_mask, (32, 32)) labels_fine = (processed_mask > 0) labels_coarse = (subsampled_mask > 0) for description in query_dict[imcrop_name]: text_seq = text_processing.preprocess_sentence(description, vocab_dict, T) training_samples.append((processed_im, text_seq, labels_coarse, labels_fine)) # Shuffle the training instances np.random.seed(3) shuffle_idx = np.random.permutation(len(training_samples)) shuffled_training_samples = [training_samples[n] for n in shuffle_idx] print('total training instance number: %d' % len(shuffled_training_samples)) # Create training batches num_batch = len(shuffled_training_samples) // N print('total batch number: %d' % num_batch) ################################################################################ # Save training samples to disk ################################################################################
def main(args): ################################################################################ # Validate input arguments ################################################################################ assert not ( args.concat and (not args.multicrop) ), "Cannot test concatenated labels on single image crop per batch." assert not (args.classes and args.concat ), "Cannot test concatenated labels when using image classes" assert not ( args.classes and (not args.multicrop) ), "Cannot test on single image per batch when using image classes" # Initialize GPU os.environ['CUDA_VISIBLE_DEVICES'] = args.GPU_ID # print mode print() print("Model:", pretrained_model) print("All crops per batch - True | First crop per batch - False:", args.multicrop) print("Concatenated captions - True | Simple captions - False:", args.concat) print("Image Classes - True | Image Descriptions - False:", args.classes) print() ################################################################################ # Evaluation network ################################################################################ # Inputs text_seq_batch = tf.placeholder(tf.int32, [T, N]) imcrop_batch = tf.placeholder(tf.float32, [N, 224, 224, 3]) lstm_top_batch = tf.placeholder(tf.float32, [N, D_text]) fc8_crop_batch = tf.placeholder(tf.float32, [N, D_im]) # Language feature (LSTM hidden state) lstm_top = lstm_net.lstm_net(text_seq_batch, num_vocab, embed_dim, lstm_dim) # Local image feature fc8_crop = vgg_net.vgg_fc8(imcrop_batch, 'vgg_local', apply_dropout=False) # L2-normalize the features (except for spatial_batch) # and concatenate them along axis 1 (feature dimension) feat_all = tf.concat(axis=1, values=[ tf.nn.l2_normalize(lstm_top_batch, 1), tf.nn.l2_normalize(fc8_crop_batch, 1) ]) # Outputs # MLP Classifier over concatenate feature with tf.variable_scope('classifier'): mlp_l1 = fc_relu('mlp_l1', feat_all, output_dim=mlp_hidden_dims) mlp_l2 = fc('mlp_l2', mlp_l1, output_dim=1) scores = mlp_l2 # Load pretrained model snapshot_restorer = tf.train.Saver(None) sess = tf.Session() snapshot_restorer.restore(sess, pretrained_model) ################################################################################ # Load annotations and bounding box proposals ################################################################################ coco = COCO(query_file) coco_captions = COCO(caption_file) imgid_list = coco.getImgIds() catid_list = coco.getCatIds() ################################################################################ # Load testing data ################################################################################ testing_samples_pos = [] testing_samples_neg = [] num_imcrop = len(imgid_list) # Gather a testing example per full image. for n_imcrop in range(num_imcrop): # image img_id = imgid_list[n_imcrop] # get the decriptions of the image caption_ids = coco_captions.getAnnIds(imgIds=img_id) captions = [ x['caption'].strip() for x in coco_captions.loadAnns(caption_ids) ] if args.concat: # append two positive captions; one with itself if only one present pos_desc = captions[0] + ' and ' + captions[len(captions) - 1] testing_samples_pos.append((img_id, pos_desc, 1)) # form negative examples by choosing random image # that is not the current image, get its descriptions, # and choose one at random. false_idx = n_imcrop while false_idx == n_imcrop: false_idx = randint(0, num_imcrop - 1) desc_ids = coco_captions.getAnnIds(imgid_list[false_idx]) desc_idx = randint(0, len(desc_ids) - 1) neg_desc1 = coco_captions.loadAnns( desc_ids[desc_idx])[0]['caption'].strip() false_idx = n_imcrop while false_idx == n_imcrop: false_idx = randint(0, num_imcrop - 1) desc_ids = coco_captions.getAnnIds(imgid_list[false_idx]) desc_idx = randint(0, len(desc_ids) - 1) neg_desc2 = coco_captions.loadAnns( desc_ids[desc_idx])[0]['caption'].strip() # negative example: append two negative captions neg_desc = neg_desc1 + ' and ' + neg_desc2 testing_samples_neg.append((img_id, neg_desc, 0)) # negative example: append one negative and one positive example neg_desc = neg_desc1 + ' and ' + captions[0].strip() testing_samples_neg.append((img_id, neg_desc, 0)) neg_desc = captions[0].strip() + ' and ' + neg_desc1 testing_samples_neg.append((img_id, neg_desc, 0)) # for appending image captions elif args.classes: img_catids = coco.getCatIds(imgIds=img_id) img_cat_names = [cat['name'] for cat in coco.loadCats(img_catids)] for category in img_cat_names: testing_samples_pos.append((img_id, category, 1)) # form one negative example by choosing random category that # img is not in false_catid = img_catids[0] while false_catid in img_catids: false_catid = catid_list[randint(0, len(catid_list) - 1)] false_cat_name = coco.loadCats(false_catid)[0]['name'] testing_samples_neg.append((img_id, false_cat_name, 0)) else: for caption in captions: # append one positive sample per description testing_samples_pos.append((img_id, caption, 1)) # form one negative example by choosing random image # that is not the current image, get its descriptions, # and choose one at random. false_idx = n_imcrop while false_idx == n_imcrop: false_idx = randint(0, num_imcrop - 1) desc_ids = coco_captions.getAnnIds(imgid_list[false_idx]) desc_idx = randint(0, len(desc_ids) - 1) false_cap = coco_captions.loadAnns( desc_ids[desc_idx])[0]['caption'].strip() testing_samples_neg.append((img_id, false_cap, 0)) # Combine samples print('#pos=', len(testing_samples_pos)) print('#neg=', len(testing_samples_neg)) # TODO: Not exactly sure what your multicrop is testing here? Just removes the # positive examples from being tested? How is this useful? if args.multicrop: testing_samples = testing_samples_pos + testing_samples_neg else: testing_samples = testing_samples_neg print('#total testing examples=', len(testing_samples)) num_batch = len(testing_samples) // N print('total batch number: %d' % num_batch) ################################################################################ # Testing ################################################################################ # Pre-allocate arrays imcrop_val = np.zeros((N, 224, 224, 3), dtype=np.float32) text_seq_val = np.zeros((T, N), dtype=np.int32) lstm_top_val = np.zeros((N, D_text)) label_val = np.zeros((N, 1), dtype=np.bool) correct_predictions = 0 total_predictions = 0 # optimization for faster image loading last_img_id = -100 last_imcrop = None for n_batch in range(num_batch): print('batch %d / %d' % (n_batch + 1, num_batch)) batch_begin = n_batch * N batch_end = (n_batch + 1) * N # load and preprocess last image from previous batch first_img_id = testing_samples[max(batch_begin - 1, 0)][0] first_imname = coco.loadImgs(first_img_id)[0]['coco_url'] first_im = skimage.io.imread(first_imname) first_imcrop = skimage.img_as_ubyte( skimage.transform.resize(first_im, [224, 224])) if len(np.shape(first_im)) != 3: continue for n_sample in range(batch_begin, batch_end): img_id, description, label = testing_samples[n_sample] # Preprocess image and caption if args.multicrop: # Optimization: do not reload image if it is the same as the last one if img_id == last_img_id: imcrop = last_imcrop else: imname = coco.loadImgs(img_id)[0]['coco_url'] im = skimage.io.imread(imname) # ignore grayscale images if len(np.shape(im)) != 3: continue imcrop = skimage.img_as_ubyte( skimage.transform.resize(im, [224, 224])) last_img_id = img_id last_imcrop = imcrop else: imcrop = first_imcrop text_seq = text_processing.preprocess_sentence( description, vocab_dict, T) # Form batch idx = n_sample - batch_begin text_seq_val[:, idx] = text_seq imcrop_val[idx, ...] = imcrop - vgg_net.channel_mean label_val[idx] = label # Extract visual feature fc8_crop_val = sess.run(fc8_crop, feed_dict={imcrop_batch: imcrop_val}) # Extract language feature lstm_top_val[...] = sess.run(lstm_top, feed_dict={text_seq_batch: text_seq_val}) # Compute scores per proposal scores_val = sess.run(scores, feed_dict={ lstm_top_batch: lstm_top_val, fc8_crop_batch: fc8_crop_val }) scores_val = scores_val[:batch_end - batch_begin + 1, ...].reshape(-1) # Evaluate on bounding labels for indx in range(len(scores_val)): correct_predictions += ((scores_val[indx] > 0) == label_val[indx]) total_predictions += 1 print("%d correct predictions out of %d" % (correct_predictions, total_predictions)) print(correct_predictions / total_predictions) print('Final results on the whole test set') result_str = 'recall = %0.4f \n' % (float(correct_predictions) / total_predictions) print(result_str)
def inference(config): with open('./det_model/fc8.prototxt', 'w') as f: f.write(str(det_model.generate_fc8('val', config))) with open('./det_model/scores.prototxt', 'w') as f: f.write(str(det_model.generate_scores('val', config))) caffe.set_device(config.gpu_id) caffe.set_mode_gpu() # Load pretrained model fc8_net = caffe.Net('./det_model/fc8.prototxt', config.pretrained_model, caffe.TEST) scores_net = caffe.Net('./det_model/scores.prototxt', config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(config.query_file)) bbox_dict = json.load(open(config.bbox_file)) imcrop_dict = json.load(open(config.imcrop_file)) imsize_dict = json.load(open(config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file) # Object proposals bbox_proposal_dict = {} for imname in imlist: bboxes = np.loadtxt(config.bbox_proposal_dir + imname[:-4] + '.txt').astype(int).reshape((-1, 4)) bbox_proposal_dict[imname] = bboxes ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append((imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ eval_bbox_num_list = [1, 10, 100] bbox_correct = np.zeros(len(eval_bbox_num_list), dtype=np.int32) bbox_total = 0 # Pre-allocate arrays imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32) spatial_val = np.zeros((config.N, 8), dtype=np.float32) text_seq_val = np.zeros((config.T, config.N), dtype=np.int32) dummy_text_seq = np.zeros((config.T, config.N), dtype=np.int32) dummy_cont = np.zeros((config.T, config.N), dtype=np.int32) dummy_label = np.zeros((config.N, 1)) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] imsize = imsize_dict[imname] bbox_proposals = bbox_proposal_dict[imname] num_proposal = bbox_proposals.shape[0] assert(config.N >= num_proposal) # Extract visual features from all proposals im = skimage.io.imread(config.image_dir + imname) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[:num_proposal, ...] = im_processing.crop_bboxes_subtract_mean( im, bbox_proposals, config.input_H, det_model.channel_mean) imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract bounding box features from proposals spatial_val[:num_proposal, ...] = \ processing_tools.spatial_feature_from_bbox(bbox_proposals, imsize) fc8_net.blobs['language'].data[...] = dummy_text_seq fc8_net.blobs['cont'].data[...] = dummy_cont fc8_net.blobs['image'].data[...] = imcrop_val_trans fc8_net.blobs['spatial'].data[...] = spatial_val fc8_net.blobs['label'].data[...] = dummy_label fc8_net.forward() fc8_val = fc8_net.blobs['fc8'].data[...].copy() # Extract textual features from sentences for imcrop_name, gt_bbox, description in flat_query_dict[imname]: proposal_IoUs = eval_tools.compute_bbox_iou(bbox_proposals, gt_bbox) # Extract language feature text = text_processing.preprocess_sentence(description, vocab_dict, config.T) text_seq_val[...] = np.array(text, dtype=np.int32).reshape((-1, 1)) cont_val = text_processing.create_cont(text_seq_val) scores_net.blobs['language'].data[...] = text_seq_val scores_net.blobs['cont'].data[...] = cont_val scores_net.blobs['img_feature'].data[...] = fc8_val scores_net.blobs['spatial'].data[...] = spatial_val scores_net.blobs['label'].data[...] = dummy_label scores_net.forward() scores_val = scores_net.blobs['scores'].data.copy() scores_val = scores_val[:num_proposal, ...].reshape(-1) # Sort the scores for the proposals if config.use_nms: top_ids = eval_tools.nms(proposal.astype(np.float32), scores_val, config.nms_thresh) else: top_ids = np.argsort(scores_val)[::-1] # Evaluate on bounding boxes for n_eval_num in range(len(eval_bbox_num_list)): eval_bbox_num = eval_bbox_num_list[n_eval_num] bbox_correct[n_eval_num] += \ np.any(proposal_IoUs[top_ids[:eval_bbox_num]] >= config.correct_iou_thresh) bbox_total += 1 print('Final results on the whole test set') result_str = '' for n_eval_num in range(len(eval_bbox_num_list)): result_str += 'recall@%s = %f\n' % \ (str(eval_bbox_num_list[n_eval_num]), bbox_correct[n_eval_num]/bbox_total) print(result_str)
def inference(config): with open('./seg_model/test.prototxt', 'w') as f: f.write(str(seg_model.generate_model('val', config))) caffe.set_device(config.gpu_id) caffe.set_mode_gpu() # Load pretrained model net = caffe.Net('./seg_model/test.prototxt', config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(config.query_file)) bbox_dict = json.load(open(config.bbox_file)) imcrop_dict = json.load(open(config.imcrop_file)) imsize_dict = json.load(open(config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file) ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append((imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ cum_I, cum_U = 0.0, 0.0 eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0.0 # Pre-allocate arrays imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32) text_seq_val = np.zeros((config.T, config.N), dtype=np.int32) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(config.image_dir + imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, config.input_H, config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract spatial features spatial_val = processing_tools.generate_spatial_batch(config.N, config.featmap_H, config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(config.mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, config.input_H, config.input_W) processed_labels = processed_labels > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, config.T) cont_val = text_processing.create_cont(text_seq_val) net.blobs['language'].data[...] = text_seq_val net.blobs['cont'].data[...] = cont_val net.blobs['image'].data[...] = imcrop_val_trans net.blobs['spatial'].data[...] = spatial_val net.blobs['label'].data[...] = processed_labels net.forward() upscores = net.blobs['upscores'].data[...].copy() upscores = np.squeeze(upscores) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (upscores >= config.score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I cum_U += U this_IoU = I/float(U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou) seg_total += 1 # Print results print('Final results on the whole test set') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I/cum_U) print(result_str)