def build_referit_batches(setname, T, input_H, input_W): # data directory im_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/images/' mask_dir = '/data/ryli/text_objseg/exp-referit/referit-dataset/mask/' query_file = './data/referit/referit_query_' + setname + '.json' vocab_file = './data/vocabulary_referit.txt' # saving directory data_folder = './referit/' + setname + '_batch/' data_prefix = 'referit_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) fp = open('./referit/trainval_list.txt', 'w') # load annotations query_dict = json.load(open(query_file)) im_list = query_dict.keys() vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) # collect training samples samples = [] for n_im, name in enumerate(im_list): im_name = name.split('_', 1)[0] + '.jpg' mask_name = name + '.mat' for sent in query_dict[name]: samples.append((im_name, mask_name, sent)) # save batches to disk num_batch = len(samples) for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) im_name, mask_name, sent = samples[n_batch] fp.write('%d\t%s%s\n' % (n_batch, im_dir, im_name)) im = skimage.io.imread(im_dir + im_name) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) text = text_processing.preprocess_sentence(sent, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(n_batch) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[sent]) fp.close()
imcrop_val = np.zeros((N, input_H, input_W, 3), dtype=np.float32) num_im = len(imlist) for n_im in range(num_im): print('testing image %d / %d' % (n_im, num_im)) imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(image_dir + imname) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - segmodel.vgg_net.channel_mean for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, input_H, input_W) > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, T) scores_val = sess.run(scores, feed_dict={ text_seq_batch : text_seq_val, imcrop_batch : imcrop_val }) scores_val = np.squeeze(scores_val) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (scores_val >= score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I
################################################################################ # Collect training samples ################################################################################ training_samples = [] num_imcrop = len(imcrop_list) for n_imcrop in range(num_imcrop): if n_imcrop % 200 == 0: print('processing %d / %d' % (n_imcrop+1, num_imcrop)) imcrop_name = imcrop_list[n_imcrop] # Image and mask imname = imcrop_name.split('_', 1)[0] + '.jpg' mask_name = imcrop_name + '.mat' im = skimage.io.imread(image_dir + imname) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = processed_im[:, :, np.newaxis] processed_mask = im_processing.resize_and_pad(mask, input_H, input_W) subsampled_mask = skimage.transform.downscale_local_mean(processed_mask, (32, 32)) labels_fine = (processed_mask > 0) labels_coarse = (subsampled_mask > 0) for description in query_dict[imcrop_name]: text_seq = text_processing.preprocess_sentence(description, vocab_dict, T) training_samples.append((processed_im, text_seq, labels_coarse, labels_fine)) # Shuffle the training instances
def build_referit_batches(setname, T, input_H, input_W): # data directory im_dir = './data/referit/images/' mask_dir = './data/referit/mask/' query_file = './data/referit_query_' + setname + '.json' vocab_file = './data/vocabulary_spacy_referit.txt' # saving directory data_folder = './referit/' + setname + '_batch/' data_prefix = 'referit_' + setname if not os.path.isdir(data_folder): os.makedirs(data_folder) # load annotations query_dict = json.load(open(query_file)) im_list = query_dict.keys() vocab_dict = text_processing.load_vocab_dict_from_file(vocab_file) # collect training samples samples = [] for n_im, name in enumerate(im_list): im_name = name.split('_', 1)[0] + '.jpg' mask_name = name + '.mat' for sent in query_dict[name]: samples.append((im_name, mask_name, sent)) # save batches to disk # spacy load nlp = spacy.load("en_core_web_sm") SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') num_batch = len(samples) valid = 0 for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) im_name, mask_name, sent = samples[n_batch] im = skimage.io.imread(im_dir + im_name) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) if 'train' in setname: im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) mask = im_processing.resize_and_pad(mask, input_H, input_W) if im.ndim == 2: im = np.tile(im[:, :, np.newaxis], (1, 1, 3)) sent = sent.lower() words = SENTENCE_SPLIT_REGEX.split(sent.strip()) words = [w for w in words if len(w.strip()) > 0] # remove . if words[-1] == '.': words = words[:-1] if len(words) > 20: words = words[:20] n_sent = "" for w in words: n_sent = n_sent + w + ' ' n_sent = n_sent.strip() try: n_sent = n_sent.decode("utf-8") except UnicodeEncodeError: continue doc = nlp(n_sent) if (len(doc) > 30): continue text, graph, height = text_processing.preprocess_spacy_sentence( doc, vocab_dict, T) np.savez(file=data_folder + data_prefix + '_' + str(valid) + '.npz', text_batch=text, im_batch=im, mask_batch=(mask > 0), sent_batch=[n_sent], graph_batch=graph, height_batch=np.array([height], dtype=np.int32)) valid += 1
imcrop_val = np.zeros((N, input_H, input_W, 3), dtype=np.float32) num_im = len(imlist) for n_im in range(num_im): print('testing image %d / %d' % (n_im, num_im)) imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(image_dir + imname) processed_im = skimage.img_as_ubyte(im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - segmodel.vgg_net.channel_mean for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(mask_dir + imcrop_name[:-4] + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, input_H, input_W) > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, T) scores_val = sess.run(scores, feed_dict={ text_seq_batch : text_seq_val, imcrop_batch : imcrop_val }) scores_val = np.squeeze(scores_val) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (scores_val >= score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I
text_seq_batch = np.zeros((T, N), dtype=np.int32) imcrop_batch = np.zeros((N, input_H, input_W, 3), dtype=np.uint8) label_coarse_batch = np.zeros((N, featmap_H, featmap_W, 1), dtype=np.bool) label_fine_batch = np.zeros((N, input_H, input_W, 1), dtype=np.bool) if not os.path.isdir(data_folder): os.mkdir(data_folder) for n_batch in range(num_batch): print('saving batch %d / %d' % (n_batch + 1, num_batch)) batch_begin = n_batch * N batch_end = (n_batch + 1) * N for n_sample in range(batch_begin, batch_end): imname, mask_name, description = shuffled_training_samples[n_sample] im = skimage.io.imread(image_dir + imname) mask = load_gt_mask(mask_dir + mask_name).astype(np.float32) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, input_H, input_W)) if processed_im.ndim == 2: processed_im = processed_im[:, :, np.newaxis] processed_mask = im_processing.resize_and_pad(mask, input_H, input_W) subsampled_mask = skimage.transform.downscale_local_mean( processed_mask, (8, 8)) labels_fine = (processed_mask > 0) labels_coarse = (subsampled_mask > 0) text_seq = text_processing.preprocess_sentence(description, vocab_dict, T)
def inference(): with open('./seg_model/test.prototxt', 'w') as f: f.write(str(seg_model.generate_model('val', test_config.N))) caffe.set_device(test_config.gpu_id) caffe.set_mode_gpu() # Load pretrained model net = caffe.Net('./seg_model/test.prototxt', test_config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(test_config.query_file)) bbox_dict = json.load(open(test_config.bbox_file)) imcrop_dict = json.load(open(test_config.imcrop_file)) imsize_dict = json.load(open(test_config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(test_config.vocab_file) ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append((imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ cum_I, cum_U = 0.0, 0.0 eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0.0 # Pre-allocate arrays imcrop_val = np.zeros((test_config.N, test_config.input_H, test_config.input_W, 3), dtype=np.float32) text_seq_val = np.zeros((test_config.T, test_config.N), dtype=np.int32) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(test_config.image_dir + imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, test_config.input_H, test_config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract spatial features spatial_val = processing_tools.generate_spatial_batch(test_config.N, test_config.featmap_H, test_config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(test_config.mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, test_config.input_H, test_config.input_W) processed_labels = processed_labels > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, test_config.T) cont_val = text_processing.create_cont(text_seq_val) net.blobs['language'].data[...] = text_seq_val net.blobs['cont'].data[...] = cont_val net.blobs['image'].data[...] = imcrop_val_trans net.blobs['spatial'].data[...] = spatial_val net.blobs['label'].data[...] = processed_labels net.forward() upscores = net.blobs['upscores'].data[...].copy() upscores = np.squeeze(upscores) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (upscores >= test_config.score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I cum_U += U this_IoU = I/float(U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou) seg_total += 1 # Print results print('Final results on the whole test set') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I/cum_U) print(result_str)
def inference(config): with open('./seg_model/test.prototxt', 'w') as f: f.write(str(seg_model.generate_model('val', config))) caffe.set_device(config.gpu_id) caffe.set_mode_gpu() # Load pretrained model net = caffe.Net('./seg_model/test.prototxt', config.pretrained_model, caffe.TEST) ################################################################################ # Load annotations and bounding box proposals ################################################################################ query_dict = json.load(open(config.query_file)) bbox_dict = json.load(open(config.bbox_file)) imcrop_dict = json.load(open(config.imcrop_file)) imsize_dict = json.load(open(config.imsize_file)) imlist = list({name.split('_', 1)[0] + '.jpg' for name in query_dict}) vocab_dict = text_processing.load_vocab_dict_from_file(config.vocab_file) ################################################################################ # Flatten the annotations ################################################################################ flat_query_dict = {imname: [] for imname in imlist} for imname in imlist: this_imcrop_names = imcrop_dict[imname] for imcrop_name in this_imcrop_names: gt_bbox = bbox_dict[imcrop_name] if imcrop_name not in query_dict: continue this_descriptions = query_dict[imcrop_name] for description in this_descriptions: flat_query_dict[imname].append((imcrop_name, gt_bbox, description)) ################################################################################ # Testing ################################################################################ cum_I, cum_U = 0.0, 0.0 eval_seg_iou_list = [0.5, 0.6, 0.7, 0.8, 0.9] seg_correct = np.zeros(len(eval_seg_iou_list), dtype=np.int32) seg_total = 0.0 # Pre-allocate arrays imcrop_val = np.zeros((config.N, config.input_H, config.input_W, 3), dtype=np.float32) text_seq_val = np.zeros((config.T, config.N), dtype=np.int32) num_im = len(imlist) for n_im in tqdm(range(num_im)): imname = imlist[n_im] # Extract visual features from all proposals im = skimage.io.imread(config.image_dir + imname) processed_im = skimage.img_as_ubyte( im_processing.resize_and_pad(im, config.input_H, config.input_W)) if processed_im.ndim == 2: processed_im = np.tile(processed_im[:, :, np.newaxis], (1, 1, 3)) imcrop_val[...] = processed_im.astype(np.float32) - seg_model.channel_mean imcrop_val_trans = imcrop_val.transpose((0, 3, 1, 2)) # Extract spatial features spatial_val = processing_tools.generate_spatial_batch(config.N, config.featmap_H, config.featmap_W) spatial_val = spatial_val.transpose((0, 3, 1, 2)) for imcrop_name, _, description in flat_query_dict[imname]: mask = load_gt_mask(config.mask_dir + imcrop_name + '.mat').astype(np.float32) labels = (mask > 0) processed_labels = im_processing.resize_and_pad(mask, config.input_H, config.input_W) processed_labels = processed_labels > 0 text_seq_val[:, 0] = text_processing.preprocess_sentence(description, vocab_dict, config.T) cont_val = text_processing.create_cont(text_seq_val) net.blobs['language'].data[...] = text_seq_val net.blobs['cont'].data[...] = cont_val net.blobs['image'].data[...] = imcrop_val_trans net.blobs['spatial'].data[...] = spatial_val net.blobs['label'].data[...] = processed_labels net.forward() upscores = net.blobs['upscores'].data[...].copy() upscores = np.squeeze(upscores) # Evaluate the segmentation performance of using bounding box segmentation pred_raw = (upscores >= config.score_thresh).astype(np.float32) predicts = im_processing.resize_and_crop(pred_raw, im.shape[0], im.shape[1]) I, U = eval_tools.compute_mask_IU(predicts, labels) cum_I += I cum_U += U this_IoU = I/float(U) for n_eval_iou in range(len(eval_seg_iou_list)): eval_seg_iou = eval_seg_iou_list[n_eval_iou] seg_correct[n_eval_iou] += (I/float(U) >= eval_seg_iou) seg_total += 1 # Print results print('Final results on the whole test set') result_str = '' for n_eval_iou in range(len(eval_seg_iou_list)): result_str += 'precision@%s = %f\n' % \ (str(eval_seg_iou_list[n_eval_iou]), seg_correct[n_eval_iou]/seg_total) result_str += 'overall IoU = %f\n' % (cum_I/cum_U) print(result_str)