class CaptionExperiment: def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1) self.captioner.set_image_batch_size(1) def getCaption(self, image): row = image bytes = array.array("b", row.image).tostring() im = Image.open(io.BytesIO(bytes)) image = np.array(im, dtype=np.uint8) dataset = [image] descriptors = self.captioner.compute_descriptors(dataset) images = dataset num_images = len(images) batch_size = num_images # Generate captions for all images. all_captions = [None] * num_images for image_index in xrange(0, num_images, batch_size): batch_end_index = min(image_index + batch_size, num_images) output_captions, output_probs = self.captioner.sample_captions( descriptors[image_index:batch_end_index], temp=float("inf") ) for batch_index, output in zip(range(image_index, batch_end_index), output_captions): all_captions[batch_index] = output # # # Collect model/reference captions, formatting the model's captions and # # each set of reference captions as a list of len(self.images) strings. # # For each image, write out the highest probability caption. model_captions = [""] * len(images) for image_index, image in enumerate(images): caption = self.captioner.sentence(all_captions[image_index]) model_captions[image_index] = caption generation_result = [ Row(row.id, model_captions[image_index]) for (image_index, image_path) in enumerate(images) ] return generation_result
class CaptionExperiment(): def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1) self.captioner.set_image_batch_size(1) def getCaption(self, image): row=image bytes = array.array('b', row.image).tostring() im = Image.open(io.BytesIO(bytes)) image = np.array(im,dtype=np.uint8) dataset = [image] descriptors = self.captioner.compute_descriptors(dataset) images = dataset num_images = len(images) batch_size = num_images #Generate captions for all images. all_captions = [None] * num_images for image_index in xrange(0, num_images, batch_size): batch_end_index = min(image_index + batch_size, num_images) output_captions, output_probs = self.captioner.sample_captions( descriptors[image_index:batch_end_index], temp=float('inf')) for batch_index, output in zip(range(image_index, batch_end_index), output_captions): all_captions[batch_index] = output # # # Collect model/reference captions, formatting the model's captions and # # each set of reference captions as a list of len(self.images) strings. # # For each image, write out the highest probability caption. model_captions = [''] * len(images) for image_index, image in enumerate(images): caption = self.captioner.sentence(all_captions[image_index]) model_captions[image_index] = caption generation_result = [Row(row.id,model_captions[image_index]) for (image_index, image_path) in enumerate(images)] return generation_result
imlist = util.io.load_str_list('./data/training/train_imcrop_list.txt') num_im = len(imlist) # Load all images into memory loaded_images = [] for n_im in range(num_im): if n_im % 200 == 0: print('loading image %d / %d into memory' % (n_im, num_im)) im = skimage.io.imread(image_dir + imlist[n_im]) # Gray scale to RGB if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # RGBA to RGB im = im[:, :, :3] loaded_images.append(im) # Compute fc7 feature from loaded images, as whole image bbox feature descriptors = captioner.compute_descriptors(loaded_images, output_name='fc7') # Save computed bbox features if not os.path.isdir(cached_local_features_dir): os.mkdir(cached_local_features_dir) for n_im in range(num_im): if n_im % 200 == 0: print('saving local features %d / %d' % (n_im, num_im)) save_path = cached_local_features_dir + imlist[n_im] + '_fc7.npy' np.save(save_path, descriptors[n_im, :])
print("Find best candidate..!") for i in range(8): im_file = './splited_image/test' + str(i) + '.jpg' edgebox_file = './proposal_box/selective_box' + str( i) + '.txt' # pre-extracted EdgeBox proposals im = skimage.io.imread(im_file) imsize = np.array([im.shape[1], im.shape[0]]) # [width, height] candidate_boxes = np.loadtxt(edgebox_file).astype(int) candidate_boxes = np.reshape(candidate_boxes, (-1, 4)) # Compute features region_feature = retriever.compute_descriptors_edgebox( captioner, im, candidate_boxes) spatial_feature = retriever.compute_spatial_feat( candidate_boxes, imsize) descriptors = np.concatenate((region_feature, spatial_feature), axis=1) context_feature = captioner.compute_descriptors([im], output_name='fc7') # Compute scores of each candidate region scores = retriever.score_descriptors_context(descriptors, query, context_feature, captioner, vocab_dict) #candidate_boxes = (i, candidate_boxes) candidate_boxes = np.insert(candidate_boxes, 0, i, axis=1) if (i == 0): sum_candidate_box = candidate_boxes else: #sum_candidate_box=np.concatenate(sum_candidate_box,candidate_boxes,axis=1) sum_candidate_box = np.vstack((sum_candidate_box, candidate_boxes)) sum_score_box = np.concatenate((sum_score_box, scores)) #print (sum_score_box)
# Sample distractor_per_object*2 distractors to make sure the test image # itself is not among the distractors (this) distractor_ids = np.random.choice(num_distractors, distractor_per_object*2, replace=False) distractor_names = [distractor_list[n] for n in distractor_ids[:distractor_per_object]] # Use the second half if the imname is among the first half if imname not in distractor_names: distractor_ids_per_im[imname] = distractor_ids[:distractor_per_object] else: distractor_ids_per_im[imname] = distractor_ids[distractor_per_object:] # Compute descriptors for both object images and distractor images image_path_list = [image_dir+imname+'.JPEG' for imname in imlist] distractor_path_list = [distractor_dir+imname+'.JPEG' for imname in distractor_list] obj_descriptors = captioner.compute_descriptors(image_path_list) dis_descriptors = captioner.compute_descriptors(distractor_path_list) ################################################################################ # Test top-1 precision correct_num = 0 total_num = 0 for n_im in range(num_im): print('testing image %d / %d' % (n_im, num_im)) imname = imlist[n_im] for sentence in query_dict[imname]: # compute test image (target object) score given the description sentence obj_score = retriever.score_descriptors(obj_descriptors[n_im:n_im+1, :], sentence, captioner, vocab_dict)[0] # compute distractor scores given the description sentence dis_idx = distractor_ids_per_im[imname]
captioner.set_image_batch_size(batch_size) imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt') num_im = len(imlist) # Load all images into memory loaded_images = [] for n_im in range(num_im): if n_im % 200 == 0: print('loading image %d / %d into memory' % (n_im, num_im)) im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg') # Gray scale to RGB if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # RGBA to RGB im = im[:, :, :3] loaded_images.append(im) # Compute fc7 feature from loaded images, as whole image contextual feature descriptors = captioner.compute_descriptors(loaded_images, output_name='fc7') # Save computed contextual features if not os.path.isdir(cached_context_features_dir): os.mkdir(cached_context_features_dir) for n_im in range(num_im): if n_im % 200 == 0: print('saving contextual features %d / %d' % (n_im, num_im)) save_path = cached_context_features_dir + imlist[n_im] + '_fc7.npy' np.save(save_path, descriptors[n_im, :])
distractor_names = [ distractor_list[n] for n in distractor_ids[:distractor_per_object] ] # Use the second half if the imname is among the first half if imname not in distractor_names: distractor_ids_per_im[imname] = distractor_ids[:distractor_per_object] else: distractor_ids_per_im[imname] = distractor_ids[distractor_per_object:] # Compute descriptors for both object images and distractor images image_path_list = [image_dir + imname + '.JPEG' for imname in imlist] distractor_path_list = [ distractor_dir + imname + '.JPEG' for imname in distractor_list ] obj_descriptors = captioner.compute_descriptors(image_path_list) dis_descriptors = captioner.compute_descriptors(distractor_path_list) ################################################################################ # Test top-1 precision correct_num = 0 total_num = 0 for n_im in range(num_im): print('testing image %d / %d' % (n_im, num_im)) imname = imlist[n_im] for sentence in query_dict[imname]: # compute test image (target object) score given the description sentence obj_score = retriever.score_descriptors( obj_descriptors[n_im:n_im + 1, :], sentence, captioner, vocab_dict)[0] # compute distractor scores given the description sentence