Ejemplo n.º 1
0
class CaptionExperiment:
    def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
        self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1)
        self.captioner.set_image_batch_size(1)

    def getCaption(self, image):
        row = image
        bytes = array.array("b", row.image).tostring()
        im = Image.open(io.BytesIO(bytes))
        image = np.array(im, dtype=np.uint8)
        dataset = [image]
        descriptors = self.captioner.compute_descriptors(dataset)
        images = dataset
        num_images = len(images)
        batch_size = num_images

        # Generate captions for all images.
        all_captions = [None] * num_images
        for image_index in xrange(0, num_images, batch_size):
            batch_end_index = min(image_index + batch_size, num_images)
            output_captions, output_probs = self.captioner.sample_captions(
                descriptors[image_index:batch_end_index], temp=float("inf")
            )
            for batch_index, output in zip(range(image_index, batch_end_index), output_captions):
                all_captions[batch_index] = output
            #
            #    # Collect model/reference captions, formatting the model's captions and
            #    # each set of reference captions as a list of len(self.images) strings.
            #    # For each image, write out the highest probability caption.
            model_captions = [""] * len(images)
            for image_index, image in enumerate(images):
                caption = self.captioner.sentence(all_captions[image_index])
                model_captions[image_index] = caption

            generation_result = [
                Row(row.id, model_captions[image_index]) for (image_index, image_path) in enumerate(images)
            ]
            return generation_result
Ejemplo n.º 2
0
class CaptionExperiment():
  def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
    self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1)
    self.captioner.set_image_batch_size(1)
  

  def getCaption(self, image):
    row=image
    bytes = array.array('b', row.image).tostring()
    im = Image.open(io.BytesIO(bytes))
    image = np.array(im,dtype=np.uint8)
    dataset = [image]
    descriptors = self.captioner.compute_descriptors(dataset)
    images = dataset
    num_images = len(images)
    batch_size = num_images
  
    #Generate captions for all images.
    all_captions = [None] * num_images
    for image_index in xrange(0, num_images, batch_size):
      batch_end_index = min(image_index + batch_size, num_images)
      output_captions, output_probs = self.captioner.sample_captions(
        descriptors[image_index:batch_end_index], temp=float('inf'))
      for batch_index, output in zip(range(image_index, batch_end_index),
                                     output_captions):
        all_captions[batch_index] = output
  #
  #    # Collect model/reference captions, formatting the model's captions and
  #    # each set of reference captions as a list of len(self.images) strings.
  #    # For each image, write out the highest probability caption.
      model_captions = [''] * len(images)
      for image_index, image in enumerate(images):
        caption = self.captioner.sentence(all_captions[image_index])
        model_captions[image_index] = caption

      generation_result = [Row(row.id,model_captions[image_index]) for (image_index, image_path) in enumerate(images)]      
      return generation_result
imlist = util.io.load_str_list('./data/training/train_imcrop_list.txt')

num_im = len(imlist)

# Load all images into memory
loaded_images = []
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('loading image %d / %d into memory' % (n_im, num_im))

    im = skimage.io.imread(image_dir + imlist[n_im])
    # Gray scale to RGB
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))
    # RGBA to RGB
    im = im[:, :, :3]
    loaded_images.append(im)

# Compute fc7 feature from loaded images, as whole image bbox feature
descriptors = captioner.compute_descriptors(loaded_images, output_name='fc7')

# Save computed bbox features
if not os.path.isdir(cached_local_features_dir):
    os.mkdir(cached_local_features_dir)
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('saving local features %d / %d' % (n_im, num_im))
    save_path = cached_local_features_dir + imlist[n_im] + '_fc7.npy'
    np.save(save_path, descriptors[n_im, :])

    print("Find best candidate..!")
    for i in range(8):
        im_file = './splited_image/test' + str(i) + '.jpg'
        edgebox_file = './proposal_box/selective_box' + str(
            i) + '.txt'  # pre-extracted EdgeBox proposals
        im = skimage.io.imread(im_file)
        imsize = np.array([im.shape[1], im.shape[0]])  # [width, height]
        candidate_boxes = np.loadtxt(edgebox_file).astype(int)
        candidate_boxes = np.reshape(candidate_boxes, (-1, 4))
        # Compute features
        region_feature = retriever.compute_descriptors_edgebox(
            captioner, im, candidate_boxes)
        spatial_feature = retriever.compute_spatial_feat(
            candidate_boxes, imsize)
        descriptors = np.concatenate((region_feature, spatial_feature), axis=1)
        context_feature = captioner.compute_descriptors([im],
                                                        output_name='fc7')

        # Compute scores of each candidate region
        scores = retriever.score_descriptors_context(descriptors, query,
                                                     context_feature,
                                                     captioner, vocab_dict)
        #candidate_boxes = (i, candidate_boxes)
        candidate_boxes = np.insert(candidate_boxes, 0, i, axis=1)
        if (i == 0):
            sum_candidate_box = candidate_boxes
        else:
            #sum_candidate_box=np.concatenate(sum_candidate_box,candidate_boxes,axis=1)
            sum_candidate_box = np.vstack((sum_candidate_box, candidate_boxes))
        sum_score_box = np.concatenate((sum_score_box, scores))

    #print (sum_score_box)
    # Sample distractor_per_object*2 distractors to make sure the test image
    # itself is not among the distractors (this)
    distractor_ids = np.random.choice(num_distractors,
                                      distractor_per_object*2, replace=False)
    distractor_names = [distractor_list[n] for n in distractor_ids[:distractor_per_object]]
    # Use the second half if the imname is among the first half
    if imname not in distractor_names:
        distractor_ids_per_im[imname] = distractor_ids[:distractor_per_object]
    else:
        distractor_ids_per_im[imname] = distractor_ids[distractor_per_object:]

# Compute descriptors for both object images and distractor images
image_path_list = [image_dir+imname+'.JPEG' for imname in imlist]
distractor_path_list = [distractor_dir+imname+'.JPEG' for imname in distractor_list]

obj_descriptors = captioner.compute_descriptors(image_path_list)
dis_descriptors = captioner.compute_descriptors(distractor_path_list)

################################################################################
# Test top-1 precision
correct_num = 0
total_num = 0
for n_im in range(num_im):
    print('testing image %d / %d' % (n_im, num_im))
    imname = imlist[n_im]
    for sentence in query_dict[imname]:
        # compute test image (target object) score given the description sentence
        obj_score = retriever.score_descriptors(obj_descriptors[n_im:n_im+1, :],
                                                sentence, captioner, vocab_dict)[0]
        # compute distractor scores given the description sentence
        dis_idx = distractor_ids_per_im[imname]
captioner.set_image_batch_size(batch_size)

imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt')
num_im = len(imlist)

# Load all images into memory
loaded_images = []
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('loading image %d / %d into memory' % (n_im, num_im))

    im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg')
    # Gray scale to RGB
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))
    # RGBA to RGB
    im = im[:, :, :3]
    loaded_images.append(im)

# Compute fc7 feature from loaded images, as whole image contextual feature
descriptors = captioner.compute_descriptors(loaded_images, output_name='fc7')

# Save computed contextual features
if not os.path.isdir(cached_context_features_dir):
    os.mkdir(cached_context_features_dir)
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('saving contextual features %d / %d' % (n_im, num_im))
    save_path = cached_context_features_dir + imlist[n_im] + '_fc7.npy'
    np.save(save_path, descriptors[n_im, :])
    distractor_names = [
        distractor_list[n] for n in distractor_ids[:distractor_per_object]
    ]
    # Use the second half if the imname is among the first half
    if imname not in distractor_names:
        distractor_ids_per_im[imname] = distractor_ids[:distractor_per_object]
    else:
        distractor_ids_per_im[imname] = distractor_ids[distractor_per_object:]

# Compute descriptors for both object images and distractor images
image_path_list = [image_dir + imname + '.JPEG' for imname in imlist]
distractor_path_list = [
    distractor_dir + imname + '.JPEG' for imname in distractor_list
]

obj_descriptors = captioner.compute_descriptors(image_path_list)
dis_descriptors = captioner.compute_descriptors(distractor_path_list)

################################################################################
# Test top-1 precision
correct_num = 0
total_num = 0
for n_im in range(num_im):
    print('testing image %d / %d' % (n_im, num_im))
    imname = imlist[n_im]
    for sentence in query_dict[imname]:
        # compute test image (target object) score given the description sentence
        obj_score = retriever.score_descriptors(
            obj_descriptors[n_im:n_im + 1, :], sentence, captioner,
            vocab_dict)[0]
        # compute distractor scores given the description sentence