Ejemplo n.º 1
0
class CaptionExperiment:
    def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
        self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1)
        self.captioner.set_image_batch_size(1)

    def getCaption(self, image):
        row = image
        bytes = array.array("b", row.image).tostring()
        im = Image.open(io.BytesIO(bytes))
        image = np.array(im, dtype=np.uint8)
        dataset = [image]
        descriptors = self.captioner.compute_descriptors(dataset)
        images = dataset
        num_images = len(images)
        batch_size = num_images

        # Generate captions for all images.
        all_captions = [None] * num_images
        for image_index in xrange(0, num_images, batch_size):
            batch_end_index = min(image_index + batch_size, num_images)
            output_captions, output_probs = self.captioner.sample_captions(
                descriptors[image_index:batch_end_index], temp=float("inf")
            )
            for batch_index, output in zip(range(image_index, batch_end_index), output_captions):
                all_captions[batch_index] = output
            #
            #    # Collect model/reference captions, formatting the model's captions and
            #    # each set of reference captions as a list of len(self.images) strings.
            #    # For each image, write out the highest probability caption.
            model_captions = [""] * len(images)
            for image_index, image in enumerate(images):
                caption = self.captioner.sentence(all_captions[image_index])
                model_captions[image_index] = caption

            generation_result = [
                Row(row.id, model_captions[image_index]) for (image_index, image_path) in enumerate(images)
            ]
            return generation_result
Ejemplo n.º 2
0
class CaptionExperiment():
  def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
    self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1)
    self.captioner.set_image_batch_size(1)
  

  def getCaption(self, image):
    row=image
    bytes = array.array('b', row.image).tostring()
    im = Image.open(io.BytesIO(bytes))
    image = np.array(im,dtype=np.uint8)
    dataset = [image]
    descriptors = self.captioner.compute_descriptors(dataset)
    images = dataset
    num_images = len(images)
    batch_size = num_images
  
    #Generate captions for all images.
    all_captions = [None] * num_images
    for image_index in xrange(0, num_images, batch_size):
      batch_end_index = min(image_index + batch_size, num_images)
      output_captions, output_probs = self.captioner.sample_captions(
        descriptors[image_index:batch_end_index], temp=float('inf'))
      for batch_index, output in zip(range(image_index, batch_end_index),
                                     output_captions):
        all_captions[batch_index] = output
  #
  #    # Collect model/reference captions, formatting the model's captions and
  #    # each set of reference captions as a list of len(self.images) strings.
  #    # For each image, write out the highest probability caption.
      model_captions = [''] * len(images)
      for image_index, image in enumerate(images):
        caption = self.captioner.sentence(all_captions[image_index])
        model_captions[image_index] = caption

      generation_result = [Row(row.id,model_captions[image_index]) for (image_index, image_path) in enumerate(images)]      
      return generation_result
Ejemplo n.º 3
0
def main():
  MAX_IMAGES = -1  # -1 to use all images
  TAG = 'coco_2layer_factored'
  if MAX_IMAGES >= 0:
    TAG += '_%dimages' % MAX_IMAGES
  eval_on_test = False
  if eval_on_test:
    ITER = 100000
    MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER
    DATASET_NAME = 'test'
  else:  # eval on val
    ITER = 50000
    MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER
    DATASET_NAME = 'val'
  TAG += '_%s' % DATASET_NAME
  MODEL_DIR = './examples/coco_caption'
  MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME)
  IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt'
  LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt'
  NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME)
  DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME,
      str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all')
  DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME)
  VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt'
  DEVICE_ID = 0
  with open(VOCAB_FILE, 'r') as vocab_file:
    vocab = [line.strip() for line in vocab_file.readlines()]
  coco = COCO(COCO_ANNO_PATH % DATASET_NAME)
  image_root = COCO_IMAGE_PATTERN % DATASET_NAME
  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab,
                             align=False, shuffle=False)
  dataset = {}
  for image_path, sentence in sg.image_sentence_pairs:
    if image_path not in dataset:
      dataset[image_path] = []
    dataset[image_path].append((sg.line_to_stream(sentence), sentence))
  print 'Original dataset contains %d images' % len(dataset.keys())
  if 0 <= MAX_IMAGES < len(dataset.keys()):
    all_keys = dataset.keys()
    perm = np.random.permutation(len(all_keys))[:MAX_IMAGES]
    chosen_keys = set([all_keys[p] for p in perm])
    for key in all_keys:
      if key not in chosen_keys:
        del dataset[key]
    print 'Reduced dataset to %d images' % len(dataset.keys())
  if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys())
  captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE,
                        device_id=DEVICE_ID)
  beam_size = 1
  generation_strategy = {'type': 'beam', 'beam_size': beam_size}
  if generation_strategy['type'] == 'beam':
    strategy_name = 'beam%d' % generation_strategy['beam_size']
  elif generation_strategy['type'] == 'sample':
    strategy_name = 'sample%f' % generation_strategy['temp']
  else:
    raise Exception('Unknown generation strategy type: %s' % generation_strategy['type'])
  CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name)
  experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg)
  captioner.set_image_batch_size(min(100, MAX_IMAGES))
  experimenter.generation_experiment(generation_strategy)
  captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000))
  experimenter.retrieval_experiment()
from captioner import Captioner


vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel'
gpu_id = 0

image_dir = './data/resized_imcrop/'
cached_local_features_dir = './data/referit_local_features/'

image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = './data/vocabulary.txt'

captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id)
batch_size = 100
captioner.set_image_batch_size(batch_size)

imlist = util.io.load_str_list('./data/training/train_imcrop_list.txt')

num_im = len(imlist)

# Load all images into memory
loaded_images = []
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('loading image %d / %d into memory' % (n_im, num_im))

    im = skimage.io.imread(image_dir + imlist[n_im])
    # Gray scale to RGB
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))
import math
#import image_convert

#im_file = './demo_data/test2.jpg'

pretrained_weights_path = '../models/two_layer_LSTM.caffemodel'
gpu_id = 0

# Initialize the retrieval model
image_net_proto = '../prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = '../prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = '../data/vocabulary.txt'
# utilize the captioner module from LRCN
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(
    50)  # decrease the number if your GPU memory is small
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

while (1):
    sum_candidate_box = []
    sum_score_box = []

    query = raw_input("type the input query: ")
    #query = 'bike on the red house'

    print("query =", query)
    print("Find best candidate..!")
    for i in range(8):
        im_file = './splited_image/test' + str(i) + '.jpg'
        edgebox_file = './proposal_box/selective_box' + str(
            i) + '.txt'  # pre-extracted EdgeBox proposals
if distractor_set == "kitchen":
    distractor_dir = image_dir
    distractor_imlist_file = tst_imlist_file
else:
    distractor_dir = './datasets/Kitchen/images/ImageNET/'
    distractor_imlist_file = './data/split/kitchen_imagenet_imlist.txt'

query_file = './data/metadata/kitchen_query_dict.json'
vocab_file = './data/vocabulary.txt'

# utilize the captioner module from LRCN
lstm_net_proto = './prototxt/scrc_word_to_preds_no_spatial_no_context.prototxt'
image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(50)
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

# Load image and caption list
imlist = util.io.load_str_list(tst_imlist_file)
num_im = len(imlist)
query_dict = util.io.load_json(query_file)

# Load distractors
distractor_list = util.io.load_str_list(distractor_imlist_file)
num_distractors = len(distractor_list)

# Sample distractor images for each test image
distractor_ids_per_im = {}
np.random.seed(3)  # fix random seed for test repeatibility
for imname in imlist:
Ejemplo n.º 7
0
def main(model_name='',
         image_net='',
         LM_net='',
         dataset_name='val',
         vocab='vocabulary',
         precomputed_feats=None,
         feats_bool_in=False,
         precomputed_h5=None,
         experiment={'type': 'generation'},
         prev_word_restriction=False,
         gpu=0):
    #model_name is the trained model: path relative to /home/lisa/caffe-LSTM-video
    #image_net is the model to extract length 1000 image features: path relative to snapshots folder; do not need to include "caffemodel"
    #dataset_name indicates which dataset to look at
    #vocab indicates which vocabulary file to look at
    #feats_bool is whether or not the images are saved as pickle feature files or if they are normal images
    #experiment: dict which has all info needed for experiments.  Must have field type which will indicate madlib versus generation expt.

    if not precomputed_feats:
        precomputed_feats = model_name

    MAX_IMAGES = -1  # -1 to use all images
    TAG = 'coco_2layer_factored'
    if MAX_IMAGES >= 0:
        TAG += '_%dimages' % MAX_IMAGES
    eval_on_test = False
    if eval_on_test:
        ITER = 100000
        MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER
        DATASET_NAME = 'test'
    else:  # eval on val
        MODEL_FILENAME = model_name
        DATASET_NAME = dataset_name
    TAG += '_%s' % DATASET_NAME
    #MODEL_DIR = home_dir + '/examples/coco_caption/snapshots'
    MODEL_DIR = ''
    MODEL_FILE = ['%s.caffemodel' % (MF) for MF in MODEL_FILENAME]
    #IMAGE_NET_FILE = home_dir + '/models/bvlc_reference_caffenet/deploy.prototxt'
    IMAGE_NET_FILE = home_dir + image_net
    #LSTM_NET_FILE = home_dir + '/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt'
    LSTM_NET_FILE = home_dir + LM_net
    DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES)
                                    if MAX_IMAGES >= 0 else 'all')
    #DATASET_CACHE_DIR = home_dir + '/retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME)
    DATASET_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % (
        DATASET_SUBDIR, '_'.join(MODEL_FILENAME))
    FEATURE_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % (
        DATASET_SUBDIR, precomputed_feats)
    VOCAB_FILE = '../../examples/coco_caption/h5_data/buffer_100/%s.txt' % vocab
    DEVICE_ID = gpu
    with open(VOCAB_FILE, 'r') as vocab_file:
        vocab = [line.strip() for line in vocab_file.readlines()]
    coco = COCO(COCO_ANNO_PATH % DATASET_NAME)
    #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images/%s2014'
    COCO_IMAGE_PATTERN = '../../data/coco/coco/images/%s2014'
    #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images2/%s2014'
    #image_root = COCO_IMAGE_PATTERN % DATASET_NAME
    image_root = COCO_IMAGE_PATTERN % 'val'
    #image_root = '/z/lisaanne/imageData/imagenet/'
    sg = CocoSequenceGenerator(coco,
                               BUFFER_SIZE,
                               image_root,
                               vocab=vocab,
                               max_words=MAX_WORDS,
                               align=False,
                               shuffle=False,
                               gt_captions=True,
                               pad=True,
                               truncate=True,
                               split_ids=None,
                               feats_bool=feats_bool_in)
    dataset = {}
    for image_path, sentence in sg.image_sentence_pairs:
        if image_path not in dataset:
            dataset[image_path] = []
        dataset[image_path].append((sg.line_to_stream(sentence), sentence))
    print 'Original dataset contains %d images' % len(dataset.keys())
    if 0 <= MAX_IMAGES < len(dataset.keys()):
        all_keys = dataset.keys()
        perm = np.random.permutation(len(all_keys))[:MAX_IMAGES]
        chosen_keys = set([all_keys[p] for p in perm])
        for key in all_keys:
            if key not in chosen_keys:
                del dataset[key]
        print 'Reduced dataset to %d images' % len(dataset.keys())
    if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys())
    captioner = Captioner(MODEL_FILE,
                          IMAGE_NET_FILE,
                          LSTM_NET_FILE,
                          VOCAB_FILE,
                          device_id=DEVICE_ID,
                          precomputed_feats=precomputed_h5,
                          prev_word_restriction=prev_word_restriction)
    if 'beam_size' in experiment.keys():
        beam_size = experiment['beam_size']
    else:
        beam_size = 1
    generation_strategy = {'type': 'beam', 'beam_size': beam_size}
    if generation_strategy['type'] == 'beam':
        strategy_name = 'beam%d' % generation_strategy['beam_size']
    elif generation_strategy['type'] == 'sample':
        strategy_name = 'sample%f' % generation_strategy['temp']
    else:
        raise Exception('Unknown generation strategy type: %s' %
                        generation_strategy['type'])
    CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name)
    experimenter = CaptionExperiment(captioner, dataset, FEATURE_CACHE_DIR,
                                     CACHE_DIR, sg, feats_bool_in)
    captioner.set_image_batch_size(min(100, MAX_IMAGES))
    if experiment['type'] == 'madlib':
        all_mean_index = []
        all_mean_prob = []
        all_top_words = []
        for fw in experiment['fill_words']:
            for cw in experiment['cooccur_words']:
                mean_index, mean_prob, top_words = experimenter.madlib_experiment(
                    fw, [cw])
                all_mean_index.append(mean_index)
                all_mean_prob.append(mean_prob)
                all_top_words.append(top_words)
        return all_mean_index, all_mean_prob, all_top_words
    if experiment['type'] == 'generation':
        experimenter.generation_experiment(generation_strategy, 1000)
    if experiment['type'] == 'score_generation':
        if 'read_file' in experiment.keys():
            read_file = experiment['read_file']
        else:
            read_file = True
        experimenter.score_generation(experiment['json_file'], read_file)
Ejemplo n.º 8
0
################################################################################

image_dir = './datasets/ReferIt/ImageCLEF/images/'
proposal_dir = './data/referit_edgeboxes_top100/'
cached_context_features_dir = './data/referit_context_features/'

imcrop_dict_file = './data/metadata/referit_imcrop_dict.json'
imcrop_bbox_dict_file = './data/metadata/referit_imcrop_bbox_dict.json'
query_file = './data/metadata/referit_query_dict.json'
vocab_file = './data/vocabulary.txt'

# utilize the captioner module from LRCN
image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(50)
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

# Load image and caption list
imlist = util.io.load_str_list(tst_imlist_file)
num_im = len(imlist)
query_dict = util.io.load_json(query_file)
imcrop_dict = util.io.load_json(imcrop_dict_file)
imcrop_bbox_dict = util.io.load_json(imcrop_bbox_dict_file)

# Load candidate regions (bounding boxes)
load_proposal = (candidate_regions == 'proposal_regions')
candidate_boxes_dict = {imname: None for imname in imlist}
for n_im in range(num_im):
    if n_im % 1000 == 0:
        print('loading candidate regions %d / %d' % (n_im, num_im))
from captioner import Captioner

vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel'
gpu_id = 0

image_dir = './datasets/ReferIt/ImageCLEF/images/'
cached_context_features_dir = './data/referit_context_features/'

image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = './data/vocabulary.txt'

captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
batch_size = 100
captioner.set_image_batch_size(batch_size)

imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt')
num_im = len(imlist)

# Load all images into memory
loaded_images = []
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('loading image %d / %d into memory' % (n_im, num_im))

    im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg')
    # Gray scale to RGB
    if im.ndim == 2:
        im = np.tile(im[..., np.newaxis], (1, 1, 3))
    # RGBA to RGB