class CaptionExperiment: def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1) self.captioner.set_image_batch_size(1) def getCaption(self, image): row = image bytes = array.array("b", row.image).tostring() im = Image.open(io.BytesIO(bytes)) image = np.array(im, dtype=np.uint8) dataset = [image] descriptors = self.captioner.compute_descriptors(dataset) images = dataset num_images = len(images) batch_size = num_images # Generate captions for all images. all_captions = [None] * num_images for image_index in xrange(0, num_images, batch_size): batch_end_index = min(image_index + batch_size, num_images) output_captions, output_probs = self.captioner.sample_captions( descriptors[image_index:batch_end_index], temp=float("inf") ) for batch_index, output in zip(range(image_index, batch_end_index), output_captions): all_captions[batch_index] = output # # # Collect model/reference captions, formatting the model's captions and # # each set of reference captions as a list of len(self.images) strings. # # For each image, write out the highest probability caption. model_captions = [""] * len(images) for image_index, image in enumerate(images): caption = self.captioner.sentence(all_captions[image_index]) model_captions[image_index] = caption generation_result = [ Row(row.id, model_captions[image_index]) for (image_index, image_path) in enumerate(images) ] return generation_result
class CaptionExperiment(): def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1) self.captioner.set_image_batch_size(1) def getCaption(self, image): row=image bytes = array.array('b', row.image).tostring() im = Image.open(io.BytesIO(bytes)) image = np.array(im,dtype=np.uint8) dataset = [image] descriptors = self.captioner.compute_descriptors(dataset) images = dataset num_images = len(images) batch_size = num_images #Generate captions for all images. all_captions = [None] * num_images for image_index in xrange(0, num_images, batch_size): batch_end_index = min(image_index + batch_size, num_images) output_captions, output_probs = self.captioner.sample_captions( descriptors[image_index:batch_end_index], temp=float('inf')) for batch_index, output in zip(range(image_index, batch_end_index), output_captions): all_captions[batch_index] = output # # # Collect model/reference captions, formatting the model's captions and # # each set of reference captions as a list of len(self.images) strings. # # For each image, write out the highest probability caption. model_captions = [''] * len(images) for image_index, image in enumerate(images): caption = self.captioner.sentence(all_captions[image_index]) model_captions[image_index] = caption generation_result = [Row(row.id,model_captions[image_index]) for (image_index, image_path) in enumerate(images)] return generation_result
def main(): MAX_IMAGES = -1 # -1 to use all images TAG = 'coco_2layer_factored' if MAX_IMAGES >= 0: TAG += '_%dimages' % MAX_IMAGES eval_on_test = False if eval_on_test: ITER = 100000 MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER DATASET_NAME = 'test' else: # eval on val ITER = 50000 MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER DATASET_NAME = 'val' TAG += '_%s' % DATASET_NAME MODEL_DIR = './examples/coco_caption' MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME) IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt' LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt' NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME) DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all') DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME) VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt' DEVICE_ID = 0 with open(VOCAB_FILE, 'r') as vocab_file: vocab = [line.strip() for line in vocab_file.readlines()] coco = COCO(COCO_ANNO_PATH % DATASET_NAME) image_root = COCO_IMAGE_PATTERN % DATASET_NAME sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab, align=False, shuffle=False) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) if 0 <= MAX_IMAGES < len(dataset.keys()): all_keys = dataset.keys() perm = np.random.permutation(len(all_keys))[:MAX_IMAGES] chosen_keys = set([all_keys[p] for p in perm]) for key in all_keys: if key not in chosen_keys: del dataset[key] print 'Reduced dataset to %d images' % len(dataset.keys()) if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys()) captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE, device_id=DEVICE_ID) beam_size = 1 generation_strategy = {'type': 'beam', 'beam_size': beam_size} if generation_strategy['type'] == 'beam': strategy_name = 'beam%d' % generation_strategy['beam_size'] elif generation_strategy['type'] == 'sample': strategy_name = 'sample%f' % generation_strategy['temp'] else: raise Exception('Unknown generation strategy type: %s' % generation_strategy['type']) CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name) experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg) captioner.set_image_batch_size(min(100, MAX_IMAGES)) experimenter.generation_experiment(generation_strategy) captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000)) experimenter.retrieval_experiment()
from captioner import Captioner vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel' gpu_id = 0 image_dir = './data/resized_imcrop/' cached_local_features_dir = './data/referit_local_features/' image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt' vocab_file = './data/vocabulary.txt' captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) batch_size = 100 captioner.set_image_batch_size(batch_size) imlist = util.io.load_str_list('./data/training/train_imcrop_list.txt') num_im = len(imlist) # Load all images into memory loaded_images = [] for n_im in range(num_im): if n_im % 200 == 0: print('loading image %d / %d into memory' % (n_im, num_im)) im = skimage.io.imread(image_dir + imlist[n_im]) # Gray scale to RGB if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3))
import math #import image_convert #im_file = './demo_data/test2.jpg' pretrained_weights_path = '../models/two_layer_LSTM.caffemodel' gpu_id = 0 # Initialize the retrieval model image_net_proto = '../prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = '../prototxt/scrc_word_to_preds_full.prototxt' vocab_file = '../data/vocabulary.txt' # utilize the captioner module from LRCN captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) captioner.set_image_batch_size( 50) # decrease the number if your GPU memory is small vocab_dict = retriever.build_vocab_dict_from_captioner(captioner) while (1): sum_candidate_box = [] sum_score_box = [] query = raw_input("type the input query: ") #query = 'bike on the red house' print("query =", query) print("Find best candidate..!") for i in range(8): im_file = './splited_image/test' + str(i) + '.jpg' edgebox_file = './proposal_box/selective_box' + str( i) + '.txt' # pre-extracted EdgeBox proposals
if distractor_set == "kitchen": distractor_dir = image_dir distractor_imlist_file = tst_imlist_file else: distractor_dir = './datasets/Kitchen/images/ImageNET/' distractor_imlist_file = './data/split/kitchen_imagenet_imlist.txt' query_file = './data/metadata/kitchen_query_dict.json' vocab_file = './data/vocabulary.txt' # utilize the captioner module from LRCN lstm_net_proto = './prototxt/scrc_word_to_preds_no_spatial_no_context.prototxt' image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) captioner.set_image_batch_size(50) vocab_dict = retriever.build_vocab_dict_from_captioner(captioner) # Load image and caption list imlist = util.io.load_str_list(tst_imlist_file) num_im = len(imlist) query_dict = util.io.load_json(query_file) # Load distractors distractor_list = util.io.load_str_list(distractor_imlist_file) num_distractors = len(distractor_list) # Sample distractor images for each test image distractor_ids_per_im = {} np.random.seed(3) # fix random seed for test repeatibility for imname in imlist:
def main(model_name='', image_net='', LM_net='', dataset_name='val', vocab='vocabulary', precomputed_feats=None, feats_bool_in=False, precomputed_h5=None, experiment={'type': 'generation'}, prev_word_restriction=False, gpu=0): #model_name is the trained model: path relative to /home/lisa/caffe-LSTM-video #image_net is the model to extract length 1000 image features: path relative to snapshots folder; do not need to include "caffemodel" #dataset_name indicates which dataset to look at #vocab indicates which vocabulary file to look at #feats_bool is whether or not the images are saved as pickle feature files or if they are normal images #experiment: dict which has all info needed for experiments. Must have field type which will indicate madlib versus generation expt. if not precomputed_feats: precomputed_feats = model_name MAX_IMAGES = -1 # -1 to use all images TAG = 'coco_2layer_factored' if MAX_IMAGES >= 0: TAG += '_%dimages' % MAX_IMAGES eval_on_test = False if eval_on_test: ITER = 100000 MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER DATASET_NAME = 'test' else: # eval on val MODEL_FILENAME = model_name DATASET_NAME = dataset_name TAG += '_%s' % DATASET_NAME #MODEL_DIR = home_dir + '/examples/coco_caption/snapshots' MODEL_DIR = '' MODEL_FILE = ['%s.caffemodel' % (MF) for MF in MODEL_FILENAME] #IMAGE_NET_FILE = home_dir + '/models/bvlc_reference_caffenet/deploy.prototxt' IMAGE_NET_FILE = home_dir + image_net #LSTM_NET_FILE = home_dir + '/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt' LSTM_NET_FILE = home_dir + LM_net DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all') #DATASET_CACHE_DIR = home_dir + '/retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME) DATASET_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % ( DATASET_SUBDIR, '_'.join(MODEL_FILENAME)) FEATURE_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % ( DATASET_SUBDIR, precomputed_feats) VOCAB_FILE = '../../examples/coco_caption/h5_data/buffer_100/%s.txt' % vocab DEVICE_ID = gpu with open(VOCAB_FILE, 'r') as vocab_file: vocab = [line.strip() for line in vocab_file.readlines()] coco = COCO(COCO_ANNO_PATH % DATASET_NAME) #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images/%s2014' COCO_IMAGE_PATTERN = '../../data/coco/coco/images/%s2014' #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images2/%s2014' #image_root = COCO_IMAGE_PATTERN % DATASET_NAME image_root = COCO_IMAGE_PATTERN % 'val' #image_root = '/z/lisaanne/imageData/imagenet/' sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab, max_words=MAX_WORDS, align=False, shuffle=False, gt_captions=True, pad=True, truncate=True, split_ids=None, feats_bool=feats_bool_in) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) if 0 <= MAX_IMAGES < len(dataset.keys()): all_keys = dataset.keys() perm = np.random.permutation(len(all_keys))[:MAX_IMAGES] chosen_keys = set([all_keys[p] for p in perm]) for key in all_keys: if key not in chosen_keys: del dataset[key] print 'Reduced dataset to %d images' % len(dataset.keys()) if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys()) captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE, device_id=DEVICE_ID, precomputed_feats=precomputed_h5, prev_word_restriction=prev_word_restriction) if 'beam_size' in experiment.keys(): beam_size = experiment['beam_size'] else: beam_size = 1 generation_strategy = {'type': 'beam', 'beam_size': beam_size} if generation_strategy['type'] == 'beam': strategy_name = 'beam%d' % generation_strategy['beam_size'] elif generation_strategy['type'] == 'sample': strategy_name = 'sample%f' % generation_strategy['temp'] else: raise Exception('Unknown generation strategy type: %s' % generation_strategy['type']) CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name) experimenter = CaptionExperiment(captioner, dataset, FEATURE_CACHE_DIR, CACHE_DIR, sg, feats_bool_in) captioner.set_image_batch_size(min(100, MAX_IMAGES)) if experiment['type'] == 'madlib': all_mean_index = [] all_mean_prob = [] all_top_words = [] for fw in experiment['fill_words']: for cw in experiment['cooccur_words']: mean_index, mean_prob, top_words = experimenter.madlib_experiment( fw, [cw]) all_mean_index.append(mean_index) all_mean_prob.append(mean_prob) all_top_words.append(top_words) return all_mean_index, all_mean_prob, all_top_words if experiment['type'] == 'generation': experimenter.generation_experiment(generation_strategy, 1000) if experiment['type'] == 'score_generation': if 'read_file' in experiment.keys(): read_file = experiment['read_file'] else: read_file = True experimenter.score_generation(experiment['json_file'], read_file)
################################################################################ image_dir = './datasets/ReferIt/ImageCLEF/images/' proposal_dir = './data/referit_edgeboxes_top100/' cached_context_features_dir = './data/referit_context_features/' imcrop_dict_file = './data/metadata/referit_imcrop_dict.json' imcrop_bbox_dict_file = './data/metadata/referit_imcrop_bbox_dict.json' query_file = './data/metadata/referit_query_dict.json' vocab_file = './data/vocabulary.txt' # utilize the captioner module from LRCN image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) captioner.set_image_batch_size(50) vocab_dict = retriever.build_vocab_dict_from_captioner(captioner) # Load image and caption list imlist = util.io.load_str_list(tst_imlist_file) num_im = len(imlist) query_dict = util.io.load_json(query_file) imcrop_dict = util.io.load_json(imcrop_dict_file) imcrop_bbox_dict = util.io.load_json(imcrop_bbox_dict_file) # Load candidate regions (bounding boxes) load_proposal = (candidate_regions == 'proposal_regions') candidate_boxes_dict = {imname: None for imname in imlist} for n_im in range(num_im): if n_im % 1000 == 0: print('loading candidate regions %d / %d' % (n_im, num_im))
from captioner import Captioner vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel' gpu_id = 0 image_dir = './datasets/ReferIt/ImageCLEF/images/' cached_context_features_dir = './data/referit_context_features/' image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt' vocab_file = './data/vocabulary.txt' captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) batch_size = 100 captioner.set_image_batch_size(batch_size) imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt') num_im = len(imlist) # Load all images into memory loaded_images = [] for n_im in range(num_im): if n_im % 200 == 0: print('loading image %d / %d into memory' % (n_im, num_im)) im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg') # Gray scale to RGB if im.ndim == 2: im = np.tile(im[..., np.newaxis], (1, 1, 3)) # RGBA to RGB