def __init__(self, detector_cfg_path, detector_weights_path, bert_cfg_path, bert_weights_path, object_vocab_path, tacotron_weights_path, waveglow_cfg_path, waveglow_weights_path, cpu_device, gpu_device, fc_layer=0, max_caption_length=67, sampling_rate=22050): """ args: detector_cfg_path: path to the detector config detector_weights_path: path to the detector weights bert_cfg_path: path to the bert decoder config bert_weights_path: path to the bert decoder weights tacotron_weights_path: path to the tacotron weights waveglow_weights_path: path to the waveglow weights cpu_device: The cpu device to run some parts of visualization gpu_device: The gpu device to run the bulk of computations, currently requires at least 1 GPU device fc_layer: the fully connected layer from the detector to extract features from, 0-indexed max_caption_length: the maximum number of tokens the caption can be sampling_rate: the rate that audio representations are sampled per second """ self.captioner = Captioner(detector_cfg_path, detector_weights_path, bert_cfg_path, bert_weights_path, object_vocab_path, cpu_device, gpu_device, fc_layer, max_caption_length) device = gpu_device if gpu_device else cpu_device self.tts = TTS(tacotron_weights_path, waveglow_cfg_path, waveglow_weights_path, device, sampling_rate)
def prepare_captioner(self): config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) checkpoint_path = os.path.join(config['project_root_dir'], config['checkpoint_path']) vocab_file_path = os.path.join(config['project_root_dir'], config['vocab_file_path']) self.captioner = Captioner(self.sess, checkpoint_path, vocab_file_path)
def build_captioner(model_name, image_net, LM_net, dataset_name='coco', split_name='val', vocab='vocabulary', precomputed_h5=None, gpu=0, prev_word_restriction=True): model_files = ['%s.caffemodel' % (mf) for mf in model_name] if image_net: image_net_file = home_dir + image_net else: image_net_file = None lstm_net_file = home_dir + LM_net vocab_file = '%s/%s.txt' % (determine_vocab_folder(dataset_name, split_name), vocab) device_id = gpu with open(vocab_file, 'r') as vocab_file_read: vocab = [line.strip() for line in vocab_file_read.readlines()] anno_path = determine_anno_path(dataset_name, split_name) image_root = determine_image_pattern(dataset_name, split_name) sg = build_sequence_generator(anno_path, 50, image_root, vocab=vocab, max_words=50, align=False, shuffle=False, gt_captions=True, pad=True, truncate=True, split_ids=None) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) captioner = Captioner(model_files, image_net_file, lstm_net_file, vocab_file, device_id=device_id, precomputed_feats=precomputed_h5, prev_word_restriction=prev_word_restriction) return captioner, sg, dataset
def main(image_path, nsamples, temperature): # Generate questions sess = gpt2.start_tf_sess() gpt2.load_gpt2(sess) config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) checkpoint_path = os.path.join(config['project_root_dir'], config['checkpoint_path']) vocab_file_path = os.path.join(config['project_root_dir'], config['vocab_file_path']) captioner = Captioner(sess, checkpoint_path, vocab_file_path) caption = captioner.caption(image_path) questions = gpt2_gen_questions(sess, caption, nsamples=nsamples, temperature=temperature) # Print generated questions print('----------\nQuestions:') for i, question in enumerate(questions): print('%d. %s' % (i + 1, question))
def main(): MAX_IMAGES = -1 # -1 to use all images TAG = 'coco_2layer_factored' if MAX_IMAGES >= 0: TAG += '_%dimages' % MAX_IMAGES eval_on_test = False if eval_on_test: ITER = 100000 MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER DATASET_NAME = 'test' else: # eval on val ITER = 50000 MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER DATASET_NAME = 'val' TAG += '_%s' % DATASET_NAME MODEL_DIR = './examples/coco_caption' MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME) IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt' LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt' NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME) DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all') DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME) VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt' DEVICE_ID = 0 with open(VOCAB_FILE, 'r') as vocab_file: vocab = [line.strip() for line in vocab_file.readlines()] coco = COCO(COCO_ANNO_PATH % DATASET_NAME) image_root = COCO_IMAGE_PATTERN % DATASET_NAME sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab, align=False, shuffle=False) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) if 0 <= MAX_IMAGES < len(dataset.keys()): all_keys = dataset.keys() perm = np.random.permutation(len(all_keys))[:MAX_IMAGES] chosen_keys = set([all_keys[p] for p in perm]) for key in all_keys: if key not in chosen_keys: del dataset[key] print 'Reduced dataset to %d images' % len(dataset.keys()) if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys()) captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE, device_id=DEVICE_ID) beam_size = 1 generation_strategy = {'type': 'beam', 'beam_size': beam_size} if generation_strategy['type'] == 'beam': strategy_name = 'beam%d' % generation_strategy['beam_size'] elif generation_strategy['type'] == 'sample': strategy_name = 'sample%f' % generation_strategy['temp'] else: raise Exception('Unknown generation strategy type: %s' % generation_strategy['type']) CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name) experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg) captioner.set_image_batch_size(min(100, MAX_IMAGES)) experimenter.generation_experiment(generation_strategy) captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000)) experimenter.retrieval_experiment()
from captioner import Captioner import retriever import math #import image_convert #im_file = './demo_data/test2.jpg' pretrained_weights_path = '../models/two_layer_LSTM.caffemodel' gpu_id = 0 # Initialize the retrieval model image_net_proto = '../prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = '../prototxt/scrc_word_to_preds_full.prototxt' vocab_file = '../data/vocabulary.txt' # utilize the captioner module from LRCN captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) captioner.set_image_batch_size( 50) # decrease the number if your GPU memory is small vocab_dict = retriever.build_vocab_dict_from_captioner(captioner) while (1): sum_candidate_box = [] sum_score_box = [] query = raw_input("type the input query: ") #query = 'bike on the red house' print("query =", query) print("Find best candidate..!") for i in range(8): im_file = './splited_image/test' + str(i) + '.jpg'
def main(model_name='', image_net='', LM_net='', dataset_name='val', vocab='vocabulary', precomputed_feats=None, feats_bool_in=False, precomputed_h5=None, experiment={'type': 'generation'}, prev_word_restriction=False, gpu=0): #model_name is the trained model: path relative to /home/lisa/caffe-LSTM-video #image_net is the model to extract length 1000 image features: path relative to snapshots folder; do not need to include "caffemodel" #dataset_name indicates which dataset to look at #vocab indicates which vocabulary file to look at #feats_bool is whether or not the images are saved as pickle feature files or if they are normal images #experiment: dict which has all info needed for experiments. Must have field type which will indicate madlib versus generation expt. if not precomputed_feats: precomputed_feats = model_name MAX_IMAGES = -1 # -1 to use all images TAG = 'coco_2layer_factored' if MAX_IMAGES >= 0: TAG += '_%dimages' % MAX_IMAGES eval_on_test = False if eval_on_test: ITER = 100000 MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER DATASET_NAME = 'test' else: # eval on val MODEL_FILENAME = model_name DATASET_NAME = dataset_name TAG += '_%s' % DATASET_NAME #MODEL_DIR = home_dir + '/examples/coco_caption/snapshots' MODEL_DIR = '' MODEL_FILE = ['%s.caffemodel' % (MF) for MF in MODEL_FILENAME] #IMAGE_NET_FILE = home_dir + '/models/bvlc_reference_caffenet/deploy.prototxt' IMAGE_NET_FILE = home_dir + image_net #LSTM_NET_FILE = home_dir + '/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt' LSTM_NET_FILE = home_dir + LM_net DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all') #DATASET_CACHE_DIR = home_dir + '/retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME) DATASET_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % ( DATASET_SUBDIR, '_'.join(MODEL_FILENAME)) FEATURE_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % ( DATASET_SUBDIR, precomputed_feats) VOCAB_FILE = '../../examples/coco_caption/h5_data/buffer_100/%s.txt' % vocab DEVICE_ID = gpu with open(VOCAB_FILE, 'r') as vocab_file: vocab = [line.strip() for line in vocab_file.readlines()] coco = COCO(COCO_ANNO_PATH % DATASET_NAME) #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images/%s2014' COCO_IMAGE_PATTERN = '../../data/coco/coco/images/%s2014' #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images2/%s2014' #image_root = COCO_IMAGE_PATTERN % DATASET_NAME image_root = COCO_IMAGE_PATTERN % 'val' #image_root = '/z/lisaanne/imageData/imagenet/' sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab, max_words=MAX_WORDS, align=False, shuffle=False, gt_captions=True, pad=True, truncate=True, split_ids=None, feats_bool=feats_bool_in) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) if 0 <= MAX_IMAGES < len(dataset.keys()): all_keys = dataset.keys() perm = np.random.permutation(len(all_keys))[:MAX_IMAGES] chosen_keys = set([all_keys[p] for p in perm]) for key in all_keys: if key not in chosen_keys: del dataset[key] print 'Reduced dataset to %d images' % len(dataset.keys()) if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys()) captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE, device_id=DEVICE_ID, precomputed_feats=precomputed_h5, prev_word_restriction=prev_word_restriction) if 'beam_size' in experiment.keys(): beam_size = experiment['beam_size'] else: beam_size = 1 generation_strategy = {'type': 'beam', 'beam_size': beam_size} if generation_strategy['type'] == 'beam': strategy_name = 'beam%d' % generation_strategy['beam_size'] elif generation_strategy['type'] == 'sample': strategy_name = 'sample%f' % generation_strategy['temp'] else: raise Exception('Unknown generation strategy type: %s' % generation_strategy['type']) CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name) experimenter = CaptionExperiment(captioner, dataset, FEATURE_CACHE_DIR, CACHE_DIR, sg, feats_bool_in) captioner.set_image_batch_size(min(100, MAX_IMAGES)) if experiment['type'] == 'madlib': all_mean_index = [] all_mean_prob = [] all_top_words = [] for fw in experiment['fill_words']: for cw in experiment['cooccur_words']: mean_index, mean_prob, top_words = experimenter.madlib_experiment( fw, [cw]) all_mean_index.append(mean_index) all_mean_prob.append(mean_prob) all_top_words.append(top_words) return all_mean_index, all_mean_prob, all_top_words if experiment['type'] == 'generation': experimenter.generation_experiment(generation_strategy, 1000) if experiment['type'] == 'score_generation': if 'read_file' in experiment.keys(): read_file = experiment['read_file'] else: read_file = True experimenter.score_generation(experiment['json_file'], read_file)
import caffe import util from captioner import Captioner vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel' gpu_id = 0 image_dir = './datasets/ReferIt/ImageCLEF/images/' cached_context_features_dir = './data/referit_context_features/' image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt' vocab_file = './data/vocabulary.txt' captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) batch_size = 100 captioner.set_image_batch_size(batch_size) imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt') num_im = len(imlist) # Load all images into memory loaded_images = [] for n_im in range(num_im): if n_im % 200 == 0: print('loading image %d / %d into memory' % (n_im, num_im)) im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg') # Gray scale to RGB if im.ndim == 2:
def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1) self.captioner.set_image_batch_size(1)
def main(): parser = argparse.ArgumentParser() # Model Setup parser.add_argument("--detector_config", default=None, type=str, help="detector config file path.") parser.add_argument("--detector_weights", default=None, type=str, help="pretrained detector weights.") parser.add_argument("--decoder_config", default=None, type=str, help="Bert decoder config file path.") parser.add_argument("--decoder_weights", default=None, type=str, help="pretrained Bert decoder weights.") parser.add_argument("--object_vocab", default=None, type=str, help="object vocabulary, maps object ids to object names") # For COCO parser.add_argument('--coco_root', type=str, default='~/Datasets/coco') parser.add_argument("--coco_data_info", default='annotations/dataset_coco.json', type=str, help="The input data file name.") parser.add_argument("--coco_ann_file", default='annotations/captions_val2014.json', type=str, help="caption annotations file (i.e. answer key)") parser.add_argument('--valid_jpgs_file', default='annotations/coco_valid_jpgs.json', type=str, help="lists the valid jpgs") # For data pipeline parser.add_argument('--batch_size', type=int, default=1, help="Batch size for decoding. Highly recommended to be a multiple of 8") parser.add_argument('--dl_workers', type=int, default=0, help="Number of dataloader workers") # For reproducibility parser.add_argument('--seed', type=int, default=-1, help="random seed for initialization") args = parser.parse_args() assert(torch.cuda.is_available()) cpu_device = torch.device("cpu") gpu_device = torch.device("cuda:0") n_gpu = torch.cuda.device_count() # fix random seed (optional) if args.seed != -1: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) with torch.no_grad(): captioner = Captioner(args.detector_config, args.detector_weights, args.decoder_config, args.decoder_weights, args.object_vocab, cpu_device, gpu_device) # TODO: optimize for amp, data-parallel torch.cuda.empty_cache() # Empty everything valid_dataset = CocoCaptionsKarpathyValidImgs(args.coco_root) valid_dl = DataLoader(valid_dataset, batch_size=args.batch_size, collate_fn=ccc_karpathy_valid_collate, num_workers=args.dl_workers, pin_memory=True) total_batch = math.ceil(len(valid_dataset) / args.batch_size) predictions = [] print('start the caption evaluation...') with tqdm(total=total_batch) as pbar: for img_ids, img_npys in valid_dl: captions = captioner.forward(img_npys) for img_id, caption in zip(img_ids, captions): predictions.append({'image_id': img_id, 'caption': caption}) pbar.update(1) language_eval(preds=predictions, annFile=os.path.join(args.coco_root, args.coco_ann_file), model_id='0', split='val')