def prepare_captioner(self): config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) checkpoint_path = os.path.join(config['project_root_dir'], config['checkpoint_path']) vocab_file_path = os.path.join(config['project_root_dir'], config['vocab_file_path']) self.captioner = Captioner(self.sess, checkpoint_path, vocab_file_path)
def __init__(self, detector_cfg_path, detector_weights_path, bert_cfg_path, bert_weights_path, object_vocab_path, tacotron_weights_path, waveglow_cfg_path, waveglow_weights_path, cpu_device, gpu_device, fc_layer=0, max_caption_length=67, sampling_rate=22050): """ args: detector_cfg_path: path to the detector config detector_weights_path: path to the detector weights bert_cfg_path: path to the bert decoder config bert_weights_path: path to the bert decoder weights tacotron_weights_path: path to the tacotron weights waveglow_weights_path: path to the waveglow weights cpu_device: The cpu device to run some parts of visualization gpu_device: The gpu device to run the bulk of computations, currently requires at least 1 GPU device fc_layer: the fully connected layer from the detector to extract features from, 0-indexed max_caption_length: the maximum number of tokens the caption can be sampling_rate: the rate that audio representations are sampled per second """ self.captioner = Captioner(detector_cfg_path, detector_weights_path, bert_cfg_path, bert_weights_path, object_vocab_path, cpu_device, gpu_device, fc_layer, max_caption_length) device = gpu_device if gpu_device else cpu_device self.tts = TTS(tacotron_weights_path, waveglow_cfg_path, waveglow_weights_path, device, sampling_rate)
def main(image_path, nsamples, temperature): # Generate questions sess = gpt2.start_tf_sess() gpt2.load_gpt2(sess) config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) checkpoint_path = os.path.join(config['project_root_dir'], config['checkpoint_path']) vocab_file_path = os.path.join(config['project_root_dir'], config['vocab_file_path']) captioner = Captioner(sess, checkpoint_path, vocab_file_path) caption = captioner.caption(image_path) questions = gpt2_gen_questions(sess, caption, nsamples=nsamples, temperature=temperature) # Print generated questions print('----------\nQuestions:') for i, question in enumerate(questions): print('%d. %s' % (i + 1, question))
class CaptionExperiment: def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1) self.captioner.set_image_batch_size(1) def getCaption(self, image): row = image bytes = array.array("b", row.image).tostring() im = Image.open(io.BytesIO(bytes)) image = np.array(im, dtype=np.uint8) dataset = [image] descriptors = self.captioner.compute_descriptors(dataset) images = dataset num_images = len(images) batch_size = num_images # Generate captions for all images. all_captions = [None] * num_images for image_index in xrange(0, num_images, batch_size): batch_end_index = min(image_index + batch_size, num_images) output_captions, output_probs = self.captioner.sample_captions( descriptors[image_index:batch_end_index], temp=float("inf") ) for batch_index, output in zip(range(image_index, batch_end_index), output_captions): all_captions[batch_index] = output # # # Collect model/reference captions, formatting the model's captions and # # each set of reference captions as a list of len(self.images) strings. # # For each image, write out the highest probability caption. model_captions = [""] * len(images) for image_index, image in enumerate(images): caption = self.captioner.sentence(all_captions[image_index]) model_captions[image_index] = caption generation_result = [ Row(row.id, model_captions[image_index]) for (image_index, image_path) in enumerate(images) ] return generation_result
class CaptionExperiment(): def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1) self.captioner.set_image_batch_size(1) def getCaption(self, image): row=image bytes = array.array('b', row.image).tostring() im = Image.open(io.BytesIO(bytes)) image = np.array(im,dtype=np.uint8) dataset = [image] descriptors = self.captioner.compute_descriptors(dataset) images = dataset num_images = len(images) batch_size = num_images #Generate captions for all images. all_captions = [None] * num_images for image_index in xrange(0, num_images, batch_size): batch_end_index = min(image_index + batch_size, num_images) output_captions, output_probs = self.captioner.sample_captions( descriptors[image_index:batch_end_index], temp=float('inf')) for batch_index, output in zip(range(image_index, batch_end_index), output_captions): all_captions[batch_index] = output # # # Collect model/reference captions, formatting the model's captions and # # each set of reference captions as a list of len(self.images) strings. # # For each image, write out the highest probability caption. model_captions = [''] * len(images) for image_index, image in enumerate(images): caption = self.captioner.sentence(all_captions[image_index]) model_captions[image_index] = caption generation_result = [Row(row.id,model_captions[image_index]) for (image_index, image_path) in enumerate(images)] return generation_result
def build_captioner(model_name, image_net, LM_net, dataset_name='coco', split_name='val', vocab='vocabulary', precomputed_h5=None, gpu=0, prev_word_restriction=True): model_files = ['%s.caffemodel' % (mf) for mf in model_name] if image_net: image_net_file = home_dir + image_net else: image_net_file = None lstm_net_file = home_dir + LM_net vocab_file = '%s/%s.txt' % (determine_vocab_folder(dataset_name, split_name), vocab) device_id = gpu with open(vocab_file, 'r') as vocab_file_read: vocab = [line.strip() for line in vocab_file_read.readlines()] anno_path = determine_anno_path(dataset_name, split_name) image_root = determine_image_pattern(dataset_name, split_name) sg = build_sequence_generator(anno_path, 50, image_root, vocab=vocab, max_words=50, align=False, shuffle=False, gt_captions=True, pad=True, truncate=True, split_ids=None) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) captioner = Captioner(model_files, image_net_file, lstm_net_file, vocab_file, device_id=device_id, precomputed_feats=precomputed_h5, prev_word_restriction=prev_word_restriction) return captioner, sg, dataset
def main(model_name='', image_net='', LM_net='', dataset_name='val', vocab='vocabulary', precomputed_feats=None, feats_bool_in=False, precomputed_h5=None, experiment={'type': 'generation'}, prev_word_restriction=False, gpu=0): #model_name is the trained model: path relative to /home/lisa/caffe-LSTM-video #image_net is the model to extract length 1000 image features: path relative to snapshots folder; do not need to include "caffemodel" #dataset_name indicates which dataset to look at #vocab indicates which vocabulary file to look at #feats_bool is whether or not the images are saved as pickle feature files or if they are normal images #experiment: dict which has all info needed for experiments. Must have field type which will indicate madlib versus generation expt. if not precomputed_feats: precomputed_feats = model_name MAX_IMAGES = -1 # -1 to use all images TAG = 'coco_2layer_factored' if MAX_IMAGES >= 0: TAG += '_%dimages' % MAX_IMAGES eval_on_test = False if eval_on_test: ITER = 100000 MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER DATASET_NAME = 'test' else: # eval on val MODEL_FILENAME = model_name DATASET_NAME = dataset_name TAG += '_%s' % DATASET_NAME #MODEL_DIR = home_dir + '/examples/coco_caption/snapshots' MODEL_DIR = '' MODEL_FILE = ['%s.caffemodel' % (MF) for MF in MODEL_FILENAME] #IMAGE_NET_FILE = home_dir + '/models/bvlc_reference_caffenet/deploy.prototxt' IMAGE_NET_FILE = home_dir + image_net #LSTM_NET_FILE = home_dir + '/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt' LSTM_NET_FILE = home_dir + LM_net DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all') #DATASET_CACHE_DIR = home_dir + '/retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME) DATASET_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % ( DATASET_SUBDIR, '_'.join(MODEL_FILENAME)) FEATURE_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % ( DATASET_SUBDIR, precomputed_feats) VOCAB_FILE = '../../examples/coco_caption/h5_data/buffer_100/%s.txt' % vocab DEVICE_ID = gpu with open(VOCAB_FILE, 'r') as vocab_file: vocab = [line.strip() for line in vocab_file.readlines()] coco = COCO(COCO_ANNO_PATH % DATASET_NAME) #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images/%s2014' COCO_IMAGE_PATTERN = '../../data/coco/coco/images/%s2014' #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images2/%s2014' #image_root = COCO_IMAGE_PATTERN % DATASET_NAME image_root = COCO_IMAGE_PATTERN % 'val' #image_root = '/z/lisaanne/imageData/imagenet/' sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab, max_words=MAX_WORDS, align=False, shuffle=False, gt_captions=True, pad=True, truncate=True, split_ids=None, feats_bool=feats_bool_in) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) if 0 <= MAX_IMAGES < len(dataset.keys()): all_keys = dataset.keys() perm = np.random.permutation(len(all_keys))[:MAX_IMAGES] chosen_keys = set([all_keys[p] for p in perm]) for key in all_keys: if key not in chosen_keys: del dataset[key] print 'Reduced dataset to %d images' % len(dataset.keys()) if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys()) captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE, device_id=DEVICE_ID, precomputed_feats=precomputed_h5, prev_word_restriction=prev_word_restriction) if 'beam_size' in experiment.keys(): beam_size = experiment['beam_size'] else: beam_size = 1 generation_strategy = {'type': 'beam', 'beam_size': beam_size} if generation_strategy['type'] == 'beam': strategy_name = 'beam%d' % generation_strategy['beam_size'] elif generation_strategy['type'] == 'sample': strategy_name = 'sample%f' % generation_strategy['temp'] else: raise Exception('Unknown generation strategy type: %s' % generation_strategy['type']) CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name) experimenter = CaptionExperiment(captioner, dataset, FEATURE_CACHE_DIR, CACHE_DIR, sg, feats_bool_in) captioner.set_image_batch_size(min(100, MAX_IMAGES)) if experiment['type'] == 'madlib': all_mean_index = [] all_mean_prob = [] all_top_words = [] for fw in experiment['fill_words']: for cw in experiment['cooccur_words']: mean_index, mean_prob, top_words = experimenter.madlib_experiment( fw, [cw]) all_mean_index.append(mean_index) all_mean_prob.append(mean_prob) all_top_words.append(top_words) return all_mean_index, all_mean_prob, all_top_words if experiment['type'] == 'generation': experimenter.generation_experiment(generation_strategy, 1000) if experiment['type'] == 'score_generation': if 'read_file' in experiment.keys(): read_file = experiment['read_file'] else: read_file = True experimenter.score_generation(experiment['json_file'], read_file)
def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1) self.captioner.set_image_batch_size(1)
tst_imlist_file = './data/split/referit_test_imlist.txt' ################################################################################ image_dir = './datasets/ReferIt/ImageCLEF/images/' proposal_dir = './data/referit_edgeboxes_top100/' cached_context_features_dir = './data/referit_context_features/' imcrop_dict_file = './data/metadata/referit_imcrop_dict.json' imcrop_bbox_dict_file = './data/metadata/referit_imcrop_bbox_dict.json' query_file = './data/metadata/referit_query_dict.json' vocab_file = './data/vocabulary.txt' # utilize the captioner module from LRCN image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) captioner.set_image_batch_size(50) vocab_dict = retriever.build_vocab_dict_from_captioner(captioner) # Load image and caption list imlist = util.io.load_str_list(tst_imlist_file) num_im = len(imlist) query_dict = util.io.load_json(query_file) imcrop_dict = util.io.load_json(imcrop_dict_file) imcrop_bbox_dict = util.io.load_json(imcrop_bbox_dict_file) # Load candidate regions (bounding boxes) load_proposal = (candidate_regions == 'proposal_regions') candidate_boxes_dict = {imname: None for imname in imlist} for n_im in range(num_im): if n_im % 1000 == 0:
import util from captioner import Captioner vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel' gpu_id = 0 image_dir = './data/resized_imcrop/' cached_local_features_dir = './data/referit_local_features/' image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt' vocab_file = './data/vocabulary.txt' captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) batch_size = 100 captioner.set_image_batch_size(batch_size) imlist = util.io.load_str_list('./data/training/train_imcrop_list.txt') num_im = len(imlist) # Load all images into memory loaded_images = [] for n_im in range(num_im): if n_im % 200 == 0: print('loading image %d / %d into memory' % (n_im, num_im)) im = skimage.io.imread(image_dir + imlist[n_im]) # Gray scale to RGB
image_dir = './datasets/Kitchen/images/Kitchen/' if distractor_set == "kitchen": distractor_dir = image_dir distractor_imlist_file = tst_imlist_file else: distractor_dir = './datasets/Kitchen/images/ImageNET/' distractor_imlist_file = './data/split/kitchen_imagenet_imlist.txt' query_file = './data/metadata/kitchen_query_dict.json' vocab_file = './data/vocabulary.txt' # utilize the captioner module from LRCN lstm_net_proto = './prototxt/scrc_word_to_preds_no_spatial_no_context.prototxt' image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) captioner.set_image_batch_size(50) vocab_dict = retriever.build_vocab_dict_from_captioner(captioner) # Load image and caption list imlist = util.io.load_str_list(tst_imlist_file) num_im = len(imlist) query_dict = util.io.load_json(query_file) # Load distractors distractor_list = util.io.load_str_list(distractor_imlist_file) num_distractors = len(distractor_list) # Sample distractor images for each test image distractor_ids_per_im = {} np.random.seed(3) # fix random seed for test repeatibility
class WindowWidget(QtWidgets.QWidget): def __init__(self, voice): super(WindowWidget, self).__init__() self.sess = None self.captioner = None self.prepare_questioner() self.prepare_captioner() self.threadpool = QtCore.QThreadPool() self.questioner_running = False self.applying_output = False global graph graph = tf.get_default_graph() if voice: self.tts = TTS() else: self.tts = None # Viewing region self.viewing_region = QtWidgets.QLabel(self) layout = QtWidgets.QHBoxLayout() layout.addWidget(self.viewing_region) # Load button self.load_button = QtWidgets.QPushButton('Load image') self.load_button.clicked.connect(self.load_button_clicked) right_sidebar = QtWidgets.QVBoxLayout() right_sidebar.addWidget(self.load_button) # Extra instructions region self.instr_region = QtWidgets.QLabel(self) self.instr_region.setText('Or drop an image onto this window.') right_sidebar.addWidget(self.instr_region) right_sidebar.addStretch() # Progress bar self.progress = QtWidgets.QProgressBar(self) self.progress.setMaximum(100) right_sidebar.addWidget(self.progress) self.progress.hide() # Text region self.text_region = QtWidgets.QLabel(self) self.text_region.setFrameStyle(QtWidgets.QFrame.Panel | QtWidgets.QFrame.Sunken) self.text_region.setWordWrap(True) self.text_region.setMargin(8) self.text_region.setText('...') right_sidebar.addWidget(self.text_region) layout.addLayout(right_sidebar) # Launch self.setWindowIcon(QtGui.QIcon(os.path.join('icon', 'question.png'))) self.setLayout(layout) self.setAcceptDrops(True) self.show() def prepare_questioner(self): self.sess = gpt2.start_tf_sess() gpt2.load_gpt2(self.sess) def prepare_captioner(self): config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader) checkpoint_path = os.path.join(config['project_root_dir'], config['checkpoint_path']) vocab_file_path = os.path.join(config['project_root_dir'], config['vocab_file_path']) self.captioner = Captioner(self.sess, checkpoint_path, vocab_file_path) def load_button_clicked(self): if self.questioner_running or self.applying_output: print("Can't load an image right now. Questioner is busy.") else: image_path, _ = QtWidgets.QFileDialog.getOpenFileName( self, 'Open file') if image_path: self.load_image(image_path) def load_image(self, image_path): pixmap = QtGui.QPixmap(image_path) pixmap = pixmap.scaled(500, 500, QtCore.Qt.KeepAspectRatio) self.viewing_region.setPixmap(pixmap) self.text_region.setText('Questioner is working.') self.adjustSize() self.questioner_running = True worker = Worker(self.run_questioner, image_path) worker.signals.finished.connect(self.questioner_finished) worker.signals.error.connect(self.questioner_failed) worker.signals.result.connect(self.apply_questioner_output) self.threadpool.start(worker) def run_questioner(self, image_path): global graph with graph.as_default(): # this is run on a separate thread caption = self.captioner.caption(image_path) questions = gpt2_gen_questions(self.sess, caption, nsamples=1, temperature=0.7) return questions[0] if len(questions) > 0 else '' def questioner_finished(self): self.questioner_running = False def questioner_failed(self, e): print(e) def apply_questioner_output(self, question): self.applying_output = True if len(question) > 0: self.text_region.setText(question) if self.tts: self.progress.show() self.tts.speak(question, self.tts_callback) self.progress.hide() self.applying_output = False def tts_callback(self, i, seq_len, batch_size, gen_rate): percentage = i * 100 / seq_len self.progress.setValue(percentage) def dragEnterEvent(self, evt): if evt.mimeData().hasUrls: evt.accept() else: evt.ignore() def dragMoveEvent(self, evt): if evt.mimeData().hasUrls: evt.accept() else: evt.ignore() def dropEvent(self, evt): if evt.mimeData().hasUrls \ and not self.questioner_running \ and not self.applying_output: evt.setDropAction(QtCore.Qt.CopyAction) evt.accept() for url in evt.mimeData().urls(): if op_sys == 'Darwin': image_path = str( NSURL.URLWithString_(str( url.toString())).filePathURL().path()) else: image_path = str(url.toLocalFile()) self.load_image(image_path) else: evt.ignore()
def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab): self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1) self.captioner.set_image_batch_size(1)
from captioner import Captioner import retriever import math #import image_convert #im_file = './demo_data/test2.jpg' pretrained_weights_path = '../models/two_layer_LSTM.caffemodel' gpu_id = 0 # Initialize the retrieval model image_net_proto = '../prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = '../prototxt/scrc_word_to_preds_full.prototxt' vocab_file = '../data/vocabulary.txt' # utilize the captioner module from LRCN captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) captioner.set_image_batch_size( 50) # decrease the number if your GPU memory is small vocab_dict = retriever.build_vocab_dict_from_captioner(captioner) while (1): sum_candidate_box = [] sum_score_box = [] query = raw_input("type the input query: ") #query = 'bike on the red house' print("query =", query) print("Find best candidate..!") for i in range(8): im_file = './splited_image/test' + str(i) + '.jpg'
import caffe import util from captioner import Captioner vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel' gpu_id = 0 image_dir = './datasets/ReferIt/ImageCLEF/images/' cached_context_features_dir = './data/referit_context_features/' image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt' lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt' vocab_file = './data/vocabulary.txt' captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id) batch_size = 100 captioner.set_image_batch_size(batch_size) imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt') num_im = len(imlist) # Load all images into memory loaded_images = [] for n_im in range(num_im): if n_im % 200 == 0: print('loading image %d / %d into memory' % (n_im, num_im)) im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg') # Gray scale to RGB if im.ndim == 2:
def main(): MAX_IMAGES = -1 # -1 to use all images TAG = 'coco_2layer_factored' if MAX_IMAGES >= 0: TAG += '_%dimages' % MAX_IMAGES eval_on_test = False if eval_on_test: ITER = 100000 MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER DATASET_NAME = 'test' else: # eval on val ITER = 50000 MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER DATASET_NAME = 'val' TAG += '_%s' % DATASET_NAME MODEL_DIR = './examples/coco_caption' MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME) IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt' LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt' NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME) DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all') DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME) VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt' DEVICE_ID = 0 with open(VOCAB_FILE, 'r') as vocab_file: vocab = [line.strip() for line in vocab_file.readlines()] coco = COCO(COCO_ANNO_PATH % DATASET_NAME) image_root = COCO_IMAGE_PATTERN % DATASET_NAME sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab, align=False, shuffle=False) dataset = {} for image_path, sentence in sg.image_sentence_pairs: if image_path not in dataset: dataset[image_path] = [] dataset[image_path].append((sg.line_to_stream(sentence), sentence)) print 'Original dataset contains %d images' % len(dataset.keys()) if 0 <= MAX_IMAGES < len(dataset.keys()): all_keys = dataset.keys() perm = np.random.permutation(len(all_keys))[:MAX_IMAGES] chosen_keys = set([all_keys[p] for p in perm]) for key in all_keys: if key not in chosen_keys: del dataset[key] print 'Reduced dataset to %d images' % len(dataset.keys()) if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys()) captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE, device_id=DEVICE_ID) beam_size = 1 generation_strategy = {'type': 'beam', 'beam_size': beam_size} if generation_strategy['type'] == 'beam': strategy_name = 'beam%d' % generation_strategy['beam_size'] elif generation_strategy['type'] == 'sample': strategy_name = 'sample%f' % generation_strategy['temp'] else: raise Exception('Unknown generation strategy type: %s' % generation_strategy['type']) CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name) experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg) captioner.set_image_batch_size(min(100, MAX_IMAGES)) experimenter.generation_experiment(generation_strategy) captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000)) experimenter.retrieval_experiment()
def main(): parser = argparse.ArgumentParser() # Model Setup parser.add_argument("--detector_config", default=None, type=str, help="detector config file path.") parser.add_argument("--detector_weights", default=None, type=str, help="pretrained detector weights.") parser.add_argument("--decoder_config", default=None, type=str, help="Bert decoder config file path.") parser.add_argument("--decoder_weights", default=None, type=str, help="pretrained Bert decoder weights.") parser.add_argument("--object_vocab", default=None, type=str, help="object vocabulary, maps object ids to object names") # For COCO parser.add_argument('--coco_root', type=str, default='~/Datasets/coco') parser.add_argument("--coco_data_info", default='annotations/dataset_coco.json', type=str, help="The input data file name.") parser.add_argument("--coco_ann_file", default='annotations/captions_val2014.json', type=str, help="caption annotations file (i.e. answer key)") parser.add_argument('--valid_jpgs_file', default='annotations/coco_valid_jpgs.json', type=str, help="lists the valid jpgs") # For data pipeline parser.add_argument('--batch_size', type=int, default=1, help="Batch size for decoding. Highly recommended to be a multiple of 8") parser.add_argument('--dl_workers', type=int, default=0, help="Number of dataloader workers") # For reproducibility parser.add_argument('--seed', type=int, default=-1, help="random seed for initialization") args = parser.parse_args() assert(torch.cuda.is_available()) cpu_device = torch.device("cpu") gpu_device = torch.device("cuda:0") n_gpu = torch.cuda.device_count() # fix random seed (optional) if args.seed != -1: random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) with torch.no_grad(): captioner = Captioner(args.detector_config, args.detector_weights, args.decoder_config, args.decoder_weights, args.object_vocab, cpu_device, gpu_device) # TODO: optimize for amp, data-parallel torch.cuda.empty_cache() # Empty everything valid_dataset = CocoCaptionsKarpathyValidImgs(args.coco_root) valid_dl = DataLoader(valid_dataset, batch_size=args.batch_size, collate_fn=ccc_karpathy_valid_collate, num_workers=args.dl_workers, pin_memory=True) total_batch = math.ceil(len(valid_dataset) / args.batch_size) predictions = [] print('start the caption evaluation...') with tqdm(total=total_batch) as pbar: for img_ids, img_npys in valid_dl: captions = captioner.forward(img_npys) for img_id, caption in zip(img_ids, captions): predictions.append({'image_id': img_id, 'caption': caption}) pbar.update(1) language_eval(preds=predictions, annFile=os.path.join(args.coco_root, args.coco_ann_file), model_id='0', split='val')