Example #1
0
 def prepare_captioner(self):
     config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
     checkpoint_path = os.path.join(config['project_root_dir'],
                                    config['checkpoint_path'])
     vocab_file_path = os.path.join(config['project_root_dir'],
                                    config['vocab_file_path'])
     self.captioner = Captioner(self.sess, checkpoint_path, vocab_file_path)
Example #2
0
 def __init__(self,
              detector_cfg_path,
              detector_weights_path,
              bert_cfg_path,
              bert_weights_path,
              object_vocab_path,
              tacotron_weights_path,
              waveglow_cfg_path,
              waveglow_weights_path,
              cpu_device,
              gpu_device,
              fc_layer=0,
              max_caption_length=67,
              sampling_rate=22050):
     """
     args:
         detector_cfg_path: path to the detector config
         detector_weights_path: path to the detector weights
         bert_cfg_path: path to the bert decoder config
         bert_weights_path: path to the bert decoder weights
         tacotron_weights_path: path to the tacotron weights
         waveglow_weights_path: path to the waveglow weights
         cpu_device: The cpu device to run some parts of visualization
         gpu_device: The gpu device to run the bulk of computations, currently requires at least 1 GPU device
         fc_layer: the fully connected layer from the detector to extract features from, 0-indexed
         max_caption_length: the maximum number of tokens the caption can be
         sampling_rate: the rate that audio representations are sampled per second
     """
     self.captioner = Captioner(detector_cfg_path, detector_weights_path,
                                bert_cfg_path, bert_weights_path,
                                object_vocab_path, cpu_device, gpu_device,
                                fc_layer, max_caption_length)
     device = gpu_device if gpu_device else cpu_device
     self.tts = TTS(tacotron_weights_path, waveglow_cfg_path,
                    waveglow_weights_path, device, sampling_rate)
Example #3
0
def main(image_path, nsamples, temperature):
    # Generate questions
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess)
    config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
    checkpoint_path = os.path.join(config['project_root_dir'],
                                   config['checkpoint_path'])
    vocab_file_path = os.path.join(config['project_root_dir'],
                                   config['vocab_file_path'])
    captioner = Captioner(sess, checkpoint_path, vocab_file_path)
    caption = captioner.caption(image_path)
    questions = gpt2_gen_questions(sess,
                                   caption,
                                   nsamples=nsamples,
                                   temperature=temperature)

    # Print generated questions
    print('----------\nQuestions:')
    for i, question in enumerate(questions):
        print('%d. %s' % (i + 1, question))
class CaptionExperiment:
    def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
        self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1)
        self.captioner.set_image_batch_size(1)

    def getCaption(self, image):
        row = image
        bytes = array.array("b", row.image).tostring()
        im = Image.open(io.BytesIO(bytes))
        image = np.array(im, dtype=np.uint8)
        dataset = [image]
        descriptors = self.captioner.compute_descriptors(dataset)
        images = dataset
        num_images = len(images)
        batch_size = num_images

        # Generate captions for all images.
        all_captions = [None] * num_images
        for image_index in xrange(0, num_images, batch_size):
            batch_end_index = min(image_index + batch_size, num_images)
            output_captions, output_probs = self.captioner.sample_captions(
                descriptors[image_index:batch_end_index], temp=float("inf")
            )
            for batch_index, output in zip(range(image_index, batch_end_index), output_captions):
                all_captions[batch_index] = output
            #
            #    # Collect model/reference captions, formatting the model's captions and
            #    # each set of reference captions as a list of len(self.images) strings.
            #    # For each image, write out the highest probability caption.
            model_captions = [""] * len(images)
            for image_index, image in enumerate(images):
                caption = self.captioner.sentence(all_captions[image_index])
                model_captions[image_index] = caption

            generation_result = [
                Row(row.id, model_captions[image_index]) for (image_index, image_path) in enumerate(images)
            ]
            return generation_result
class CaptionExperiment():
  def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
    self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1)
    self.captioner.set_image_batch_size(1)
  

  def getCaption(self, image):
    row=image
    bytes = array.array('b', row.image).tostring()
    im = Image.open(io.BytesIO(bytes))
    image = np.array(im,dtype=np.uint8)
    dataset = [image]
    descriptors = self.captioner.compute_descriptors(dataset)
    images = dataset
    num_images = len(images)
    batch_size = num_images
  
    #Generate captions for all images.
    all_captions = [None] * num_images
    for image_index in xrange(0, num_images, batch_size):
      batch_end_index = min(image_index + batch_size, num_images)
      output_captions, output_probs = self.captioner.sample_captions(
        descriptors[image_index:batch_end_index], temp=float('inf'))
      for batch_index, output in zip(range(image_index, batch_end_index),
                                     output_captions):
        all_captions[batch_index] = output
  #
  #    # Collect model/reference captions, formatting the model's captions and
  #    # each set of reference captions as a list of len(self.images) strings.
  #    # For each image, write out the highest probability caption.
      model_captions = [''] * len(images)
      for image_index, image in enumerate(images):
        caption = self.captioner.sentence(all_captions[image_index])
        model_captions[image_index] = caption

      generation_result = [Row(row.id,model_captions[image_index]) for (image_index, image_path) in enumerate(images)]      
      return generation_result
Example #6
0
def build_captioner(model_name,
                    image_net,
                    LM_net,
                    dataset_name='coco',
                    split_name='val',
                    vocab='vocabulary',
                    precomputed_h5=None,
                    gpu=0,
                    prev_word_restriction=True):
    model_files = ['%s.caffemodel' % (mf) for mf in model_name]
    if image_net:
        image_net_file = home_dir + image_net
    else:
        image_net_file = None
    lstm_net_file = home_dir + LM_net
    vocab_file = '%s/%s.txt' % (determine_vocab_folder(dataset_name,
                                                       split_name), vocab)
    device_id = gpu
    with open(vocab_file, 'r') as vocab_file_read:
        vocab = [line.strip() for line in vocab_file_read.readlines()]
    anno_path = determine_anno_path(dataset_name, split_name)
    image_root = determine_image_pattern(dataset_name, split_name)

    sg = build_sequence_generator(anno_path,
                                  50,
                                  image_root,
                                  vocab=vocab,
                                  max_words=50,
                                  align=False,
                                  shuffle=False,
                                  gt_captions=True,
                                  pad=True,
                                  truncate=True,
                                  split_ids=None)
    dataset = {}
    for image_path, sentence in sg.image_sentence_pairs:
        if image_path not in dataset:
            dataset[image_path] = []
        dataset[image_path].append((sg.line_to_stream(sentence), sentence))
    print 'Original dataset contains %d images' % len(dataset.keys())
    captioner = Captioner(model_files,
                          image_net_file,
                          lstm_net_file,
                          vocab_file,
                          device_id=device_id,
                          precomputed_feats=precomputed_h5,
                          prev_word_restriction=prev_word_restriction)
    return captioner, sg, dataset
Example #7
0
def main(model_name='',
         image_net='',
         LM_net='',
         dataset_name='val',
         vocab='vocabulary',
         precomputed_feats=None,
         feats_bool_in=False,
         precomputed_h5=None,
         experiment={'type': 'generation'},
         prev_word_restriction=False,
         gpu=0):
    #model_name is the trained model: path relative to /home/lisa/caffe-LSTM-video
    #image_net is the model to extract length 1000 image features: path relative to snapshots folder; do not need to include "caffemodel"
    #dataset_name indicates which dataset to look at
    #vocab indicates which vocabulary file to look at
    #feats_bool is whether or not the images are saved as pickle feature files or if they are normal images
    #experiment: dict which has all info needed for experiments.  Must have field type which will indicate madlib versus generation expt.

    if not precomputed_feats:
        precomputed_feats = model_name

    MAX_IMAGES = -1  # -1 to use all images
    TAG = 'coco_2layer_factored'
    if MAX_IMAGES >= 0:
        TAG += '_%dimages' % MAX_IMAGES
    eval_on_test = False
    if eval_on_test:
        ITER = 100000
        MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER
        DATASET_NAME = 'test'
    else:  # eval on val
        MODEL_FILENAME = model_name
        DATASET_NAME = dataset_name
    TAG += '_%s' % DATASET_NAME
    #MODEL_DIR = home_dir + '/examples/coco_caption/snapshots'
    MODEL_DIR = ''
    MODEL_FILE = ['%s.caffemodel' % (MF) for MF in MODEL_FILENAME]
    #IMAGE_NET_FILE = home_dir + '/models/bvlc_reference_caffenet/deploy.prototxt'
    IMAGE_NET_FILE = home_dir + image_net
    #LSTM_NET_FILE = home_dir + '/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt'
    LSTM_NET_FILE = home_dir + LM_net
    DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES)
                                    if MAX_IMAGES >= 0 else 'all')
    #DATASET_CACHE_DIR = home_dir + '/retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME)
    DATASET_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % (
        DATASET_SUBDIR, '_'.join(MODEL_FILENAME))
    FEATURE_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % (
        DATASET_SUBDIR, precomputed_feats)
    VOCAB_FILE = '../../examples/coco_caption/h5_data/buffer_100/%s.txt' % vocab
    DEVICE_ID = gpu
    with open(VOCAB_FILE, 'r') as vocab_file:
        vocab = [line.strip() for line in vocab_file.readlines()]
    coco = COCO(COCO_ANNO_PATH % DATASET_NAME)
    #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images/%s2014'
    COCO_IMAGE_PATTERN = '../../data/coco/coco/images/%s2014'
    #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images2/%s2014'
    #image_root = COCO_IMAGE_PATTERN % DATASET_NAME
    image_root = COCO_IMAGE_PATTERN % 'val'
    #image_root = '/z/lisaanne/imageData/imagenet/'
    sg = CocoSequenceGenerator(coco,
                               BUFFER_SIZE,
                               image_root,
                               vocab=vocab,
                               max_words=MAX_WORDS,
                               align=False,
                               shuffle=False,
                               gt_captions=True,
                               pad=True,
                               truncate=True,
                               split_ids=None,
                               feats_bool=feats_bool_in)
    dataset = {}
    for image_path, sentence in sg.image_sentence_pairs:
        if image_path not in dataset:
            dataset[image_path] = []
        dataset[image_path].append((sg.line_to_stream(sentence), sentence))
    print 'Original dataset contains %d images' % len(dataset.keys())
    if 0 <= MAX_IMAGES < len(dataset.keys()):
        all_keys = dataset.keys()
        perm = np.random.permutation(len(all_keys))[:MAX_IMAGES]
        chosen_keys = set([all_keys[p] for p in perm])
        for key in all_keys:
            if key not in chosen_keys:
                del dataset[key]
        print 'Reduced dataset to %d images' % len(dataset.keys())
    if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys())
    captioner = Captioner(MODEL_FILE,
                          IMAGE_NET_FILE,
                          LSTM_NET_FILE,
                          VOCAB_FILE,
                          device_id=DEVICE_ID,
                          precomputed_feats=precomputed_h5,
                          prev_word_restriction=prev_word_restriction)
    if 'beam_size' in experiment.keys():
        beam_size = experiment['beam_size']
    else:
        beam_size = 1
    generation_strategy = {'type': 'beam', 'beam_size': beam_size}
    if generation_strategy['type'] == 'beam':
        strategy_name = 'beam%d' % generation_strategy['beam_size']
    elif generation_strategy['type'] == 'sample':
        strategy_name = 'sample%f' % generation_strategy['temp']
    else:
        raise Exception('Unknown generation strategy type: %s' %
                        generation_strategy['type'])
    CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name)
    experimenter = CaptionExperiment(captioner, dataset, FEATURE_CACHE_DIR,
                                     CACHE_DIR, sg, feats_bool_in)
    captioner.set_image_batch_size(min(100, MAX_IMAGES))
    if experiment['type'] == 'madlib':
        all_mean_index = []
        all_mean_prob = []
        all_top_words = []
        for fw in experiment['fill_words']:
            for cw in experiment['cooccur_words']:
                mean_index, mean_prob, top_words = experimenter.madlib_experiment(
                    fw, [cw])
                all_mean_index.append(mean_index)
                all_mean_prob.append(mean_prob)
                all_top_words.append(top_words)
        return all_mean_index, all_mean_prob, all_top_words
    if experiment['type'] == 'generation':
        experimenter.generation_experiment(generation_strategy, 1000)
    if experiment['type'] == 'score_generation':
        if 'read_file' in experiment.keys():
            read_file = experiment['read_file']
        else:
            read_file = True
        experimenter.score_generation(experiment['json_file'], read_file)
 def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
     self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab, -1)
     self.captioner.set_image_batch_size(1)
Example #9
0
tst_imlist_file = './data/split/referit_test_imlist.txt'
################################################################################

image_dir = './datasets/ReferIt/ImageCLEF/images/'
proposal_dir = './data/referit_edgeboxes_top100/'
cached_context_features_dir = './data/referit_context_features/'

imcrop_dict_file = './data/metadata/referit_imcrop_dict.json'
imcrop_bbox_dict_file = './data/metadata/referit_imcrop_bbox_dict.json'
query_file = './data/metadata/referit_query_dict.json'
vocab_file = './data/vocabulary.txt'

# utilize the captioner module from LRCN
image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(50)
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

# Load image and caption list
imlist = util.io.load_str_list(tst_imlist_file)
num_im = len(imlist)
query_dict = util.io.load_json(query_file)
imcrop_dict = util.io.load_json(imcrop_dict_file)
imcrop_bbox_dict = util.io.load_json(imcrop_bbox_dict_file)

# Load candidate regions (bounding boxes)
load_proposal = (candidate_regions == 'proposal_regions')
candidate_boxes_dict = {imname: None for imname in imlist}
for n_im in range(num_im):
    if n_im % 1000 == 0:
tst_imlist_file = './data/split/referit_test_imlist.txt'
################################################################################

image_dir = './datasets/ReferIt/ImageCLEF/images/'
proposal_dir = './data/referit_edgeboxes_top100/'
cached_context_features_dir = './data/referit_context_features/'

imcrop_dict_file = './data/metadata/referit_imcrop_dict.json'
imcrop_bbox_dict_file = './data/metadata/referit_imcrop_bbox_dict.json'
query_file = './data/metadata/referit_query_dict.json'
vocab_file = './data/vocabulary.txt'

# utilize the captioner module from LRCN
image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(50)
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

# Load image and caption list
imlist = util.io.load_str_list(tst_imlist_file)
num_im = len(imlist)
query_dict = util.io.load_json(query_file)
imcrop_dict = util.io.load_json(imcrop_dict_file)
imcrop_bbox_dict = util.io.load_json(imcrop_bbox_dict_file)

# Load candidate regions (bounding boxes)
load_proposal = (candidate_regions == 'proposal_regions')
candidate_boxes_dict = {imname: None for imname in imlist}
for n_im in range(num_im):
    if n_im % 1000 == 0:
import util
from captioner import Captioner


vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel'
gpu_id = 0

image_dir = './data/resized_imcrop/'
cached_local_features_dir = './data/referit_local_features/'

image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = './data/vocabulary.txt'

captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto, vocab_file, gpu_id)
batch_size = 100
captioner.set_image_batch_size(batch_size)

imlist = util.io.load_str_list('./data/training/train_imcrop_list.txt')

num_im = len(imlist)

# Load all images into memory
loaded_images = []
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('loading image %d / %d into memory' % (n_im, num_im))

    im = skimage.io.imread(image_dir + imlist[n_im])
    # Gray scale to RGB
image_dir = './datasets/Kitchen/images/Kitchen/'

if distractor_set == "kitchen":
    distractor_dir = image_dir
    distractor_imlist_file = tst_imlist_file
else:
    distractor_dir = './datasets/Kitchen/images/ImageNET/'
    distractor_imlist_file = './data/split/kitchen_imagenet_imlist.txt'

query_file = './data/metadata/kitchen_query_dict.json'
vocab_file = './data/vocabulary.txt'

# utilize the captioner module from LRCN
lstm_net_proto = './prototxt/scrc_word_to_preds_no_spatial_no_context.prototxt'
image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(50)
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

# Load image and caption list
imlist = util.io.load_str_list(tst_imlist_file)
num_im = len(imlist)
query_dict = util.io.load_json(query_file)

# Load distractors
distractor_list = util.io.load_str_list(distractor_imlist_file)
num_distractors = len(distractor_list)

# Sample distractor images for each test image
distractor_ids_per_im = {}
np.random.seed(3)  # fix random seed for test repeatibility
Example #13
0
class WindowWidget(QtWidgets.QWidget):
    def __init__(self, voice):
        super(WindowWidget, self).__init__()
        self.sess = None
        self.captioner = None
        self.prepare_questioner()
        self.prepare_captioner()
        self.threadpool = QtCore.QThreadPool()
        self.questioner_running = False
        self.applying_output = False

        global graph
        graph = tf.get_default_graph()

        if voice:
            self.tts = TTS()
        else:
            self.tts = None

        # Viewing region
        self.viewing_region = QtWidgets.QLabel(self)
        layout = QtWidgets.QHBoxLayout()
        layout.addWidget(self.viewing_region)

        # Load button
        self.load_button = QtWidgets.QPushButton('Load image')
        self.load_button.clicked.connect(self.load_button_clicked)
        right_sidebar = QtWidgets.QVBoxLayout()
        right_sidebar.addWidget(self.load_button)

        # Extra instructions region
        self.instr_region = QtWidgets.QLabel(self)
        self.instr_region.setText('Or drop an image onto this window.')
        right_sidebar.addWidget(self.instr_region)
        right_sidebar.addStretch()

        # Progress bar
        self.progress = QtWidgets.QProgressBar(self)
        self.progress.setMaximum(100)
        right_sidebar.addWidget(self.progress)
        self.progress.hide()

        # Text region
        self.text_region = QtWidgets.QLabel(self)
        self.text_region.setFrameStyle(QtWidgets.QFrame.Panel
                                       | QtWidgets.QFrame.Sunken)
        self.text_region.setWordWrap(True)
        self.text_region.setMargin(8)
        self.text_region.setText('...')
        right_sidebar.addWidget(self.text_region)
        layout.addLayout(right_sidebar)

        # Launch
        self.setWindowIcon(QtGui.QIcon(os.path.join('icon', 'question.png')))
        self.setLayout(layout)
        self.setAcceptDrops(True)
        self.show()

    def prepare_questioner(self):
        self.sess = gpt2.start_tf_sess()
        gpt2.load_gpt2(self.sess)

    def prepare_captioner(self):
        config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
        checkpoint_path = os.path.join(config['project_root_dir'],
                                       config['checkpoint_path'])
        vocab_file_path = os.path.join(config['project_root_dir'],
                                       config['vocab_file_path'])
        self.captioner = Captioner(self.sess, checkpoint_path, vocab_file_path)

    def load_button_clicked(self):
        if self.questioner_running or self.applying_output:
            print("Can't load an image right now. Questioner is busy.")
        else:
            image_path, _ = QtWidgets.QFileDialog.getOpenFileName(
                self, 'Open file')
            if image_path:
                self.load_image(image_path)

    def load_image(self, image_path):
        pixmap = QtGui.QPixmap(image_path)
        pixmap = pixmap.scaled(500, 500, QtCore.Qt.KeepAspectRatio)
        self.viewing_region.setPixmap(pixmap)
        self.text_region.setText('Questioner is working.')
        self.adjustSize()

        self.questioner_running = True
        worker = Worker(self.run_questioner, image_path)
        worker.signals.finished.connect(self.questioner_finished)
        worker.signals.error.connect(self.questioner_failed)
        worker.signals.result.connect(self.apply_questioner_output)
        self.threadpool.start(worker)

    def run_questioner(self, image_path):
        global graph
        with graph.as_default():  # this is run on a separate thread
            caption = self.captioner.caption(image_path)
            questions = gpt2_gen_questions(self.sess,
                                           caption,
                                           nsamples=1,
                                           temperature=0.7)
            return questions[0] if len(questions) > 0 else ''

    def questioner_finished(self):
        self.questioner_running = False

    def questioner_failed(self, e):
        print(e)

    def apply_questioner_output(self, question):
        self.applying_output = True
        if len(question) > 0:
            self.text_region.setText(question)
            if self.tts:
                self.progress.show()
                self.tts.speak(question, self.tts_callback)
                self.progress.hide()
        self.applying_output = False

    def tts_callback(self, i, seq_len, batch_size, gen_rate):
        percentage = i * 100 / seq_len
        self.progress.setValue(percentage)

    def dragEnterEvent(self, evt):
        if evt.mimeData().hasUrls:
            evt.accept()
        else:
            evt.ignore()

    def dragMoveEvent(self, evt):
        if evt.mimeData().hasUrls:
            evt.accept()
        else:
            evt.ignore()

    def dropEvent(self, evt):
        if evt.mimeData().hasUrls \
                and not self.questioner_running \
                and not self.applying_output:
            evt.setDropAction(QtCore.Qt.CopyAction)
            evt.accept()
            for url in evt.mimeData().urls():
                if op_sys == 'Darwin':
                    image_path = str(
                        NSURL.URLWithString_(str(
                            url.toString())).filePathURL().path())
                else:
                    image_path = str(url.toLocalFile())
            self.load_image(image_path)
        else:
            evt.ignore()
 def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
   self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1)
   self.captioner.set_image_batch_size(1)
image_dir = './datasets/Kitchen/images/Kitchen/'

if distractor_set == "kitchen":
    distractor_dir = image_dir
    distractor_imlist_file = tst_imlist_file
else:
    distractor_dir = './datasets/Kitchen/images/ImageNET/'
    distractor_imlist_file = './data/split/kitchen_imagenet_imlist.txt'

query_file = './data/metadata/kitchen_query_dict.json'
vocab_file = './data/vocabulary.txt'

# utilize the captioner module from LRCN
lstm_net_proto = './prototxt/scrc_word_to_preds_no_spatial_no_context.prototxt'
image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(50)
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

# Load image and caption list
imlist = util.io.load_str_list(tst_imlist_file)
num_im = len(imlist)
query_dict = util.io.load_json(query_file)

# Load distractors
distractor_list = util.io.load_str_list(distractor_imlist_file)
num_distractors = len(distractor_list)

# Sample distractor images for each test image
distractor_ids_per_im = {}
np.random.seed(3)  # fix random seed for test repeatibility
from captioner import Captioner
import retriever
import math
#import image_convert

#im_file = './demo_data/test2.jpg'

pretrained_weights_path = '../models/two_layer_LSTM.caffemodel'
gpu_id = 0

# Initialize the retrieval model
image_net_proto = '../prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = '../prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = '../data/vocabulary.txt'
# utilize the captioner module from LRCN
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(
    50)  # decrease the number if your GPU memory is small
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

while (1):
    sum_candidate_box = []
    sum_score_box = []

    query = raw_input("type the input query: ")
    #query = 'bike on the red house'

    print("query =", query)
    print("Find best candidate..!")
    for i in range(8):
        im_file = './splited_image/test' + str(i) + '.jpg'
import caffe

import util
from captioner import Captioner

vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel'
gpu_id = 0

image_dir = './datasets/ReferIt/ImageCLEF/images/'
cached_context_features_dir = './data/referit_context_features/'

image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = './data/vocabulary.txt'

captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
batch_size = 100
captioner.set_image_batch_size(batch_size)

imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt')
num_im = len(imlist)

# Load all images into memory
loaded_images = []
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('loading image %d / %d into memory' % (n_im, num_im))

    im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg')
    # Gray scale to RGB
    if im.ndim == 2:
Example #18
0
def main():
  MAX_IMAGES = -1  # -1 to use all images
  TAG = 'coco_2layer_factored'
  if MAX_IMAGES >= 0:
    TAG += '_%dimages' % MAX_IMAGES
  eval_on_test = False
  if eval_on_test:
    ITER = 100000
    MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER
    DATASET_NAME = 'test'
  else:  # eval on val
    ITER = 50000
    MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER
    DATASET_NAME = 'val'
  TAG += '_%s' % DATASET_NAME
  MODEL_DIR = './examples/coco_caption'
  MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME)
  IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt'
  LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt'
  NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME)
  DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME,
      str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all')
  DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME)
  VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt'
  DEVICE_ID = 0
  with open(VOCAB_FILE, 'r') as vocab_file:
    vocab = [line.strip() for line in vocab_file.readlines()]
  coco = COCO(COCO_ANNO_PATH % DATASET_NAME)
  image_root = COCO_IMAGE_PATTERN % DATASET_NAME
  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab,
                             align=False, shuffle=False)
  dataset = {}
  for image_path, sentence in sg.image_sentence_pairs:
    if image_path not in dataset:
      dataset[image_path] = []
    dataset[image_path].append((sg.line_to_stream(sentence), sentence))
  print 'Original dataset contains %d images' % len(dataset.keys())
  if 0 <= MAX_IMAGES < len(dataset.keys()):
    all_keys = dataset.keys()
    perm = np.random.permutation(len(all_keys))[:MAX_IMAGES]
    chosen_keys = set([all_keys[p] for p in perm])
    for key in all_keys:
      if key not in chosen_keys:
        del dataset[key]
    print 'Reduced dataset to %d images' % len(dataset.keys())
  if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys())
  captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE,
                        device_id=DEVICE_ID)
  beam_size = 1
  generation_strategy = {'type': 'beam', 'beam_size': beam_size}
  if generation_strategy['type'] == 'beam':
    strategy_name = 'beam%d' % generation_strategy['beam_size']
  elif generation_strategy['type'] == 'sample':
    strategy_name = 'sample%f' % generation_strategy['temp']
  else:
    raise Exception('Unknown generation strategy type: %s' % generation_strategy['type'])
  CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name)
  experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg)
  captioner.set_image_batch_size(min(100, MAX_IMAGES))
  experimenter.generation_experiment(generation_strategy)
  captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000))
  experimenter.retrieval_experiment()
Example #19
0
def main():
    parser = argparse.ArgumentParser()

    # Model Setup
    parser.add_argument("--detector_config", default=None, type=str,
                        help="detector config file path.")
    parser.add_argument("--detector_weights", default=None, type=str,
                        help="pretrained detector weights.")
    parser.add_argument("--decoder_config", default=None, type=str,
                        help="Bert decoder config file path.")
    parser.add_argument("--decoder_weights", default=None, type=str,
                        help="pretrained Bert decoder weights.")
    parser.add_argument("--object_vocab", default=None, type=str,
                        help="object vocabulary, maps object ids to object names")

    # For COCO
    parser.add_argument('--coco_root', type=str, default='~/Datasets/coco')
    parser.add_argument("--coco_data_info", default='annotations/dataset_coco.json', type=str,
                        help="The input data file name.")
    parser.add_argument("--coco_ann_file", default='annotations/captions_val2014.json', type=str,
                        help="caption annotations file (i.e. answer key)")
    parser.add_argument('--valid_jpgs_file', default='annotations/coco_valid_jpgs.json', type=str,
                        help="lists the valid jpgs")

    # For data pipeline
    parser.add_argument('--batch_size', type=int, default=1,
                        help="Batch size for decoding. Highly recommended to be a multiple of 8")
    parser.add_argument('--dl_workers', type=int, default=0, help="Number of dataloader workers")

    # For reproducibility
    parser.add_argument('--seed', type=int, default=-1, help="random seed for initialization")

    args = parser.parse_args()

    assert(torch.cuda.is_available())

    cpu_device = torch.device("cpu")
    gpu_device = torch.device("cuda:0")
    n_gpu = torch.cuda.device_count()

    # fix random seed (optional)
    if args.seed != -1:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    with torch.no_grad():

        captioner = Captioner(args.detector_config, args.detector_weights, args.decoder_config, args.decoder_weights,
                              args.object_vocab, cpu_device, gpu_device)

        # TODO: optimize for amp, data-parallel
        torch.cuda.empty_cache()  # Empty everything

        valid_dataset = CocoCaptionsKarpathyValidImgs(args.coco_root)
        valid_dl = DataLoader(valid_dataset, batch_size=args.batch_size, collate_fn=ccc_karpathy_valid_collate,
                              num_workers=args.dl_workers, pin_memory=True)

        total_batch = math.ceil(len(valid_dataset) / args.batch_size)

        predictions = []

        print('start the caption evaluation...')
        with tqdm(total=total_batch) as pbar:
            for img_ids, img_npys in valid_dl:
                captions = captioner.forward(img_npys)

                for img_id, caption in zip(img_ids, captions):
                    predictions.append({'image_id': img_id, 'caption': caption})
                pbar.update(1)

        language_eval(preds=predictions, annFile=os.path.join(args.coco_root, args.coco_ann_file),
                      model_id='0', split='val')