Esempio n. 1
0
 def __init__(self,
              detector_cfg_path,
              detector_weights_path,
              bert_cfg_path,
              bert_weights_path,
              object_vocab_path,
              tacotron_weights_path,
              waveglow_cfg_path,
              waveglow_weights_path,
              cpu_device,
              gpu_device,
              fc_layer=0,
              max_caption_length=67,
              sampling_rate=22050):
     """
     args:
         detector_cfg_path: path to the detector config
         detector_weights_path: path to the detector weights
         bert_cfg_path: path to the bert decoder config
         bert_weights_path: path to the bert decoder weights
         tacotron_weights_path: path to the tacotron weights
         waveglow_weights_path: path to the waveglow weights
         cpu_device: The cpu device to run some parts of visualization
         gpu_device: The gpu device to run the bulk of computations, currently requires at least 1 GPU device
         fc_layer: the fully connected layer from the detector to extract features from, 0-indexed
         max_caption_length: the maximum number of tokens the caption can be
         sampling_rate: the rate that audio representations are sampled per second
     """
     self.captioner = Captioner(detector_cfg_path, detector_weights_path,
                                bert_cfg_path, bert_weights_path,
                                object_vocab_path, cpu_device, gpu_device,
                                fc_layer, max_caption_length)
     device = gpu_device if gpu_device else cpu_device
     self.tts = TTS(tacotron_weights_path, waveglow_cfg_path,
                    waveglow_weights_path, device, sampling_rate)
Esempio n. 2
0
 def prepare_captioner(self):
     config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
     checkpoint_path = os.path.join(config['project_root_dir'],
                                    config['checkpoint_path'])
     vocab_file_path = os.path.join(config['project_root_dir'],
                                    config['vocab_file_path'])
     self.captioner = Captioner(self.sess, checkpoint_path, vocab_file_path)
Esempio n. 3
0
def build_captioner(model_name,
                    image_net,
                    LM_net,
                    dataset_name='coco',
                    split_name='val',
                    vocab='vocabulary',
                    precomputed_h5=None,
                    gpu=0,
                    prev_word_restriction=True):
    model_files = ['%s.caffemodel' % (mf) for mf in model_name]
    if image_net:
        image_net_file = home_dir + image_net
    else:
        image_net_file = None
    lstm_net_file = home_dir + LM_net
    vocab_file = '%s/%s.txt' % (determine_vocab_folder(dataset_name,
                                                       split_name), vocab)
    device_id = gpu
    with open(vocab_file, 'r') as vocab_file_read:
        vocab = [line.strip() for line in vocab_file_read.readlines()]
    anno_path = determine_anno_path(dataset_name, split_name)
    image_root = determine_image_pattern(dataset_name, split_name)

    sg = build_sequence_generator(anno_path,
                                  50,
                                  image_root,
                                  vocab=vocab,
                                  max_words=50,
                                  align=False,
                                  shuffle=False,
                                  gt_captions=True,
                                  pad=True,
                                  truncate=True,
                                  split_ids=None)
    dataset = {}
    for image_path, sentence in sg.image_sentence_pairs:
        if image_path not in dataset:
            dataset[image_path] = []
        dataset[image_path].append((sg.line_to_stream(sentence), sentence))
    print 'Original dataset contains %d images' % len(dataset.keys())
    captioner = Captioner(model_files,
                          image_net_file,
                          lstm_net_file,
                          vocab_file,
                          device_id=device_id,
                          precomputed_feats=precomputed_h5,
                          prev_word_restriction=prev_word_restriction)
    return captioner, sg, dataset
Esempio n. 4
0
def main(image_path, nsamples, temperature):
    # Generate questions
    sess = gpt2.start_tf_sess()
    gpt2.load_gpt2(sess)
    config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)
    checkpoint_path = os.path.join(config['project_root_dir'],
                                   config['checkpoint_path'])
    vocab_file_path = os.path.join(config['project_root_dir'],
                                   config['vocab_file_path'])
    captioner = Captioner(sess, checkpoint_path, vocab_file_path)
    caption = captioner.caption(image_path)
    questions = gpt2_gen_questions(sess,
                                   caption,
                                   nsamples=nsamples,
                                   temperature=temperature)

    # Print generated questions
    print('----------\nQuestions:')
    for i, question in enumerate(questions):
        print('%d. %s' % (i + 1, question))
Esempio n. 5
0
def main():
  MAX_IMAGES = -1  # -1 to use all images
  TAG = 'coco_2layer_factored'
  if MAX_IMAGES >= 0:
    TAG += '_%dimages' % MAX_IMAGES
  eval_on_test = False
  if eval_on_test:
    ITER = 100000
    MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER
    DATASET_NAME = 'test'
  else:  # eval on val
    ITER = 50000
    MODEL_FILENAME = 'lrcn_finetune_iter_%d' % ITER
    DATASET_NAME = 'val'
  TAG += '_%s' % DATASET_NAME
  MODEL_DIR = './examples/coco_caption'
  MODEL_FILE = '%s/%s.caffemodel' % (MODEL_DIR, MODEL_FILENAME)
  IMAGE_NET_FILE = './models/bvlc_reference_caffenet/deploy.prototxt'
  LSTM_NET_FILE = './examples/coco_caption/lrcn_word_to_preds.deploy.prototxt'
  NET_TAG = '%s_%s' % (TAG, MODEL_FILENAME)
  DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME,
      str(MAX_IMAGES) if MAX_IMAGES >= 0 else 'all')
  DATASET_CACHE_DIR = './retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME)
  VOCAB_FILE = './examples/coco_caption/h5_data/buffer_100/vocabulary.txt'
  DEVICE_ID = 0
  with open(VOCAB_FILE, 'r') as vocab_file:
    vocab = [line.strip() for line in vocab_file.readlines()]
  coco = COCO(COCO_ANNO_PATH % DATASET_NAME)
  image_root = COCO_IMAGE_PATTERN % DATASET_NAME
  sg = CocoSequenceGenerator(coco, BUFFER_SIZE, image_root, vocab=vocab,
                             align=False, shuffle=False)
  dataset = {}
  for image_path, sentence in sg.image_sentence_pairs:
    if image_path not in dataset:
      dataset[image_path] = []
    dataset[image_path].append((sg.line_to_stream(sentence), sentence))
  print 'Original dataset contains %d images' % len(dataset.keys())
  if 0 <= MAX_IMAGES < len(dataset.keys()):
    all_keys = dataset.keys()
    perm = np.random.permutation(len(all_keys))[:MAX_IMAGES]
    chosen_keys = set([all_keys[p] for p in perm])
    for key in all_keys:
      if key not in chosen_keys:
        del dataset[key]
    print 'Reduced dataset to %d images' % len(dataset.keys())
  if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys())
  captioner = Captioner(MODEL_FILE, IMAGE_NET_FILE, LSTM_NET_FILE, VOCAB_FILE,
                        device_id=DEVICE_ID)
  beam_size = 1
  generation_strategy = {'type': 'beam', 'beam_size': beam_size}
  if generation_strategy['type'] == 'beam':
    strategy_name = 'beam%d' % generation_strategy['beam_size']
  elif generation_strategy['type'] == 'sample':
    strategy_name = 'sample%f' % generation_strategy['temp']
  else:
    raise Exception('Unknown generation strategy type: %s' % generation_strategy['type'])
  CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name)
  experimenter = CaptionExperiment(captioner, dataset, DATASET_CACHE_DIR, CACHE_DIR, sg)
  captioner.set_image_batch_size(min(100, MAX_IMAGES))
  experimenter.generation_experiment(generation_strategy)
  captioner.set_caption_batch_size(min(MAX_IMAGES * 5, 1000))
  experimenter.retrieval_experiment()
from captioner import Captioner
import retriever
import math
#import image_convert

#im_file = './demo_data/test2.jpg'

pretrained_weights_path = '../models/two_layer_LSTM.caffemodel'
gpu_id = 0

# Initialize the retrieval model
image_net_proto = '../prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = '../prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = '../data/vocabulary.txt'
# utilize the captioner module from LRCN
captioner = Captioner(pretrained_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
captioner.set_image_batch_size(
    50)  # decrease the number if your GPU memory is small
vocab_dict = retriever.build_vocab_dict_from_captioner(captioner)

while (1):
    sum_candidate_box = []
    sum_score_box = []

    query = raw_input("type the input query: ")
    #query = 'bike on the red house'

    print("query =", query)
    print("Find best candidate..!")
    for i in range(8):
        im_file = './splited_image/test' + str(i) + '.jpg'
Esempio n. 7
0
def main(model_name='',
         image_net='',
         LM_net='',
         dataset_name='val',
         vocab='vocabulary',
         precomputed_feats=None,
         feats_bool_in=False,
         precomputed_h5=None,
         experiment={'type': 'generation'},
         prev_word_restriction=False,
         gpu=0):
    #model_name is the trained model: path relative to /home/lisa/caffe-LSTM-video
    #image_net is the model to extract length 1000 image features: path relative to snapshots folder; do not need to include "caffemodel"
    #dataset_name indicates which dataset to look at
    #vocab indicates which vocabulary file to look at
    #feats_bool is whether or not the images are saved as pickle feature files or if they are normal images
    #experiment: dict which has all info needed for experiments.  Must have field type which will indicate madlib versus generation expt.

    if not precomputed_feats:
        precomputed_feats = model_name

    MAX_IMAGES = -1  # -1 to use all images
    TAG = 'coco_2layer_factored'
    if MAX_IMAGES >= 0:
        TAG += '_%dimages' % MAX_IMAGES
    eval_on_test = False
    if eval_on_test:
        ITER = 100000
        MODEL_FILENAME = 'lrcn_finetune_trainval_stepsize40k_iter_%d' % ITER
        DATASET_NAME = 'test'
    else:  # eval on val
        MODEL_FILENAME = model_name
        DATASET_NAME = dataset_name
    TAG += '_%s' % DATASET_NAME
    #MODEL_DIR = home_dir + '/examples/coco_caption/snapshots'
    MODEL_DIR = ''
    MODEL_FILE = ['%s.caffemodel' % (MF) for MF in MODEL_FILENAME]
    #IMAGE_NET_FILE = home_dir + '/models/bvlc_reference_caffenet/deploy.prototxt'
    IMAGE_NET_FILE = home_dir + image_net
    #LSTM_NET_FILE = home_dir + '/examples/coco_caption/lrcn_word_to_preds.deploy.prototxt'
    LSTM_NET_FILE = home_dir + LM_net
    DATASET_SUBDIR = '%s/%s_ims' % (DATASET_NAME, str(MAX_IMAGES)
                                    if MAX_IMAGES >= 0 else 'all')
    #DATASET_CACHE_DIR = home_dir + '/retrieval_cache/%s/%s' % (DATASET_SUBDIR, MODEL_FILENAME)
    DATASET_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % (
        DATASET_SUBDIR, '_'.join(MODEL_FILENAME))
    FEATURE_CACHE_DIR = '/x/lisaanne/retrieval_cache/%s/%s' % (
        DATASET_SUBDIR, precomputed_feats)
    VOCAB_FILE = '../../examples/coco_caption/h5_data/buffer_100/%s.txt' % vocab
    DEVICE_ID = gpu
    with open(VOCAB_FILE, 'r') as vocab_file:
        vocab = [line.strip() for line in vocab_file.readlines()]
    coco = COCO(COCO_ANNO_PATH % DATASET_NAME)
    #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images/%s2014'
    COCO_IMAGE_PATTERN = '../../data/coco/coco/images/%s2014'
    #COCO_IMAGE_PATTERN = '/y/lisaanne/coco/images2/%s2014'
    #image_root = COCO_IMAGE_PATTERN % DATASET_NAME
    image_root = COCO_IMAGE_PATTERN % 'val'
    #image_root = '/z/lisaanne/imageData/imagenet/'
    sg = CocoSequenceGenerator(coco,
                               BUFFER_SIZE,
                               image_root,
                               vocab=vocab,
                               max_words=MAX_WORDS,
                               align=False,
                               shuffle=False,
                               gt_captions=True,
                               pad=True,
                               truncate=True,
                               split_ids=None,
                               feats_bool=feats_bool_in)
    dataset = {}
    for image_path, sentence in sg.image_sentence_pairs:
        if image_path not in dataset:
            dataset[image_path] = []
        dataset[image_path].append((sg.line_to_stream(sentence), sentence))
    print 'Original dataset contains %d images' % len(dataset.keys())
    if 0 <= MAX_IMAGES < len(dataset.keys()):
        all_keys = dataset.keys()
        perm = np.random.permutation(len(all_keys))[:MAX_IMAGES]
        chosen_keys = set([all_keys[p] for p in perm])
        for key in all_keys:
            if key not in chosen_keys:
                del dataset[key]
        print 'Reduced dataset to %d images' % len(dataset.keys())
    if MAX_IMAGES < 0: MAX_IMAGES = len(dataset.keys())
    captioner = Captioner(MODEL_FILE,
                          IMAGE_NET_FILE,
                          LSTM_NET_FILE,
                          VOCAB_FILE,
                          device_id=DEVICE_ID,
                          precomputed_feats=precomputed_h5,
                          prev_word_restriction=prev_word_restriction)
    if 'beam_size' in experiment.keys():
        beam_size = experiment['beam_size']
    else:
        beam_size = 1
    generation_strategy = {'type': 'beam', 'beam_size': beam_size}
    if generation_strategy['type'] == 'beam':
        strategy_name = 'beam%d' % generation_strategy['beam_size']
    elif generation_strategy['type'] == 'sample':
        strategy_name = 'sample%f' % generation_strategy['temp']
    else:
        raise Exception('Unknown generation strategy type: %s' %
                        generation_strategy['type'])
    CACHE_DIR = '%s/%s' % (DATASET_CACHE_DIR, strategy_name)
    experimenter = CaptionExperiment(captioner, dataset, FEATURE_CACHE_DIR,
                                     CACHE_DIR, sg, feats_bool_in)
    captioner.set_image_batch_size(min(100, MAX_IMAGES))
    if experiment['type'] == 'madlib':
        all_mean_index = []
        all_mean_prob = []
        all_top_words = []
        for fw in experiment['fill_words']:
            for cw in experiment['cooccur_words']:
                mean_index, mean_prob, top_words = experimenter.madlib_experiment(
                    fw, [cw])
                all_mean_index.append(mean_index)
                all_mean_prob.append(mean_prob)
                all_top_words.append(top_words)
        return all_mean_index, all_mean_prob, all_top_words
    if experiment['type'] == 'generation':
        experimenter.generation_experiment(generation_strategy, 1000)
    if experiment['type'] == 'score_generation':
        if 'read_file' in experiment.keys():
            read_file = experiment['read_file']
        else:
            read_file = True
        experimenter.score_generation(experiment['json_file'], read_file)
import caffe

import util
from captioner import Captioner

vgg_weights_path = './models/VGG_ILSVRC_16_layers.caffemodel'
gpu_id = 0

image_dir = './datasets/ReferIt/ImageCLEF/images/'
cached_context_features_dir = './data/referit_context_features/'

image_net_proto = './prototxt/VGG_ILSVRC_16_layers_deploy.prototxt'
lstm_net_proto = './prototxt/scrc_word_to_preds_full.prototxt'
vocab_file = './data/vocabulary.txt'

captioner = Captioner(vgg_weights_path, image_net_proto, lstm_net_proto,
                      vocab_file, gpu_id)
batch_size = 100
captioner.set_image_batch_size(batch_size)

imlist = util.io.load_str_list('./data/split/referit_all_imlist.txt')
num_im = len(imlist)

# Load all images into memory
loaded_images = []
for n_im in range(num_im):
    if n_im % 200 == 0:
        print('loading image %d / %d into memory' % (n_im, num_im))

    im = skimage.io.imread(image_dir + imlist[n_im] + '.jpg')
    # Gray scale to RGB
    if im.ndim == 2:
 def __init__(self, image_model, image_net_proto, lstm_net_proto, vocab):
   self.captioner = Captioner(image_model, image_net_proto, lstm_net_proto, vocab,-1)
   self.captioner.set_image_batch_size(1)
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()

    # Model Setup
    parser.add_argument("--detector_config", default=None, type=str,
                        help="detector config file path.")
    parser.add_argument("--detector_weights", default=None, type=str,
                        help="pretrained detector weights.")
    parser.add_argument("--decoder_config", default=None, type=str,
                        help="Bert decoder config file path.")
    parser.add_argument("--decoder_weights", default=None, type=str,
                        help="pretrained Bert decoder weights.")
    parser.add_argument("--object_vocab", default=None, type=str,
                        help="object vocabulary, maps object ids to object names")

    # For COCO
    parser.add_argument('--coco_root', type=str, default='~/Datasets/coco')
    parser.add_argument("--coco_data_info", default='annotations/dataset_coco.json', type=str,
                        help="The input data file name.")
    parser.add_argument("--coco_ann_file", default='annotations/captions_val2014.json', type=str,
                        help="caption annotations file (i.e. answer key)")
    parser.add_argument('--valid_jpgs_file', default='annotations/coco_valid_jpgs.json', type=str,
                        help="lists the valid jpgs")

    # For data pipeline
    parser.add_argument('--batch_size', type=int, default=1,
                        help="Batch size for decoding. Highly recommended to be a multiple of 8")
    parser.add_argument('--dl_workers', type=int, default=0, help="Number of dataloader workers")

    # For reproducibility
    parser.add_argument('--seed', type=int, default=-1, help="random seed for initialization")

    args = parser.parse_args()

    assert(torch.cuda.is_available())

    cpu_device = torch.device("cpu")
    gpu_device = torch.device("cuda:0")
    n_gpu = torch.cuda.device_count()

    # fix random seed (optional)
    if args.seed != -1:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)

    with torch.no_grad():

        captioner = Captioner(args.detector_config, args.detector_weights, args.decoder_config, args.decoder_weights,
                              args.object_vocab, cpu_device, gpu_device)

        # TODO: optimize for amp, data-parallel
        torch.cuda.empty_cache()  # Empty everything

        valid_dataset = CocoCaptionsKarpathyValidImgs(args.coco_root)
        valid_dl = DataLoader(valid_dataset, batch_size=args.batch_size, collate_fn=ccc_karpathy_valid_collate,
                              num_workers=args.dl_workers, pin_memory=True)

        total_batch = math.ceil(len(valid_dataset) / args.batch_size)

        predictions = []

        print('start the caption evaluation...')
        with tqdm(total=total_batch) as pbar:
            for img_ids, img_npys in valid_dl:
                captions = captioner.forward(img_npys)

                for img_id, caption in zip(img_ids, captions):
                    predictions.append({'image_id': img_id, 'caption': caption})
                pbar.update(1)

        language_eval(preds=predictions, annFile=os.path.join(args.coco_root, args.coco_ann_file),
                      model_id='0', split='val')