def _create_caption_generator():
    cg = CaptionGenerator(
        model_type=_CONFIG['model_type'],
        vocabulary_path=_CONFIG['vocabulary_path'],
        word_embedding_size=_CONFIG['word_embedding_size'],
        visual_feature_size=_CONFIG['visual_feature_size'],
        spatial_feature_size=_CONFIG['spatial_feature_size'],
        hidden_size=_CONFIG['cg_hidden_size'],
        use_all_regions=((_CONFIG['model_type'] == 'region_attention')
                         and (_CONFIG['use_all_regions'] == 'enforced')),
        inference_only=True,
        num_layers=_CONFIG['num_rnn_layers'],
        learning_rate=_CONFIG['learning_rate'],
        dropout_lstm=_CONFIG['dropout_lstm'],
        dropout_word_embedding=_CONFIG['dropout_word_embedding'],
        l2_weight=_CONFIG['l2_weight'],
        block_unnecessary_tokens=_CONFIG['block_unnecessary_tokens'],
        device=_CONFIG['device'])

    if _CONFIG['load_path_cg'] is not None:
        print("Starting from PATH", _CONFIG['load_path_cg'])
        cg.load(checkpoint_path=_CONFIG['load_path_cg'],
                load_optimizer=_CONFIG['load_cg_optimizer'])

    return cg
예제 #2
0
def main(_):
    model = ShowAndTellModel(FLAGS.model_path)
    vocab = Vocabulary(FLAGS.vocab_file)
    filenames = _load_filenames()

    generator = CaptionGenerator(model, vocab)

    for filename in filenames:
        with tf.gfile.GFile(filename, "rb") as f:
            image = f.read()
        captions = generator.beam_search(image)
        print("Captions for image %s:" % os.path.basename(filename))
        for i, caption in enumerate(captions):
            # Ignore begin and end tokens <S> and </S>.
            sentence = [vocab.id_to_token(w) for w in caption.sentence[1:-1]]
            sentence = " ".join(sentence)
            print("  %d) %s (p=%f)" % (i, sentence, math.exp(caption.logprob)))
def test_model_with_feature():
    image = None
    generated_words = None
    if not os.path.exists('data/ixtoword.npy'):
        print('You must run 1. O\'reilly Training.ipynb first.')
    else:
        ixtoword = np.load('data/ixtoword.npy').tolist()
        n_words = len(ixtoword)
        maxlen = 15

        tf.reset_default_graph()
        sess = tf.InteractiveSession()

        caption_generator = CaptionGenerator(dim_in, dim_hidden, dim_embed,
                                             batch_size, maxlen + 2, n_words)
        image, generated_words = caption_generator.build_generator(
            maxlen=maxlen)
    test_with_feature(sess, image, generated_words, ixtoword, 55)
예제 #4
0
def train_model(weight=None, batch_size=32, epochs=10):

    cg = CaptionGenerator()
    model = cg.create_model()

    if weight is not None:
        model.load_weights(weight)

    file_name = 'weights-improvement-{epoch:02d}.hdf5'
    checkpoint = ModelCheckpoint(file_name, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model.fit_generator(cg.data_generator(batch_size=batch_size),
                        steps_per_epoch=cg.total_samples/batch_size, epochs=epochs,
                        verbose=1, callbacks=callbacks_list)
    try:
        model.save('Models/WholeModel.h5', overwrite=True)
        model.save_weights('Models/Weights.h5', overwrite=True)
    except:
        print("Error in saving model.")
    print("Training complete...\n")
예제 #5
0
def main(unused_args):

  length_normalization_factor = FLAGS.length_normalization_factor

  # Load model configuration
  config_path = os.path.join(os.path.dirname(__file__), 'model_conf', FLAGS.model_name + '.py')
  config = utility.load_config(config_path)

  config.trainCollection = FLAGS.train_collection
  config.word_cnt_thr = FLAGS.word_cnt_thr
  config.rootpath = FLAGS.rootpath

  train_collection =  FLAGS.train_collection
  test_collection = FLAGS.test_collection
  overwrite = FLAGS.overwrite
  feature = FLAGS.vf_name


  img_set_file = os.path.join(rootpath, test_collection, 'VideoSets', '%s.txt' % test_collection)
  if not os.path.exists(img_set_file):
      img_set_file = os.path.join(rootpath, test_collection, 'ImageSets', '%s.txt' % test_collection)
  img_list = map(str.strip, open(img_set_file).readlines())

  # have visual feature ready
  vf_dir = utility.get_feat_dir(test_collection, feature, rootpath)
  vf_reader = BigFile( vf_dir )

  textbank = TextBank(utility.get_train_vocab_file(FLAGS))
  config.vocab_size = len(textbank.vocab)
  config.vf_size = int(open(os.path.join(vf_dir, 'shape.txt')).read().split()[1])

  model_dir = utility.get_model_dir(FLAGS)
  output_dir = utility.get_pred_dir(FLAGS)

  checkpoint_style = FLAGS.checkpoint_style

  if checkpoint_style == 'file':
    #output_per_filename = 'model_perf_in_topk_%d_%s' % (FLAGS.top_k, FLAGS.eval_model_list_file)
    # read validated top models
    validation_output_dir = utility.get_sim_dir(FLAGS)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    eval_model_list_file = os.path.join(validation_output_dir, 'loss_info.txt') #FLAGS.eval_model_list_file)
    shutil.copy(eval_model_list_file, output_dir)
    test_iter_list = []
    for line in open(eval_model_list_file).readlines()[:FLAGS.top_k]:
      iter_current = int(line.strip().split()[0])
      test_iter_list.append(iter_current)

  elif checkpoint_style == 'iter_interval':
    #output_per_filename =  'model_perf_in_%s' % FLAGS.eval_stat
    test_iter_list = range(*[int(x) for x in FLAGS.eval_stat.split("-")])
  elif checkpoint_style == 'iter_num':
    #output_per_filename =  'model_perf_in_iter_%d' % FLAGS.iter_num
    test_iter_list = [FLAGS.iter_num]

  with_image_embedding = True if FLAGS.with_image_embedding != 0 else False
  g = tf.Graph()
  with g.as_default():
    model = InferenceWrapper(config=config,model_dir=model_dir,
                             gpu_memory_fraction=FLAGS.gpu_memory_fraction,
                             gpu=FLAGS.gpu,
                             with_image_embedding=with_image_embedding)
    model.build_model()
  
  for k, iter_n in enumerate(test_iter_list):
    model_path = os.path.join(model_dir, 'variables', 'model_%d.ckpt' % iter_n)
    while not os.path.exists(model_path+'.meta'):
      logger.error('Model path: %s', model_path)
      logger.error('Cannot load model file and exit')
      sys.exit(0)

    top_one_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_one_pred_sent.txt')
    top_n_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_n_pred_sent.txt')
    # perf_file = os.path.join(output_dir, 'model_%d.ckpt' % iter_n, 'perf.txt')

    if os.path.exists(top_one_pred_sent_file) and not overwrite:
      # write existing perf file and print out
      logger.info('%s exists. skip', top_one_pred_sent_file)
      continue

    if not os.path.exists(os.path.split(top_one_pred_sent_file)[0]):
      os.makedirs(os.path.split(top_one_pred_sent_file)[0])

    logger.info('save results to %s', top_one_pred_sent_file)

    # load the trained model
    generator = CaptionGenerator(config, model, length_normalization_factor = length_normalization_factor)
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
    config_proto = tf.ConfigProto(
      intra_op_parallelism_threads=FLAGS.ses_threads, gpu_options=gpu_options, allow_soft_placement=True)
    #with  tf.Session(config=config_proto) as session:
      #model.build_model(session, model_path)
    model.load_model(model_path)

    fout_one_sent = codecs.open(top_one_pred_sent_file, 'w','utf-8')
    fout_n_sent = codecs.open(top_n_pred_sent_file, 'w','utf-8')

    for progress,img in enumerate(img_list):
        # predict sentences given a visual feature
        visual_feature = np.array(vf_reader.read_one(img))
        sentences = generator.beam_search( visual_feature, FLAGS.beam_size)

        # output top one sentence info
        sent_score = sentences[0].score
        sent = ' '.join(sentences[0].words)
        fout_one_sent.write(img + ' ' + '%.3f' % sent_score + ' ' + sent + '\n')
        logger.debug(img + ' ' + '%.3f' % sent_score + ' ' + sent)

        # output top n sentences info
        fout_n_sent.write(img)
        for sentence in sentences:
            sent_score = sentence.score
            sent = ' '.join(sentence.words)
            fout_n_sent.write('\t' + '%.3f' % sent_score + '\t' + sent)
        fout_n_sent.write('\n')
      
        if progress % 100 == 0:
          logger.info('%d images decoded' % (progress+1))

    logger.info('%d images decoded' % (progress+1))
 
    fout_one_sent.close()
    fout_n_sent.close()
def train(learning_rate=0.001, continue_training=False, transfer=True):

    tf.reset_default_graph()

    feats, captions = get_data(annotations_path, features_path)
    wordtoix, ixtoword, init_b = build_word_vocab(captions)

    np.save('data/ixtoword', ixtoword)

    index = (np.arange(len(feats)).astype(int))
    np.random.shuffle(index)

    sess = tf.InteractiveSession()
    n_words = len(wordtoix)
    maxlen = np.max([x for x in map(lambda x: len(x.split(' ')), captions)])
    caption_generator = CaptionGenerator(dim_in, dim_hidden, dim_embed,
                                         batch_size, maxlen + 2, n_words,
                                         init_b)

    loss, image, sentence, mask = caption_generator.build_model()

    saver = tf.train.Saver(max_to_keep=100)
    global_step = tf.Variable(0, trainable=False)
    learning_rate = tf.train.exponential_decay(learning_rate, global_step,
                                               int(len(index) / batch_size),
                                               0.95)
    train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
    tf.global_variables_initializer().run()

    if continue_training:
        if not transfer:
            saver.restore(sess, tf.train.latest_checkpoint(model_path))
        else:
            saver.restore(sess,
                          tf.train.latest_checkpoint(model_path_transfer))
    losses = []
    for epoch in range(n_epochs):
        for start, end in zip(range(0, len(index), batch_size),
                              range(batch_size, len(index), batch_size)):

            current_feats = feats[index[start:end]]
            current_captions = captions[index[start:end]]
            current_caption_ind = [
                x for x in map(
                    lambda cap: [
                        wordtoix[word] for word in cap.lower().split(' ')[:-1]
                        if word in wordtoix
                    ], current_captions)
            ]

            current_caption_matrix = sequence.pad_sequences(
                current_caption_ind, padding='post', maxlen=maxlen + 1)
            current_caption_matrix = np.hstack([
                np.full((len(current_caption_matrix), 1), 0),
                current_caption_matrix
            ])

            current_mask_matrix = np.zeros((current_caption_matrix.shape[0],
                                            current_caption_matrix.shape[1]))
            nonzeros = np.array([
                x for x in map(lambda x: (x != 0).sum() + 2,
                               current_caption_matrix)
            ])

            for ind, row in enumerate(current_mask_matrix):
                row[:nonzeros[ind]] = 1

            _, loss_value = sess.run(
                [train_op, loss],
                feed_dict={
                    image: current_feats.astype(np.float32),
                    sentence: current_caption_matrix.astype(np.int32),
                    mask: current_mask_matrix.astype(np.float32)
                })

            print("Current Cost: ", loss_value,
                  "\t Epoch {}/{}".format(epoch, n_epochs),
                  "\t Iter {}/{}".format(start, len(feats)))
        print("Saving the model from epoch: ", epoch)
        saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
예제 #7
0
from model import ShowAndTellModel
from vocabulary import Vocabulary
from caption_generator import CaptionGenerator

FLAGS = tf.flags.FLAGS

tf.flags.DEFINE_string("model", "./show-and-tell.pb", "Model graph def path")
tf.flags.DEFINE_string("vocab", "./word_counts.txt", "Text file containing the vocabulary.")
tf.flags.DEFINE_string("port", "5000", "Port of the server.")
tf.flags.DEFINE_string("host", "localhost", "Host of the server.")
tf.flags.DEFINE_integer("beam_size", 3, "Size of the beam.")
tf.flags.DEFINE_integer("max_caption_length", 20, "Maximum length of the generate caption.")

vocab = Vocabulary(vocab_file_path=FLAGS.vocab)
model = ShowAndTellModel(model_path=FLAGS.model)
generator = CaptionGenerator(model=model, vocab=vocab, beam_size=FLAGS.beam_size,
                             max_caption_length=FLAGS.max_caption_length)

logger = logging.getLogger(__name__)
app = Flask(__name__)


@app.route('/api/image-caption/predict', methods=['GET','POST'])
def caption():
    if request.method == 'POST':
        file = request.files['image']
        image = file.read()
        captions = generator.beam_search(image)
        sentences = []
        for caption in captions:
            sentence = [vocab.id_to_token(w) for w in caption.sentence[1:-1]]
            sentences.append((" ".join(sentence), np.exp(caption.logprob)))
예제 #8
0
import pickle

import numpy as np
import nltk
from keras.preprocessing import sequence

from caption_generator import CaptionGenerator

cg = CaptionGenerator()


def process_caption(caption):
    caption_split = caption.split()
    processed_caption = caption_split[1:]
    try:
        end_index = processed_caption.index('<end>')
        processed_caption = processed_caption[:end_index]
    except:
        pass
    return " ".join([word for word in processed_caption])


def get_best_caption(captions):
    captions.sort(key=lambda l: l[1])
    best_caption = captions[-1][0]
    return " ".join([cg.index_word[index] for index in best_caption])


def get_all_captions(captions):
    final_captions = []
    captions.sort(key=lambda l: l[1])
def evaluate():
    with open(os.path.join(FLAGS.data_dir, 'feature.test'), 'rb') as f:
        feature = cPickle.load(f)
    with open(os.path.join(FLAGS.data_dir, 'caption.test'), 'rb') as f:
        sentence = cPickle.load(f)

    scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
               (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")]
    vocab, re_vocab = data_utils.initialize_vocabulary()
    GTS = {}
    RES = {}
    batch_size = 1
    max_meteor = 0

    with tf.Session() as sess:
        model = Seq2Seq(FLAGS.num_units,
                        FLAGS.use_lstm,
                        1.0,
                        FLAGS.num_layers,
                        FLAGS.encoder_max_sequence_length,
                        1,
                        FLAGS.feature_size,
                        FLAGS.vocab_size,
                        FLAGS.learning_rate,
                        FLAGS.learning_rate_decay_factor,
                        FLAGS.max_gradient_norm,
                        forward_only=False)
        step = 7000
        while True:
            step += FLAGS.steps_per_checkpoint
            ckpt_path = os.path.join(FLAGS.checkpoint_dir, 'ckpt-%d' % step)
            if os.path.isfile(ckpt_path + '.meta'):
                model.saver.restore(sess, ckpt_path)
                cg = CaptionGenerator(
                    model=model,
                    start_id=data_utils.GO_ID,
                    end_id=data_utils.EOS_ID,
                    beam_size=3,
                    max_caption_length=FLAGS.decoder_max_sentence_length,
                    length_normalization_factor=0.0)
                for vid, _ in feature.iteritems():
                    feature_inputs, batch_decoder_inputs, batch_weights = model.get_batch(
                        feature, [(vid, [0])])
                    outputs = cg.beam_search(sess, feature_inputs)
                    sen = " ".join([
                        tf.compat.as_str(re_vocab[output])
                        for output in outputs
                    ])
                    print("%s: %s" % (sen, sentence[vid][9]))
                    GTS[vid] = sentence[vid]
                    RES[vid] = [sen]
                print('STEP: %d' % step)
                for scorer, method in scorers:
                    score, scores = scorer.compute_score(GTS, RES)
                    if method == "METEOR" and score > max_meteor:
                        max_meteor = score
                    if isinstance(method, list):
                        for k, v in zip(method, score):
                            print("%s:\t%f" % (k, v))
                    else:
                        print("%s:\t%f" % (method, score))
                sys.stdout.flush()
                exit(0)
            else:
                break
    print("Max METEOR:\t%f" % max_meteor)
예제 #10
0
def main(unused_args):

  length_normalization_factor = FLAGS.length_normalization_factor

  # Load model configuration
  config_path = os.path.join(os.path.dirname(__file__), 'model_conf', FLAGS.model_name + '.py')
  config = utility.load_config(config_path)

  config.trainCollection = FLAGS.train_collection
  config.word_cnt_thr = FLAGS.word_cnt_thr
  config.rootpath = FLAGS.rootpath

  train_collection =  FLAGS.train_collection
  test_collection = FLAGS.test_collection
  overwrite = FLAGS.overwrite
  feature = FLAGS.vf_name


  img_set_file = os.path.join(rootpath, test_collection, 'VideoSets', '%s.txt' % test_collection)
  if not os.path.exists(img_set_file):
      img_set_file = os.path.join(rootpath, test_collection, 'ImageSets', '%s.txt' % test_collection)
  img_list = map(str.strip, open(img_set_file).readlines())

  # have visual feature ready
  FLAGS.vf_dir = os.path.join(rootpath, test_collection, 'FeatureData', feature)
  vf_reader = BigFile(FLAGS.vf_dir)

  textbank = TextBank(utility.get_train_vocab_file(FLAGS))
  config.vocab_size = len(textbank.vocab)
  config.vf_size = int(open(os.path.join(FLAGS.vf_dir, 'shape.txt')).read().split()[1])

  model_dir = utility.get_model_dir(FLAGS)
  output_dir = utility.get_pred_dir(FLAGS)

  checkpoint_style = FLAGS.checkpoint_style

  if checkpoint_style == 'file':
    #output_per_filename = 'model_perf_in_topk_%d_%s' % (FLAGS.top_k, FLAGS.eval_model_list_file)
    # read validated top models
    validation_output_dir = utility.get_sim_dir(FLAGS)
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    eval_model_list_file = os.path.join(validation_output_dir, 'loss_info.txt') #FLAGS.eval_model_list_file)
    shutil.copy(eval_model_list_file, output_dir)
    test_iter_list = []
    for line in open(eval_model_list_file).readlines()[:FLAGS.top_k]:
      iter_current = int(line.strip().split()[0])
      test_iter_list.append(iter_current)

  elif checkpoint_style == 'iter_interval':
    #output_per_filename =  'model_perf_in_%s' % FLAGS.eval_stat
    test_iter_list = range(*[int(x) for x in FLAGS.eval_stat.split("-")])
  elif checkpoint_style == 'iter_num':
    #output_per_filename =  'model_perf_in_iter_%d' % FLAGS.iter_num
    test_iter_list = [FLAGS.iter_num]

  with_image_embedding = True if FLAGS.with_image_embedding != 0 else False
  g = tf.Graph()
  with g.as_default():
    model = InferenceWrapper(config=config,model_dir=model_dir,
                             gpu_memory_fraction=FLAGS.gpu_memory_fraction,
                             gpu=FLAGS.gpu,
                             with_image_embedding=with_image_embedding)
    model.build_model()
  
  for k, iter_n in enumerate(test_iter_list):
    model_path = os.path.join(model_dir, 'variables', 'model_%d.ckpt' % iter_n)
    while not os.path.exists(model_path+'.meta'):
      logger.error('Model path: %s', model_path)
      logger.error('Cannot load model file and exit')
      sys.exit(0)

    top_one_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_one_pred_sent.txt')
    top_n_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_n_pred_sent.txt')
    # perf_file = os.path.join(output_dir, 'model_%d.ckpt' % iter_n, 'perf.txt')

    if os.path.exists(top_one_pred_sent_file) and not overwrite:
      # write existing perf file and print out
      logger.info('%s exists. skip', top_one_pred_sent_file)
      continue

    if not os.path.exists(os.path.split(top_one_pred_sent_file)[0]):
      os.makedirs(os.path.split(top_one_pred_sent_file)[0])

    logger.info('save results to %s', top_one_pred_sent_file)

    # load the trained model
    generator = CaptionGenerator(config, model, length_normalization_factor = length_normalization_factor)
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
    config_proto = tf.ConfigProto(
      intra_op_parallelism_threads=FLAGS.ses_threads, gpu_options=gpu_options, allow_soft_placement=True)
    #with  tf.Session(config=config_proto) as session:
      #model.build_model(session, model_path)
    model.load_model(model_path)

    fout_one_sent = codecs.open(top_one_pred_sent_file, 'w','utf-8')
    fout_n_sent = codecs.open(top_n_pred_sent_file, 'w','utf-8')

    for progress,img in enumerate(img_list):
        print(img)
        # predict sentences given a visual feature
        visual_feature = np.array(vf_reader.read_one(img))
        sentences = generator.beam_search( visual_feature, FLAGS.beam_size)

        # output top one sentence info
        sent_score = sentences[0].score
        sent = ' '.join(sentences[0].words)
        fout_one_sent.write(img + ' ' + '%.3f' % sent_score + ' ' + sent + '\n')
        logger.debug(img + ' ' + '%.3f' % sent_score + ' ' + sent)

        # output top n sentences info
        fout_n_sent.write(img)
        for sentence in sentences:
            sent_score = sentence.score
            sent = ' '.join(sentence.words)
            fout_n_sent.write('\t' + '%.3f' % sent_score + '\t' + sent)
        fout_n_sent.write('\n')
      
        if progress % 100 == 0:
          logger.info('%d images decoded' % (progress+1))

    logger.info('%d images decoded' % (progress+1))
 
    fout_one_sent.close()
    fout_n_sent.close()
예제 #11
0
    # set model path
    if args.model_path == '':
        model_paths = list(iglob('./experiments/**/*.hdf5', recursive=True))
        model_paths = sorted(model_paths, key=lambda x: os.path.getctime(x))
        model_path = model_paths[-1]  # getting most fresh weights
    else:
        model_path = args.model_path
    print('Model path: {}'.format(model_path))

    # load image encoder and caption generator
    image_encoder = build_image_encoder(dm_train.image_height,
                                        dm_train.image_width,
                                        dm_train.n_channels)
    caption_generator = CaptionGenerator(model_path, image_encoder,
                                         dm_train.vocabulary,
                                         dm_train.caption_length)

    # calculate bleu score
    predicted_captions = []
    true_caption_lists = []

    for image, caption_list in tqdm(
            dm_val.flow_test(return_encoded_captions=False),
            total=len(dm_val.images)):
        predicted_caption = caption_generator.generate_captions(
            image,
            max_caption_length=args.max_caption_length,
            beam_size=args.beam_size)[0]
        predicted_captions.append(predicted_caption)