def _create_caption_generator(): cg = CaptionGenerator( model_type=_CONFIG['model_type'], vocabulary_path=_CONFIG['vocabulary_path'], word_embedding_size=_CONFIG['word_embedding_size'], visual_feature_size=_CONFIG['visual_feature_size'], spatial_feature_size=_CONFIG['spatial_feature_size'], hidden_size=_CONFIG['cg_hidden_size'], use_all_regions=((_CONFIG['model_type'] == 'region_attention') and (_CONFIG['use_all_regions'] == 'enforced')), inference_only=True, num_layers=_CONFIG['num_rnn_layers'], learning_rate=_CONFIG['learning_rate'], dropout_lstm=_CONFIG['dropout_lstm'], dropout_word_embedding=_CONFIG['dropout_word_embedding'], l2_weight=_CONFIG['l2_weight'], block_unnecessary_tokens=_CONFIG['block_unnecessary_tokens'], device=_CONFIG['device']) if _CONFIG['load_path_cg'] is not None: print("Starting from PATH", _CONFIG['load_path_cg']) cg.load(checkpoint_path=_CONFIG['load_path_cg'], load_optimizer=_CONFIG['load_cg_optimizer']) return cg
def main(_): model = ShowAndTellModel(FLAGS.model_path) vocab = Vocabulary(FLAGS.vocab_file) filenames = _load_filenames() generator = CaptionGenerator(model, vocab) for filename in filenames: with tf.gfile.GFile(filename, "rb") as f: image = f.read() captions = generator.beam_search(image) print("Captions for image %s:" % os.path.basename(filename)) for i, caption in enumerate(captions): # Ignore begin and end tokens <S> and </S>. sentence = [vocab.id_to_token(w) for w in caption.sentence[1:-1]] sentence = " ".join(sentence) print(" %d) %s (p=%f)" % (i, sentence, math.exp(caption.logprob)))
def test_model_with_feature(): image = None generated_words = None if not os.path.exists('data/ixtoword.npy'): print('You must run 1. O\'reilly Training.ipynb first.') else: ixtoword = np.load('data/ixtoword.npy').tolist() n_words = len(ixtoword) maxlen = 15 tf.reset_default_graph() sess = tf.InteractiveSession() caption_generator = CaptionGenerator(dim_in, dim_hidden, dim_embed, batch_size, maxlen + 2, n_words) image, generated_words = caption_generator.build_generator( maxlen=maxlen) test_with_feature(sess, image, generated_words, ixtoword, 55)
def train_model(weight=None, batch_size=32, epochs=10): cg = CaptionGenerator() model = cg.create_model() if weight is not None: model.load_weights(weight) file_name = 'weights-improvement-{epoch:02d}.hdf5' checkpoint = ModelCheckpoint(file_name, monitor='loss', verbose=1, save_best_only=True, mode='min') callbacks_list = [checkpoint] model.fit_generator(cg.data_generator(batch_size=batch_size), steps_per_epoch=cg.total_samples/batch_size, epochs=epochs, verbose=1, callbacks=callbacks_list) try: model.save('Models/WholeModel.h5', overwrite=True) model.save_weights('Models/Weights.h5', overwrite=True) except: print("Error in saving model.") print("Training complete...\n")
def main(unused_args): length_normalization_factor = FLAGS.length_normalization_factor # Load model configuration config_path = os.path.join(os.path.dirname(__file__), 'model_conf', FLAGS.model_name + '.py') config = utility.load_config(config_path) config.trainCollection = FLAGS.train_collection config.word_cnt_thr = FLAGS.word_cnt_thr config.rootpath = FLAGS.rootpath train_collection = FLAGS.train_collection test_collection = FLAGS.test_collection overwrite = FLAGS.overwrite feature = FLAGS.vf_name img_set_file = os.path.join(rootpath, test_collection, 'VideoSets', '%s.txt' % test_collection) if not os.path.exists(img_set_file): img_set_file = os.path.join(rootpath, test_collection, 'ImageSets', '%s.txt' % test_collection) img_list = map(str.strip, open(img_set_file).readlines()) # have visual feature ready vf_dir = utility.get_feat_dir(test_collection, feature, rootpath) vf_reader = BigFile( vf_dir ) textbank = TextBank(utility.get_train_vocab_file(FLAGS)) config.vocab_size = len(textbank.vocab) config.vf_size = int(open(os.path.join(vf_dir, 'shape.txt')).read().split()[1]) model_dir = utility.get_model_dir(FLAGS) output_dir = utility.get_pred_dir(FLAGS) checkpoint_style = FLAGS.checkpoint_style if checkpoint_style == 'file': #output_per_filename = 'model_perf_in_topk_%d_%s' % (FLAGS.top_k, FLAGS.eval_model_list_file) # read validated top models validation_output_dir = utility.get_sim_dir(FLAGS) if not os.path.exists(output_dir): os.makedirs(output_dir) eval_model_list_file = os.path.join(validation_output_dir, 'loss_info.txt') #FLAGS.eval_model_list_file) shutil.copy(eval_model_list_file, output_dir) test_iter_list = [] for line in open(eval_model_list_file).readlines()[:FLAGS.top_k]: iter_current = int(line.strip().split()[0]) test_iter_list.append(iter_current) elif checkpoint_style == 'iter_interval': #output_per_filename = 'model_perf_in_%s' % FLAGS.eval_stat test_iter_list = range(*[int(x) for x in FLAGS.eval_stat.split("-")]) elif checkpoint_style == 'iter_num': #output_per_filename = 'model_perf_in_iter_%d' % FLAGS.iter_num test_iter_list = [FLAGS.iter_num] with_image_embedding = True if FLAGS.with_image_embedding != 0 else False g = tf.Graph() with g.as_default(): model = InferenceWrapper(config=config,model_dir=model_dir, gpu_memory_fraction=FLAGS.gpu_memory_fraction, gpu=FLAGS.gpu, with_image_embedding=with_image_embedding) model.build_model() for k, iter_n in enumerate(test_iter_list): model_path = os.path.join(model_dir, 'variables', 'model_%d.ckpt' % iter_n) while not os.path.exists(model_path+'.meta'): logger.error('Model path: %s', model_path) logger.error('Cannot load model file and exit') sys.exit(0) top_one_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_one_pred_sent.txt') top_n_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_n_pred_sent.txt') # perf_file = os.path.join(output_dir, 'model_%d.ckpt' % iter_n, 'perf.txt') if os.path.exists(top_one_pred_sent_file) and not overwrite: # write existing perf file and print out logger.info('%s exists. skip', top_one_pred_sent_file) continue if not os.path.exists(os.path.split(top_one_pred_sent_file)[0]): os.makedirs(os.path.split(top_one_pred_sent_file)[0]) logger.info('save results to %s', top_one_pred_sent_file) # load the trained model generator = CaptionGenerator(config, model, length_normalization_factor = length_normalization_factor) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config_proto = tf.ConfigProto( intra_op_parallelism_threads=FLAGS.ses_threads, gpu_options=gpu_options, allow_soft_placement=True) #with tf.Session(config=config_proto) as session: #model.build_model(session, model_path) model.load_model(model_path) fout_one_sent = codecs.open(top_one_pred_sent_file, 'w','utf-8') fout_n_sent = codecs.open(top_n_pred_sent_file, 'w','utf-8') for progress,img in enumerate(img_list): # predict sentences given a visual feature visual_feature = np.array(vf_reader.read_one(img)) sentences = generator.beam_search( visual_feature, FLAGS.beam_size) # output top one sentence info sent_score = sentences[0].score sent = ' '.join(sentences[0].words) fout_one_sent.write(img + ' ' + '%.3f' % sent_score + ' ' + sent + '\n') logger.debug(img + ' ' + '%.3f' % sent_score + ' ' + sent) # output top n sentences info fout_n_sent.write(img) for sentence in sentences: sent_score = sentence.score sent = ' '.join(sentence.words) fout_n_sent.write('\t' + '%.3f' % sent_score + '\t' + sent) fout_n_sent.write('\n') if progress % 100 == 0: logger.info('%d images decoded' % (progress+1)) logger.info('%d images decoded' % (progress+1)) fout_one_sent.close() fout_n_sent.close()
def train(learning_rate=0.001, continue_training=False, transfer=True): tf.reset_default_graph() feats, captions = get_data(annotations_path, features_path) wordtoix, ixtoword, init_b = build_word_vocab(captions) np.save('data/ixtoword', ixtoword) index = (np.arange(len(feats)).astype(int)) np.random.shuffle(index) sess = tf.InteractiveSession() n_words = len(wordtoix) maxlen = np.max([x for x in map(lambda x: len(x.split(' ')), captions)]) caption_generator = CaptionGenerator(dim_in, dim_hidden, dim_embed, batch_size, maxlen + 2, n_words, init_b) loss, image, sentence, mask = caption_generator.build_model() saver = tf.train.Saver(max_to_keep=100) global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.exponential_decay(learning_rate, global_step, int(len(index) / batch_size), 0.95) train_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) tf.global_variables_initializer().run() if continue_training: if not transfer: saver.restore(sess, tf.train.latest_checkpoint(model_path)) else: saver.restore(sess, tf.train.latest_checkpoint(model_path_transfer)) losses = [] for epoch in range(n_epochs): for start, end in zip(range(0, len(index), batch_size), range(batch_size, len(index), batch_size)): current_feats = feats[index[start:end]] current_captions = captions[index[start:end]] current_caption_ind = [ x for x in map( lambda cap: [ wordtoix[word] for word in cap.lower().split(' ')[:-1] if word in wordtoix ], current_captions) ] current_caption_matrix = sequence.pad_sequences( current_caption_ind, padding='post', maxlen=maxlen + 1) current_caption_matrix = np.hstack([ np.full((len(current_caption_matrix), 1), 0), current_caption_matrix ]) current_mask_matrix = np.zeros((current_caption_matrix.shape[0], current_caption_matrix.shape[1])) nonzeros = np.array([ x for x in map(lambda x: (x != 0).sum() + 2, current_caption_matrix) ]) for ind, row in enumerate(current_mask_matrix): row[:nonzeros[ind]] = 1 _, loss_value = sess.run( [train_op, loss], feed_dict={ image: current_feats.astype(np.float32), sentence: current_caption_matrix.astype(np.int32), mask: current_mask_matrix.astype(np.float32) }) print("Current Cost: ", loss_value, "\t Epoch {}/{}".format(epoch, n_epochs), "\t Iter {}/{}".format(start, len(feats))) print("Saving the model from epoch: ", epoch) saver.save(sess, os.path.join(model_path, 'model'), global_step=epoch)
from model import ShowAndTellModel from vocabulary import Vocabulary from caption_generator import CaptionGenerator FLAGS = tf.flags.FLAGS tf.flags.DEFINE_string("model", "./show-and-tell.pb", "Model graph def path") tf.flags.DEFINE_string("vocab", "./word_counts.txt", "Text file containing the vocabulary.") tf.flags.DEFINE_string("port", "5000", "Port of the server.") tf.flags.DEFINE_string("host", "localhost", "Host of the server.") tf.flags.DEFINE_integer("beam_size", 3, "Size of the beam.") tf.flags.DEFINE_integer("max_caption_length", 20, "Maximum length of the generate caption.") vocab = Vocabulary(vocab_file_path=FLAGS.vocab) model = ShowAndTellModel(model_path=FLAGS.model) generator = CaptionGenerator(model=model, vocab=vocab, beam_size=FLAGS.beam_size, max_caption_length=FLAGS.max_caption_length) logger = logging.getLogger(__name__) app = Flask(__name__) @app.route('/api/image-caption/predict', methods=['GET','POST']) def caption(): if request.method == 'POST': file = request.files['image'] image = file.read() captions = generator.beam_search(image) sentences = [] for caption in captions: sentence = [vocab.id_to_token(w) for w in caption.sentence[1:-1]] sentences.append((" ".join(sentence), np.exp(caption.logprob)))
import pickle import numpy as np import nltk from keras.preprocessing import sequence from caption_generator import CaptionGenerator cg = CaptionGenerator() def process_caption(caption): caption_split = caption.split() processed_caption = caption_split[1:] try: end_index = processed_caption.index('<end>') processed_caption = processed_caption[:end_index] except: pass return " ".join([word for word in processed_caption]) def get_best_caption(captions): captions.sort(key=lambda l: l[1]) best_caption = captions[-1][0] return " ".join([cg.index_word[index] for index in best_caption]) def get_all_captions(captions): final_captions = [] captions.sort(key=lambda l: l[1])
def evaluate(): with open(os.path.join(FLAGS.data_dir, 'feature.test'), 'rb') as f: feature = cPickle.load(f) with open(os.path.join(FLAGS.data_dir, 'caption.test'), 'rb') as f: sentence = cPickle.load(f) scorers = [(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), (Meteor(), "METEOR"), (Rouge(), "ROUGE_L"), (Cider(), "CIDEr")] vocab, re_vocab = data_utils.initialize_vocabulary() GTS = {} RES = {} batch_size = 1 max_meteor = 0 with tf.Session() as sess: model = Seq2Seq(FLAGS.num_units, FLAGS.use_lstm, 1.0, FLAGS.num_layers, FLAGS.encoder_max_sequence_length, 1, FLAGS.feature_size, FLAGS.vocab_size, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, FLAGS.max_gradient_norm, forward_only=False) step = 7000 while True: step += FLAGS.steps_per_checkpoint ckpt_path = os.path.join(FLAGS.checkpoint_dir, 'ckpt-%d' % step) if os.path.isfile(ckpt_path + '.meta'): model.saver.restore(sess, ckpt_path) cg = CaptionGenerator( model=model, start_id=data_utils.GO_ID, end_id=data_utils.EOS_ID, beam_size=3, max_caption_length=FLAGS.decoder_max_sentence_length, length_normalization_factor=0.0) for vid, _ in feature.iteritems(): feature_inputs, batch_decoder_inputs, batch_weights = model.get_batch( feature, [(vid, [0])]) outputs = cg.beam_search(sess, feature_inputs) sen = " ".join([ tf.compat.as_str(re_vocab[output]) for output in outputs ]) print("%s: %s" % (sen, sentence[vid][9])) GTS[vid] = sentence[vid] RES[vid] = [sen] print('STEP: %d' % step) for scorer, method in scorers: score, scores = scorer.compute_score(GTS, RES) if method == "METEOR" and score > max_meteor: max_meteor = score if isinstance(method, list): for k, v in zip(method, score): print("%s:\t%f" % (k, v)) else: print("%s:\t%f" % (method, score)) sys.stdout.flush() exit(0) else: break print("Max METEOR:\t%f" % max_meteor)
def main(unused_args): length_normalization_factor = FLAGS.length_normalization_factor # Load model configuration config_path = os.path.join(os.path.dirname(__file__), 'model_conf', FLAGS.model_name + '.py') config = utility.load_config(config_path) config.trainCollection = FLAGS.train_collection config.word_cnt_thr = FLAGS.word_cnt_thr config.rootpath = FLAGS.rootpath train_collection = FLAGS.train_collection test_collection = FLAGS.test_collection overwrite = FLAGS.overwrite feature = FLAGS.vf_name img_set_file = os.path.join(rootpath, test_collection, 'VideoSets', '%s.txt' % test_collection) if not os.path.exists(img_set_file): img_set_file = os.path.join(rootpath, test_collection, 'ImageSets', '%s.txt' % test_collection) img_list = map(str.strip, open(img_set_file).readlines()) # have visual feature ready FLAGS.vf_dir = os.path.join(rootpath, test_collection, 'FeatureData', feature) vf_reader = BigFile(FLAGS.vf_dir) textbank = TextBank(utility.get_train_vocab_file(FLAGS)) config.vocab_size = len(textbank.vocab) config.vf_size = int(open(os.path.join(FLAGS.vf_dir, 'shape.txt')).read().split()[1]) model_dir = utility.get_model_dir(FLAGS) output_dir = utility.get_pred_dir(FLAGS) checkpoint_style = FLAGS.checkpoint_style if checkpoint_style == 'file': #output_per_filename = 'model_perf_in_topk_%d_%s' % (FLAGS.top_k, FLAGS.eval_model_list_file) # read validated top models validation_output_dir = utility.get_sim_dir(FLAGS) if not os.path.exists(output_dir): os.makedirs(output_dir) eval_model_list_file = os.path.join(validation_output_dir, 'loss_info.txt') #FLAGS.eval_model_list_file) shutil.copy(eval_model_list_file, output_dir) test_iter_list = [] for line in open(eval_model_list_file).readlines()[:FLAGS.top_k]: iter_current = int(line.strip().split()[0]) test_iter_list.append(iter_current) elif checkpoint_style == 'iter_interval': #output_per_filename = 'model_perf_in_%s' % FLAGS.eval_stat test_iter_list = range(*[int(x) for x in FLAGS.eval_stat.split("-")]) elif checkpoint_style == 'iter_num': #output_per_filename = 'model_perf_in_iter_%d' % FLAGS.iter_num test_iter_list = [FLAGS.iter_num] with_image_embedding = True if FLAGS.with_image_embedding != 0 else False g = tf.Graph() with g.as_default(): model = InferenceWrapper(config=config,model_dir=model_dir, gpu_memory_fraction=FLAGS.gpu_memory_fraction, gpu=FLAGS.gpu, with_image_embedding=with_image_embedding) model.build_model() for k, iter_n in enumerate(test_iter_list): model_path = os.path.join(model_dir, 'variables', 'model_%d.ckpt' % iter_n) while not os.path.exists(model_path+'.meta'): logger.error('Model path: %s', model_path) logger.error('Cannot load model file and exit') sys.exit(0) top_one_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_one_pred_sent.txt') top_n_pred_sent_file = os.path.join(output_dir, 'top%d' % k, 'top_n_pred_sent.txt') # perf_file = os.path.join(output_dir, 'model_%d.ckpt' % iter_n, 'perf.txt') if os.path.exists(top_one_pred_sent_file) and not overwrite: # write existing perf file and print out logger.info('%s exists. skip', top_one_pred_sent_file) continue if not os.path.exists(os.path.split(top_one_pred_sent_file)[0]): os.makedirs(os.path.split(top_one_pred_sent_file)[0]) logger.info('save results to %s', top_one_pred_sent_file) # load the trained model generator = CaptionGenerator(config, model, length_normalization_factor = length_normalization_factor) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction) config_proto = tf.ConfigProto( intra_op_parallelism_threads=FLAGS.ses_threads, gpu_options=gpu_options, allow_soft_placement=True) #with tf.Session(config=config_proto) as session: #model.build_model(session, model_path) model.load_model(model_path) fout_one_sent = codecs.open(top_one_pred_sent_file, 'w','utf-8') fout_n_sent = codecs.open(top_n_pred_sent_file, 'w','utf-8') for progress,img in enumerate(img_list): print(img) # predict sentences given a visual feature visual_feature = np.array(vf_reader.read_one(img)) sentences = generator.beam_search( visual_feature, FLAGS.beam_size) # output top one sentence info sent_score = sentences[0].score sent = ' '.join(sentences[0].words) fout_one_sent.write(img + ' ' + '%.3f' % sent_score + ' ' + sent + '\n') logger.debug(img + ' ' + '%.3f' % sent_score + ' ' + sent) # output top n sentences info fout_n_sent.write(img) for sentence in sentences: sent_score = sentence.score sent = ' '.join(sentence.words) fout_n_sent.write('\t' + '%.3f' % sent_score + '\t' + sent) fout_n_sent.write('\n') if progress % 100 == 0: logger.info('%d images decoded' % (progress+1)) logger.info('%d images decoded' % (progress+1)) fout_one_sent.close() fout_n_sent.close()
# set model path if args.model_path == '': model_paths = list(iglob('./experiments/**/*.hdf5', recursive=True)) model_paths = sorted(model_paths, key=lambda x: os.path.getctime(x)) model_path = model_paths[-1] # getting most fresh weights else: model_path = args.model_path print('Model path: {}'.format(model_path)) # load image encoder and caption generator image_encoder = build_image_encoder(dm_train.image_height, dm_train.image_width, dm_train.n_channels) caption_generator = CaptionGenerator(model_path, image_encoder, dm_train.vocabulary, dm_train.caption_length) # calculate bleu score predicted_captions = [] true_caption_lists = [] for image, caption_list in tqdm( dm_val.flow_test(return_encoded_captions=False), total=len(dm_val.images)): predicted_caption = caption_generator.generate_captions( image, max_caption_length=args.max_caption_length, beam_size=args.beam_size)[0] predicted_captions.append(predicted_caption)