def train(): config = lstm.config() w2d, d2w = get_word_to_id() if len(w2d) == 0: w2d, d2w = gen_word_to_id(FLAGS.caption_path, FLAGS.image_path) f, image, sentence = data_producer.get_data(FLAGS.caption_path, FLAGS.image_path, batch_size=config.batch_size) config.vocab_size = len(w2d) print("vocab size:", config.vocab_size) epoch_size = 100 image_caption = ImageCaption(image, word, target, config) sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(config=config_proto) as sess: print("log save:", FLAGS.log_path) summary_writer = tf.summary.FileWriter(FLAGS.log_path, sess.graph) for i in range(config.max_max_epoch): x_lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) print("lr:", x_lr_decay) image_caption.lstm.assign_lr(sess, config.learning_rate * x_lr_decay) p = image_caption.run_epoch(sess, x_lr_decay, epoch_size, summary_writer, sv) print("step %d per %f" % (i, p))
def train(): config = lstm.config() config.batch_size = 2 config.hidden_size = 512 f, image, label, word, target, w2d, d2w = data.get_data( FLAGS.caption_path, FLAGS.image_path, max_len=config.num_steps + 1, batch_size=config.batch_size) epoch_size = 10000 config.vocab_size = len(w2d) print("vb size:", len(w2d)) image_caption = ImageCaption(image, word, target, config) #sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=True) #with sv.managed_session(config=config_proto) as sess: with tf.Session(config=config_proto) as sess: sv = load_session(sess, FLAGS.save_path) threads = tf.train.start_queue_runners(sess) summary_writer = tf.summary.FileWriter(FLAGS.log_path, sess.graph) for i in range(config.max_max_epoch): x_lr_decay = config.lr_decay**max(i + 1 - config.max_epoch, 0.0) print("lr:", x_lr_decay) image_caption.lstm.assign_lr(sess, config.learning_rate * x_lr_decay) p = image_caption.run_epoch(sess, x_lr_decay, epoch_size, summary_writer, sv) print("step %d per %f" % (i, p))
def predict_one(img, sess, image_caption, candidats): config = lstm.config() img = img.reshape((1, 224, 224, 3)) for i in range(20): seq = [s.words for s in candidats] seq_len = [len(seq[0])] * len(seq) feed_dict = { image_caption.images: np.array([img] * len(seq)).reshape(-1, 224, 224, 3), image_caption.word: np.array(seq), image_caption.lstm.seqlen: np.array(seq_len) } outputs = sess.run(image_caption.lstm.logits, feed_dict=feed_dict) new_list = [] for s, s_out in zip(candidats, outputs): predict = s_out[-1] sort_idx = predict.argsort()[::-1] for j in range(len(sort_idx)): #print "S:",s,s+[sort_idx[j]] w = sort_idx[j] score = np.log(predict[w]) assert not math.isnan(score) % predict[w] new_list.append(Sentence(s.words + [w], s.score + score)) new_list = sorted(new_list, key=lambda sentence: sentence.score, reverse=True) candidats = new_list[0:20] for l in candidats: print("sort list:", l.words, l.score) if candidats[0].words[-1] == 0: return candidats
def predict(): config = lstm.config() img = utils.load_image("./test_data/tiger.jpeg") img = img.reshape((1, 224, 224, 3)) w2d, d2w = data.get_word_to_id() print "read w2d size:", len(w2d) if len(w2d) == 0: f, image, label, word, target, w2d, d2w = data.get_data( FLAGS.caption_path, FLAGS.image_path, max_len=config.num_steps + 1, batch_size=config.batch_size) print "reload read w2d size:", len(w2d) config.vocab_size = len(w2d) #config.vocab_size = 24553 images = tf.placeholder("float", [None, 224, 224, 3], name="image") word = tf.placeholder(tf.int32, [None, None], name="word_seq") image_caption = ImageCaption(images, word, None, config, is_training=False) sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(config=config_proto) as sess: seq = [w2d['<start>']] for i in range(50): seq_len = np.array(len(seq)).reshape([-1]) feed_dict = { images: img, word: np.array(seq).reshape([1, -1]), image_caption.lstm.seqlen: seq_len } output = sess.run(image_caption.lstm.logits, feed_dict=feed_dict) print(output.shape) idx = np.argmax(output[-1]) seq.append(idx) print seq print[d2w[s] for s in seq] if idx == 2: break
def evaluate(): config = lstm.config() config.hidden_size = 512 config.batch_size = 1 w2d, d2w = data.get_word_to_id() print "read w2d size:", len(w2d) config.vocab_size = len(w2d) print FLAGS.caption_path, FLAGS.image_path, config.num_steps + 1, config.batch_size f, image, label, word, target, _, _ = data.get_data( FLAGS.caption_path, FLAGS.image_path, max_len=config.num_steps + 1, batch_size=config.batch_size, mode='test') images = tf.placeholder("float", [None, 224, 224, 3], name="image") word = tf.placeholder(tf.int32, [None, None], name="word_seq") image_caption = ImageCaption(images, word, None, config, is_training=False) sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(config=config_proto) as sess: for i in range(5): f_, i, l = sess.run([f, image, label]) candidats = [Sentence([w2d['<start>']])] res = predict_one(i, sess, image_caption, candidats) print f_ for r in res: print r.words print[d2w[p] for p in r.words] print l print[d2w[p] for p in l[0]] hy = [r.words for r in res] print "score:", score.score(l[0], hy)
def predict(): config = lstm.config() img = utils.load_image("./test_data/tiger.jpeg")
def predict(): config = lstm.config() config.hidden_size = 512 config.batch_size = 2 img = utils.load_image( "/home/tusimple/junechen/ml_data/data/train2014/COCO_train2014_000000160629.jpg" ) #img = utils.load_image("./test_data/tiger.jpeg") img = img.reshape((1, 224, 224, 3)) w2d, d2w = data.get_word_to_id() print "read w2d size:", len(w2d) if len(w2d) == 0: f, image, label, word, target, w2d, d2w = data.get_data( FLAGS.caption_path, FLAGS.image_path, max_len=config.num_steps + 1, batch_size=config.batch_size) print "reload read w2d size:", len(w2d) config.vocab_size = len(w2d) #config.vocab_size = 24553 images = tf.placeholder("float", [None, 224, 224, 3], name="image") word = tf.placeholder(tf.int32, [None, None], name="word_seq") image_caption = ImageCaption(images, word, None, config, is_training=False) sv = tf.train.Supervisor(logdir=FLAGS.save_path) config_proto = tf.ConfigProto(allow_soft_placement=True) with sv.managed_session(config=config_proto) as sess: candidats = [Sentence([w2d['<start>']])] #seq = [[w2d['<start>']]] #seq = [[3, 1], [3, 16], [3, 15], [3, 7], [3, 66]] for i in range(20): seq = [s.words for s in candidats] print("run seq:", np.array(seq).shape) seq_len = [len(seq[0])] * len(seq) feed_dict = { images: np.array([img] * len(seq)).reshape(-1, 224, 224, 3), word: np.array(seq), image_caption.lstm.seqlen: np.array(seq_len) } print(feed_dict[images].shape) print(feed_dict[word].shape) print(feed_dict[image_caption.lstm.seqlen].shape) outputs = sess.run(image_caption.lstm.logits, feed_dict=feed_dict) print np.array(outputs).shape new_list = [] for s, s_out in zip(candidats, outputs): predict = s_out[-1] sort_idx = predict.argsort()[::-1] for j in range(len(sort_idx)): #print "S:",s,s+[sort_idx[j]] w = sort_idx[j] score = np.log(predict[w]) assert not math.isnan(score) % predict[w] new_list.append(Sentence(s.words + [w], s.score + score)) #print "new:lst:",new_list #print [ d2w[p.words] for p in new_list ] new_list = sorted(new_list, key=lambda sentence: sentence.score, reverse=True) candidats = new_list[0:20] for l in candidats: print("sort list:", l.words, l.score) print[d2w[p] for p in l.words]