def __init__(self, config, model_name, vocab_path, ses_threads=2, gpu_memory_fraction=1.0): self.cu = CommonUtiler() self.config = copy.deepcopy(config) self.config.batch_size = 1 self.model_path = None self.model_name = model_name self.flag_load_model = False self.vocab_path = vocab_path self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) self.session = session = tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=ses_threads, gpu_options=gpu_options)) with tf.variable_scope("mRNNmodel", reuse=None): self.model_init = mRNNModel(is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=True) with tf.variable_scope("mRNNmodel", reuse=True): self.model_cont = mRNNModel(is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=False, flag_reset_state=True)
def main(unused_args): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Start model training with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=FLAGS.ses_threads)) as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) assert len(config.buckets) >= 1 assert config.buckets[-1] == config.max_num_steps models = [] with tf.variable_scope("mRNNmodel", reuse=None, initializer=initializer): m = mRNNModel(is_training=True, num_steps=config.buckets[0], config=config, model_name=FLAGS.model_name, flag_with_saver=True, model_root=FLAGS.model_root) models.append(m) with tf.variable_scope("mRNNmodel", reuse=True): for bucket in config.buckets[1:]: m = mRNNModel(is_training=True, num_steps=bucket, config=config, model_name=FLAGS.model_name, model_root=FLAGS.model_root) models.append(m) hdlr = logging.FileHandler(os.path.join(m.model_dir, 'log.txt')) hdlr.setLevel(logging.INFO) hdlr.setFormatter(logging.Formatter(formatter_log)) logger.addHandler(hdlr) if FLAGS.pre_trained_model_path: models[0].saver.restore(session, FLAGS.pre_trained_model_path) logger.info('Continue to train from %s', FLAGS.pre_trained_model_path) else: tf.initialize_all_variables().run() iters_done = 0 data_provider = mRNNCocoBucketDataProvider(FLAGS.anno_files_path.split(':'), FLAGS.vocab_path, config.vocab_size, FLAGS.vf_dir, config.vf_size) for i in range(config.num_epoch): train_cost, iters_done = run_epoch(session, iters_done, config, models, data_provider, verbose=True) logger.info("Train cost for epoch %d is %.3f" % (i, train_cost)) # Save final copy of the model models[0].saver.save(session, os.path.join(m.variable_dir, 'model_%d.ckpt' % iters_done))
def __init__(self, config, model_name, vocab_path, ses_threads=2, gpu_memory_fraction=1.0): self.cu = CommonUtiler() self.config = copy.deepcopy(config) self.config.batch_size = 1 self.model_path = None self.model_name = model_name self.flag_load_model = False self.vocab_path = vocab_path self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) self.session = session = tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=ses_threads, gpu_options=gpu_options)) with tf.variable_scope("mRNNmodel", reuse=None): self.model_init = mRNNModel( is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=True) with tf.variable_scope("mRNNmodel", reuse=True): self.model_cont = mRNNModel( is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=False, flag_reset_state=True)
def __init__(self, anno_files_path, vocab_path, vocab_size, vf_dir, vf_size, flag_shuffle=True): self.cu = CommonUtiler() self.anno_files_path = anno_files_path self.vocab_path = vocab_path self.vocab, _ = self.cu.load_vocabulary(vocab_path) assert len(self.vocab) == vocab_size assert self.vocab['<pad>'] == 0 self.vf_dir = vf_dir self.vf_size = vf_size self.flag_shuffle = flag_shuffle self._load_data()
def main(unused_args): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Evaluate trained models on val decoder = mRNNDecoder(config, FLAGS.model_name, FLAGS.vocab_path, gpu_memory_fraction=FLAGS.gpu_memory_fraction) for i in xrange(*[int(x) for x in FLAGS.eval_stat.split()]): model_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'variables', 'model_%d.ckpt' % i) while not os.path.exists(model_path): logger.warn('Cannot load model file, sleep 1 hour to retry') time.sleep(3600) decoder.load_model(model_path) num_decode = 0 pred_sentences = [] for anno_file_path in FLAGS.anno_files_path.split(':'): annos = np.load(anno_file_path).tolist() for anno in annos: feat_path = os.path.join( FLAGS.vf_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') visual_features = np.loadtxt(feat_path) sentences = decoder.decode(visual_features, FLAGS.beam_size) sentence_coco = {} sentence_coco['image_id'] = anno['id'] sentence_coco['caption'] = ' '.join(sentences[0]['words']) pred_sentences.append(sentence_coco) num_decode += 1 if num_decode % 100 == 0: logger.info('%d images are decoded' % num_decode) pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'generated_%d.json' % i) result_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'result_%d.txt' % i) cu.create_dir_if_not_exists(os.path.dirname(pred_path)) with open(pred_path, 'w') as fout: json.dump(pred_sentences, fout) cu.coco_val_eval(pred_path, result_path)
def main(unused_args): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Evaluate trained models on val decoder = mRNNDecoder(config, FLAGS.model_name, FLAGS.vocab_path, gpu_memory_fraction=FLAGS.gpu_memory_fraction) for i in xrange(*[int(x) for x in FLAGS.eval_stat.split()]): model_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'variables', 'model_%d.ckpt' % i) while not os.path.exists(model_path): logger.warn('Cannot load model file, sleep 1 hour to retry') time.sleep(3600) decoder.load_model(model_path) num_decode = 0 pred_sentences = [] for anno_file_path in FLAGS.anno_files_path.split(':'): annos = np.load(anno_file_path).tolist() for anno in annos: feat_path = os.path.join(FLAGS.vf_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') visual_features = np.loadtxt(feat_path) sentences = decoder.decode(visual_features, FLAGS.beam_size) sentence_coco = {} sentence_coco['image_id'] = anno['id'] sentence_coco['caption'] = ' '.join(sentences[0]['words']) pred_sentences.append(sentence_coco) num_decode += 1 if num_decode % 100 == 0: logger.info('%d images are decoded' % num_decode) pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'generated_%d.json' % i) result_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'result_%d.txt' % i) cu.create_dir_if_not_exists(os.path.dirname(pred_path)) with open(pred_path, 'w') as fout: json.dump(pred_sentences, fout) cu.coco_val_eval(pred_path, result_path)
def __init__(self, is_training, config, num_steps, model_name, flag_with_saver=False, model_root='./cache/models/mscoco', flag_reset_state=False): # Set up paths and dirs self.cu = CommonUtiler() self.model_dir = os.path.join(model_root, model_name) self.variable_dir = os.path.join(self.model_dir, 'variables') self.cu.create_dir_if_not_exists(self.model_dir) self.cu.create_dir_if_not_exists(self.variable_dir) self.batch_size = batch_size = config.batch_size self.num_steps = num_steps rnn_size = config.rnn_size emb_size = config.emb_size vocab_size = config.vocab_size vf_size = config.vf_size # Inputs to the model self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self._visual_features = tf.placeholder(tf.float32, [batch_size, vf_size]) self._valid_flags = tf.placeholder(tf.float32, [batch_size, num_steps]) self._seq_lens = tf.placeholder(tf.int32, [batch_size]) # Create rnn cell if config.rnn_type == 'GRU': rnn_cell_basic = tf.nn.rnn_cell.GRUCell(rnn_size) elif config.rnn_type == 'LSTM': rnn_cell_basic = tf.nn.rnn_cell.LSTMCell(rnn_size, input_size=emb_size, use_peepholes=True) else: raise NameError("Unknown rnn type %s!" % config.rnn_type) if is_training and config.keep_prob_rnn < 1: rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper( rnn_cell_basic, output_keep_prob=config.keep_prob_rnn) cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] * config.num_rnn_layers) state_size = cell.state_size # Create word embeddings self._embedding = embedding = tf.get_variable("embedding", [vocab_size, emb_size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob_emb < 1: inputs = tf.nn.dropout(inputs, config.keep_prob_emb) # Different ways to fuze text and visual information if config.multimodal_type == 'mrnn': mm_size = config.mm_size # Run RNNs if flag_reset_state: self._initial_state = initial_state = tf.placeholder( tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = cell.zero_state( batch_size, tf.float32) inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map RNN output to multimodal space w_r2m = tf.get_variable("w_r2m", [rnn_size, mm_size]) b_r2m = tf.get_variable("b_r2m", [mm_size]) multimodal_l = tf.nn.relu(tf.matmul(output_rnn, w_r2m) + b_r2m) # Map Visual feature to multimodal space w_vf2m = tf.get_variable("w_vf2m", [vf_size, mm_size]) b_vf2m = tf.get_variable("b_vf2m", [mm_size]) mm_vf_single = tf.nn.relu( tf.matmul(self._visual_features, w_vf2m) + b_vf2m) mm_vf = tf.reshape(tf.tile(mm_vf_single, [1, num_steps]), [-1, mm_size]) multimodal_l = multimodal_l + mm_vf if is_training and config.keep_prob_mm < 1: multimodal_l = tf.nn.dropout(multimodal_l, config.keep_prob_mm) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [mm_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(multimodal_l, w_m2w) + b_m2w) elif config.multimodal_type == 'init': # Mapping visual feature to the RNN state w_vf2state = tf.get_variable("w_vf2state", [vf_size, state_size]) b_vf2state = tf.get_variable("b_vf2state", [state_size]) if flag_reset_state: self._initial_state = initial_state = tf.placeholder( tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = tf.nn.relu( tf.matmul(self._visual_features, w_vf2state) + b_vf2state) # Run RNNs inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [rnn_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(output_rnn, w_m2w) + b_m2w) else: raise NameError("Unknown multimodal type %s!" % config.multimodal_type) # Build sampled softmax loss # share the weights between embedding and softmax acc. to [2] w_loss = tf.transpose(embedding) b_loss = tf.get_variable("b_loss", [vocab_size]) self._logit = logit = tf.matmul(output, w_loss) + b_loss target = tf.reshape(math_ops.to_int64(self._targets), [-1]) valid_flag = tf.reshape(self._valid_flags, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, target) self._cost = cost = tf.reduce_sum( loss * valid_flag) / (tf.reduce_sum(valid_flag) + 1e-12) # Create saver if necessary if flag_with_saver: self.saver = tf.train.Saver(max_to_keep=None) else: self.saver = None # Return the model if it is just for inference if not is_training: return # Create learning rate and gradients optimizer self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) if hasattr(config, 'optimizer'): if config.optimizer == 'ori': optimizer = tf.train.GradientDescentOptimizer(self.lr) elif config.optimizer == 'ada': # No GPU optimizer = tf.train.AdagradOptimizer(self.lr) elif config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(self.lr) elif config.optimizer == 'rms': optimizer = tf.train.RMSPropOptimizer(self.lr) else: raise NameError("Unknown optimizer type %s!" % config.optimizer) else: optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
logger = logging.getLogger('ExpMscoco') logging.basicConfig( format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s", datefmt='%d %b %H:%M:%S') logger.setLevel(logging.INFO) if __name__ == '__main__': # Hyparameters min_count = 3 vocab_path = './cache/dctionary/mscoco_mc%d_vocab' % min_count mscoco_root = './datasets/ms_coco' anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy'] # Preparations cu = CommonUtiler() cu.create_dir_if_not_exists(os.path.dirname(vocab_path)) # Scan the anno files vocab = {} for anno_file_name in anno_file_names: anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name) annos = np.load(anno_path).tolist() for anno in annos: for sentence in anno['sentences']: for word in sentence: word = word.strip().lower() if word in vocab: vocab[word] += 1 else:
sys.path.append('./py_lib/') from common_utils import CommonUtiler from tf_mrnn_decoder import mRNNDecoder from vision import ImageFeatureExtractor # In[2]: # set up paths mrnn_model_path = './trained_models/coco_caption/mrnn_GRU_570K.ckpt' mrnn_config_path = './model_conf/mrnn_GRU_conf.py' mrnn_vocab_path = './trained_models/coco_caption/mscoco_mc3_vocab' img_model_path = './external/tf_cnn_models/inception_v3.pb' # initilize feature extractor and sentence decoder cu = CommonUtiler() config = cu.load_config(mrnn_config_path) ife = ImageFeatureExtractor(img_model_path) decoder = mRNNDecoder(config, 'demo', mrnn_vocab_path) # In[3]: demo_image_path = 'demo_image.jpg' beam_size = 3 # extract visual feature for the image visual_features = ife.extract_features(demo_image_path, flag_from_file=True) # generate sentences decoder.load_model(mrnn_model_path) sentences = decoder.decode(visual_features, beam_size)
class mRNNDecoder(object): """The sentence decoder (generator) for mRNNModel.""" def __init__(self, config, model_name, vocab_path, ses_threads=2, gpu_memory_fraction=1.0): self.cu = CommonUtiler() self.config = copy.deepcopy(config) self.config.batch_size = 1 self.model_path = None self.model_name = model_name self.flag_load_model = False self.vocab_path = vocab_path self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) self.session = session = tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=ses_threads, gpu_options=gpu_options)) with tf.variable_scope("mRNNmodel", reuse=None): self.model_init = mRNNModel( is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=True) with tf.variable_scope("mRNNmodel", reuse=True): self.model_cont = mRNNModel( is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=False, flag_reset_state=True) def load_model(self, model_path): self.model_init.saver.restore(self.session, model_path) self.flag_load_model = True self.model_path = model_path logger.info('Load model from %s', model_path) def decode(self, visual_features, beam_size, max_steps=30): """Decode an image with a sentences.""" assert visual_features.shape[0] == self.config.vf_size assert self.flag_load_model, 'Must call local_model first' vocab = self.vocab rev_vocab = self.rev_vocab # Initilize beam search variables # Candidate will be represented with a dictionary # "indexes": a list with indexes denoted a sentence; # "words": word in the decoded sentence without <bos> # "score": log-likelihood of the sentence # "state": RNN state when generating the last word of the candidate good_sentences = [] # store sentences already ended with <bos> cur_best_cand = [] # store current best candidates highest_score = 0.0 # hightest log-likelihodd in good sentences # Get the initial logit and state logit_init, state_init = self.get_logit_init(visual_features) logit_init = np.squeeze(logit_init) assert logit_init.shape[0] == self.config.vocab_size and len( logit_init.shape) == 1 logit_init = self.cu.softmax(logit_init) logit_init_order = np.argsort(-logit_init) for ind_b in xrange(beam_size): cand = {} cand['indexes'] = [logit_init_order[ind_b]] cand['score'] = -np.log(logit_init[logit_init_order[ind_b]]) cand['state'] = state_init cur_best_cand.append(cand) # Expand the current best candidates until max_steps or no candidate for i in xrange(max_steps): # move candidates end with <bos> to good_sentences or remove it cand_left = [] for cand in cur_best_cand: if len(good_sentences) > beam_size and cand['score'] > highest_score: continue # No need to expand that candidate if cand['indexes'][-1] == vocab['<bos>']: good_sentences.append(cand) highest_score = max(highest_score, cand['score']) else: cand_left.append(cand) cur_best_cand = cand_left if not cur_best_cand: break # expand candidate left cand_pool = [] for cand in cur_best_cand: logit, state = self.get_logit_cont(cand['state'], cand['indexes'][-1], visual_features) logit = np.squeeze(logit) logit = self.cu.softmax(logit) logit_order = np.argsort(-logit) for ind_b in xrange(beam_size): cand_e = copy.deepcopy(cand) cand_e['indexes'].append(logit_order[ind_b]) cand_e['score'] -= np.log(logit[logit_order[ind_b]]) cand_e['state'] = state cand_pool.append(cand_e) # get final cand_pool cur_best_cand = sorted(cand_pool, key=lambda cand: cand['score']) cur_best_cand = self.cu.truncate_list(cur_best_cand, beam_size) # Add candidate left in cur_best_cand to good sentences for cand in cur_best_cand: if len(good_sentences) > beam_size and cand['score'] > highest_score: continue if cand['indexes'][-1] != vocab['<bos>']: cand['indexes'].append(vocab['<bos>']) good_sentences.append(cand) highest_score = max(highest_score, cand['score']) # Sort good sentences and return the final list good_sentences = sorted(good_sentences, key=lambda cand: cand['score']) good_sentences = self.cu.truncate_list(good_sentences, beam_size) for sentence in good_sentences: sentence['words'] = self.cu.decode_sentence( sentence['indexes'], vocab, rev_vocab) return good_sentences def get_logit_init(self, visual_features): """Use the model to get initial logit""" m = self.model_init session = self.session vocab = self.vocab config = self.config x = np.zeros([1, 1], dtype=np.int32) vf = np.zeros([1, config.vf_size], dtype=np.float32) fg = np.ones([1, 1], dtype=np.float32) sl = np.ones([1], dtype=np.int32) vf[0, :] = visual_features x[0] = vocab['<bos>'] logit, state = session.run([m.logit, m.final_state], {m.input_data: x, m.visual_features: vf, m.valid_flags: fg, m.seq_lens: sl}) return (logit, state) def get_logit_cont(self, state_prev, index_word, visual_features): """Use the model to get continued logit""" m = self.model_cont session = self.session config = self.config x = np.zeros([1, 1], dtype=np.int32) vf = np.zeros([1, config.vf_size], dtype=np.float32) fg = np.ones([1, 1], dtype=np.float32) sl = np.ones([1], dtype=np.int32) vf[0, :] = visual_features x[0] = index_word logit, state = session.run([m.logit, m.final_state], {m.input_data: x, m.visual_features: vf, m.valid_flags: fg, m.seq_lens: sl, m.initial_state: state_prev}) return (logit, state)
format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s", datefmt='%d %b %H:%M:%S') logger.setLevel(logging.INFO) if __name__ == '__main__': flag_ignore_exists = True # Path model_path = './external/tf_cnn_models/inception_v3.pb' mscoco_root = './datasets/ms_coco' anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy', 'anno_list_mscoco_crVal_m_RNN.npy', 'anno_list_mscoco_test2014.npy'] feat_dir = './cache/mscoco_image_features/inception_v3' # Preparations cu = CommonUtiler() ife = ImageFeatureExtractor(model_path) cu.create_dir_if_not_exists(os.path.join(feat_dir, 'train2014')) cu.create_dir_if_not_exists(os.path.join(feat_dir, 'test2014')) cu.create_dir_if_not_exists(os.path.join(feat_dir, 'val2014')) # Extract features for anno_file_name in anno_file_names: anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name) annos = np.load(anno_path).tolist() for (ind_a, anno) in enumerate(annos): image_path = os.path.join(mscoco_root, 'images', anno['file_path'], anno['file_name']) feat_path = os.path.join(feat_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt')
class mRNNCocoBucketDataProvider(object): """mRNN TensorFlow Data Provider with Buckets on MS COCO.""" def __init__(self, anno_files_path, vocab_path, vocab_size, vf_dir, vf_size, flag_shuffle=True): self.cu = CommonUtiler() self.anno_files_path = anno_files_path self.vocab_path = vocab_path self.vocab, _ = self.cu.load_vocabulary(vocab_path) assert len(self.vocab) == vocab_size assert self.vocab['<pad>'] == 0 self.vf_dir = vf_dir self.vf_size = vf_size self.flag_shuffle = flag_shuffle self._load_data() def generate_batches(self, batch_size, buckets): """Return a list generator of mini-batches of training data.""" # create Batches batches = [] for max_seq_len in buckets: batches.append( Batch(batch_size, max_seq_len, self.vf_size, self.vocab['<bos>'])) # shuffle if necessary if self.flag_shuffle: np.random.shuffle(self._data_pointer) # scan data queue for ind_i, ind_s in self._data_pointer: sentence = self._data_queue[ind_i]['sentences'][ind_s] visual_features = self._data_queue[ind_i]['visual_features'] if len(sentence) >= buckets[-1]: feed_res = batches[-1].feed_and_vomit(visual_features, sentence) ind_buc = len(buckets) - 1 else: for (ind_b, batch) in enumerate(batches): if len(sentence) < batch.max_seq_len: feed_res = batches[ind_b].feed_and_vomit( visual_features, sentence) ind_buc = ind_b break if feed_res: yield (ind_buc, ) + feed_res batches[ind_buc].empty() def _load_data(self, verbose=True): logger.info('Loading data') vocab = self.vocab self._data_queue = [] self._data_pointer = [] ind_img = 0 num_failed = 0 for anno_file_path in self.anno_files_path: annos = np.load(anno_file_path).tolist() for (ind_a, anno) in enumerate(annos): data = {} # Load visual features feat_path = os.path.join( self.vf_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') if os.path.exists(feat_path): vf = np.loadtxt(feat_path) else: num_failed += 1 continue data['visual_features'] = vf # Encode sentences data['sentences'] = [] for (ind_s, sentence) in enumerate(anno['sentences']): sentence_encode = self.cu.encode_sentence( sentence, vocab, flag_add_bos=False) self._data_pointer.append((ind_img, ind_s)) data['sentences'].append(np.array(sentence_encode)) self._data_queue.append(data) ind_img += 1 if verbose and (ind_a + 1) % 5000 == 0: logger.info('Load %d/%d annotation from file %s', ind_a + 1, len(annos), anno_file_path) logger.info( 'Load %d images, %d sentences from %d files, %d image failed', len(self._data_queue), len(self._data_pointer), len(self.anno_files_path), num_failed)
sys.path.append('./py_lib/') from common_utils import CommonUtiler from tf_mrnn_decoder import mRNNDecoder from vision import ImageFeatureExtractor # In[2]: # set up paths mrnn_model_path = './trained_models/coco_caption/mrnn_GRU_570K.ckpt' mrnn_config_path = './model_conf/mrnn_GRU_conf.py' mrnn_vocab_path = './trained_models/coco_caption/mscoco_mc3_vocab' img_model_path = './external/tf_cnn_models/inception_v3.pb' # initilize feature extractor and sentence decoder cu = CommonUtiler() config = cu.load_config(mrnn_config_path) ife = ImageFeatureExtractor(img_model_path) decoder = mRNNDecoder(config, 'demo', mrnn_vocab_path) # In[3]: demo_image_path = 'demo_image.jpg' beam_size = 3 # extract visual feature for the image visual_features = ife.extract_features(demo_image_path, flag_from_file=True) # generate sentences decoder.load_model(mrnn_model_path) sentences = decoder.decode(visual_features, beam_size) # In[4]:
def __init__(self, is_training, config, num_steps, model_name, flag_with_saver=False, model_root='./cache/models/mscoco', flag_reset_state=False): # Set up paths and dirs self.cu = CommonUtiler() self.model_dir = os.path.join(model_root, model_name) self.variable_dir = os.path.join(self.model_dir, 'variables') self.cu.create_dir_if_not_exists(self.model_dir) self.cu.create_dir_if_not_exists(self.variable_dir) self.batch_size = batch_size = config.batch_size self.num_steps = num_steps rnn_size = config.rnn_size emb_size = config.emb_size vocab_size = config.vocab_size vf_size = config.vf_size # Inputs to the model self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self._visual_features = tf.placeholder(tf.float32, [batch_size, vf_size]) self._valid_flags = tf.placeholder(tf.float32, [batch_size, num_steps]) self._seq_lens = tf.placeholder(tf.int32, [batch_size]) # Create rnn cell if config.rnn_type == 'GRU': rnn_cell_basic = tf.nn.rnn_cell.GRUCell(rnn_size) elif config.rnn_type == 'LSTM': rnn_cell_basic = tf.nn.rnn_cell.LSTMCell(rnn_size, input_size=emb_size, use_peepholes=True) else: raise NameError("Unknown rnn type %s!" % config.rnn_type) if is_training and config.keep_prob_rnn < 1: rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper( rnn_cell_basic, output_keep_prob=config.keep_prob_rnn) cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] * config.num_rnn_layers) state_size = cell.state_size # Create word embeddings self._embedding = embedding = tf.get_variable("embedding", [vocab_size, emb_size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob_emb < 1: inputs = tf.nn.dropout(inputs, config.keep_prob_emb) # Different ways to fuze text and visual information if config.multimodal_type == 'mrnn': mm_size = config.mm_size # Run RNNs if flag_reset_state: self._initial_state = initial_state = tf.placeholder(tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = cell.zero_state( batch_size, tf.float32) inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map RNN output to multimodal space w_r2m = tf.get_variable("w_r2m", [rnn_size, mm_size]) b_r2m = tf.get_variable("b_r2m", [mm_size]) multimodal_l = tf.nn.relu(tf.matmul(output_rnn, w_r2m) + b_r2m) # Map Visual feature to multimodal space w_vf2m = tf.get_variable("w_vf2m", [vf_size, mm_size]) b_vf2m = tf.get_variable("b_vf2m", [mm_size]) mm_vf_single = tf.nn.relu( tf.matmul(self._visual_features, w_vf2m) + b_vf2m) mm_vf = tf.reshape(tf.tile(mm_vf_single, [1, num_steps]), [-1, mm_size]) multimodal_l = multimodal_l + mm_vf if is_training and config.keep_prob_mm < 1: multimodal_l = tf.nn.dropout(multimodal_l, config.keep_prob_mm) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [mm_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(multimodal_l, w_m2w) + b_m2w) elif config.multimodal_type == 'init': # Mapping visual feature to the RNN state w_vf2state = tf.get_variable("w_vf2state", [vf_size, state_size]) b_vf2state = tf.get_variable("b_vf2state", [state_size]) if flag_reset_state: self._initial_state = initial_state = tf.placeholder(tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = tf.nn.relu( tf.matmul(self._visual_features, w_vf2state) + b_vf2state) # Run RNNs inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [rnn_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(output_rnn, w_m2w) + b_m2w) else: raise NameError("Unknown multimodal type %s!" % config.multimodal_type) # Build sampled softmax loss # share the weights between embedding and softmax acc. to [2] w_loss = tf.transpose(embedding) b_loss = tf.get_variable("b_loss", [vocab_size]) self._logit = logit = tf.matmul(output, w_loss) + b_loss target = tf.reshape(math_ops.to_int64(self._targets), [-1]) valid_flag = tf.reshape(self._valid_flags, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, target) self._cost = cost = tf.reduce_sum(loss * valid_flag) / ( tf.reduce_sum(valid_flag) + 1e-12) # Create saver if necessary if flag_with_saver: self.saver = tf.train.Saver(max_to_keep=None) else: self.saver = None # Return the model if it is just for inference if not is_training: return # Create learning rate and gradients optimizer self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) if hasattr(config, 'optimizer'): if config.optimizer == 'ori': optimizer = tf.train.GradientDescentOptimizer(self.lr) elif config.optimizer == 'ada': # No GPU optimizer = tf.train.AdagradOptimizer(self.lr) elif config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(self.lr) elif config.optimizer == 'rms': optimizer = tf.train.RMSPropOptimizer(self.lr) else: raise NameError("Unknown optimizer type %s!" % config.optimizer) else: optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars))
logger = logging.getLogger('ExpMscoco') logging.basicConfig( format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s", datefmt='%d %b %H:%M:%S') logger.setLevel(logging.INFO) if __name__ == '__main__': # Hyparameters min_count = 3 vocab_path = './cache/dctionary/mscoco_mc%d_vocab' % min_count mscoco_root = './datasets/ms_coco' anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy'] # Preparations cu = CommonUtiler() cu.create_dir_if_not_exists(os.path.dirname(vocab_path)) # Scan the anno files vocab = {} for anno_file_name in anno_file_names: anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name) annos = np.load(anno_path).tolist() for anno in annos: for sentence in anno['sentences']: for word in sentence: word = word.strip().lower() if word in vocab: vocab[word] += 1 else: vocab[word] = 1
class mRNNCocoBucketDataProvider(object): """mRNN TensorFlow Data Provider with Buckets on MS COCO.""" def __init__(self, anno_files_path, vocab_path, vocab_size, vf_dir, vf_size, flag_shuffle=True): self.cu = CommonUtiler() self.anno_files_path = anno_files_path self.vocab_path = vocab_path self.vocab, _ = self.cu.load_vocabulary(vocab_path) assert len(self.vocab) == vocab_size assert self.vocab['<pad>'] == 0 self.vf_dir = vf_dir self.vf_size = vf_size self.flag_shuffle = flag_shuffle self._load_data() def generate_batches(self, batch_size, buckets): """Return a list generator of mini-batches of training data.""" # create Batches batches = [] for max_seq_len in buckets: batches.append( Batch(batch_size, max_seq_len, self.vf_size, self.vocab['<bos>'])) # shuffle if necessary if self.flag_shuffle: np.random.shuffle(self._data_pointer) # scan data queue for ind_i, ind_s in self._data_pointer: sentence = self._data_queue[ind_i]['sentences'][ind_s] visual_features = self._data_queue[ind_i]['visual_features'] if len(sentence) >= buckets[-1]: feed_res = batches[-1].feed_and_vomit(visual_features, sentence) ind_buc = len(buckets) - 1 else: for (ind_b, batch) in enumerate(batches): if len(sentence) < batch.max_seq_len: feed_res = batches[ind_b].feed_and_vomit(visual_features, sentence) ind_buc = ind_b break if feed_res: yield (ind_buc,) + feed_res batches[ind_buc].empty() def _load_data(self, verbose=True): logger.info('Loading data') vocab = self.vocab self._data_queue = [] self._data_pointer = [] ind_img = 0 num_failed = 0 for anno_file_path in self.anno_files_path: annos = np.load(anno_file_path).tolist() for (ind_a, anno) in enumerate(annos): data = {} # Load visual features feat_path = os.path.join(self.vf_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') if os.path.exists(feat_path): vf = np.loadtxt(feat_path) else: num_failed += 1 continue data['visual_features'] = vf # Encode sentences data['sentences'] = [] for (ind_s, sentence) in enumerate(anno['sentences']): sentence_encode = self.cu.encode_sentence(sentence, vocab, flag_add_bos=False) self._data_pointer.append((ind_img, ind_s)) data['sentences'].append(np.array(sentence_encode)) self._data_queue.append(data) ind_img += 1 if verbose and (ind_a + 1) % 5000 == 0: logger.info('Load %d/%d annotation from file %s', ind_a + 1, len(annos), anno_file_path) logger.info('Load %d images, %d sentences from %d files, %d image failed', len(self._data_queue), len(self._data_pointer), len(self.anno_files_path), num_failed)
class mRNNDecoder(object): """The sentence decoder (generator) for mRNNModel.""" def __init__(self, config, model_name, vocab_path, ses_threads=2, gpu_memory_fraction=1.0): self.cu = CommonUtiler() self.config = copy.deepcopy(config) self.config.batch_size = 1 self.model_path = None self.model_name = model_name self.flag_load_model = False self.vocab_path = vocab_path self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path) gpu_options = tf.GPUOptions( per_process_gpu_memory_fraction=gpu_memory_fraction) self.session = session = tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=ses_threads, gpu_options=gpu_options)) with tf.variable_scope("mRNNmodel", reuse=None): self.model_init = mRNNModel(is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=True) with tf.variable_scope("mRNNmodel", reuse=True): self.model_cont = mRNNModel(is_training=False, num_steps=1, config=self.config, model_name=self.model_name, flag_with_saver=False, flag_reset_state=True) def load_model(self, model_path): self.model_init.saver.restore(self.session, model_path) self.flag_load_model = True self.model_path = model_path logger.info('Load model from %s', model_path) def decode(self, visual_features, beam_size, max_steps=30): """Decode an image with a sentences.""" assert visual_features.shape[0] == self.config.vf_size assert self.flag_load_model, 'Must call local_model first' vocab = self.vocab rev_vocab = self.rev_vocab # Initilize beam search variables # Candidate will be represented with a dictionary # "indexes": a list with indexes denoted a sentence; # "words": word in the decoded sentence without <bos> # "score": log-likelihood of the sentence # "state": RNN state when generating the last word of the candidate good_sentences = [] # store sentences already ended with <bos> cur_best_cand = [] # store current best candidates highest_score = 0.0 # hightest log-likelihodd in good sentences # Get the initial logit and state logit_init, state_init = self.get_logit_init(visual_features) logit_init = np.squeeze(logit_init) assert logit_init.shape[0] == self.config.vocab_size and len( logit_init.shape) == 1 logit_init = self.cu.softmax(logit_init) logit_init_order = np.argsort(-logit_init) for ind_b in xrange(beam_size): cand = {} cand['indexes'] = [logit_init_order[ind_b]] cand['score'] = -np.log(logit_init[logit_init_order[ind_b]]) cand['state'] = state_init cur_best_cand.append(cand) # Expand the current best candidates until max_steps or no candidate for i in xrange(max_steps): # move candidates end with <bos> to good_sentences or remove it cand_left = [] for cand in cur_best_cand: if len(good_sentences ) > beam_size and cand['score'] > highest_score: continue # No need to expand that candidate if cand['indexes'][-1] == vocab['<bos>']: good_sentences.append(cand) highest_score = max(highest_score, cand['score']) else: cand_left.append(cand) cur_best_cand = cand_left if not cur_best_cand: break # expand candidate left cand_pool = [] for cand in cur_best_cand: logit, state = self.get_logit_cont(cand['state'], cand['indexes'][-1], visual_features) logit = np.squeeze(logit) logit = self.cu.softmax(logit) logit_order = np.argsort(-logit) for ind_b in xrange(beam_size): cand_e = copy.deepcopy(cand) cand_e['indexes'].append(logit_order[ind_b]) cand_e['score'] -= np.log(logit[logit_order[ind_b]]) cand_e['state'] = state cand_pool.append(cand_e) # get final cand_pool cur_best_cand = sorted(cand_pool, key=lambda cand: cand['score']) cur_best_cand = self.cu.truncate_list(cur_best_cand, beam_size) # Add candidate left in cur_best_cand to good sentences for cand in cur_best_cand: if len(good_sentences ) > beam_size and cand['score'] > highest_score: continue if cand['indexes'][-1] != vocab['<bos>']: cand['indexes'].append(vocab['<bos>']) good_sentences.append(cand) highest_score = max(highest_score, cand['score']) # Sort good sentences and return the final list good_sentences = sorted(good_sentences, key=lambda cand: cand['score']) good_sentences = self.cu.truncate_list(good_sentences, beam_size) for sentence in good_sentences: sentence['words'] = self.cu.decode_sentence( sentence['indexes'], vocab, rev_vocab) return good_sentences def get_logit_init(self, visual_features): """Use the model to get initial logit""" m = self.model_init session = self.session vocab = self.vocab config = self.config x = np.zeros([1, 1], dtype=np.int32) vf = np.zeros([1, config.vf_size], dtype=np.float32) fg = np.ones([1, 1], dtype=np.float32) sl = np.ones([1], dtype=np.int32) vf[0, :] = visual_features x[0] = vocab['<bos>'] logit, state = session.run( [m.logit, m.final_state], { m.input_data: x, m.visual_features: vf, m.valid_flags: fg, m.seq_lens: sl }) return (logit, state) def get_logit_cont(self, state_prev, index_word, visual_features): """Use the model to get continued logit""" m = self.model_cont session = self.session config = self.config x = np.zeros([1, 1], dtype=np.int32) vf = np.zeros([1, config.vf_size], dtype=np.float32) fg = np.ones([1, 1], dtype=np.float32) sl = np.ones([1], dtype=np.int32) vf[0, :] = visual_features x[0] = index_word logit, state = session.run( [m.logit, m.final_state], { m.input_data: x, m.visual_features: vf, m.valid_flags: fg, m.seq_lens: sl, m.initial_state: state_prev }) return (logit, state)
def main(unused_args): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Start model training with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto( intra_op_parallelism_threads=FLAGS.ses_threads)) as session: initializer = tf.random_uniform_initializer(-config.init_scale, config.init_scale) assert len(config.buckets) >= 1 assert config.buckets[-1] == config.max_num_steps models = [] with tf.variable_scope("mRNNmodel", reuse=None, initializer=initializer): m = mRNNModel(is_training=True, num_steps=config.buckets[0], config=config, model_name=FLAGS.model_name, flag_with_saver=True, model_root=FLAGS.model_root) models.append(m) with tf.variable_scope("mRNNmodel", reuse=True): for bucket in config.buckets[1:]: m = mRNNModel(is_training=True, num_steps=bucket, config=config, model_name=FLAGS.model_name, model_root=FLAGS.model_root) models.append(m) hdlr = logging.FileHandler(os.path.join(m.model_dir, 'log.txt')) hdlr.setLevel(logging.INFO) hdlr.setFormatter(logging.Formatter(formatter_log)) logger.addHandler(hdlr) if FLAGS.pre_trained_model_path: models[0].saver.restore(session, FLAGS.pre_trained_model_path) logger.info('Continue to train from %s', FLAGS.pre_trained_model_path) else: tf.initialize_all_variables().run() iters_done = 0 data_provider = mRNNCocoBucketDataProvider( FLAGS.anno_files_path.split(':'), FLAGS.vocab_path, config.vocab_size, FLAGS.vf_dir, config.vf_size) for i in range(config.num_epoch): train_cost, iters_done = run_epoch(session, iters_done, config, models, data_provider, verbose=True) logger.info("Train cost for epoch %d is %.3f" % (i, train_cost)) # Save final copy of the model models[0].saver.save( session, os.path.join(m.variable_dir, 'model_%d.ckpt' % iters_done))