def main(unused_args): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Evaluate trained models on val decoder = mRNNDecoder(config, FLAGS.model_name, FLAGS.vocab_path, gpu_memory_fraction=FLAGS.gpu_memory_fraction) for i in xrange(*[int(x) for x in FLAGS.eval_stat.split()]): model_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'variables', 'model_%d.ckpt' % i) while not os.path.exists(model_path): logger.warn('Cannot load model file, sleep 1 hour to retry') time.sleep(3600) decoder.load_model(model_path) num_decode = 0 pred_sentences = [] for anno_file_path in FLAGS.anno_files_path.split(':'): annos = np.load(anno_file_path).tolist() for anno in annos: feat_path = os.path.join( FLAGS.vf_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') visual_features = np.loadtxt(feat_path) sentences = decoder.decode(visual_features, FLAGS.beam_size) sentence_coco = {} sentence_coco['image_id'] = anno['id'] sentence_coco['caption'] = ' '.join(sentences[0]['words']) pred_sentences.append(sentence_coco) num_decode += 1 if num_decode % 100 == 0: logger.info('%d images are decoded' % num_decode) pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'generated_%d.json' % i) result_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'result_%d.txt' % i) cu.create_dir_if_not_exists(os.path.dirname(pred_path)) with open(pred_path, 'w') as fout: json.dump(pred_sentences, fout) cu.coco_val_eval(pred_path, result_path)
def main(unused_args): # Load model configuration cu = CommonUtiler() config_path = os.path.join('./model_conf', FLAGS.model_name + '.py') config = cu.load_config(config_path) # Evaluate trained models on val decoder = mRNNDecoder(config, FLAGS.model_name, FLAGS.vocab_path, gpu_memory_fraction=FLAGS.gpu_memory_fraction) for i in xrange(*[int(x) for x in FLAGS.eval_stat.split()]): model_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'variables', 'model_%d.ckpt' % i) while not os.path.exists(model_path): logger.warn('Cannot load model file, sleep 1 hour to retry') time.sleep(3600) decoder.load_model(model_path) num_decode = 0 pred_sentences = [] for anno_file_path in FLAGS.anno_files_path.split(':'): annos = np.load(anno_file_path).tolist() for anno in annos: feat_path = os.path.join(FLAGS.vf_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') visual_features = np.loadtxt(feat_path) sentences = decoder.decode(visual_features, FLAGS.beam_size) sentence_coco = {} sentence_coco['image_id'] = anno['id'] sentence_coco['caption'] = ' '.join(sentences[0]['words']) pred_sentences.append(sentence_coco) num_decode += 1 if num_decode % 100 == 0: logger.info('%d images are decoded' % num_decode) pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'generated_%d.json' % i) result_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 'decode_val_result', 'result_%d.txt' % i) cu.create_dir_if_not_exists(os.path.dirname(pred_path)) with open(pred_path, 'w') as fout: json.dump(pred_sentences, fout) cu.coco_val_eval(pred_path, result_path)
class mRNNModel(object): """The mRNN model with a shared weights strategy in [1, 2].""" def __init__(self, is_training, config, num_steps, model_name, flag_with_saver=False, model_root='./cache/models/mscoco', flag_reset_state=False): # Set up paths and dirs self.cu = CommonUtiler() self.model_dir = os.path.join(model_root, model_name) self.variable_dir = os.path.join(self.model_dir, 'variables') self.cu.create_dir_if_not_exists(self.model_dir) self.cu.create_dir_if_not_exists(self.variable_dir) self.batch_size = batch_size = config.batch_size self.num_steps = num_steps rnn_size = config.rnn_size emb_size = config.emb_size vocab_size = config.vocab_size vf_size = config.vf_size # Inputs to the model self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self._visual_features = tf.placeholder(tf.float32, [batch_size, vf_size]) self._valid_flags = tf.placeholder(tf.float32, [batch_size, num_steps]) self._seq_lens = tf.placeholder(tf.int32, [batch_size]) # Create rnn cell if config.rnn_type == 'GRU': rnn_cell_basic = tf.nn.rnn_cell.GRUCell(rnn_size) elif config.rnn_type == 'LSTM': rnn_cell_basic = tf.nn.rnn_cell.LSTMCell(rnn_size, input_size=emb_size, use_peepholes=True) else: raise NameError("Unknown rnn type %s!" % config.rnn_type) if is_training and config.keep_prob_rnn < 1: rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper( rnn_cell_basic, output_keep_prob=config.keep_prob_rnn) cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] * config.num_rnn_layers) state_size = cell.state_size # Create word embeddings self._embedding = embedding = tf.get_variable("embedding", [vocab_size, emb_size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob_emb < 1: inputs = tf.nn.dropout(inputs, config.keep_prob_emb) # Different ways to fuze text and visual information if config.multimodal_type == 'mrnn': mm_size = config.mm_size # Run RNNs if flag_reset_state: self._initial_state = initial_state = tf.placeholder( tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = cell.zero_state( batch_size, tf.float32) inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map RNN output to multimodal space w_r2m = tf.get_variable("w_r2m", [rnn_size, mm_size]) b_r2m = tf.get_variable("b_r2m", [mm_size]) multimodal_l = tf.nn.relu(tf.matmul(output_rnn, w_r2m) + b_r2m) # Map Visual feature to multimodal space w_vf2m = tf.get_variable("w_vf2m", [vf_size, mm_size]) b_vf2m = tf.get_variable("b_vf2m", [mm_size]) mm_vf_single = tf.nn.relu( tf.matmul(self._visual_features, w_vf2m) + b_vf2m) mm_vf = tf.reshape(tf.tile(mm_vf_single, [1, num_steps]), [-1, mm_size]) multimodal_l = multimodal_l + mm_vf if is_training and config.keep_prob_mm < 1: multimodal_l = tf.nn.dropout(multimodal_l, config.keep_prob_mm) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [mm_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(multimodal_l, w_m2w) + b_m2w) elif config.multimodal_type == 'init': # Mapping visual feature to the RNN state w_vf2state = tf.get_variable("w_vf2state", [vf_size, state_size]) b_vf2state = tf.get_variable("b_vf2state", [state_size]) if flag_reset_state: self._initial_state = initial_state = tf.placeholder( tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = tf.nn.relu( tf.matmul(self._visual_features, w_vf2state) + b_vf2state) # Run RNNs inputs = [ tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs) ] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [rnn_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(output_rnn, w_m2w) + b_m2w) else: raise NameError("Unknown multimodal type %s!" % config.multimodal_type) # Build sampled softmax loss # share the weights between embedding and softmax acc. to [2] w_loss = tf.transpose(embedding) b_loss = tf.get_variable("b_loss", [vocab_size]) self._logit = logit = tf.matmul(output, w_loss) + b_loss target = tf.reshape(math_ops.to_int64(self._targets), [-1]) valid_flag = tf.reshape(self._valid_flags, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, target) self._cost = cost = tf.reduce_sum( loss * valid_flag) / (tf.reduce_sum(valid_flag) + 1e-12) # Create saver if necessary if flag_with_saver: self.saver = tf.train.Saver(max_to_keep=None) else: self.saver = None # Return the model if it is just for inference if not is_training: return # Create learning rate and gradients optimizer self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) if hasattr(config, 'optimizer'): if config.optimizer == 'ori': optimizer = tf.train.GradientDescentOptimizer(self.lr) elif config.optimizer == 'ada': # No GPU optimizer = tf.train.AdagradOptimizer(self.lr) elif config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(self.lr) elif config.optimizer == 'rms': optimizer = tf.train.RMSPropOptimizer(self.lr) else: raise NameError("Unknown optimizer type %s!" % config.optimizer) else: optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) def assign_lr(self, session, lr_value): session.run(tf.assign(self.lr, lr_value)) @property def input_data(self): return self._input_data @property def targets(self): return self._targets @property def valid_flags(self): return self._valid_flags @property def visual_features(self): return self._visual_features @property def seq_lens(self): return self._seq_lens @property def cost(self): return self._cost @property def final_state(self): return self._final_state @property def initial_state(self): return self._initial_state @property def lr(self): return self._lr @property def train_op(self): return self._train_op @property def embedding(self): return self._embedding @property def logit(self): return self._logit
logger = logging.getLogger('ExpMscoco') logging.basicConfig( format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s", datefmt='%d %b %H:%M:%S') logger.setLevel(logging.INFO) if __name__ == '__main__': # Hyparameters min_count = 3 vocab_path = './cache/dctionary/mscoco_mc%d_vocab' % min_count mscoco_root = './datasets/ms_coco' anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy'] # Preparations cu = CommonUtiler() cu.create_dir_if_not_exists(os.path.dirname(vocab_path)) # Scan the anno files vocab = {} for anno_file_name in anno_file_names: anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name) annos = np.load(anno_path).tolist() for anno in annos: for sentence in anno['sentences']: for word in sentence: word = word.strip().lower() if word in vocab: vocab[word] += 1 else: vocab[word] = 1
class mRNNModel(object): """The mRNN model with a shared weights strategy in [1, 2].""" def __init__(self, is_training, config, num_steps, model_name, flag_with_saver=False, model_root='./cache/models/mscoco', flag_reset_state=False): # Set up paths and dirs self.cu = CommonUtiler() self.model_dir = os.path.join(model_root, model_name) self.variable_dir = os.path.join(self.model_dir, 'variables') self.cu.create_dir_if_not_exists(self.model_dir) self.cu.create_dir_if_not_exists(self.variable_dir) self.batch_size = batch_size = config.batch_size self.num_steps = num_steps rnn_size = config.rnn_size emb_size = config.emb_size vocab_size = config.vocab_size vf_size = config.vf_size # Inputs to the model self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps]) self._targets = tf.placeholder(tf.int32, [batch_size, num_steps]) self._visual_features = tf.placeholder(tf.float32, [batch_size, vf_size]) self._valid_flags = tf.placeholder(tf.float32, [batch_size, num_steps]) self._seq_lens = tf.placeholder(tf.int32, [batch_size]) # Create rnn cell if config.rnn_type == 'GRU': rnn_cell_basic = tf.nn.rnn_cell.GRUCell(rnn_size) elif config.rnn_type == 'LSTM': rnn_cell_basic = tf.nn.rnn_cell.LSTMCell(rnn_size, input_size=emb_size, use_peepholes=True) else: raise NameError("Unknown rnn type %s!" % config.rnn_type) if is_training and config.keep_prob_rnn < 1: rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper( rnn_cell_basic, output_keep_prob=config.keep_prob_rnn) cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] * config.num_rnn_layers) state_size = cell.state_size # Create word embeddings self._embedding = embedding = tf.get_variable("embedding", [vocab_size, emb_size]) inputs = tf.nn.embedding_lookup(embedding, self._input_data) if is_training and config.keep_prob_emb < 1: inputs = tf.nn.dropout(inputs, config.keep_prob_emb) # Different ways to fuze text and visual information if config.multimodal_type == 'mrnn': mm_size = config.mm_size # Run RNNs if flag_reset_state: self._initial_state = initial_state = tf.placeholder(tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = cell.zero_state( batch_size, tf.float32) inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map RNN output to multimodal space w_r2m = tf.get_variable("w_r2m", [rnn_size, mm_size]) b_r2m = tf.get_variable("b_r2m", [mm_size]) multimodal_l = tf.nn.relu(tf.matmul(output_rnn, w_r2m) + b_r2m) # Map Visual feature to multimodal space w_vf2m = tf.get_variable("w_vf2m", [vf_size, mm_size]) b_vf2m = tf.get_variable("b_vf2m", [mm_size]) mm_vf_single = tf.nn.relu( tf.matmul(self._visual_features, w_vf2m) + b_vf2m) mm_vf = tf.reshape(tf.tile(mm_vf_single, [1, num_steps]), [-1, mm_size]) multimodal_l = multimodal_l + mm_vf if is_training and config.keep_prob_mm < 1: multimodal_l = tf.nn.dropout(multimodal_l, config.keep_prob_mm) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [mm_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(multimodal_l, w_m2w) + b_m2w) elif config.multimodal_type == 'init': # Mapping visual feature to the RNN state w_vf2state = tf.get_variable("w_vf2state", [vf_size, state_size]) b_vf2state = tf.get_variable("b_vf2state", [state_size]) if flag_reset_state: self._initial_state = initial_state = tf.placeholder(tf.float32, [batch_size, state_size]) else: self._initial_state = initial_state = tf.nn.relu( tf.matmul(self._visual_features, w_vf2state) + b_vf2state) # Run RNNs inputs = [tf.squeeze(input_, [1]) for input_ in tf.split(1, num_steps, inputs)] outputs_rnn, state = tf.nn.rnn(cell, inputs, initial_state=initial_state, sequence_length=self._seq_lens) self._final_state = state output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size]) # Map multimodal space to word space w_m2w = tf.get_variable("w_m2w", [rnn_size, emb_size]) b_m2w = tf.get_variable("b_m2w", [emb_size]) output = tf.nn.relu(tf.matmul(output_rnn, w_m2w) + b_m2w) else: raise NameError("Unknown multimodal type %s!" % config.multimodal_type) # Build sampled softmax loss # share the weights between embedding and softmax acc. to [2] w_loss = tf.transpose(embedding) b_loss = tf.get_variable("b_loss", [vocab_size]) self._logit = logit = tf.matmul(output, w_loss) + b_loss target = tf.reshape(math_ops.to_int64(self._targets), [-1]) valid_flag = tf.reshape(self._valid_flags, [-1]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, target) self._cost = cost = tf.reduce_sum(loss * valid_flag) / ( tf.reduce_sum(valid_flag) + 1e-12) # Create saver if necessary if flag_with_saver: self.saver = tf.train.Saver(max_to_keep=None) else: self.saver = None # Return the model if it is just for inference if not is_training: return # Create learning rate and gradients optimizer self._lr = tf.Variable(0.0, trainable=False) tvars = tf.trainable_variables() grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), config.max_grad_norm) if hasattr(config, 'optimizer'): if config.optimizer == 'ori': optimizer = tf.train.GradientDescentOptimizer(self.lr) elif config.optimizer == 'ada': # No GPU optimizer = tf.train.AdagradOptimizer(self.lr) elif config.optimizer == 'adam': optimizer = tf.train.AdamOptimizer(self.lr) elif config.optimizer == 'rms': optimizer = tf.train.RMSPropOptimizer(self.lr) else: raise NameError("Unknown optimizer type %s!" % config.optimizer) else: optimizer = tf.train.GradientDescentOptimizer(self.lr) self._train_op = optimizer.apply_gradients(zip(grads, tvars)) def assign_lr(self, session, lr_value): session.run(tf.assign(self.lr, lr_value)) @property def input_data(self): return self._input_data @property def targets(self): return self._targets @property def valid_flags(self): return self._valid_flags @property def visual_features(self): return self._visual_features @property def seq_lens(self): return self._seq_lens @property def cost(self): return self._cost @property def final_state(self): return self._final_state @property def initial_state(self): return self._initial_state @property def lr(self): return self._lr @property def train_op(self): return self._train_op @property def embedding(self): return self._embedding @property def logit(self): return self._logit
logger.setLevel(logging.INFO) if __name__ == '__main__': flag_ignore_exists = True # Path model_path = './external/tf_cnn_models/inception_v3.pb' mscoco_root = './datasets/ms_coco' anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy', 'anno_list_mscoco_crVal_m_RNN.npy', 'anno_list_mscoco_test2014.npy'] feat_dir = './cache/mscoco_image_features/inception_v3' # Preparations cu = CommonUtiler() ife = ImageFeatureExtractor(model_path) cu.create_dir_if_not_exists(os.path.join(feat_dir, 'train2014')) cu.create_dir_if_not_exists(os.path.join(feat_dir, 'test2014')) cu.create_dir_if_not_exists(os.path.join(feat_dir, 'val2014')) # Extract features for anno_file_name in anno_file_names: anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name) annos = np.load(anno_path).tolist() for (ind_a, anno) in enumerate(annos): image_path = os.path.join(mscoco_root, 'images', anno['file_path'], anno['file_name']) feat_path = os.path.join(feat_dir, anno['file_path'], anno['file_name'].split('.')[0] + '.txt') if flag_ignore_exists and os.path.exists(feat_path): logger.info('%d/%d exists for %s', ind_a+1, len(annos), anno_file_name)