Beispiel #1
0
    def __init__(self,
                 config,
                 model_name,
                 vocab_path,
                 ses_threads=2,
                 gpu_memory_fraction=1.0):
        self.cu = CommonUtiler()
        self.config = copy.deepcopy(config)
        self.config.batch_size = 1
        self.model_path = None
        self.model_name = model_name
        self.flag_load_model = False
        self.vocab_path = vocab_path
        self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_memory_fraction)
        self.session = session = tf.Session(config=tf.ConfigProto(
            intra_op_parallelism_threads=ses_threads, gpu_options=gpu_options))

        with tf.variable_scope("mRNNmodel", reuse=None):
            self.model_init = mRNNModel(is_training=False,
                                        num_steps=1,
                                        config=self.config,
                                        model_name=self.model_name,
                                        flag_with_saver=True)

        with tf.variable_scope("mRNNmodel", reuse=True):
            self.model_cont = mRNNModel(is_training=False,
                                        num_steps=1,
                                        config=self.config,
                                        model_name=self.model_name,
                                        flag_with_saver=False,
                                        flag_reset_state=True)
def main(unused_args):
  # Load model configuration
  cu = CommonUtiler()
  config_path = os.path.join('./model_conf', FLAGS.model_name + '.py')
  config = cu.load_config(config_path)

  # Start model training
  with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
      intra_op_parallelism_threads=FLAGS.ses_threads)) as session:
    initializer = tf.random_uniform_initializer(-config.init_scale,
                                                config.init_scale)
    assert len(config.buckets) >= 1
    assert config.buckets[-1] == config.max_num_steps
    models = []
    with tf.variable_scope("mRNNmodel", reuse=None, initializer=initializer):
      m = mRNNModel(is_training=True,
          num_steps=config.buckets[0], 
          config=config,
          model_name=FLAGS.model_name,
          flag_with_saver=True,
          model_root=FLAGS.model_root)
      models.append(m)
      
    with tf.variable_scope("mRNNmodel", reuse=True):
      for bucket in config.buckets[1:]:
        m = mRNNModel(is_training=True, 
            num_steps=bucket, 
            config=config,
            model_name=FLAGS.model_name,
            model_root=FLAGS.model_root)
        models.append(m)
        
    hdlr = logging.FileHandler(os.path.join(m.model_dir, 'log.txt'))
    hdlr.setLevel(logging.INFO)
    hdlr.setFormatter(logging.Formatter(formatter_log))
    logger.addHandler(hdlr)
    
    if FLAGS.pre_trained_model_path:
      models[0].saver.restore(session, FLAGS.pre_trained_model_path)
      logger.info('Continue to train from %s', FLAGS.pre_trained_model_path)
    else:
      tf.initialize_all_variables().run()

    iters_done = 0
    data_provider = mRNNCocoBucketDataProvider(FLAGS.anno_files_path.split(':'),
        FLAGS.vocab_path, config.vocab_size, FLAGS.vf_dir, config.vf_size)
    for i in range(config.num_epoch):
      train_cost, iters_done = run_epoch(session, iters_done, config, models, 
          data_provider, verbose=True)
      logger.info("Train cost for epoch %d is %.3f" % (i, train_cost))
    
    # Save final copy of the model
    models[0].saver.save(session, os.path.join(m.variable_dir, 
        'model_%d.ckpt' % iters_done))
Beispiel #3
0
 def __init__(self, config, model_name, vocab_path,
              ses_threads=2,
              gpu_memory_fraction=1.0):
   self.cu = CommonUtiler()
   self.config = copy.deepcopy(config)
   self.config.batch_size = 1
   self.model_path = None
   self.model_name = model_name
   self.flag_load_model = False
   self.vocab_path = vocab_path
   self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path)
   
   gpu_options = tf.GPUOptions(
       per_process_gpu_memory_fraction=gpu_memory_fraction)
   self.session = session = tf.Session(config=tf.ConfigProto(
       intra_op_parallelism_threads=ses_threads, 
       gpu_options=gpu_options))
   
   with tf.variable_scope("mRNNmodel", reuse=None):
     self.model_init = mRNNModel(
         is_training=False,
         num_steps=1, 
         config=self.config,
         model_name=self.model_name,
         flag_with_saver=True)
   
   with tf.variable_scope("mRNNmodel", reuse=True):
     self.model_cont = mRNNModel(
         is_training=False,
         num_steps=1, 
         config=self.config,
         model_name=self.model_name,
         flag_with_saver=False,
         flag_reset_state=True)
Beispiel #4
0
 def __init__(self,
              anno_files_path,
              vocab_path,
              vocab_size,
              vf_dir,
              vf_size,
              flag_shuffle=True):
     self.cu = CommonUtiler()
     self.anno_files_path = anno_files_path
     self.vocab_path = vocab_path
     self.vocab, _ = self.cu.load_vocabulary(vocab_path)
     assert len(self.vocab) == vocab_size
     assert self.vocab['<pad>'] == 0
     self.vf_dir = vf_dir
     self.vf_size = vf_size
     self.flag_shuffle = flag_shuffle
     self._load_data()
Beispiel #5
0
 def __init__(self, anno_files_path, vocab_path, vocab_size, vf_dir, vf_size,
     flag_shuffle=True):
   self.cu = CommonUtiler()
   self.anno_files_path = anno_files_path
   self.vocab_path = vocab_path
   self.vocab, _ = self.cu.load_vocabulary(vocab_path)
   assert len(self.vocab) == vocab_size
   assert self.vocab['<pad>'] == 0
   self.vf_dir = vf_dir
   self.vf_size = vf_size
   self.flag_shuffle = flag_shuffle
   self._load_data()
def main(unused_args):
    # Load model configuration
    cu = CommonUtiler()
    config_path = os.path.join('./model_conf', FLAGS.model_name + '.py')
    config = cu.load_config(config_path)

    # Evaluate trained models on val
    decoder = mRNNDecoder(config,
                          FLAGS.model_name,
                          FLAGS.vocab_path,
                          gpu_memory_fraction=FLAGS.gpu_memory_fraction)
    for i in xrange(*[int(x) for x in FLAGS.eval_stat.split()]):
        model_path = os.path.join(FLAGS.model_root, FLAGS.model_name,
                                  'variables', 'model_%d.ckpt' % i)
        while not os.path.exists(model_path):
            logger.warn('Cannot load model file, sleep 1 hour to retry')
            time.sleep(3600)

        decoder.load_model(model_path)

        num_decode = 0
        pred_sentences = []
        for anno_file_path in FLAGS.anno_files_path.split(':'):
            annos = np.load(anno_file_path).tolist()
            for anno in annos:
                feat_path = os.path.join(
                    FLAGS.vf_dir, anno['file_path'],
                    anno['file_name'].split('.')[0] + '.txt')
                visual_features = np.loadtxt(feat_path)
                sentences = decoder.decode(visual_features, FLAGS.beam_size)

                sentence_coco = {}
                sentence_coco['image_id'] = anno['id']
                sentence_coco['caption'] = ' '.join(sentences[0]['words'])
                pred_sentences.append(sentence_coco)
                num_decode += 1

                if num_decode % 100 == 0:
                    logger.info('%d images are decoded' % num_decode)

        pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name,
                                 'decode_val_result', 'generated_%d.json' % i)
        result_path = os.path.join(FLAGS.model_root, FLAGS.model_name,
                                   'decode_val_result', 'result_%d.txt' % i)
        cu.create_dir_if_not_exists(os.path.dirname(pred_path))
        with open(pred_path, 'w') as fout:
            json.dump(pred_sentences, fout)
        cu.coco_val_eval(pred_path, result_path)
def main(unused_args):
  # Load model configuration
  cu = CommonUtiler()
  config_path = os.path.join('./model_conf', FLAGS.model_name + '.py')
  config = cu.load_config(config_path)
      
  # Evaluate trained models on val
  decoder = mRNNDecoder(config, FLAGS.model_name, FLAGS.vocab_path,
      gpu_memory_fraction=FLAGS.gpu_memory_fraction)
  for i in xrange(*[int(x) for x in FLAGS.eval_stat.split()]):
    model_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 
        'variables', 'model_%d.ckpt' % i)
    while not os.path.exists(model_path):
      logger.warn('Cannot load model file, sleep 1 hour to retry')
      time.sleep(3600)
    
    decoder.load_model(model_path)
    
    num_decode = 0
    pred_sentences = []
    for anno_file_path in FLAGS.anno_files_path.split(':'):
      annos = np.load(anno_file_path).tolist()
      for anno in annos:
        feat_path = os.path.join(FLAGS.vf_dir, anno['file_path'],
            anno['file_name'].split('.')[0] + '.txt')
        visual_features = np.loadtxt(feat_path)
        sentences = decoder.decode(visual_features, FLAGS.beam_size)
        
        sentence_coco = {}
        sentence_coco['image_id'] = anno['id']
        sentence_coco['caption'] = ' '.join(sentences[0]['words'])
        pred_sentences.append(sentence_coco)
        num_decode += 1
        
        if num_decode % 100 == 0:
          logger.info('%d images are decoded' % num_decode)
          
    pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 
        'decode_val_result', 'generated_%d.json' % i)
    result_path = os.path.join(FLAGS.model_root, FLAGS.model_name, 
        'decode_val_result', 'result_%d.txt' % i)
    cu.create_dir_if_not_exists(os.path.dirname(pred_path))
    with open(pred_path, 'w') as fout:
      json.dump(pred_sentences, fout)
    cu.coco_val_eval(pred_path, result_path)
Beispiel #8
0
    def __init__(self,
                 is_training,
                 config,
                 num_steps,
                 model_name,
                 flag_with_saver=False,
                 model_root='./cache/models/mscoco',
                 flag_reset_state=False):
        # Set up paths and dirs
        self.cu = CommonUtiler()
        self.model_dir = os.path.join(model_root, model_name)
        self.variable_dir = os.path.join(self.model_dir, 'variables')

        self.cu.create_dir_if_not_exists(self.model_dir)
        self.cu.create_dir_if_not_exists(self.variable_dir)

        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps
        rnn_size = config.rnn_size
        emb_size = config.emb_size
        vocab_size = config.vocab_size
        vf_size = config.vf_size

        # Inputs to the model
        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._visual_features = tf.placeholder(tf.float32,
                                               [batch_size, vf_size])
        self._valid_flags = tf.placeholder(tf.float32, [batch_size, num_steps])
        self._seq_lens = tf.placeholder(tf.int32, [batch_size])

        # Create rnn cell
        if config.rnn_type == 'GRU':
            rnn_cell_basic = tf.nn.rnn_cell.GRUCell(rnn_size)
        elif config.rnn_type == 'LSTM':
            rnn_cell_basic = tf.nn.rnn_cell.LSTMCell(rnn_size,
                                                     input_size=emb_size,
                                                     use_peepholes=True)
        else:
            raise NameError("Unknown rnn type %s!" % config.rnn_type)
        if is_training and config.keep_prob_rnn < 1:
            rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper(
                rnn_cell_basic, output_keep_prob=config.keep_prob_rnn)
        cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] *
                                           config.num_rnn_layers)
        state_size = cell.state_size

        # Create word embeddings
        self._embedding = embedding = tf.get_variable("embedding",
                                                      [vocab_size, emb_size])
        inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        if is_training and config.keep_prob_emb < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob_emb)

        # Different ways to fuze text and visual information
        if config.multimodal_type == 'mrnn':
            mm_size = config.mm_size
            # Run RNNs
            if flag_reset_state:
                self._initial_state = initial_state = tf.placeholder(
                    tf.float32, [batch_size, state_size])
            else:
                self._initial_state = initial_state = cell.zero_state(
                    batch_size, tf.float32)
            inputs = [
                tf.squeeze(input_, [1])
                for input_ in tf.split(1, num_steps, inputs)
            ]
            outputs_rnn, state = tf.nn.rnn(cell,
                                           inputs,
                                           initial_state=initial_state,
                                           sequence_length=self._seq_lens)
            self._final_state = state
            output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size])

            # Map RNN output to multimodal space
            w_r2m = tf.get_variable("w_r2m", [rnn_size, mm_size])
            b_r2m = tf.get_variable("b_r2m", [mm_size])
            multimodal_l = tf.nn.relu(tf.matmul(output_rnn, w_r2m) + b_r2m)

            # Map Visual feature to multimodal space
            w_vf2m = tf.get_variable("w_vf2m", [vf_size, mm_size])
            b_vf2m = tf.get_variable("b_vf2m", [mm_size])
            mm_vf_single = tf.nn.relu(
                tf.matmul(self._visual_features, w_vf2m) + b_vf2m)
            mm_vf = tf.reshape(tf.tile(mm_vf_single, [1, num_steps]),
                               [-1, mm_size])
            multimodal_l = multimodal_l + mm_vf
            if is_training and config.keep_prob_mm < 1:
                multimodal_l = tf.nn.dropout(multimodal_l, config.keep_prob_mm)

            # Map multimodal space to word space
            w_m2w = tf.get_variable("w_m2w", [mm_size, emb_size])
            b_m2w = tf.get_variable("b_m2w", [emb_size])
            output = tf.nn.relu(tf.matmul(multimodal_l, w_m2w) + b_m2w)

        elif config.multimodal_type == 'init':
            # Mapping visual feature to the RNN state
            w_vf2state = tf.get_variable("w_vf2state", [vf_size, state_size])
            b_vf2state = tf.get_variable("b_vf2state", [state_size])
            if flag_reset_state:
                self._initial_state = initial_state = tf.placeholder(
                    tf.float32, [batch_size, state_size])
            else:
                self._initial_state = initial_state = tf.nn.relu(
                    tf.matmul(self._visual_features, w_vf2state) + b_vf2state)

            # Run RNNs
            inputs = [
                tf.squeeze(input_, [1])
                for input_ in tf.split(1, num_steps, inputs)
            ]
            outputs_rnn, state = tf.nn.rnn(cell,
                                           inputs,
                                           initial_state=initial_state,
                                           sequence_length=self._seq_lens)
            self._final_state = state
            output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size])

            # Map multimodal space to word space
            w_m2w = tf.get_variable("w_m2w", [rnn_size, emb_size])
            b_m2w = tf.get_variable("b_m2w", [emb_size])
            output = tf.nn.relu(tf.matmul(output_rnn, w_m2w) + b_m2w)

        else:
            raise NameError("Unknown multimodal type %s!" %
                            config.multimodal_type)

        # Build sampled softmax loss
        # share the weights between embedding and softmax acc. to [2]
        w_loss = tf.transpose(embedding)
        b_loss = tf.get_variable("b_loss", [vocab_size])
        self._logit = logit = tf.matmul(output, w_loss) + b_loss

        target = tf.reshape(math_ops.to_int64(self._targets), [-1])
        valid_flag = tf.reshape(self._valid_flags, [-1])
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, target)
        self._cost = cost = tf.reduce_sum(
            loss * valid_flag) / (tf.reduce_sum(valid_flag) + 1e-12)

        # Create saver if necessary
        if flag_with_saver:
            self.saver = tf.train.Saver(max_to_keep=None)
        else:
            self.saver = None

        # Return the model if it is just for inference
        if not is_training:
            return

        # Create learning rate and gradients optimizer
        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        if hasattr(config, 'optimizer'):
            if config.optimizer == 'ori':
                optimizer = tf.train.GradientDescentOptimizer(self.lr)
            elif config.optimizer == 'ada':  # No GPU
                optimizer = tf.train.AdagradOptimizer(self.lr)
            elif config.optimizer == 'adam':
                optimizer = tf.train.AdamOptimizer(self.lr)
            elif config.optimizer == 'rms':
                optimizer = tf.train.RMSPropOptimizer(self.lr)
            else:
                raise NameError("Unknown optimizer type %s!" %
                                config.optimizer)
        else:
            optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
Beispiel #9
0
logger = logging.getLogger('ExpMscoco')
logging.basicConfig(
    format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s",
    datefmt='%d %b %H:%M:%S')
logger.setLevel(logging.INFO)

if __name__ == '__main__':
    # Hyparameters
    min_count = 3
    vocab_path = './cache/dctionary/mscoco_mc%d_vocab' % min_count
    mscoco_root = './datasets/ms_coco'
    anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy']

    # Preparations
    cu = CommonUtiler()
    cu.create_dir_if_not_exists(os.path.dirname(vocab_path))

    # Scan the anno files
    vocab = {}
    for anno_file_name in anno_file_names:
        anno_path = os.path.join(mscoco_root, 'mscoco_anno_files',
                                 anno_file_name)
        annos = np.load(anno_path).tolist()
        for anno in annos:
            for sentence in anno['sentences']:
                for word in sentence:
                    word = word.strip().lower()
                    if word in vocab:
                        vocab[word] += 1
                    else:
Beispiel #10
0
sys.path.append('./py_lib/')
from common_utils import CommonUtiler
from tf_mrnn_decoder import mRNNDecoder
from vision import ImageFeatureExtractor


# In[2]:

# set up paths
mrnn_model_path = './trained_models/coco_caption/mrnn_GRU_570K.ckpt'
mrnn_config_path = './model_conf/mrnn_GRU_conf.py'
mrnn_vocab_path = './trained_models/coco_caption/mscoco_mc3_vocab'
img_model_path = './external/tf_cnn_models/inception_v3.pb'

# initilize feature extractor and sentence decoder
cu = CommonUtiler()
config = cu.load_config(mrnn_config_path)
ife = ImageFeatureExtractor(img_model_path)
decoder = mRNNDecoder(config, 'demo', mrnn_vocab_path)


# In[3]:

demo_image_path = 'demo_image.jpg'
beam_size = 3
# extract visual feature for the image
visual_features = ife.extract_features(demo_image_path, 
                                       flag_from_file=True)
# generate sentences
decoder.load_model(mrnn_model_path)
sentences = decoder.decode(visual_features, beam_size)
Beispiel #11
0
class mRNNDecoder(object):
  """The sentence decoder (generator) for mRNNModel."""

  def __init__(self, config, model_name, vocab_path,
               ses_threads=2,
               gpu_memory_fraction=1.0):
    self.cu = CommonUtiler()
    self.config = copy.deepcopy(config)
    self.config.batch_size = 1
    self.model_path = None
    self.model_name = model_name
    self.flag_load_model = False
    self.vocab_path = vocab_path
    self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path)
    
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=gpu_memory_fraction)
    self.session = session = tf.Session(config=tf.ConfigProto(
        intra_op_parallelism_threads=ses_threads, 
        gpu_options=gpu_options))
    
    with tf.variable_scope("mRNNmodel", reuse=None):
      self.model_init = mRNNModel(
          is_training=False,
          num_steps=1, 
          config=self.config,
          model_name=self.model_name,
          flag_with_saver=True)
    
    with tf.variable_scope("mRNNmodel", reuse=True):
      self.model_cont = mRNNModel(
          is_training=False,
          num_steps=1, 
          config=self.config,
          model_name=self.model_name,
          flag_with_saver=False,
          flag_reset_state=True)
          
  def load_model(self, model_path):
    self.model_init.saver.restore(self.session, model_path)
    self.flag_load_model = True
    self.model_path = model_path
    logger.info('Load model from %s', model_path)
    
  def decode(self, visual_features, beam_size, max_steps=30):
    """Decode an image with a sentences."""
    assert visual_features.shape[0] == self.config.vf_size
    assert self.flag_load_model, 'Must call local_model first'
    vocab = self.vocab
    rev_vocab = self.rev_vocab
    
    # Initilize beam search variables
    # Candidate will be represented with a dictionary
    #   "indexes": a list with indexes denoted a sentence; 
    #   "words": word in the decoded sentence without <bos>
    #   "score": log-likelihood of the sentence
    #   "state": RNN state when generating the last word of the candidate
    good_sentences = [] # store sentences already ended with <bos>
    cur_best_cand = [] # store current best candidates
    highest_score = 0.0 # hightest log-likelihodd in good sentences
    
    # Get the initial logit and state
    logit_init, state_init = self.get_logit_init(visual_features)
    logit_init = np.squeeze(logit_init)
    assert logit_init.shape[0] == self.config.vocab_size and len(
        logit_init.shape) == 1
    logit_init = self.cu.softmax(logit_init)
    logit_init_order = np.argsort(-logit_init)
    for ind_b in xrange(beam_size):
      cand = {}
      cand['indexes'] = [logit_init_order[ind_b]]
      cand['score'] = -np.log(logit_init[logit_init_order[ind_b]])
      cand['state'] = state_init
      cur_best_cand.append(cand)
      
    # Expand the current best candidates until max_steps or no candidate
    for i in xrange(max_steps):
      # move candidates end with <bos> to good_sentences or remove it
      cand_left = []
      for cand in cur_best_cand:
        if len(good_sentences) > beam_size and cand['score'] > highest_score:
          continue # No need to expand that candidate
        if cand['indexes'][-1] == vocab['<bos>']:
          good_sentences.append(cand)
          highest_score = max(highest_score, cand['score'])
        else:
          cand_left.append(cand)
      cur_best_cand = cand_left
      if not cur_best_cand:
        break
      # expand candidate left
      cand_pool = []
      for cand in cur_best_cand:
        logit, state = self.get_logit_cont(cand['state'], cand['indexes'][-1],
            visual_features)
        logit = np.squeeze(logit)
        logit = self.cu.softmax(logit)
        logit_order = np.argsort(-logit)
        for ind_b in xrange(beam_size):
          cand_e = copy.deepcopy(cand)
          cand_e['indexes'].append(logit_order[ind_b])
          cand_e['score'] -= np.log(logit[logit_order[ind_b]])
          cand_e['state'] = state
          cand_pool.append(cand_e)
      # get final cand_pool
      cur_best_cand = sorted(cand_pool, key=lambda cand: cand['score'])
      cur_best_cand = self.cu.truncate_list(cur_best_cand, beam_size)
      
    # Add candidate left in cur_best_cand to good sentences
    for cand in cur_best_cand:
      if len(good_sentences) > beam_size and cand['score'] > highest_score:
        continue
      if cand['indexes'][-1] != vocab['<bos>']:
        cand['indexes'].append(vocab['<bos>'])
      good_sentences.append(cand)
      highest_score = max(highest_score, cand['score'])
      
    # Sort good sentences and return the final list
    good_sentences = sorted(good_sentences, key=lambda cand: cand['score'])
    good_sentences = self.cu.truncate_list(good_sentences, beam_size)
    for sentence in good_sentences:
      sentence['words'] = self.cu.decode_sentence(
          sentence['indexes'], vocab, rev_vocab)
    
    return good_sentences
    
  def get_logit_init(self, visual_features):
    """Use the model to get initial logit"""
    m = self.model_init
    session = self.session
    vocab = self.vocab
    config = self.config
    
    x = np.zeros([1, 1], dtype=np.int32)
    vf = np.zeros([1, config.vf_size], dtype=np.float32)
    fg = np.ones([1, 1], dtype=np.float32)
    sl = np.ones([1], dtype=np.int32)
    vf[0, :] = visual_features
    x[0] = vocab['<bos>']
    
    logit, state = session.run([m.logit, m.final_state],
                               {m.input_data: x,
                                m.visual_features: vf,
                                m.valid_flags: fg,
                                m.seq_lens: sl})
                              
    return (logit, state)
    
  def get_logit_cont(self, state_prev, index_word, visual_features):
    """Use the model to get continued logit"""
    m = self.model_cont
    session = self.session
    config = self.config
    
    x = np.zeros([1, 1], dtype=np.int32)
    vf = np.zeros([1, config.vf_size], dtype=np.float32)
    fg = np.ones([1, 1], dtype=np.float32)
    sl = np.ones([1], dtype=np.int32)
    vf[0, :] = visual_features
    x[0] = index_word
    
    logit, state = session.run([m.logit, m.final_state],
                               {m.input_data: x,
                                m.visual_features: vf,
                                m.valid_flags: fg,
                                m.seq_lens: sl,
                                m.initial_state: state_prev})
                              
    return (logit, state)
    format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s",
    datefmt='%d %b %H:%M:%S')
logger.setLevel(logging.INFO)

if __name__ == '__main__':
  flag_ignore_exists = True
  # Path
  model_path = './external/tf_cnn_models/inception_v3.pb'
  mscoco_root = './datasets/ms_coco'
  anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy',
                     'anno_list_mscoco_crVal_m_RNN.npy',
                     'anno_list_mscoco_test2014.npy']
  feat_dir = './cache/mscoco_image_features/inception_v3'
  
  # Preparations
  cu = CommonUtiler()
  ife = ImageFeatureExtractor(model_path)
  cu.create_dir_if_not_exists(os.path.join(feat_dir, 'train2014'))
  cu.create_dir_if_not_exists(os.path.join(feat_dir, 'test2014'))
  cu.create_dir_if_not_exists(os.path.join(feat_dir, 'val2014'))
  
  # Extract features
  for anno_file_name in anno_file_names:
    anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name)
    annos = np.load(anno_path).tolist()
    for (ind_a, anno) in enumerate(annos):
      image_path = os.path.join(mscoco_root, 'images', anno['file_path'],
          anno['file_name'])
      feat_path = os.path.join(feat_dir, anno['file_path'],
          anno['file_name'].split('.')[0] + '.txt')
          
Beispiel #13
0
class mRNNCocoBucketDataProvider(object):
    """mRNN TensorFlow Data Provider with Buckets on MS COCO."""
    def __init__(self,
                 anno_files_path,
                 vocab_path,
                 vocab_size,
                 vf_dir,
                 vf_size,
                 flag_shuffle=True):
        self.cu = CommonUtiler()
        self.anno_files_path = anno_files_path
        self.vocab_path = vocab_path
        self.vocab, _ = self.cu.load_vocabulary(vocab_path)
        assert len(self.vocab) == vocab_size
        assert self.vocab['<pad>'] == 0
        self.vf_dir = vf_dir
        self.vf_size = vf_size
        self.flag_shuffle = flag_shuffle
        self._load_data()

    def generate_batches(self, batch_size, buckets):
        """Return a list generator of mini-batches of training data."""
        # create Batches
        batches = []
        for max_seq_len in buckets:
            batches.append(
                Batch(batch_size, max_seq_len, self.vf_size,
                      self.vocab['<bos>']))
        # shuffle if necessary
        if self.flag_shuffle:
            np.random.shuffle(self._data_pointer)
        # scan data queue
        for ind_i, ind_s in self._data_pointer:
            sentence = self._data_queue[ind_i]['sentences'][ind_s]
            visual_features = self._data_queue[ind_i]['visual_features']
            if len(sentence) >= buckets[-1]:
                feed_res = batches[-1].feed_and_vomit(visual_features,
                                                      sentence)
                ind_buc = len(buckets) - 1
            else:
                for (ind_b, batch) in enumerate(batches):
                    if len(sentence) < batch.max_seq_len:
                        feed_res = batches[ind_b].feed_and_vomit(
                            visual_features, sentence)
                        ind_buc = ind_b
                        break
            if feed_res:
                yield (ind_buc, ) + feed_res
                batches[ind_buc].empty()

    def _load_data(self, verbose=True):
        logger.info('Loading data')
        vocab = self.vocab
        self._data_queue = []
        self._data_pointer = []
        ind_img = 0
        num_failed = 0
        for anno_file_path in self.anno_files_path:
            annos = np.load(anno_file_path).tolist()
            for (ind_a, anno) in enumerate(annos):
                data = {}
                # Load visual features
                feat_path = os.path.join(
                    self.vf_dir, anno['file_path'],
                    anno['file_name'].split('.')[0] + '.txt')
                if os.path.exists(feat_path):
                    vf = np.loadtxt(feat_path)
                else:
                    num_failed += 1
                    continue
                data['visual_features'] = vf
                # Encode sentences
                data['sentences'] = []
                for (ind_s, sentence) in enumerate(anno['sentences']):
                    sentence_encode = self.cu.encode_sentence(
                        sentence, vocab, flag_add_bos=False)
                    self._data_pointer.append((ind_img, ind_s))
                    data['sentences'].append(np.array(sentence_encode))

                self._data_queue.append(data)
                ind_img += 1
                if verbose and (ind_a + 1) % 5000 == 0:
                    logger.info('Load %d/%d annotation from file %s',
                                ind_a + 1, len(annos), anno_file_path)

        logger.info(
            'Load %d images, %d sentences from %d files, %d image failed',
            len(self._data_queue), len(self._data_pointer),
            len(self.anno_files_path), num_failed)
Beispiel #14
0
sys.path.append('./py_lib/')
from common_utils import CommonUtiler
from tf_mrnn_decoder import mRNNDecoder
from vision import ImageFeatureExtractor

# In[2]:

# set up paths
mrnn_model_path = './trained_models/coco_caption/mrnn_GRU_570K.ckpt'
mrnn_config_path = './model_conf/mrnn_GRU_conf.py'
mrnn_vocab_path = './trained_models/coco_caption/mscoco_mc3_vocab'
img_model_path = './external/tf_cnn_models/inception_v3.pb'

# initilize feature extractor and sentence decoder
cu = CommonUtiler()
config = cu.load_config(mrnn_config_path)
ife = ImageFeatureExtractor(img_model_path)
decoder = mRNNDecoder(config, 'demo', mrnn_vocab_path)

# In[3]:

demo_image_path = 'demo_image.jpg'
beam_size = 3
# extract visual feature for the image
visual_features = ife.extract_features(demo_image_path, flag_from_file=True)
# generate sentences
decoder.load_model(mrnn_model_path)
sentences = decoder.decode(visual_features, beam_size)

# In[4]:
Beispiel #15
0
  def __init__(self, is_training, config, num_steps, model_name,
               flag_with_saver=False,
               model_root='./cache/models/mscoco',
               flag_reset_state=False):
    # Set up paths and dirs
    self.cu = CommonUtiler()
    self.model_dir = os.path.join(model_root, model_name)
    self.variable_dir = os.path.join(self.model_dir, 'variables')

    self.cu.create_dir_if_not_exists(self.model_dir)
    self.cu.create_dir_if_not_exists(self.variable_dir)
  
    self.batch_size = batch_size = config.batch_size
    self.num_steps = num_steps
    rnn_size = config.rnn_size
    emb_size = config.emb_size
    vocab_size = config.vocab_size
    vf_size = config.vf_size

    # Inputs to the model
    self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
    self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])
    self._visual_features = tf.placeholder(tf.float32, [batch_size, vf_size])
    self._valid_flags = tf.placeholder(tf.float32, [batch_size, num_steps])
    self._seq_lens = tf.placeholder(tf.int32, [batch_size])

    # Create rnn cell
    if config.rnn_type == 'GRU':
      rnn_cell_basic = tf.nn.rnn_cell.GRUCell(rnn_size)
    elif config.rnn_type == 'LSTM':
      rnn_cell_basic = tf.nn.rnn_cell.LSTMCell(rnn_size, input_size=emb_size, 
          use_peepholes=True)
    else:
      raise NameError("Unknown rnn type %s!" % config.rnn_type)
    if is_training and config.keep_prob_rnn < 1:
      rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper(
          rnn_cell_basic, output_keep_prob=config.keep_prob_rnn)
    cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] * config.num_rnn_layers)
    state_size = cell.state_size
    
    # Create word embeddings
    self._embedding = embedding = tf.get_variable("embedding", 
        [vocab_size, emb_size])
    inputs = tf.nn.embedding_lookup(embedding, self._input_data)

    if is_training and config.keep_prob_emb < 1:
      inputs = tf.nn.dropout(inputs, config.keep_prob_emb)
    
    # Different ways to fuze text and visual information
    if config.multimodal_type == 'mrnn':
      mm_size = config.mm_size
      # Run RNNs
      if flag_reset_state:
        self._initial_state = initial_state = tf.placeholder(tf.float32, 
            [batch_size, state_size])
      else:
        self._initial_state = initial_state = cell.zero_state(
            batch_size, tf.float32)
      inputs = [tf.squeeze(input_, [1])
          for input_ in tf.split(1, num_steps, inputs)]
      outputs_rnn, state = tf.nn.rnn(cell, inputs, 
          initial_state=initial_state,
          sequence_length=self._seq_lens)
      self._final_state = state
      output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size])
      
      # Map RNN output to multimodal space
      w_r2m = tf.get_variable("w_r2m", [rnn_size, mm_size])
      b_r2m = tf.get_variable("b_r2m", [mm_size])
      multimodal_l = tf.nn.relu(tf.matmul(output_rnn, w_r2m) + b_r2m)
      
      # Map Visual feature to multimodal space
      w_vf2m = tf.get_variable("w_vf2m", [vf_size, mm_size])
      b_vf2m = tf.get_variable("b_vf2m", [mm_size])
      mm_vf_single = tf.nn.relu(
          tf.matmul(self._visual_features, w_vf2m) + b_vf2m)
      mm_vf = tf.reshape(tf.tile(mm_vf_single, [1, num_steps]), [-1, mm_size])
      multimodal_l = multimodal_l + mm_vf
      if is_training and config.keep_prob_mm < 1:
        multimodal_l = tf.nn.dropout(multimodal_l, config.keep_prob_mm)
      
      # Map multimodal space to word space
      w_m2w = tf.get_variable("w_m2w", [mm_size, emb_size])
      b_m2w = tf.get_variable("b_m2w", [emb_size])
      output = tf.nn.relu(tf.matmul(multimodal_l, w_m2w) + b_m2w)
      
    elif config.multimodal_type == 'init':
      # Mapping visual feature to the RNN state
      w_vf2state = tf.get_variable("w_vf2state", [vf_size, state_size])
      b_vf2state = tf.get_variable("b_vf2state", [state_size])
      if flag_reset_state:
        self._initial_state = initial_state = tf.placeholder(tf.float32, 
            [batch_size, state_size])
      else:
        self._initial_state = initial_state = tf.nn.relu(
            tf.matmul(self._visual_features, w_vf2state) + b_vf2state)

      # Run RNNs
      inputs = [tf.squeeze(input_, [1])
          for input_ in tf.split(1, num_steps, inputs)]
      outputs_rnn, state = tf.nn.rnn(cell, inputs, 
          initial_state=initial_state,
          sequence_length=self._seq_lens)
      self._final_state = state
      output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size])
      
      # Map multimodal space to word space
      w_m2w = tf.get_variable("w_m2w", [rnn_size, emb_size])
      b_m2w = tf.get_variable("b_m2w", [emb_size])
      output = tf.nn.relu(tf.matmul(output_rnn, w_m2w) + b_m2w)
      
    else:
      raise NameError("Unknown multimodal type %s!" % config.multimodal_type)

    # Build sampled softmax loss
    # share the weights between embedding and softmax acc. to [2]
    w_loss = tf.transpose(embedding)
    b_loss = tf.get_variable("b_loss", [vocab_size])
    self._logit = logit = tf.matmul(output, w_loss) + b_loss
    
    target = tf.reshape(math_ops.to_int64(self._targets), [-1])
    valid_flag = tf.reshape(self._valid_flags, [-1])
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, target)
    self._cost = cost = tf.reduce_sum(loss * valid_flag) / (
        tf.reduce_sum(valid_flag) + 1e-12)
    
    # Create saver if necessary
    if flag_with_saver:
      self.saver = tf.train.Saver(max_to_keep=None)
    else:
      self.saver = None

    # Return the model if it is just for inference
    if not is_training:
      return

    # Create learning rate and gradients optimizer
    self._lr = tf.Variable(0.0, trainable=False)
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                      config.max_grad_norm)
    if hasattr(config, 'optimizer'):
      if config.optimizer == 'ori':
        optimizer = tf.train.GradientDescentOptimizer(self.lr)
      elif config.optimizer == 'ada': # No GPU
        optimizer = tf.train.AdagradOptimizer(self.lr)
      elif config.optimizer == 'adam':
        optimizer = tf.train.AdamOptimizer(self.lr)
      elif config.optimizer == 'rms':
        optimizer = tf.train.RMSPropOptimizer(self.lr)
      else:
        raise NameError("Unknown optimizer type %s!" % config.optimizer)
    else:
      optimizer = tf.train.GradientDescentOptimizer(self.lr)
    self._train_op = optimizer.apply_gradients(zip(grads, tvars))
Beispiel #16
0
logger = logging.getLogger('ExpMscoco')
logging.basicConfig(
    format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s",
    datefmt='%d %b %H:%M:%S')
logger.setLevel(logging.INFO)

if __name__ == '__main__':
  # Hyparameters
  min_count = 3
  vocab_path = './cache/dctionary/mscoco_mc%d_vocab' % min_count
  mscoco_root = './datasets/ms_coco'
  anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy']
  
  # Preparations
  cu = CommonUtiler()
  cu.create_dir_if_not_exists(os.path.dirname(vocab_path))
  
  # Scan the anno files
  vocab = {}
  for anno_file_name in anno_file_names:
    anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name)
    annos = np.load(anno_path).tolist()
    for anno in annos:
      for sentence in anno['sentences']:
        for word in sentence:
          word = word.strip().lower()
          if word in vocab:
            vocab[word] += 1
          else:
            vocab[word] = 1
Beispiel #17
0
class mRNNCocoBucketDataProvider(object):
  """mRNN TensorFlow Data Provider with Buckets on MS COCO."""
  def __init__(self, anno_files_path, vocab_path, vocab_size, vf_dir, vf_size,
      flag_shuffle=True):
    self.cu = CommonUtiler()
    self.anno_files_path = anno_files_path
    self.vocab_path = vocab_path
    self.vocab, _ = self.cu.load_vocabulary(vocab_path)
    assert len(self.vocab) == vocab_size
    assert self.vocab['<pad>'] == 0
    self.vf_dir = vf_dir
    self.vf_size = vf_size
    self.flag_shuffle = flag_shuffle
    self._load_data()
      
  def generate_batches(self, batch_size, buckets):
    """Return a list generator of mini-batches of training data."""
    # create Batches
    batches = []
    for max_seq_len in buckets:
      batches.append(
          Batch(batch_size, max_seq_len, self.vf_size, self.vocab['<bos>']))
    # shuffle if necessary
    if self.flag_shuffle:
      np.random.shuffle(self._data_pointer)
    # scan data queue
    for ind_i, ind_s in self._data_pointer:
      sentence = self._data_queue[ind_i]['sentences'][ind_s]
      visual_features = self._data_queue[ind_i]['visual_features']
      if len(sentence) >= buckets[-1]:
        feed_res = batches[-1].feed_and_vomit(visual_features, sentence)
        ind_buc = len(buckets) - 1
      else:
        for (ind_b, batch) in enumerate(batches):
          if len(sentence) < batch.max_seq_len:
            feed_res = batches[ind_b].feed_and_vomit(visual_features, sentence)
            ind_buc = ind_b
            break
      if feed_res:
        yield (ind_buc,) + feed_res
        batches[ind_buc].empty()
          
  def _load_data(self, verbose=True):
    logger.info('Loading data')
    vocab = self.vocab
    self._data_queue = []
    self._data_pointer = []
    ind_img = 0
    num_failed = 0
    for anno_file_path in self.anno_files_path:
      annos = np.load(anno_file_path).tolist()
      for (ind_a, anno) in enumerate(annos):
        data = {}
        # Load visual features
        feat_path = os.path.join(self.vf_dir, anno['file_path'],
            anno['file_name'].split('.')[0] + '.txt')
        if os.path.exists(feat_path):
          vf = np.loadtxt(feat_path)
        else:
          num_failed += 1
          continue
        data['visual_features'] = vf
        # Encode sentences
        data['sentences'] = []
        for (ind_s, sentence) in enumerate(anno['sentences']):
          sentence_encode = self.cu.encode_sentence(sentence, vocab, 
              flag_add_bos=False)
          self._data_pointer.append((ind_img, ind_s))
          data['sentences'].append(np.array(sentence_encode))
          
        self._data_queue.append(data)
        ind_img += 1
        if verbose and (ind_a + 1) % 5000 == 0:
          logger.info('Load %d/%d annotation from file %s', ind_a + 1, 
              len(annos), anno_file_path)
        
    logger.info('Load %d images, %d sentences from %d files, %d image failed', 
        len(self._data_queue), len(self._data_pointer), 
        len(self.anno_files_path), num_failed)
Beispiel #18
0
class mRNNDecoder(object):
    """The sentence decoder (generator) for mRNNModel."""
    def __init__(self,
                 config,
                 model_name,
                 vocab_path,
                 ses_threads=2,
                 gpu_memory_fraction=1.0):
        self.cu = CommonUtiler()
        self.config = copy.deepcopy(config)
        self.config.batch_size = 1
        self.model_path = None
        self.model_name = model_name
        self.flag_load_model = False
        self.vocab_path = vocab_path
        self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_memory_fraction)
        self.session = session = tf.Session(config=tf.ConfigProto(
            intra_op_parallelism_threads=ses_threads, gpu_options=gpu_options))

        with tf.variable_scope("mRNNmodel", reuse=None):
            self.model_init = mRNNModel(is_training=False,
                                        num_steps=1,
                                        config=self.config,
                                        model_name=self.model_name,
                                        flag_with_saver=True)

        with tf.variable_scope("mRNNmodel", reuse=True):
            self.model_cont = mRNNModel(is_training=False,
                                        num_steps=1,
                                        config=self.config,
                                        model_name=self.model_name,
                                        flag_with_saver=False,
                                        flag_reset_state=True)

    def load_model(self, model_path):
        self.model_init.saver.restore(self.session, model_path)
        self.flag_load_model = True
        self.model_path = model_path
        logger.info('Load model from %s', model_path)

    def decode(self, visual_features, beam_size, max_steps=30):
        """Decode an image with a sentences."""
        assert visual_features.shape[0] == self.config.vf_size
        assert self.flag_load_model, 'Must call local_model first'
        vocab = self.vocab
        rev_vocab = self.rev_vocab

        # Initilize beam search variables
        # Candidate will be represented with a dictionary
        #   "indexes": a list with indexes denoted a sentence;
        #   "words": word in the decoded sentence without <bos>
        #   "score": log-likelihood of the sentence
        #   "state": RNN state when generating the last word of the candidate
        good_sentences = []  # store sentences already ended with <bos>
        cur_best_cand = []  # store current best candidates
        highest_score = 0.0  # hightest log-likelihodd in good sentences

        # Get the initial logit and state
        logit_init, state_init = self.get_logit_init(visual_features)
        logit_init = np.squeeze(logit_init)
        assert logit_init.shape[0] == self.config.vocab_size and len(
            logit_init.shape) == 1
        logit_init = self.cu.softmax(logit_init)
        logit_init_order = np.argsort(-logit_init)
        for ind_b in xrange(beam_size):
            cand = {}
            cand['indexes'] = [logit_init_order[ind_b]]
            cand['score'] = -np.log(logit_init[logit_init_order[ind_b]])
            cand['state'] = state_init
            cur_best_cand.append(cand)

        # Expand the current best candidates until max_steps or no candidate
        for i in xrange(max_steps):
            # move candidates end with <bos> to good_sentences or remove it
            cand_left = []
            for cand in cur_best_cand:
                if len(good_sentences
                       ) > beam_size and cand['score'] > highest_score:
                    continue  # No need to expand that candidate
                if cand['indexes'][-1] == vocab['<bos>']:
                    good_sentences.append(cand)
                    highest_score = max(highest_score, cand['score'])
                else:
                    cand_left.append(cand)
            cur_best_cand = cand_left
            if not cur_best_cand:
                break
            # expand candidate left
            cand_pool = []
            for cand in cur_best_cand:
                logit, state = self.get_logit_cont(cand['state'],
                                                   cand['indexes'][-1],
                                                   visual_features)
                logit = np.squeeze(logit)
                logit = self.cu.softmax(logit)
                logit_order = np.argsort(-logit)
                for ind_b in xrange(beam_size):
                    cand_e = copy.deepcopy(cand)
                    cand_e['indexes'].append(logit_order[ind_b])
                    cand_e['score'] -= np.log(logit[logit_order[ind_b]])
                    cand_e['state'] = state
                    cand_pool.append(cand_e)
            # get final cand_pool
            cur_best_cand = sorted(cand_pool, key=lambda cand: cand['score'])
            cur_best_cand = self.cu.truncate_list(cur_best_cand, beam_size)

        # Add candidate left in cur_best_cand to good sentences
        for cand in cur_best_cand:
            if len(good_sentences
                   ) > beam_size and cand['score'] > highest_score:
                continue
            if cand['indexes'][-1] != vocab['<bos>']:
                cand['indexes'].append(vocab['<bos>'])
            good_sentences.append(cand)
            highest_score = max(highest_score, cand['score'])

        # Sort good sentences and return the final list
        good_sentences = sorted(good_sentences, key=lambda cand: cand['score'])
        good_sentences = self.cu.truncate_list(good_sentences, beam_size)
        for sentence in good_sentences:
            sentence['words'] = self.cu.decode_sentence(
                sentence['indexes'], vocab, rev_vocab)

        return good_sentences

    def get_logit_init(self, visual_features):
        """Use the model to get initial logit"""
        m = self.model_init
        session = self.session
        vocab = self.vocab
        config = self.config

        x = np.zeros([1, 1], dtype=np.int32)
        vf = np.zeros([1, config.vf_size], dtype=np.float32)
        fg = np.ones([1, 1], dtype=np.float32)
        sl = np.ones([1], dtype=np.int32)
        vf[0, :] = visual_features
        x[0] = vocab['<bos>']

        logit, state = session.run(
            [m.logit, m.final_state], {
                m.input_data: x,
                m.visual_features: vf,
                m.valid_flags: fg,
                m.seq_lens: sl
            })

        return (logit, state)

    def get_logit_cont(self, state_prev, index_word, visual_features):
        """Use the model to get continued logit"""
        m = self.model_cont
        session = self.session
        config = self.config

        x = np.zeros([1, 1], dtype=np.int32)
        vf = np.zeros([1, config.vf_size], dtype=np.float32)
        fg = np.ones([1, 1], dtype=np.float32)
        sl = np.ones([1], dtype=np.int32)
        vf[0, :] = visual_features
        x[0] = index_word

        logit, state = session.run(
            [m.logit, m.final_state], {
                m.input_data: x,
                m.visual_features: vf,
                m.valid_flags: fg,
                m.seq_lens: sl,
                m.initial_state: state_prev
            })

        return (logit, state)
def main(unused_args):
    # Load model configuration
    cu = CommonUtiler()
    config_path = os.path.join('./model_conf', FLAGS.model_name + '.py')
    config = cu.load_config(config_path)

    # Start model training
    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
            intra_op_parallelism_threads=FLAGS.ses_threads)) as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        assert len(config.buckets) >= 1
        assert config.buckets[-1] == config.max_num_steps
        models = []
        with tf.variable_scope("mRNNmodel",
                               reuse=None,
                               initializer=initializer):
            m = mRNNModel(is_training=True,
                          num_steps=config.buckets[0],
                          config=config,
                          model_name=FLAGS.model_name,
                          flag_with_saver=True,
                          model_root=FLAGS.model_root)
            models.append(m)

        with tf.variable_scope("mRNNmodel", reuse=True):
            for bucket in config.buckets[1:]:
                m = mRNNModel(is_training=True,
                              num_steps=bucket,
                              config=config,
                              model_name=FLAGS.model_name,
                              model_root=FLAGS.model_root)
                models.append(m)

        hdlr = logging.FileHandler(os.path.join(m.model_dir, 'log.txt'))
        hdlr.setLevel(logging.INFO)
        hdlr.setFormatter(logging.Formatter(formatter_log))
        logger.addHandler(hdlr)

        if FLAGS.pre_trained_model_path:
            models[0].saver.restore(session, FLAGS.pre_trained_model_path)
            logger.info('Continue to train from %s',
                        FLAGS.pre_trained_model_path)
        else:
            tf.initialize_all_variables().run()

        iters_done = 0
        data_provider = mRNNCocoBucketDataProvider(
            FLAGS.anno_files_path.split(':'), FLAGS.vocab_path,
            config.vocab_size, FLAGS.vf_dir, config.vf_size)
        for i in range(config.num_epoch):
            train_cost, iters_done = run_epoch(session,
                                               iters_done,
                                               config,
                                               models,
                                               data_provider,
                                               verbose=True)
            logger.info("Train cost for epoch %d is %.3f" % (i, train_cost))

        # Save final copy of the model
        models[0].saver.save(
            session, os.path.join(m.variable_dir,
                                  'model_%d.ckpt' % iters_done))
    format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s",
    datefmt='%d %b %H:%M:%S')
logger.setLevel(logging.INFO)

if __name__ == '__main__':
  flag_ignore_exists = True
  # Path
  model_path = './external/tf_cnn_models/inception_v3.pb'
  mscoco_root = './datasets/ms_coco'
  anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy',
                     'anno_list_mscoco_crVal_m_RNN.npy',
                     'anno_list_mscoco_test2014.npy']
  feat_dir = './cache/mscoco_image_features/inception_v3'
  
  # Preparations
  cu = CommonUtiler()
  ife = ImageFeatureExtractor(model_path)
  cu.create_dir_if_not_exists(os.path.join(feat_dir, 'train2014'))
  cu.create_dir_if_not_exists(os.path.join(feat_dir, 'test2014'))
  cu.create_dir_if_not_exists(os.path.join(feat_dir, 'val2014'))
  
  # Extract features
  for anno_file_name in anno_file_names:
    anno_path = os.path.join(mscoco_root, 'mscoco_anno_files', anno_file_name)
    annos = np.load(anno_path).tolist()
    for (ind_a, anno) in enumerate(annos):
      image_path = os.path.join(mscoco_root, 'images', anno['file_path'],
          anno['file_name'])
      feat_path = os.path.join(feat_dir, anno['file_path'],
          anno['file_name'].split('.')[0] + '.txt')