コード例 #1
0
    def __init__(self,
                 config,
                 model_name,
                 vocab_path,
                 ses_threads=2,
                 gpu_memory_fraction=1.0):
        self.cu = CommonUtiler()
        self.config = copy.deepcopy(config)
        self.config.batch_size = 1
        self.model_path = None
        self.model_name = model_name
        self.flag_load_model = False
        self.vocab_path = vocab_path
        self.vocab, self.rev_vocab = self.cu.load_vocabulary(vocab_path)

        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_memory_fraction)
        self.session = session = tf.Session(config=tf.ConfigProto(
            intra_op_parallelism_threads=ses_threads, gpu_options=gpu_options))

        with tf.variable_scope("mRNNmodel", reuse=None):
            self.model_init = mRNNModel(is_training=False,
                                        num_steps=1,
                                        config=self.config,
                                        model_name=self.model_name,
                                        flag_with_saver=True)

        with tf.variable_scope("mRNNmodel", reuse=True):
            self.model_cont = mRNNModel(is_training=False,
                                        num_steps=1,
                                        config=self.config,
                                        model_name=self.model_name,
                                        flag_with_saver=False,
                                        flag_reset_state=True)
コード例 #2
0
def main(unused_args):
    # Load model configuration
    cu = CommonUtiler()
    config_path = os.path.join('./model_conf', FLAGS.model_name + '.py')
    config = cu.load_config(config_path)

    # Evaluate trained models on val
    decoder = mRNNDecoder(config,
                          FLAGS.model_name,
                          FLAGS.vocab_path,
                          gpu_memory_fraction=FLAGS.gpu_memory_fraction)
    for i in xrange(*[int(x) for x in FLAGS.eval_stat.split()]):
        model_path = os.path.join(FLAGS.model_root, FLAGS.model_name,
                                  'variables', 'model_%d.ckpt' % i)
        while not os.path.exists(model_path):
            logger.warn('Cannot load model file, sleep 1 hour to retry')
            time.sleep(3600)

        decoder.load_model(model_path)

        num_decode = 0
        pred_sentences = []
        for anno_file_path in FLAGS.anno_files_path.split(':'):
            annos = np.load(anno_file_path).tolist()
            for anno in annos:
                feat_path = os.path.join(
                    FLAGS.vf_dir, anno['file_path'],
                    anno['file_name'].split('.')[0] + '.txt')
                visual_features = np.loadtxt(feat_path)
                sentences = decoder.decode(visual_features, FLAGS.beam_size)

                sentence_coco = {}
                sentence_coco['image_id'] = anno['id']
                sentence_coco['caption'] = ' '.join(sentences[0]['words'])
                pred_sentences.append(sentence_coco)
                num_decode += 1

                if num_decode % 100 == 0:
                    logger.info('%d images are decoded' % num_decode)

        pred_path = os.path.join(FLAGS.model_root, FLAGS.model_name,
                                 'decode_val_result', 'generated_%d.json' % i)
        result_path = os.path.join(FLAGS.model_root, FLAGS.model_name,
                                   'decode_val_result', 'result_%d.txt' % i)
        cu.create_dir_if_not_exists(os.path.dirname(pred_path))
        with open(pred_path, 'w') as fout:
            json.dump(pred_sentences, fout)
        cu.coco_val_eval(pred_path, result_path)
コード例 #3
0
ファイル: tf_data_provider.py プロジェクト: zgsxwsdxg/TF-mRNN
 def __init__(self,
              anno_files_path,
              vocab_path,
              vocab_size,
              vf_dir,
              vf_size,
              flag_shuffle=True):
     self.cu = CommonUtiler()
     self.anno_files_path = anno_files_path
     self.vocab_path = vocab_path
     self.vocab, _ = self.cu.load_vocabulary(vocab_path)
     assert len(self.vocab) == vocab_size
     assert self.vocab['<pad>'] == 0
     self.vf_dir = vf_dir
     self.vf_size = vf_size
     self.flag_shuffle = flag_shuffle
     self._load_data()
コード例 #4
0
    def __init__(self,
                 is_training,
                 config,
                 num_steps,
                 model_name,
                 flag_with_saver=False,
                 model_root='./cache/models/mscoco',
                 flag_reset_state=False):
        # Set up paths and dirs
        self.cu = CommonUtiler()
        self.model_dir = os.path.join(model_root, model_name)
        self.variable_dir = os.path.join(self.model_dir, 'variables')

        self.cu.create_dir_if_not_exists(self.model_dir)
        self.cu.create_dir_if_not_exists(self.variable_dir)

        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps
        rnn_size = config.rnn_size
        emb_size = config.emb_size
        vocab_size = config.vocab_size
        vf_size = config.vf_size

        # Inputs to the model
        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._visual_features = tf.placeholder(tf.float32,
                                               [batch_size, vf_size])
        self._valid_flags = tf.placeholder(tf.float32, [batch_size, num_steps])
        self._seq_lens = tf.placeholder(tf.int32, [batch_size])

        # Create rnn cell
        if config.rnn_type == 'GRU':
            rnn_cell_basic = tf.nn.rnn_cell.GRUCell(rnn_size)
        elif config.rnn_type == 'LSTM':
            rnn_cell_basic = tf.nn.rnn_cell.LSTMCell(rnn_size,
                                                     input_size=emb_size,
                                                     use_peepholes=True)
        else:
            raise NameError("Unknown rnn type %s!" % config.rnn_type)
        if is_training and config.keep_prob_rnn < 1:
            rnn_cell_basic = tf.nn.rnn_cell.DropoutWrapper(
                rnn_cell_basic, output_keep_prob=config.keep_prob_rnn)
        cell = tf.nn.rnn_cell.MultiRNNCell([rnn_cell_basic] *
                                           config.num_rnn_layers)
        state_size = cell.state_size

        # Create word embeddings
        self._embedding = embedding = tf.get_variable("embedding",
                                                      [vocab_size, emb_size])
        inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        if is_training and config.keep_prob_emb < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob_emb)

        # Different ways to fuze text and visual information
        if config.multimodal_type == 'mrnn':
            mm_size = config.mm_size
            # Run RNNs
            if flag_reset_state:
                self._initial_state = initial_state = tf.placeholder(
                    tf.float32, [batch_size, state_size])
            else:
                self._initial_state = initial_state = cell.zero_state(
                    batch_size, tf.float32)
            inputs = [
                tf.squeeze(input_, [1])
                for input_ in tf.split(1, num_steps, inputs)
            ]
            outputs_rnn, state = tf.nn.rnn(cell,
                                           inputs,
                                           initial_state=initial_state,
                                           sequence_length=self._seq_lens)
            self._final_state = state
            output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size])

            # Map RNN output to multimodal space
            w_r2m = tf.get_variable("w_r2m", [rnn_size, mm_size])
            b_r2m = tf.get_variable("b_r2m", [mm_size])
            multimodal_l = tf.nn.relu(tf.matmul(output_rnn, w_r2m) + b_r2m)

            # Map Visual feature to multimodal space
            w_vf2m = tf.get_variable("w_vf2m", [vf_size, mm_size])
            b_vf2m = tf.get_variable("b_vf2m", [mm_size])
            mm_vf_single = tf.nn.relu(
                tf.matmul(self._visual_features, w_vf2m) + b_vf2m)
            mm_vf = tf.reshape(tf.tile(mm_vf_single, [1, num_steps]),
                               [-1, mm_size])
            multimodal_l = multimodal_l + mm_vf
            if is_training and config.keep_prob_mm < 1:
                multimodal_l = tf.nn.dropout(multimodal_l, config.keep_prob_mm)

            # Map multimodal space to word space
            w_m2w = tf.get_variable("w_m2w", [mm_size, emb_size])
            b_m2w = tf.get_variable("b_m2w", [emb_size])
            output = tf.nn.relu(tf.matmul(multimodal_l, w_m2w) + b_m2w)

        elif config.multimodal_type == 'init':
            # Mapping visual feature to the RNN state
            w_vf2state = tf.get_variable("w_vf2state", [vf_size, state_size])
            b_vf2state = tf.get_variable("b_vf2state", [state_size])
            if flag_reset_state:
                self._initial_state = initial_state = tf.placeholder(
                    tf.float32, [batch_size, state_size])
            else:
                self._initial_state = initial_state = tf.nn.relu(
                    tf.matmul(self._visual_features, w_vf2state) + b_vf2state)

            # Run RNNs
            inputs = [
                tf.squeeze(input_, [1])
                for input_ in tf.split(1, num_steps, inputs)
            ]
            outputs_rnn, state = tf.nn.rnn(cell,
                                           inputs,
                                           initial_state=initial_state,
                                           sequence_length=self._seq_lens)
            self._final_state = state
            output_rnn = tf.reshape(tf.concat(1, outputs_rnn), [-1, rnn_size])

            # Map multimodal space to word space
            w_m2w = tf.get_variable("w_m2w", [rnn_size, emb_size])
            b_m2w = tf.get_variable("b_m2w", [emb_size])
            output = tf.nn.relu(tf.matmul(output_rnn, w_m2w) + b_m2w)

        else:
            raise NameError("Unknown multimodal type %s!" %
                            config.multimodal_type)

        # Build sampled softmax loss
        # share the weights between embedding and softmax acc. to [2]
        w_loss = tf.transpose(embedding)
        b_loss = tf.get_variable("b_loss", [vocab_size])
        self._logit = logit = tf.matmul(output, w_loss) + b_loss

        target = tf.reshape(math_ops.to_int64(self._targets), [-1])
        valid_flag = tf.reshape(self._valid_flags, [-1])
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logit, target)
        self._cost = cost = tf.reduce_sum(
            loss * valid_flag) / (tf.reduce_sum(valid_flag) + 1e-12)

        # Create saver if necessary
        if flag_with_saver:
            self.saver = tf.train.Saver(max_to_keep=None)
        else:
            self.saver = None

        # Return the model if it is just for inference
        if not is_training:
            return

        # Create learning rate and gradients optimizer
        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
                                          config.max_grad_norm)
        if hasattr(config, 'optimizer'):
            if config.optimizer == 'ori':
                optimizer = tf.train.GradientDescentOptimizer(self.lr)
            elif config.optimizer == 'ada':  # No GPU
                optimizer = tf.train.AdagradOptimizer(self.lr)
            elif config.optimizer == 'adam':
                optimizer = tf.train.AdamOptimizer(self.lr)
            elif config.optimizer == 'rms':
                optimizer = tf.train.RMSPropOptimizer(self.lr)
            else:
                raise NameError("Unknown optimizer type %s!" %
                                config.optimizer)
        else:
            optimizer = tf.train.GradientDescentOptimizer(self.lr)
        self._train_op = optimizer.apply_gradients(zip(grads, tvars))
コード例 #5
0
logger = logging.getLogger('ExpMscoco')
logging.basicConfig(
    format="[%(asctime)s - %(filename)s:line %(lineno)4s] %(message)s",
    datefmt='%d %b %H:%M:%S')
logger.setLevel(logging.INFO)

if __name__ == '__main__':
    # Hyparameters
    min_count = 3
    vocab_path = './cache/dctionary/mscoco_mc%d_vocab' % min_count
    mscoco_root = './datasets/ms_coco'
    anno_file_names = ['anno_list_mscoco_trainModelVal_m_RNN.npy']

    # Preparations
    cu = CommonUtiler()
    cu.create_dir_if_not_exists(os.path.dirname(vocab_path))

    # Scan the anno files
    vocab = {}
    for anno_file_name in anno_file_names:
        anno_path = os.path.join(mscoco_root, 'mscoco_anno_files',
                                 anno_file_name)
        annos = np.load(anno_path).tolist()
        for anno in annos:
            for sentence in anno['sentences']:
                for word in sentence:
                    word = word.strip().lower()
                    if word in vocab:
                        vocab[word] += 1
                    else:
コード例 #6
0
def main(unused_args):
    # Load model configuration
    cu = CommonUtiler()
    config_path = os.path.join('./model_conf', FLAGS.model_name + '.py')
    config = cu.load_config(config_path)

    # Start model training
    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
            intra_op_parallelism_threads=FLAGS.ses_threads)) as session:
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        assert len(config.buckets) >= 1
        assert config.buckets[-1] == config.max_num_steps
        models = []
        with tf.variable_scope("mRNNmodel",
                               reuse=None,
                               initializer=initializer):
            m = mRNNModel(is_training=True,
                          num_steps=config.buckets[0],
                          config=config,
                          model_name=FLAGS.model_name,
                          flag_with_saver=True,
                          model_root=FLAGS.model_root)
            models.append(m)

        with tf.variable_scope("mRNNmodel", reuse=True):
            for bucket in config.buckets[1:]:
                m = mRNNModel(is_training=True,
                              num_steps=bucket,
                              config=config,
                              model_name=FLAGS.model_name,
                              model_root=FLAGS.model_root)
                models.append(m)

        hdlr = logging.FileHandler(os.path.join(m.model_dir, 'log.txt'))
        hdlr.setLevel(logging.INFO)
        hdlr.setFormatter(logging.Formatter(formatter_log))
        logger.addHandler(hdlr)

        if FLAGS.pre_trained_model_path:
            models[0].saver.restore(session, FLAGS.pre_trained_model_path)
            logger.info('Continue to train from %s',
                        FLAGS.pre_trained_model_path)
        else:
            tf.initialize_all_variables().run()

        iters_done = 0
        data_provider = mRNNCocoBucketDataProvider(
            FLAGS.anno_files_path.split(':'), FLAGS.vocab_path,
            config.vocab_size, FLAGS.vf_dir, config.vf_size)
        for i in range(config.num_epoch):
            train_cost, iters_done = run_epoch(session,
                                               iters_done,
                                               config,
                                               models,
                                               data_provider,
                                               verbose=True)
            logger.info("Train cost for epoch %d is %.3f" % (i, train_cost))

        # Save final copy of the model
        models[0].saver.save(
            session, os.path.join(m.variable_dir,
                                  'model_%d.ckpt' % iters_done))