Exemple #1
0
    def beam_search(nodes,
                    graph,
                    beam_size,
                    expand_size=None,
                    compress_mask=False,
                    model=None,
                    max_calc_batch_size=4096):
        """Method to call beam search, given TSP samples and a model
        """

        assert model is not None, "Provide model"

        fixed = model.precompute_fixed(nodes, graph)

        def propose_expansions(beam):
            return model.propose_expansions(
                beam,
                fixed,
                expand_size,
                normalize=True,
                max_calc_batch_size=max_calc_batch_size)

        state = TSP.make_state(
            nodes,
            graph,
            visited_dtype=torch.int64 if compress_mask else torch.uint8)

        return beam_search(state, beam_size, propose_expansions)
    def beam_search(input,
                    beam_size,
                    expand_size=None,
                    compress_mask=False,
                    model=None,
                    max_calc_batch_size=4096):

        assert model is not None, "Provide model"

        fixed = model.precompute_fixed(input)

        state = PDP.make_state(
            input,
            visited_dtype=torch.int64 if compress_mask else torch.uint8
            #            input, visited_dtype=torch.int64 if compress_mask else torch.bool
        )

        def propose_expansions(beam):
            return model.propose_expansions(
                beam,
                fixed,
                expand_size,
                normalize=True,
                max_calc_batch_size=max_calc_batch_size)

        return beam_search(state, beam_size, propose_expansions)
Exemple #3
0
    def test(self):
        p = ProgressBar()
        f = open('./captions.txt', 'w')
        for i_  in p(range(0,len(self.test_data),self.batchsize)): 
            data = np.zeros((self.batchsize, self.in_channel, self.input_height, self.input_width), dtype=np.float32)
            t2 = np.zeros((self.batchsize, self.input_height, self.input_width), dtype=np.int32)
            label=[]
            first_words=np.zeros((self.batchsize), dtype=np.int32)
            for j in xrange(self.batchsize):
                image = (self.image_hash[self.test_data[i_+j][0]])
                image = google_prepare(image)
                data[j,:,:,:] = image
                label.append(self.test_data[i_+j][1])
                first_words[j]=self.test_data[i_+j][1][0]

            genrated_sentence=[]
            data = Variable(cuda.to_gpu(data))
            state = {name: Variable(self.xp.zeros((data.shape[0], 1024),dtype=self.xp.float32)) for name in ('c1', 'h1')}
            h = self.enc(data, train=False, test=True)
            
            ### first LSTM ###
            state,_ = self.dec(h, state,train=False, test=True, image=True)
            ### input <SOS> ###
            state,y = self.dec(Variable(cuda.to_gpu(first_words)), state,train=False, test=True)

            genrated_sentence_beamed = beam_search(self.dec,state,y,data, 20, self.mydict_inv)
            
            # maximum sentence length is 50
            for i in xrange(50):
                y = Variable(self.xp.array(np.argmax(y.data.get(), axis=1)).astype(self.xp.int32))
                state,y = self.dec(y, state,train=False, test=True)
                genrated_sentence.append(y.data)

            
            for b in range(self.batchsize):
                f.write(str(self.test_data[i_+b][0])+'/')
                # GT caption
                for i in range(1,len(label[b])-1):
                    index=label[b][i]
                    f.write(self.mydict_inv[index]+' ')
                f.write("/")

                # Predicted caption
                for i,predicted_word in enumerate(genrated_sentence):
                    index=cuda.to_cpu(predicted_word.argmax(1))[b]
                    if self.mydict_inv[index]=='<EOS>':
                        break
                    f.write(self.mydict_inv[index]+' ')
                f.write("/")

                # beamed caption
                for i in range(len(genrated_sentence_beamed[b])):
                    index=genrated_sentence_beamed[b][i]
                    if self.mydict_inv[index]=='<EOS>':
                        break
                    f.write(self.mydict_inv[index]+' ')
                f.write("\n")

        f.close()
Exemple #4
0
    def testTPUBeam(self):
        batch_size = 1
        beam_size = 2
        vocab_size = 3
        decode_length = 3

        initial_ids = tf.constant([0] * batch_size)  # GO
        probabilities = tf.constant([[[0.1, 0.1, 0.8], [0.1, 0.1, 0.8]],
                                     [[0.4, 0.5, 0.1], [0.2, 0.4, 0.4]],
                                     [[0.05, 0.9, 0.05], [0.4, 0.4, 0.2]]])

        # The top beam is always selected so we should see the top beam's state
        # at each position, which is the one thats getting 3 added to it each step.
        expected_states = tf.constant([[[0.], [0.]], [[3.], [3.]], [[6.],
                                                                    [6.]]])

        def symbols_to_logits(_, i, states, kv_encdecs):  # pylint: disable=unused-argument
            # We have to assert the values of state inline here since we can't fetch
            # them out of the loop!
            with tf.control_dependencies(
                [tf.assert_equal(states["state"], expected_states[i])]):
                logits = tf.to_float(tf.log(probabilities[i, :]))

            states["state"] += tf.constant([[3.], [7.]])
            return logits, states

        states = {
            "state": tf.zeros((batch_size, 1)),
        }
        states["state"] = tf.placeholder_with_default(states["state"],
                                                      shape=(None, 1))

        final_ids, _ = beam_search.beam_search(symbols_to_logits,
                                               initial_ids,
                                               beam_size,
                                               decode_length,
                                               vocab_size,
                                               3.5,
                                               eos_id=1,
                                               states=states)

        with self.test_session() as sess:
            # Catch and fail so that the testing framework doesn't think it's an error
            try:
                sess.run(final_ids)
            except tf.errors.InvalidArgumentError as e:
                raise AssertionError(e.message)
        self.assertAllEqual([[[0, 2, 0, 1], [0, 2, 1, 0]]], final_ids)
Exemple #5
0
def main(args):
    checkpoint_path = os.path.join("saved/", args.name, args.checkpoint)
    checkpoint = torch.load(checkpoint_path)
    config = checkpoint['config']

    #if args.task.lower() == 'caption':
    embedder = eval(config['embedder']['type'])
    embedder_path = os.path.join("saved/", args.name, "embedder.pkl")
    data_loader = CaptionDataLoader(config,
                                    embedder,
                                    mode='test',
                                    path=args.data_dir,
                                    embedder_path=embedder_path)

    model = Seq2Seq(config, embedder=data_loader.embedder)
    model.load_state_dict(checkpoint['state_dict'])
    if not args.no_cuda:
        model.cuda()
    model.eval()
    model.summary()

    result = []
    for batch_idx, (in_seq, id) in enumerate(data_loader):
        in_seq = torch.FloatTensor(in_seq)
        in_seq = Variable(in_seq)
        if not args.no_cuda:
            in_seq = in_seq.cuda()
        if args.beam_size == 1:
            out_seq = model(in_seq, 24)
            out_seq = np.array([seq.data.cpu().numpy() for seq in out_seq])
            out_seq = np.transpose(out_seq, (1, 0, 2))
            out_seq = data_loader.embedder.decode_lines(out_seq)
        else:
            out_seq = beam_search(model,
                                  data_loader.embedder,
                                  in_seq,
                                  seq_len=24,
                                  beam_size=args.beam_size)
            out_seq = data_loader.embedder.decode_lines(out_seq)

        out_seq = [(str(id[0]), out_seq)]
        result.extend(out_seq)

    with open(args.output, 'w') as f:
        for video_id, caption in result:
            caption = postprocess(caption)
            f.write(video_id + ',' + caption + '\n')
    def beam_search(input, beam_size, expand_size=None,
                    compress_mask=False, model=None, max_calc_batch_size=4096):

        assert model is not None, "Provide model"

        fixed = model.precompute_fixed(input)

        def propose_expansions(beam):
            return model.propose_expansions(
                beam, fixed, expand_size, normalize=True, max_calc_batch_size=max_calc_batch_size
            )

        # With beam search we always consider the deterministic case
        state = PCTSPDet.make_state(
            input, visited_dtype=torch.int64 if compress_mask else torch.uint8
        )

        return beam_search(state, beam_size, propose_expansions)
    def beam_search(input,
                    beam_size,
                    expand_size=None,
                    compress_mask=False,
                    model=None,
                    max_calc_batch_size=4096):
        assert model is not None, "Provide model"
        assert not compress_mask, "SDVRP does not support compression of the mask"

        fixed = model.precompute_fixed(input)

        def propose_expansions(beam):
            return model.propose_expansions(
                beam,
                fixed,
                expand_size,
                normalize=True,
                max_calc_batch_size=max_calc_batch_size)

        state = SDVRP.make_state(input)

        return beam_search(state, beam_size, propose_expansions)
Exemple #8
0
def main(params):
    if params.input == 'GOT':
        corpus_path = "/home/luoyy/datasets_small/got"
        data_raw = data_.got_read(corpus_path)
        data, labels_arr, embed_arr, data_dict = data_.prepare_data(data_raw,
                                                                    params)
    elif params.input == 'PTB':
        # data in form [data, labels]
        train_data_raw, valid_data_raw, test_data_raw = data_.ptb_read(
            './PTB_DATA/data')
        data, labels_arr, embed_arr, data_dict = data_.prepare_data(
            train_data_raw, params)
    with tf.Graph().as_default() as graph:
        inputs = tf.placeholder(shape=[None, None], dtype=tf.int32)
        d_inputs_ps = tf.placeholder(dtype=tf.int32, shape=[None, None])
        labels = tf.placeholder(shape=[None, None], dtype=tf.int32)
        with tf.device("/cpu:0"):
            if not params.pre_trained_embed:
                embedding = tf.get_variable(
                    "embedding", [data_dict.vocab_size,
                                  params.embed_size], dtype=tf.float32)
                vect_inputs = tf.nn.embedding_lookup(embedding, inputs)
            else:
                # [data_dict.vocab_size, params.embed_size]
                embedding = tf.Variable(
                    embed_arr,
                    trainable=params.fine_tune_embed,
                    name="embedding", dtype=tf.float32)
                vect_inputs = tf.nn.embedding_lookup(embedding, inputs)
        # inputs = tf.unstack(inputs, num=num_steps, axis=1)
        vocab_size = data_dict.vocab_size
        seq_length = tf.placeholder_with_default([0.0], shape=[None])
        d_seq_length = tf.placeholder(shape=[None], dtype=tf.float32)
        qz = q_net(vect_inputs, seq_length, params.batch_size)
        x_logits, _, _ = vae_lstm({'z': qz}, params.batch_size,
                                  d_seq_length, embedding,
                                  d_inputs_ps, vocab_size=vocab_size)
        # loss, masking <PAD>
        current_len = tf.placeholder_with_default(params.sent_max_size,
                                                  shape=())
        # tf.sequence_mask, tf.contrib.seq2seq.sequence_loss
        labels_flat = tf.reshape(labels, [-1])
        cross_entr = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=x_logits, labels=labels_flat)
        mask_labels = tf.sign(tf.to_float(labels_flat))
        masked_losses = mask_labels * cross_entr
        # reshape again
        masked_losses = tf.reshape(masked_losses, tf.shape(labels))
        mean_loss_by_example = tf.reduce_sum(masked_losses,
                                             reduction_indices=1) / d_seq_length
        rec_loss = tf.reduce_mean(mean_loss_by_example)
        perplexity = tf.exp(rec_loss)
        # kl divergence calculation
        kld = -0.5 * tf.reduce_mean(
                tf.reduce_sum(
                    1 + tf.log(tf.square(qz.distribution.std) + 0.0001)
                    - tf.square(qz.distribution.mean)
                    - tf.square(qz.distribution.std), 1))
        tf.summary.scalar('kl_divergence', kld)
        # kld weight annealing
        anneal = tf.placeholder(tf.int32)
        annealing = (tf.tanh((tf.to_float(anneal) - 3500)/1000) + 1)/2
        # overall loss reconstruction loss - kl_regularization
        lower_bound = rec_loss + tf.multiply(
            tf.to_float(annealing), tf.to_float(kld)) / 10
        #lower_bound = rec_loss
        sm2 = [tf.summary.scalar('lower_bound', lower_bound),
               tf.summary.scalar('kld_coeff', annealing)]
        gradients = tf.gradients(lower_bound, tf.trainable_variables())
        opt = tf.train.AdamOptimizer(learning_rate=params.learning_rate)
        clipped_grad, _ = tf.clip_by_global_norm(gradients, 5)
        optimize = opt.apply_gradients(zip(clipped_grad,
                                           tf.trainable_variables()))
        #sample
        logits, states, smpl = vae_lstm({}, 1, d_seq_length, embedding,
                                        d_inputs_ps, vocab_size=vocab_size,
                                        gen_mode=True)
        init_state = states[0]
        fin_output = states[1]
        # merge summaries
        merged = tf.summary.merge_all()
        with tf.Session() as sess:
            sess.run([tf.global_variables_initializer(),
                      tf.local_variables_initializer()])
            if params.debug:
                sess = tf_debug.LocalCLIDebugWrapperSession(sess)
            summary_writer = tf.summary.FileWriter(params.LOG_DIR, sess.graph)
            summary_writer.add_graph(sess.graph)
            #ptb_data = PTBInput(params.batch_size, train_data)
            num_iters = len(data) // params.batch_size
            cur_it = 0
            iters, kld_arr, coeff = [], [], []
            for e in range(params.num_epochs):
                for it in range(num_iters):
                    params.is_training = True
                    batch = data[it * params.batch_size: (it + 1) * params.batch_size]
                    l_batch = labels_arr[it * params.batch_size:(it + 1) * params.batch_size]
                    # zero padding
                    pad = len(max(batch, key=len))
                    # not optimal!!
                    length_ = np.array([len(sent) for sent in batch]).reshape(params.batch_size)
                    # prepare encoder and decoder inputs to feed
                    batch = np.array([sent + [0] * (pad - len(sent)) for sent in batch])
                    l_batch = np.array([(sent + [0] * (pad - len(sent))) for sent in l_batch])
                    # encoder feed=[....<EOS>], decoder feed=[<BOS>....], labels=[.....<EOS>]
                    feed = {inputs: l_batch, d_inputs_ps: batch, labels: l_batch,
                            seq_length: length_, d_seq_length: length_, anneal: cur_it, current_len: pad}
                    lb, _, kld_, ann_, r_loss, perplexity_ = sess.run([lower_bound, optimize,
                                                                       kld, annealing, rec_loss, perplexity],
                                                                      feed_dict=feed)
                    cur_it += 1
                    iters.append(cur_it)
                    kld_arr.append(kld_)
                    coeff.append(ann_)
                    if cur_it % 100 == 0 and cur_it != 0:
                        print("VLB after {} ({}) iterations (epoch): {} KLD: "
                              "{} Annealing Coeff: {} CE: {}".format(
                                  cur_it, e,lb, kld_, ann_, r_loss))
                        print("Perplexity: {}".format(perplexity_))
                    if cur_it % 150 == 0:
                        if not params.beam_search:
                            params.is_training = False
                            online_inference(sess, data_dict,
                                             sample=smpl, seq=d_inputs_ps,
                                             in_state=init_state,
                                             out_state=fin_output,
                                             length=d_seq_length)
                        else:
                            gen_sentence = beam_search(sess, data_dict, states,
                                                       smpl, (d_inputs_ps,
                                                        d_seq_length), params,
                                                       beam_size=params.beam_size)
                            print(gen_sentence)
                    if cur_it % 400 == 0 and cur_it!=0:
                       # saver = tf.train.Saver()
                        summary = sess.run(merged, feed_dict=feed)
                        summary_writer.add_summary(summary)
                        # saver.save(sess, os.path.join(params.LOG_DIR, "lstmlstm_model.ckpt"), cur_it)
                    if params.visualise:
                        if cur_it % 30000 == 0 and cur_it!=0:
                           import matplotlib.pyplot as plt
                           with open("./run_kld" + str(params.dec_keep_rate), 'w') as wf:
                               _ = [wf.write(str(s) + ' ')for s in iters]
                               wf.write('\n')
                               _ = [wf.write(str(s) + ' ')for s in kld_arr]
                               wf.write('\n')
                               _ = [wf.write(str(s) + ' ') for s in coeff]
                           plt.plot(iters, kld_arr, label='KLD')
                           plt.xlabel('Iterations')
                           plt.legend(bbox_to_anchor=(1.05, 1),
                                      loc=1, borderaxespad=0.)
                           plt.show()
                           plt.plot(iters, coeff, 'r--', label='annealing')
                           plt.legend(bbox_to_anchor=(1.05, 1),
                                      loc=1, borderaxespad=0.)
                           plt.show()
 def generate(self, source, source_mask, k, max_len):
     return beam_search(self, source, source_mask, self.params["target_vocab"], k, max_len)
Exemple #10
0
def fast_decode_tpu(encoder_output,
                    symbols_to_logits_fn,
                    hparams,
                    decode_length,
                    vocab_size,
                    beam_size,
                    top_beams=1,
                    alpha=1.0,
                    sos_id=0,
                    eos_id=beam_search.EOS_ID,
                    batch_size=None,
                    scope_prefix="body/"):
    """Given encoder output and a symbols to logits function, does fast decoding.

  Implements beam search decoding for TPU.

  Args:
    encoder_output: A tensor, output from encoder.
    symbols_to_logits_fn: Incremental decoding, function mapping triple
      `(ids, step, cache)` to symbol logits.
    hparams: Run hyperparameters.
    decode_length: An integer, how many additional timesteps to decode.
    vocab_size: Output vocabulary size.
    beam_size: An integer, number of beams.
    top_beams: An integer, how many of the beams to return.
    alpha: A float that controls the length penalty. Larger the alpha, stronger
      the preference for longer translations.
    sos_id: Start-of-sequence symbol.
    eos_id: End-of-sequence symbol.
    batch_size: An integer, must be passed if there is no input.
    scope_prefix: str, prefix for decoder layer variable scopes.

  Returns:
    A dict of decoding results {
        "outputs": integer `Tensor` of decoded ids of shape
            [batch_size, top_beams, <= decode_length]
        "scores": decoding log probs from the beam search.
    }.

  Raises:
    NotImplementedError: If beam size > 1 with partial targets.
  """
    if encoder_output is not None:
        batch_size = common_layers.shape_list(encoder_output)[0]

    key_channels = hparams.attention_key_channels or hparams.hidden_size
    value_channels = hparams.attention_value_channels or hparams.hidden_size
    num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers

    cache = {
        "layer_%d" % layer: {
            "k":
            tf.zeros([
                batch_size, hparams.num_heads,
                key_channels // hparams.num_heads, decode_length
            ],
                     dtype=encoder_output.dtype),
            "v":
            tf.zeros([
                batch_size, hparams.num_heads,
                value_channels // hparams.num_heads, decode_length
            ],
                     dtype=encoder_output.dtype),
        }
        for layer in range(num_layers)
    }

    kv_encdecs = {"layer_%d" % layer: {} for layer in range(num_layers)}
    if encoder_output is not None:
        for layer in range(num_layers):
            layer_name = "layer_%d" % layer
            with tf.variable_scope(
                    "%sdecoder/%s/encdec_attention/multihead_attention" %
                (scope_prefix, layer_name)):
                k_encdec = common_attention.compute_attention_component(
                    encoder_output, key_channels, hparams.num_heads, name="k")
                k_encdec = beam_search.merge_beam_dim(
                    beam_search.expand_to_beam_size(k_encdec, beam_size))
                v_encdec = common_attention.compute_attention_component(
                    encoder_output,
                    value_channels,
                    hparams.num_heads,
                    name="v")
                v_encdec = beam_search.merge_beam_dim(
                    beam_search.expand_to_beam_size(v_encdec, beam_size))
            kv_encdecs[layer_name]["k_encdec"] = k_encdec
            kv_encdecs[layer_name]["v_encdec"] = v_encdec

    initial_ids = sos_id * tf.ones([batch_size], dtype=tf.int32)
    decoded_ids, scores = beam_search.beam_search(symbols_to_logits_fn,
                                                  initial_ids,
                                                  beam_size,
                                                  decode_length,
                                                  vocab_size,
                                                  alpha,
                                                  states=cache,
                                                  kv_encdecs=kv_encdecs,
                                                  eos_id=eos_id,
                                                  stop_early=(top_beams == 1))

    if top_beams == 1:
        decoded_ids = decoded_ids[:, 0, 1:]
        scores = scores[:, 0]
    else:
        decoded_ids = decoded_ids[:, :top_beams, 1:]
        scores = scores[:, :top_beams]

    return {"outputs": decoded_ids, "scores": scores}
Exemple #11
0
    def _beam_decode(self,
                     features,
                     decode_length,
                     beam_size,
                     top_beams,
                     last_position_only,
                     alpha,
                     ensemble_num=1):
        """Beam search decoding.

    Args:
      features: an map of string to `Tensor`
      decode_length: an integer.  How many additional timesteps to decode.
      beam_size: number of beams.
      top_beams: an integer. How many of the beams to return.
      last_position_only: a boolean, speed-up by computing last position only.
      alpha: Float that controls the length penalty. larger the alpha, stronger
        the preference for slonger translations.

    Returns:
       samples: an integer `Tensor`. Top samples from the beam search
    """
        tf.logging.info('we use this beam_search')
        target_modality = self._hparams.problems[
            self._problem_idx].target_modality
        vocab_size = 84000  #target_modality.top_dimensionality

        def symbols_to_logits_fn(ids):
            """Go from ids to logits."""
            ids = tf.expand_dims(tf.expand_dims(ids, axis=2), axis=3)
            ids = tf.pad(ids[:, 1:], [[0, 0], [0, 1], [0, 0], [0, 0]])

            features["targets"] = ids
            self._coverage = None
            logits = tf.zeros([1, 1, 1, 1, vocab_size], dtype=tf.float32)
            for i in range(ensemble_num):
                tf.logging.info('the %dth model_fn' % (i + 1))
                #with tf.variable_scope("graph_%d" % (i+1)):
                sharded_logits, _, _ = self.model_fn(
                    features,
                    False,
                    last_position_only=last_position_only,
                    hparams=self._hparams_list[i],
                    num=i)
                # now self._coverage is a coverage tensor for the first datashard.
                # it has shape [batch_size] and contains floats between 0 and
                # source_length.
                logits += sharded_logits[0]  # Assuming we have one shard.
            logits /= ensemble_num
            if last_position_only:
                return tf.squeeze(logits, axis=[1, 2, 3])
            current_output_position = tf.shape(
                ids)[1] - 1  # -1 due to the pad above.
            logits = logits[:, current_output_position, :, :]
            return tf.squeeze(logits, axis=[1, 2])

        batch_size = tf.shape(features["inputs"])[0]
        initial_ids = tf.zeros([batch_size], dtype=tf.int32)

        inputs_old = features["inputs"]
        features["inputs"] = tf.expand_dims(features["inputs"], 1)
        if len(features["inputs"].shape) < 5:
            features["inputs"] = tf.expand_dims(features["inputs"], 4)
        # Expand the inputs in to the beam size.
        features["inputs"] = tf.tile(features["inputs"],
                                     [1, beam_size, 1, 1, 1])
        s = tf.shape(features["inputs"])
        features["inputs"] = tf.reshape(features["inputs"],
                                        [s[0] * s[1], s[2], s[3], s[4]])

        #print('the inputs of feature in beam_search is :', tf.shape(features["inputs"])[3])
        #target_modality = self._hparams.problems[self._problem_idx].target_modality
        #vocab_size = target_modality.top_dimensionality
        # Setting decode length to input length + decode_length
        decode_length = tf.shape(
            features["inputs"])[1] + tf.constant(decode_length)
        ids, scores = beam_search.beam_search(symbols_to_logits_fn,
                                              initial_ids, beam_size,
                                              decode_length, vocab_size, alpha)

        # Set inputs back to the unexpanded inputs to not to confuse the Estimator!
        features["inputs"] = inputs_old

        # Return `top_beams` decodings (also remove initial id from the beam search)
        return_scores = True  # TODO(lukaszkaiser): make it work multi-problem.
        if top_beams == 1:
            if return_scores:
                return {"outputs": ids[:, 0, 1:], "scores": scores}
            return ids[:, 0, 1:]
        else:
            if return_scores:
                return {"outputs": ids[:, :top_beams, 1:], "scores": scores}
            return ids[:, :top_beams, 1:]
    def test(self):
        p = ProgressBar()
        f = open('./captions.txt', 'w')
        for i_ in p(range(0, len(self.test_data), self.batchsize)):
            data = np.zeros((self.batchsize, self.in_channel,
                             self.input_height, self.input_width),
                            dtype=np.float32)
            t2 = np.zeros(
                (self.batchsize, self.input_height, self.input_width),
                dtype=np.int32)
            label = []
            first_words = np.zeros((self.batchsize), dtype=np.int32)
            for j in xrange(self.batchsize):
                image = (self.image_hash[self.test_data[i_ + j][0]])
                image = google_prepare(image)
                data[j, :, :, :] = image
                label.append(self.test_data[i_ + j][1])
                first_words[j] = self.test_data[i_ + j][1][0]

            genrated_sentence = []
            data = Variable(cuda.to_gpu(data))
            state = {
                name: Variable(
                    self.xp.zeros((data.shape[0], 1024),
                                  dtype=self.xp.float32))
                for name in ('c1', 'h1')
            }
            h = self.enc(data, train=False, test=True)

            ### first LSTM ###
            state, _ = self.dec(h, state, train=False, test=True, image=True)
            ### input <SOS> ###
            state, y = self.dec(Variable(cuda.to_gpu(first_words)),
                                state,
                                train=False,
                                test=True)

            genrated_sentence_beamed = beam_search(self.dec, state, y, data,
                                                   20, self.mydict_inv)

            # maximum sentence length is 50
            for i in xrange(50):
                y = Variable(
                    self.xp.array(np.argmax(y.data.get(),
                                            axis=1)).astype(self.xp.int32))
                state, y = self.dec(y, state, train=False, test=True)
                genrated_sentence.append(y.data)

            for b in range(self.batchsize):
                f.write(str(self.test_data[i_ + b][0]) + '/')
                # GT caption
                for i in range(1, len(label[b]) - 1):
                    index = label[b][i]
                    f.write(self.mydict_inv[index] + ' ')
                f.write("/")

                # Predicted caption
                for i, predicted_word in enumerate(genrated_sentence):
                    index = cuda.to_cpu(predicted_word.argmax(1))[b]
                    if self.mydict_inv[index] == '<EOS>':
                        break
                    f.write(self.mydict_inv[index] + ' ')
                f.write("/")

                # beamed caption
                for i in range(len(genrated_sentence_beamed[b])):
                    index = genrated_sentence_beamed[b][i]
                    if self.mydict_inv[index] == '<EOS>':
                        break
                    f.write(self.mydict_inv[index] + ' ')
                f.write("\n")

        f.close()