def glimpse_net(self, inputs, l_sample):
        """
            Args:
                inputs: [batch, h, w, c]
                l_sample: [batch, 2]
        """
        with tf.name_scope('glimpse_sensor'):
            max_r = int(self._g_size * (2**(self._g_n - 2)))
            inputs_pad = tf.pad(
                inputs, [[0, 0], [max_r, max_r], [max_r, max_r], [0, 0]],
                'CONSTANT')

            #TODO use clipped location to compute prob or not?
            l_sample = tf.clip_by_value(l_sample, -1.0, 1.0)

            if self._is_transform:
                l_sample_adj = l_sample * 1.0 * self._unit_pixel / (
                    self._im_size / 2 + max_r)
            else:
                l_sample_adj = l_sample * 1.0 * self._unit_pixel / (
                    self._im_size / 2 + max_r)

            retina_reprsent = []
            for g_id in range(0, self._g_n):
                cur_size = self._g_size * (2**g_id)
                cur_glimpse = tf.image.extract_glimpse(
                    inputs_pad,
                    size=[cur_size, cur_size],
                    offsets=l_sample_adj,
                    centered=True,
                    normalized=True,
                    uniform_noise=True,
                    name='glimpse_sensor',
                )
                cur_glimpse = tf.image.resize_images(
                    cur_glimpse,
                    size=[self._g_size, self._g_size],
                    method=tf.image.ResizeMethod.BILINEAR,
                    align_corners=False,
                )
                retina_reprsent.append(cur_glimpse)
            retina_reprsent = tf.concat(retina_reprsent, axis=-1)
            self.layers['retina_reprsent'].append(retina_reprsent)

        with tf.variable_scope('glimpse_net'):
            out_dim = 128
            hg = L.Linear(retina_reprsent, out_dim, name='hg', nl=tf.nn.relu)
            hl = L.Linear(l_sample, out_dim, name='hl', nl=tf.nn.relu)

            out_dim = 256
            g = tf.nn.relu(L.Linear(hl, out_dim, 'lhg') +
                           L.Linear(hg, out_dim, 'lhl'),
                           name='g')
            return g
    def core_net(self, inputs_im):
        self.layers['loc_mean'] = []
        self.layers['loc_sample'] = []
        self.layers['rnn_outputs'] = []
        self.layers['retina_reprsent'] = []

        cell_size = 256
        batch_size = tf.shape(inputs_im)[0]

        init_loc_mean = tf.ones((batch_size, 2))
        loc_sample = tf.random_uniform((batch_size, 2), minval=-1, maxval=1)
        glimpse_out = self.glimpse_net(inputs_im, loc_sample)

        if self.is_training:
            inputs_im = tf.tile(inputs_im, [self._n_l_sample, 1, 1, 1])
            glimpse_out = tf.tile(glimpse_out, [self._n_l_sample, 1])
            batch_size = tf.shape(glimpse_out)[0]
            init_loc_mean = tf.tile(init_loc_mean, [self._n_l_sample, 1])
            loc_sample = tf.tile(loc_sample, [self._n_l_sample, 1])

        self.layers['loc_mean'].append(init_loc_mean)
        self.layers['loc_sample'].append(loc_sample)

        # RNN of core net
        h_prev = tf.zeros((batch_size, cell_size))
        for step_id in range(0, self._n_step):
            with tf.variable_scope('core_net'):
                h = tf.nn.relu(L.Linear(h_prev, cell_size, 'lh') +
                               L.Linear(glimpse_out, cell_size, 'lg'),
                               name='h')

            # core net does not trained through locatiion net
            loc_mean = self.location_net(tf.stop_gradient(h))
            if self.is_training:
                loc_sample = tf.stop_gradient(
                    sample_normal_single(loc_mean, stddev=self._l_std))
            else:
                loc_sample = tf.stop_gradient(
                    sample_normal_single(loc_mean, stddev=self._l_std))

            glimpse_out = self.glimpse_net(inputs_im, loc_sample)
            action = self.action_net(h)

            # do not restore the last step location
            if step_id < self._n_step - 1:
                self.layers['loc_mean'].append(loc_mean)
                self.layers['loc_sample'].append(loc_sample)
            self.layers['rnn_outputs'].append(h)

            h_prev = h

        self.layers['class_logists'] = action
        self.layers['prob'] = tf.nn.softmax(logits=action, name='prob')
        self.layers['pred'] = tf.argmax(action, axis=1)
    def _comp_baselines(self):
        with tf.variable_scope('baseline'):
            # core net does not trained through baseline loss
            rnn_outputs = tf.stop_gradient(self.layers['rnn_outputs'])
            baselines = []
            for step_id in range(0, self._n_step - 1):
                b = L.Linear(rnn_outputs[step_id], 1, name='baseline')
                b = tf.squeeze(b, axis=-1)
                baselines.append(b)

            baselines = tf.stack(baselines)  # [n_step, b_size]
            baselines = tf.transpose(baselines)  # [b_size, n_step]
            return baselines
 def action_net(self, core_state):
     with tf.variable_scope('act_net'):
         act = L.Linear(core_state, self._n_class, name='act')
         return act
 def location_net(self, core_state):
     with tf.variable_scope('loc_net'):
         l_mean = L.Linear(core_state, 2, name='l_mean')
         # l_mean = tf.tanh(l_mean)
         l_mean = tf.clip_by_value(l_mean, -1., 1.)
         return l_mean