Beispiel #1
0
 def model(self, reuse=False):
     with tf.variable_scope("G", reuse=reuse):
         z = tf.random_uniform([self.batch_size, 1024], minval=-1.0, maxval=1.0)
         
         fc1 = fc(z, 1024, 7*7*256, bn=True, activation_fn=tf.nn.relu, scope="fc1")
         fc1 = tf.reshape(fc1, [-1, 7, 7, 256])
         
         convt1 = convt(fc1,
                        kernel=[3, 3, 256, 256],
                        stride=[1, 2, 2, 1],
                        output=[self.batch_size, 14, 14, 256],
                        bn=True,
                        activation_fn=tf.nn.relu,
                        scope="convt1")
         """
         convt2 = convt(convt1,
                        kernel=[3, 3, 256, 256],
                        stride=[1, 1, 1, 1],
                        output=[self.batch_size, 14, 14, 256],
                        bn=True,
                        activation_fn=tf.nn.relu,
                        scope="convt2")
         """
         convt3 = convt(convt1,
                        kernel=[3, 3, 256, 256],
                        stride=[1, 2, 2, 1],
                        output=[self.batch_size, 28, 28, 256],
                        bn=True,
                        activation_fn=tf.nn.relu,
                        scope="convt3")
         """
         convt4 = convt(convt3,
                        kernel=[3, 3, 256, 256],
                        stride=[1, 1, 1, 1],
                        output=[self.batch_size, 28, 28, 256],
                        bn=True,
                        activation_fn=tf.nn.relu,
                        scope="convt4")
         """
         convt5 = convt(convt3,
                        kernel=[3, 3, 128, 256],
                        stride=[1, 2, 2, 1],
                        output=[self.batch_size, 56, 56, 128],
                        bn=True,
                        activation_fn=tf.nn.relu,
                        scope="convt5")
         """
         convt6 = convt(convt5,
                        kernel=[3, 3, 64, 128],
                        stride=[1, 1, 1, 1],
                        output=[self.batch_size, 56, 56, 64],
                        bn=True,
                        activation_fn=tf.nn.relu,
                        scope="convt6")
         convt7 = convt(convt6,
                        kernel=[3, 3, 3, 64],
                        stride=[1, 1, 1, 1],
                        output=[self.batch_size, 56, 56, 3],
                        activation_fn=tf.nn.tanh,
                        scope="convt7")
         
         """
         
         convt6 = convt(convt5,
                        kernel=[3, 3, 64, 128],
                        stride=[1, 2, 2, 1],
                        output=[self.batch_size, 112, 112, 64],
                        bn=True,
                        activation_fn=tf.nn.relu,
                        scope="convt6")
         convt7 = convt(convt6,
                        kernel=[3, 3, 3, 64],
                        stride=[1, 1, 1, 1],
                        output=[self.batch_size, 112, 112, 3],
                        activation_fn=tf.nn.tanh,
                        scope="convt7")
         
         return convt7
Beispiel #2
0
    def build_graph(self, ph_ob):
        ob = ph_ob[-1]
        assert len(ob.shape.as_list()) == 4  #B, H, W, C
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob.shape.as_list()[1:3] + [1])

        ob_norm = ob[:, :, :, -1:]
        ob_norm = tf.cast(ob_norm, tf.float32)
        ob_norm = tf.clip_by_value(
            (ob_norm - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)

        # Random target network
        xr = tf.nn.leaky_relu(
            conv(ob_norm,
                 "c1r",
                 nf=self.convfeat * 1,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2)))
        xr = tf.nn.leaky_relu(
            conv(xr,
                 'c2r',
                 nf=self.convfeat * 2 * 1,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2)))
        xr = tf.nn.leaky_relu(
            conv(xr,
                 'c3r',
                 nf=self.convfeat * 2 * 1,
                 rf=3,
                 stride=1,
                 init_scale=np.sqrt(2)))
        rgbr = [to2d(xr)]
        X_r = fc(rgbr[0], 'fc1r', nh=self.rep_size, init_scale=np.sqrt(2))

        # Predictor network
        xrp = tf.nn.leaky_relu(
            conv(ob_norm,
                 'c1rp_pred',
                 nf=self.convfeat,
                 rf=8,
                 stride=4,
                 init_scale=np.sqrt(2)))
        xrp = tf.nn.leaky_relu(
            conv(xrp,
                 'c2rp_pred',
                 nf=self.convfeat * 2,
                 rf=4,
                 stride=2,
                 init_scale=np.sqrt(2)))
        xrp = tf.nn.leaky_relu(
            conv(xrp,
                 'c3rp_pred',
                 nf=self.convfeat * 2,
                 rf=3,
                 stride=1,
                 init_scale=np.sqrt(2)))
        rgbrp = to2d(xrp)

        X_r_hat = tf.nn.relu(
            fc(rgbrp,
               'fc1r_hat1_pred',
               nh=256 * self.enlargement,
               init_scale=np.sqrt(2)))
        X_r_hat = tf.nn.relu(
            fc(X_r_hat,
               'fc1r_hat2_pred',
               nh=256 * self.enlargement,
               init_scale=np.sqrt(2)))
        X_r_hat = fc(X_r_hat,
                     'fc1r_hat3_pred',
                     nh=self.rep_size,
                     init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)

        targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)

        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
        self._predictor = U.function([ob], [self.int_rew])
Beispiel #3
0
    def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False,
                 activ_fcn='relu6'):  # pylint: disable=W0613
        # this method is called with nbatch = nenvs*nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        # actdim = ac_space.shape[0]
        # Todo check initialization
        # Input and Output dimensions
        nd, = ob_space.shape
        nbatch = nenvs * nsteps
        ob_shape = (nbatch, nd)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        with tf.variable_scope("model", reuse=reuse):
            if activ_fcn == 'relu6':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'elu':
                h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'mixed':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))

            # The output matrix [nbatch x trace_length, h_units] of layer 2 needs to be reshaped to a vector with
            # dimensions: [nbatch , trace_length , h_units] for rnn processing.
            rnn_cell = tf.contrib.rnn.GRUCell(num_units=units_per_hlayer[1])
            rnn_input = tf.reshape(h2, shape=[nenvs, nsteps, units_per_hlayer[1]])
            rnn_state_in = rnn_cell.zero_state(batch_size=nenvs,
                                               dtype=tf.float32)  # reset the state in every training iteration
            rnn_output, rnn_state_out = tf.nn.dynamic_rnn(inputs=rnn_input,
                                                          cell=rnn_cell,
                                                          initial_state=rnn_state_in,
                                                          dtype=tf.float32,
                                                          scope="model" + '_rnn')
            # The output of the recurrent cell then needs to be reshaped to the original matrix shape.
            rnn_output = tf.reshape(rnn_output, shape=[-1, units_per_hlayer[1]])

            if activ_fcn == 'relu6':
                activ = tf.nn.relu6
            elif activ_fcn == 'elu':
                activ = tf.nn.elu
            elif activ_fcn == 'mixed':
                activ = tf.nn.tanh
            h3 = activ(fc(rnn_output, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            pi_logit = fc(h3, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logit)

            vf = fc(rnn_output, 'vf', 1)[:, 0]  # predicted value of input state

        self.pd = CategoricalPd(pi_logit)  # pdparam
        a0 = self.pd.sample()  # returns action index: 0,1
        # a0 = tf.argmax(pi_logit, axis=1)
        neglogp0 = self.pd.neglogp(a0)

        # The rnn state consists of the "cell state" c and the "input vector" x_t = h_{t-1}
        self.initial_state = np.zeros([nenvs, units_per_hlayer[1]])

        def step(ob, r_state, *_args, **_kwargs):
            a, pi, v, r_state_out, neglogp = sess.run([a0, pi_logit, vf, rnn_state_out, neglogp0], {X: ob, rnn_state_in: r_state})
            return a, pi, v, r_state_out, neglogp

        def value(ob, r_state,  *_args, **_kwargs):
            return sess.run(vf, {X: ob, rnn_state_in: r_state})

        self.X = X
        self.pi = pi
        self.pi_logit = pi_logit
        self.vf = vf
        self.ac = a0
        self.rnn_state_in = rnn_state_in
        self.rnn_state_out = rnn_state_out
        self.step = step
        self.value = value
Beispiel #4
0
    def define_action_balance_rew(self, units, rep_size):
        logger.info(
            "Using Action Balance BONUS ****************************************************"
        )
        # (s, a) seen frequency as bonus
        with tf.variable_scope('action_balance', reuse=tf.AUTO_REUSE):
            ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
            assert ac_one_hot.get_shape().ndims == 3
            assert ac_one_hot.get_shape().as_list() == [
                None, None, self.ac_space.n
            ], ac_one_hot.get_shape().as_list()
            ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

            def cond(x):
                return tf.concat([x, ac_one_hot], 1)

            # Random target network.
            for ph in self.ph_ob.values():
                if len(ph.shape.as_list()) == 3:  # B,T,S
                    logger.info(
                        "Mlp Target: using '%s' shape %s as image input" %
                        (ph.name, str(ph.shape)))
                    xr = ph[:, :-1]
                    xr = tf.cast(xr, tf.float32)
                    xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:]))
                    xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std,
                                          -5.0, 5.0)

                    xr = tf.nn.relu(
                        fc(cond(xr),
                           'fc_sa0_r',
                           nh=units,
                           init_scale=np.sqrt(2)))
                    xr = tf.nn.relu(
                        fc(cond(xr),
                           'fc_sa1_r',
                           nh=units,
                           init_scale=np.sqrt(2)))
                    X_r = fc(cond(xr),
                             'fc_sa2_r',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

            # Predictor network.
            for ph in self.ph_ob.values():
                if len(ph.shape.as_list()) == 3:  # B,T,S
                    logger.info(
                        "Mlp Target: using '%s' shape %s as image input" %
                        (ph.name, str(ph.shape)))
                    xrp = ph[:, :-1]
                    xrp = tf.cast(xrp, tf.float32)
                    xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:]))
                    xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                           -5.0, 5.0)

                    xrp = tf.nn.relu(
                        fc(cond(xrp),
                           'fc_sa0_r',
                           nh=units * 2,
                           init_scale=np.sqrt(2)))
                    xrp = tf.nn.relu(
                        fc(cond(xrp),
                           'fc_sa1_r',
                           nh=units * 2,
                           init_scale=np.sqrt(2)))
                    X_r_hat = fc(cond(xrp),
                                 'fc_sa2_r',
                                 nh=rep_size,
                                 init_scale=np.sqrt(2))

        self.feat_var_ab = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat_ab = tf.reduce_max(tf.abs(X_r))
        self.int_rew_ab = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew_ab = tf.reshape(self.int_rew_ab,
                                     (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss_ab = tf.reduce_mean(tf.square(noisy_targets - X_r_hat),
                                          -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss_ab),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss_ab = tf.reduce_sum(mask * self.aux_loss_ab) / tf.maximum(
            tf.reduce_sum(mask), 1.)
Beispiel #5
0
    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        #Dynamics loss with random features.

        activ = tf.nn.relu
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]  # get next status index is 1:
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:]))
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = activ(fc(xr, 'fc_0_r', nh=32, init_scale=np.sqrt(2)))
                xr = activ(fc(xr, 'fc_1_r', nh=32, init_scale=np.sqrt(2)))
                X_r = fc(xr, 'fc_2_r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None, None, self.ac_space.n
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:]))
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = activ(fc(xrp, 'fc_0_pred', nh=32, init_scale=np.sqrt(2)))
                xrp = activ(fc(xrp, 'fc_1_pred', nh=32, init_scale=np.sqrt(2)))
                X_r_hat = fc(xrp,
                             'fc_2r_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
    def apply_policy(ph_ob, ph_new, ph_istate, reuse, scope, hidsize, memsize,
                     extrahid, sy_nenvs, sy_nsteps, pdparamsize,
                     rec_gate_init):
        data_format = 'NHWC'
        ph = ph_ob
        assert len(ph.shape.as_list()) == 5  # B,T,H,W,C
        logger.info("CnnGruPolicy: using '%s' shape %s as image input" %
                    (ph.name, str(ph.shape)))
        X = tf.cast(ph, tf.float32) / 255.
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device('/gpu:0' if yes_gpu else '/cpu:0'):
            X = activ(
                conv(X,
                     'c1',
                     nf=32,
                     rf=8,
                     stride=4,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c2',
                     nf=64,
                     rf=4,
                     stride=2,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = activ(
                conv(X,
                     'c3',
                     nf=64,
                     rf=4,
                     stride=1,
                     init_scale=np.sqrt(2),
                     data_format=data_format))
            X = to2d(X)
            X = activ(fc(X, 'fc1', nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(GRUCell(memsize,
                                                 rec_gate_init=rec_gate_init),
                                         (X, ph_new[:, :, None]),
                                         dtype=tf.float32,
                                         time_major=False,
                                         initial_state=ph_istate)
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, 'fc2val', nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, 'fc2act', nh=memsize, init_scale=0.1))
            pdparam = fc(X, 'pd', nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, 'vf_int', nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, 'vf_ext', nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
    def define_dynamics_prediction_rew(self, convfeat, rep_size, enlargement):
        #Dynamics based bonus.

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        ac_one_hot = tf.one_hot(self.ph_ac, self.ac_space.n, axis=2)
        assert ac_one_hot.get_shape().ndims == 3
        assert ac_one_hot.get_shape().as_list() == [
            None, None, self.ac_space.n
        ], ac_one_hot.get_shape().as_list()
        ac_one_hot = tf.reshape(ac_one_hot, (-1, self.ac_space.n))

        def cond(x):
            return tf.concat([x, ac_one_hot], 1)

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, :-1]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))
                # ph_mean, ph_std are 84x84x1, so we subtract the average of the last channel from all channels. Is this ok?
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)

                # X_r_hat = tf.nn.relu(fc(rgb[0], 'fc1r_hat1', nh=256 * enlargement, init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(rgbrp),
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(cond(X_r_hat),
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(cond(X_r_hat),
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
 def get_pdparam(self, x):
     pdparam = fc(x, name='pd', units=self.pdparamsize, activation=None)
     vpred = fc(x, name='value_function_output', units=1, activation=None)
     return pdparam, vpred
Beispiel #9
0
    def __init__(self, config, input_, is_training=True):

        batch_size = input_.batch_size
        num_steps = input_.num_steps  # for truncated backprop.
        lstm_width = config.hidden_size  # size of hidden units
        in_shape = input_.input_data.get_shape()

        initializer = tf.random_uniform_initializer(-0.08,
                                                    0.08,
                                                    dtype=tf.float32)

        enc_cell = utils.LSTM(size=lstm_width, init=initializer)
        dec_cell = utils.LSTM(size=lstm_width, init=initializer)

        # Encoder
        # ( fc > elu > lstm )
        enc_state = enc_cell.zero_state(batch_size, tf.float32)
        enc_states = []

        with tf.variable_scope("enc"):
            for i in range(num_steps):
                if i > 0: tf.get_variable_scope().reuse_variables()

                enc_inputs = input_.input_data[:, i, :]

                # 2d -> lstm width
                enc_cell_in = utils.fc(enc_inputs,
                                       enc_inputs.get_shape()[-1],
                                       lstm_width,
                                       init_w=initializer,
                                       a_fn=tf.nn.elu)
                (enc_cell_out, enc_state) = enc_cell(enc_cell_in, enc_state)

                enc_states.append(enc_state)

        # for test
        # self.enc_final_state = enc_state

        # Decoder
        # ( fc > elu > lstm > v^t tanh(W1 e + W2 d) > softmax > argmax )
        dec_state = enc_states[-1]
        dec_inputs = tf.constant(0.0, shape=[batch_size, 2],
                                 dtype=tf.float32)  # start symbol

        self.C_prob = []
        self.C_idx = []
        with tf.variable_scope("dec"):
            for i in range(num_steps):
                if i > 0: tf.get_variable_scope().reuse_variables()

                dec_cell_in = utils.fc(dec_inputs,
                                       dec_inputs.get_shape()[-1],
                                       lstm_width,
                                       init_w=initializer,
                                       a_fn=tf.nn.elu)

                (dec_cell_out, dec_state) = dec_cell(dec_cell_in, dec_state)

                # W1, W2 are square matrixes (SxS)
                # where S is the size of hidden states
                W1 = tf.get_variable("W1", [lstm_width, lstm_width],
                                     dtype=tf.float32,
                                     initializer=initializer)
                W2 = tf.get_variable("W2", [lstm_width, lstm_width],
                                     dtype=tf.float32,
                                     initializer=initializer)
                # v is a vector (S)
                v = tf.get_variable("v", [lstm_width],
                                    dtype=tf.float32,
                                    initializer=initializer)

                # W2 (SxS) d_i (S) = W2d (S)
                W2d = tf.matmul(dec_state.h, W2)

                # u_i (n)
                u_i = []

                for j in range(num_steps):
                    # W1 (SxS) e_j (S) = W1e (S)
                    # t = tanh(W1e + W2d) (S)
                    t = tf.tanh(tf.matmul(enc_states[j].h, W1) + W2d)
                    # v^T (S) t (S) = U_ij (1)
                    u_ij = tf.reduce_sum(v * t,
                                         axis=1)  # cuz t is acutually BxS

                    u_i.append(u_ij)

                u_i = tf.stack(u_i, axis=1)  # asarray

                probs = tf.nn.softmax(u_i)

                C_i = tf.reshape(tf.cast(tf.argmax(probs, axis=1), tf.int32),
                                 shape=[batch_size, 1])

                self.C_idx.append(C_i)

                first = tf.expand_dims(tf.range(batch_size), axis=1)
                dec_inputs = tf.gather_nd(
                    input_.input_data, tf.concat(values=[first, C_i], axis=1))

                self.C_prob.append(probs)

        self.C_prob = tf.squeeze(tf.stack(self.C_prob, axis=1))
        self.C_idx = tf.squeeze(tf.stack(self.C_idx, axis=1))

        targets = tf.one_hot(input_.targets, depth=51)

        self.loss = tf.nn.l2_loss(targets - self.C_prob)

        opt = tf.train.AdadeltaOptimizer(learning_rate=0.001,
                                         rho=0.95,
                                         epsilon=1e-6)

        self.train_op = opt.minimize(self.loss)
    def define_rew_discriminator_v2(self, convfeat, rep_size, use_rew=False):

        output_shape = [self.sy_nenvs * (self.sy_nsteps - 1)]

        sample_prob = tf.reshape(self.sample_agent_prob,
                                 tf.stack(output_shape))
        game_score = tf.reshape(
            self.game_score,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        rew_agent_label = tf.reshape(
            self.rew_agent_label,
            tf.stack([self.sy_nenvs * (self.sy_nsteps - 1), 1]))

        #rew_agent_label = tf.one_hot(self.rew_agent_label, self.num_agents, axis=-1)
        #rew_agent_label = tf.reshape(rew_agent_label,(-1,self.num_agents ))

        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C

                phi = ph[:, 1:]
                phi = tf.cast(phi, tf.float32)
                phi = tf.reshape(phi, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                phi = phi / 255.

                last_rew_ob = self.last_rew_ob
                last_rew_ob = tf.cast(last_rew_ob, tf.float32)
                last_rew_ob = tf.reshape(
                    last_rew_ob,
                    (-1, *last_rew_ob.shape.as_list()[-3:]))[:, :, :, -1:]
                last_rew_ob = last_rew_ob / 255.

                if use_rew:
                    phi = tf.concat([phi, last_rew_ob], axis=-1)

                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                #[20,20] [8,8]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                #[9,9] [7,7]
                phi = tf.nn.leaky_relu(
                    conv(phi,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                phi = to2d(phi)

                phi = tf.nn.relu(
                    fc(phi, 'fc1r', nh=rep_size, init_scale=np.sqrt(2)))
                phi = tf.nn.relu(
                    fc(phi, 'fc2r', nh=rep_size, init_scale=np.sqrt(2)))
                disc_logits = fc(phi,
                                 'fc3r',
                                 nh=self.num_agents,
                                 init_scale=np.sqrt(2))

        one_hot_gidx = tf.one_hot(self.ph_agent_idx, self.num_agents, axis=-1)
        one_hot_gidx = tf.reshape(one_hot_gidx, (-1, self.num_agents))

        flatten_all_div_prob = tf.nn.softmax(disc_logits, axis=-1)
        all_div_prob = tf.reshape(
            flatten_all_div_prob,
            (self.sy_nenvs, self.sy_nsteps - 1, self.num_agents))

        sp_prob = tf.reduce_sum(one_hot_gidx * flatten_all_div_prob, axis=1)
        sp_prob = tf.reshape(sp_prob, (self.sy_nenvs, self.sy_nsteps - 1))

        div_rew = -1 * tf.nn.softmax_cross_entropy_with_logits_v2(
            logits=disc_logits, labels=one_hot_gidx)
        base_rew = tf.log(0.01)
        div_rew = div_rew - tf.log(sample_prob)

        div_rew = tf.reshape(div_rew, (self.sy_nenvs, self.sy_nsteps - 1))

        disc_pdtype = CategoricalPdType(self.num_agents)
        disc_pd = disc_pdtype.pdfromflat(disc_logits)

        disc_nlp = disc_pd.neglogp(rew_agent_label)

        return disc_logits, all_div_prob, sp_prob, div_rew, disc_pd, disc_nlp
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 n_env,
                 n_steps,
                 reuse,
                 n_lstm=256,
                 scope="policy"):
        super(ErrorPredRnnPolicy,
              self).__init__(ob_space, ac_space, hidsize, ob_mean, ob_std,
                             feat_dim, layernormalize, nl, n_env, n_steps,
                             reuse, n_lstm, scope)
        with tf.variable_scope(scope):
            self.flat_masks_ph = tf.reshape(self.masks_ph,
                                            [self.n_env * self.n_steps])
            self.pred_error = tf.placeholder(
                dtype=tf.float32,
                shape=(self.n_env, self.n_steps, self.hidsize),
                name='pred_error')  # prediction error
            self.flat_pred_error = flatten_two_dims(self.pred_error)

            self.obs_pred = tf.placeholder(dtype=tf.float32,
                                           shape=(self.n_env, self.n_steps,
                                                  self.hidsize),
                                           name='obs_pred')
            self.flat_obs_pred = flatten_two_dims(self.obs_pred)

            with tf.variable_scope(scope, reuse=self.reuse):
                x = tf.concat([
                    self.flat_features, self.flat_obs_pred,
                    self.flat_pred_error
                ],
                              axis=1)

                input_sequence = batch_to_seq(x, self.n_env, self.n_steps)
                masks = batch_to_seq(self.masks_ph, self.n_env, self.n_steps)
                rnn_output, self.snew = lstm(input_sequence,
                                             masks,
                                             self.states_ph,
                                             'lstm1',
                                             n_hidden=n_lstm,
                                             layer_norm=False)
                rnn_output = seq_to_batch(rnn_output)
                rnn_output = layernorm(rnn_output)

                ## Concat
                q = self.flat_features
                q = tf.concat([q, rnn_output], axis=1)
                q = fc(q, units=hidsize, activation=activ, name="fc1")
                q = fc(q, units=hidsize, activation=activ, name="fc2")

                pdparam, vpred = self.get_pdparam(q)
            self.pdparam = pdparam = unflatten_first_dim(pdparam, self.sh)
            self.vpred = unflatten_first_dim(vpred, self.sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.proba_distribution_from_flat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
    def __init__(self, input_states, taken_actions,
                 num_actions,scope_name,  shared_network = True, layer_norm = True):
        """
            env:
                RL environment
            input_states [batch_size, obs_size]:
                Input state vectors to predict actions for
            taken_actions [batch_size, 1]:
                Actions taken by the old policy (used for training)
            num_actions (int):
                Number of discrete actions
            scope_name (string):
                scope name (i.e. policy or policy_old)
            shared_network (bool):
                Whether Actor and critic share part of network
            layer_norm(bool): 
                perform layer_norm
        """

        with tf.variable_scope(scope_name):
            # construct mlp networks
            self.policy_latent = mlp(num_layers = 2, num_hidden = 128, activation = tf.nn.relu, layer_norm = layer_norm)(input_states)
            '''
            layer = tf.layers.flatten(input_states)
            for i in range(2):
                layer = tf.layers.dense(layer, 128, activation = None, kernel_initializer = ortho_init(np.sqrt(2.0)), bias_initializer = tf.constant_initializer(0.0), name = "mlp_fc{}".format(i))
                if layer_norm:
                    layer = tf.contrib.layers.layer_norm(layer, center = True, scale = True)
                layer = tf.nn.relu(layer)
            self.policy_latent = layer
            '''
            if shared_network:
                self.value_latent = self.policy_latent
            else:
                self.value_latent = mlp(num_layers = 2, num_hidden =128, activation = tf.nn.relu, layer_norm = layer_norm)(input_states)
                '''
                v_layer = tf.layers.flatten(input_states)
                for i in range(2):
                    v_layer = tf.layers.dense(v_layer, 128, activation = None, kernel_initializer = ortho_init(np.sqrt(2.0)), bias_initializer = tf.constant_initializer(0.0),name = "mlp_fc{}".format(i))
                    if layer_norm:
                        v_layer = tf.contrib.layers.layer_norm(v_layer, center = True, scale = True)
                    v_layer = tf.nn.relu(v_layer)
                
                self.value_latent = v_layer
                '''
            # Additional Flatten Layers(may be useless)
            self.value_latent = tf.layers.flatten(self.value_latent)
            self.policy_latent = tf.layers.flatten(self.policy_latent)
        
            # ============================   Policy Branch Pi(a_t | s_t; theta)
            # create graph for sampling actions
            # latent_vector (Batch_Size, 128) --> fc -->  pdparams (Batch_Size, self.ncat) --> softmax --> logits (Batch_Size, self.ncat) (probability of each action)
            self.pdtype = CategoricalPdType(num_actions)

            self.pd, self.pi = self.pdtype.pdfromlatent(self.policy_latent, init_scale = 0.01)

            # Take an action from policy's distribution
            self.action = self.pd.sample()

            # ============================   Value Branch V(s_t; theta)
            # Note fc has no activation
            # Shape: [Batch_Size, 1]
            self.value = fc(self.value_latent,'v',1) 
            #self.value = tf.layers.dense(self.value_latent, 1, activation = None, kernel_initializer = ortho_init(np.sqrt(2.0)), bias_initializer = tf.constant_initializer(0.0),name = 'v')
            
            # Shape: [Batch_Size]
            self.value = self.value[:,0]

            # check numericals
            self.pi = tf.check_numerics(self.pi, "Invalid value for self.pi")
            self.value = tf.check_numerics(self.value, "Invalid value for self.value")
    def define_bottleneck_rew(self,
                              convfeat,
                              rep_size,
                              enlargement,
                              beta=1e-2,
                              rew_counter=None):
        logger.info(
            "Using Curiosity Bottleneck ****************************************************"
        )
        v_target = tf.reshape(self.ph_ret_ext, (-1, 1))

        if rew_counter is None:
            sched_coef = 1.
        else:
            sched_coef = tf.minimum(rew_counter / 1000, 1.)

        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                mu = fc(rgbr[0], 'fc_mu', nh=rep_size, init_scale=np.sqrt(2))
                sigma = tf.nn.softplus(
                    fc(rgbr[0], 'fc_sigma', nh=rep_size,
                       init_scale=np.sqrt(2)))
                z = mu + sigma * tf.random_normal(
                    tf.shape(mu), 0, 1, dtype=tf.float32)
                v = fc(z, 'value', nh=1, init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(sigma)
        self.max_feat = tf.reduce_max(tf.abs(z))

        self.kl = 0.5 * tf.reduce_sum(tf.square(mu) + tf.square(sigma) -
                                      tf.log(1e-8 + tf.square(sigma)) - 1,
                                      axis=-1,
                                      keep_dims=True)
        self.int_rew = tf.stop_gradient(self.kl)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        self.aux_loss = sched_coef * tf.square(v_target - v) + beta * self.kl
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
Beispiel #14
0
    def __init__(self, sess, ob_space, loc_space, ac_space, nbatch, nsteps, max_timesteps, reuse=False, seed=0):
        nenv = nbatch // nsteps
        self.pdtype = make_pdtype(ac_space)
        with tf.variable_scope("model", reuse=reuse):
            G = tf.placeholder(tf.float32, [nbatch, max_timesteps, loc_space])
            X = tf.placeholder(tf.float32, (nbatch, )+ob_space.shape)
            Y = tf.placeholder(tf.float32, [nbatch, loc_space])
            M = tf.placeholder(tf.float32, [nbatch])
            S = tf.placeholder(tf.float32, [nenv, 128])
            ys = batch_to_seq(Y, nenv, nsteps)
            ms = batch_to_seq(M, nenv, nsteps)

            tf.set_random_seed(seed)
            self.embed_W = tf.get_variable("embed_w", [loc_space, 64], initializer=ortho_init(1.0, seed))
            self.embed_b = tf.get_variable("embed_b", [64,])
            self.wa = tf.get_variable("wa", [128, 128], initializer=ortho_init(1.0, seed))
            self.wb = tf.get_variable("wb", [128,])
            self.ua = tf.get_variable("ua", [128, 128], initializer=ortho_init(1.0, seed))
            self.ub = tf.get_variable("ub", [128,])
            self.va = tf.get_variable("va", [128])
            self.rnn = tf.nn.rnn_cell.GRUCell(128, kernel_initializer=ortho_init(1.0, seed))
            enc_hidden = tf.zeros((nbatch, 128))
            embed_G = tf.matmul(tf.reshape(G, (-1, loc_space)),self.embed_W)+self.embed_b
            embed_G = tf.reshape(embed_G, (nbatch, max_timesteps, -1))
            enc_output, _ = tf.nn.dynamic_rnn(cell=self.rnn, inputs=embed_G, dtype=tf.float32)
            gs = batch_to_seq(enc_output, nenv, nsteps)
            dec_hidden = S
            h = []
            for idx, (y, m, g) in enumerate(zip(ys, ms, gs)):
                dec_hidden = dec_hidden*(1-m)
                embed_y = tf.matmul(y,self.embed_W)+self.embed_b
                dec_output, dec_hidden = tf.nn.dynamic_rnn(cell=self.rnn, inputs=tf.expand_dims(embed_y,axis=1), initial_state=dec_hidden)

                tmp = tf.reshape(tf.matmul(tf.reshape(g, (-1, 128)), self.ua)+self.ub,(nenv, max_timesteps, 128))
                tmp = tf.tanh(tf.expand_dims(tf.matmul(dec_hidden, self.wa)+self.wb,axis=1) + tmp)
                score = tf.reduce_sum(tmp*tf.expand_dims(tf.expand_dims(self.va, axis=0), axis=1), axis=2, keepdims=True)
                attention_weights = tf.nn.softmax(score, axis=1)
                context_vector = attention_weights * g
                context_vector = tf.reduce_sum(context_vector, axis=1)
                x = tf.concat([context_vector, dec_hidden], axis=-1)
                h.append(x)
            h = seq_to_batch(h)
            vf = fc(h, 'v', 1, seed=seed)[:,0]
            self.pd, self.pi = self.pdtype.pdfromlatent(h, seed=seed, init_scale=0.01)
        a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = np.zeros((nenv,128))

        def step(ob, loc, goal, state, mask):
            a, v, state, neglogp = sess.run([a0, vf, dec_hidden, neglogp0], {X:ob, Y:loc, G:goal, M:mask, S:state})
            return a, v, state, neglogp

        def value(ob, loc, goal, state, mask):
            return sess.run(vf, {X:ob, Y:loc, G:goal, M:mask, S:state})

        self.G = G
        self.X = X
        self.Y = Y
        self.S = S
        self.M = M
        self.vf = vf
        self.step = step
        self.value = value
Beispiel #15
0
    def define_self_prediction_rew(self, width, rep_size, enlargement):
        # RND.
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B, Envs, Features
                logger.info(
                    f"FFNNTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    fc(
                        xr,
                        "fc1r",
                        nh=width * 1,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    fc(
                        xr,
                        "fc2r",
                        nh=width * 2 * 1,
                        init_scale=np.sqrt(2),
                    ))
                xr = tf.nn.leaky_relu(
                    fc(
                        xr,
                        "fc3r",
                        nh=width * 2 * 1,
                        init_scale=np.sqrt(2),
                    ))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], "fc4r", nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,Envs,Features
                logger.info(
                    f"FFNNTarget: using '{ph.name}' shape {ph.shape} as image input"
                )
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    fc(
                        xrp,
                        "fc1rp_pred",
                        nh=width,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    fc(
                        xrp,
                        "fc2rp_pred",
                        nh=width * 2,
                        init_scale=np.sqrt(2),
                    ))
                xrp = tf.nn.leaky_relu(
                    fc(
                        xrp,
                        "fc3rp_pred",
                        nh=width * 2,
                        init_scale=np.sqrt(2),
                    ))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(
                        rgbrp,
                        "fc1r_hat1_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = tf.nn.relu(
                    fc(
                        X_r_hat,
                        "fc1r_hat2_pred",
                        nh=256 * enlargement,
                        init_scale=np.sqrt(2),
                    ))
                X_r_hat = fc(
                    X_r_hat,
                    "fc1r_hat3_pred",
                    nh=rep_size,
                    init_scale=np.sqrt(2),
                )

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True,
        )
        self.int_rew = tf.reshape(
            self.int_rew,
            (self.sy_nenvs, self.sy_nsteps - 1),
        )

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(
            shape=tf.shape(self.aux_loss),
            minval=0.0,
            maxval=1.0,
            dtype=tf.float32,
        )
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.0)
 def get_pdparam(self, features, reuse):
     with tf.variable_scope(self.scope, reuse=False):
         x = fc(features, units=self.hidsize, activation=activ, reuse=True)
         x = fc(x, units=self.hidsize, activation=activ, reuse=True)
         pdparam = fc(x, name='pd', units=self.pdparamsize, activation=None, reuse=reuse)
     return pdparam
    Acc = []
    max_epoch = 20
    mini_batch = 100
    for epoch_num in range(max_epoch):
        idxs = np.random.permutation(train_size)
        for k in range(math.ceil(train_size / mini_batch)):
            start_idx = k * mini_batch
            end_idx = min((k + 1) * mini_batch, train_size)

            a, z, delta = {}, {}, {}
            batch_indices = idxs[start_idx:end_idx]
            a[1] = X_train[:, batch_indices]
            y = trainLabels[:, batch_indices]

            for l in range(1, L):
                a[l + 1], z[l + 1] = fc(w[l], a[l])

            delta[L] = (a[L] - y) * (a[L] * (1 - a[L]))
            print(delta[L])

            for l in range(L - 1, 1, -1):
                delta[l] = bc(w[l], z[l], delta[l + 1])

            for l in range(1, L):
                grad_w = np.dot(delta[l + 1], a[l].T)
                w[l] = w[l] - alpha * grad_w

            J.append(cost(a[L], y) / mini_batch)
            Acc.append(accuracy(a[L], y))

        a[1] = X_test
Beispiel #18
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=True):  #pylint: disable=W0613
        ob_shape = (nbatch, ) + ob_space.shape
        actdim = ac_space.shape[0]

        window_length = ob_space.shape[1] - 1

        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  #obs

        #         with tf.variable_scope("model", reuse=reuse) as scope:

        #             # policy
        #             w0 = tf.slice(X, [0,0,0,0],[-1,-1,1,1], name='pi_sl0')
        #             x = tf.slice(X, [0,0,1,0],[-1,-1,-1,-1], name='pi_sl1')
        #             x = conv(tf.cast(x, tf.float32),'c1', fh=1,fw=4,nf=3, stride=1, init_scale=np.sqrt(2))
        #             # x = tf.layers.conv2d(
        #             #     inputs=x,
        #             #     filters=3,
        #             #     kernel_size=[1, 4],
        #             #     padding="valid",
        #             #     activation=tf.nn.relu)
        #             #(1, 3, 47, 3)

        #             x = conv(x, 'c2', fh=1, fw=window_length -3, nf=20, stride= window_length -3, init_scale=np.sqrt(2))
        #             # x = tf.layers.conv2d(
        #             #     inputs=x,
        #             #     filters=20,
        #             #     kernel_size=[1, window_length -3],
        #             #     padding="valid",
        #             #     strides=(1, window_length -3),
        #             #     activation=tf.nn.relu)

        #             x = tf.concat([x, w0], 3)

        #             x = conv(x, 'c3', fh=1, fw=1, nf=1, stride= 1, init_scale=np.sqrt(2))
        #             # x = tf.layers.conv2d(
        #             #     inputs=x,
        #             #     filters=1,
        #             #     kernel_size=[1, 1],
        #             #     padding="valid",
        #             #     strides=(1, 1),
        #             #     activation=tf.nn.relu)

        #             cash_bias = tf.zeros([x.shape[0],1,1,1], tf.float32)
        #             c = tf.concat([cash_bias, x], 1)

        #             v = conv_to_fc(x)

        #             # vf = fc(v, 'v',1)[:,0]

        #             f = tf.contrib.layers.flatten(c)
        #             eps = 10e20
        #             f = tf.clip_by_value(f, -eps, eps, 'clip1')
        #             # f = tf.Print(f, [f], "concatenate")
        #             pi = tf.nn.softmax(f)
        #             # pi = tf.Print(pi,[pi], 'pi ')

        #             # f = tf.nn.relu(f)
        #             vf = fc(v, 'v',1, act=tf.nn.relu)[:,0]

        #             # vf = tf.add(tf.ones(v.shape), v)

        #             # vf = fc(v, 'v',1)[:,0]

        #             # vf = tf.add(vf, tf.ones(vf.shape, tf.float32))

        #             logstd = tf.get_variable(name="logstd", shape=[1, actdim],
        #                 initializer=tf.zeros_initializer())
        #             eps = 80
        #             logstd = tf.clip_by_value(logstd, -eps, eps, 'clip_logstd')
        #             # logstd = tf.Print(logstd,[logstd], 'logstd ')
        with tf.variable_scope("model", reuse=reuse) as scope:
            w0 = tf.slice(X, [0, 0, 0, 0], [-1, -1, 1, 1])
            x = tf.slice(X, [0, 0, 1, 0], [-1, -1, -1, -1])

            # reuse when testing

            x = conv(tf.cast(x, tf.float32),
                     'c1',
                     fh=1,
                     fw=3,
                     nf=3,
                     stride=1,
                     init_scale=np.sqrt(2))

            x = conv(x,
                     'c2',
                     fh=1,
                     fw=window_length - 2,
                     nf=20,
                     stride=window_length - 2,
                     init_scale=np.sqrt(2))

            x = tf.concat([x, w0], 3)

            x = conv(x,
                     'c3',
                     fh=1,
                     fw=1,
                     nf=1,
                     stride=1,
                     init_scale=np.sqrt(2))

            cash_bias = tf.ones([x.shape[0], 1, 1, 1], tf.float32)
            c = tf.concat([cash_bias, x], 1)

            v = conv_to_fc(x)
            vf = fc(v, 'v', 1)[:, 0]

            f = tf.contrib.layers.flatten(c)

            pi = tf.nn.softmax(f)

            logstd = tf.get_variable(
                name="logstd",
                shape=[1, actdim],
                initializer=tf.truncated_normal_initializer())
            # logstd = tf.Print(logstd,[logstd], 'logstd ')
            eps = 50
            # logstd = tf.clip_by_value(logstd, -eps, eps, 'clip_logstd')

        pdparam = tf.concat([pi, pi * 0.0 + logstd], axis=1)

        self.pdtype = make_pdtype(ac_space)
        self.pd = self.pdtype.pdfromflat(pdparam)

        a0 = self.pd.sample()
        # a0 = tf.clip_by_value(a0, -eps, eps, 'clip2')
        a0 = tf.nn.softmax(a0)

        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, v, neglogp, lst, p = sess.run([a0, vf, neglogp0, logstd, pi],
                                             {X: ob})

            # print ("logstd: "+ str(lst[0]))

            # print ("action: " + str(a))
            # print ("value: {}".format(v))
            # print ("neglogp: "+ str(neglogp))
            # print ("f:{}".format(f))
            return a, v, self.initial_state, neglogp, lst[0], p

        def value(ob, *_args, **_kwargs):
            v = sess.run(vf, {X: ob})
            # print ("vf: " + str(v))
            return v

        self.X = X
        self.pi = pi
        self.vf = vf
        self.step = step
        self.value = value
    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        #RND.
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                    -1:]
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c1r',
                         nf=convfeat * 1,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c2r',
                         nf=convfeat * 2 * 1,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xr = tf.nn.leaky_relu(
                    conv(xr,
                         'c3r',
                         nf=convfeat * 2 * 1,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbr = [to2d(xr)]
                X_r = fc(rgbr[0], 'fc1r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 5:  # B,T,H,W,C
                logger.info("CnnTarget: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-3:]))[:, :, :,
                                                                      -1:]
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c1rp_pred',
                         nf=convfeat,
                         rf=8,
                         stride=4,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c2rp_pred',
                         nf=convfeat * 2,
                         rf=4,
                         stride=2,
                         init_scale=np.sqrt(2)))
                xrp = tf.nn.leaky_relu(
                    conv(xrp,
                         'c3rp_pred',
                         nf=convfeat * 2,
                         rf=3,
                         stride=1,
                         init_scale=np.sqrt(2)))
                rgbrp = to2d(xrp)
                X_r_hat = tf.nn.relu(
                    fc(rgbrp,
                       'fc1r_hat1_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = tf.nn.relu(
                    fc(X_r_hat,
                       'fc1r_hat2_pred',
                       nh=256 * enlargement,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(X_r_hat,
                             'fc1r_hat3_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        noisy_targets = tf.stop_gradient(X_r)
        self.aux_loss = tf.reduce_mean(tf.square(noisy_targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
Beispiel #20
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(ac_space)
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            print('ob_mean shape: ', ob_mean.shape)
            sh = tf.shape(self.ph_ob)

            x = flatten_two_dims(self.ph_ob)
            x = tf.cast(x, dtype=tf.float32)
            l = []
            for i in range(4):
                r = tf.multiply(x[:, :, :, i * 3], 0.299)
                g = tf.multiply(x[:, :, :, i * 3 + 1], 0.587)
                b = tf.multiply(x[:, :, :, i * 3 + 2], 0.114)

                gray = r + g + b

                l.append(gray)

            x = tf.stack(l, axis=-1)
            x = tf.cast(x, dtype=tf.int32)

            l = []
            for i in range(4):
                r = ob_mean[:, :, i * 3] * 0.299
                g = ob_mean[:, :, i * 3 + 1] * 0.587
                b = ob_mean[:, :, i * 3 + 2] * 0.114

                gray = r + g + b

                l.append(gray)

            print('before obmean: ', self.ob_mean.shape)
            self.ob_mean = np.stack(l, axis=-1)
            self.ob_rgb_mean = ob_mean
            print('after obmean: ', self.ob_mean.shape)

            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x, name='pd', units=pdparamsize, activation=None)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
Beispiel #21
0
    def __init__(self,
                 ob_space,
                 ac_space,
                 hidsize,
                 ob_mean,
                 ob_std,
                 feat_dim,
                 layernormalize,
                 nl,
                 scope="policy"):
        if layernormalize:
            print(
                "Warning: policy is operating on top of layer-normed features. It might slow down the training."
            )
        self.layernormalize = layernormalize
        self.bool_actionclip = True  #TODO Need to make this flexible
        self.nl = nl
        self.ob_mean = ob_mean
        self.ob_std = ob_std
        #self.ac_range = ac_range
        with tf.variable_scope(scope):
            self.ob_space = ob_space
            self.ac_space = ac_space
            self.ac_pdtype = make_pdtype(
                ac_space
            )  #RS: Should give a continuous action space, given  a continuous action env
            self.ph_ob = tf.placeholder(dtype=tf.int32,
                                        shape=(None, None) + ob_space.shape,
                                        name='ob')
            self.ph_ac = self.ac_pdtype.sample_placeholder([None, None],
                                                           name='ac')
            self.pd = self.vpred = None
            self.hidsize = hidsize
            self.feat_dim = feat_dim
            self.scope = scope
            pdparamsize = self.ac_pdtype.param_shape()[0]

            sh = tf.shape(self.ph_ob)
            x = flatten_two_dims(self.ph_ob)
            self.flat_features = self.get_features(x, reuse=False)
            self.features = unflatten_first_dim(self.flat_features, sh)

            with tf.variable_scope(scope, reuse=False):
                x = fc(self.flat_features, units=hidsize, activation=activ)
                x = fc(x, units=hidsize, activation=activ)
                pdparam = fc(x,
                             name='pd',
                             units=pdparamsize,
                             activation=tf.nn.tanh)
                vpred = fc(x,
                           name='value_function_output',
                           units=1,
                           activation=None)
            pdparam = unflatten_first_dim(pdparam, sh)
            self.vpred = unflatten_first_dim(vpred, sh)[:, :, 0]
            self.pd = pd = self.ac_pdtype.pdfromflat(pdparam)
            self.a_samp = pd.sample()
            self.a_samp = self.clip_action(
                self.a_samp) if self.bool_actionclip else self.a_samp
            self.entropy = pd.entropy()
            self.nlp_samp = pd.neglogp(self.a_samp)
            self.pd_logstd = pd.logstd
            self.pd_std = pd.std
            self.pd_mean = pd.mean
Beispiel #22
0
def _encoder(input, code_size):
    out_1_encoder = fc('out_1_encoder', input, H_SIZE)
    out_2_encoder = fc('out_2_encoder', out_1_encoder, code_size)
    out_encoder = fc('out_encoder', out_2_encoder, code_size)
    return out_encoder
Beispiel #23
0
    def define_self_prediction_rew(self, convfeat, rep_size, enlargement):
        logger.info(
            "Using RND BONUS ****************************************************"
        )
        hidden_size = convfeat * 2

        #RND bonus.

        activ = tf.nn.relu
        # Random target network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xr = ph[:, 1:]  # get next status index is 1:
                xr = tf.cast(xr, tf.float32)
                xr = tf.reshape(xr, (-1, *ph.shape.as_list()[-1:]))
                xr = tf.clip_by_value((xr - self.ph_mean) / self.ph_std, -5.0,
                                      5.0)

                xr = activ(
                    fc(xr, 'fc_0_r', nh=hidden_size, init_scale=np.sqrt(2)))
                xr = activ(
                    fc(xr, 'fc_1_r', nh=hidden_size, init_scale=np.sqrt(2)))
                X_r = fc(xr, 'fc_2_r', nh=rep_size, init_scale=np.sqrt(2))

        # Predictor network.
        for ph in self.ph_ob.values():
            if len(ph.shape.as_list()) == 3:  # B,T,S
                logger.info("Mlp Target: using '%s' shape %s as image input" %
                            (ph.name, str(ph.shape)))
                xrp = ph[:, 1:]
                xrp = tf.cast(xrp, tf.float32)
                xrp = tf.reshape(xrp, (-1, *ph.shape.as_list()[-1:]))
                xrp = tf.clip_by_value((xrp - self.ph_mean) / self.ph_std,
                                       -5.0, 5.0)

                xrp = activ(
                    fc(xrp, 'fc_0_pred', nh=hidden_size,
                       init_scale=np.sqrt(2)))
                xrp = activ(
                    fc(xrp, 'fc_1_pred', nh=hidden_size,
                       init_scale=np.sqrt(2)))
                X_r_hat = fc(xrp,
                             'fc_2_pred',
                             nh=rep_size,
                             init_scale=np.sqrt(2))

        self.feat_var = tf.reduce_mean(tf.nn.moments(X_r, axes=[0])[1])
        self.max_feat = tf.reduce_max(tf.abs(X_r))
        self.int_rew = tf.reduce_mean(
            tf.square(tf.stop_gradient(X_r) - X_r_hat),
            axis=-1,
            keep_dims=True)
        self.int_rew = tf.reshape(self.int_rew,
                                  (self.sy_nenvs, self.sy_nsteps - 1))

        targets = tf.stop_gradient(X_r)
        # self.aux_loss = tf.reduce_mean(tf.square(noisy_targets-X_r_hat))
        self.aux_loss = tf.reduce_mean(tf.square(targets - X_r_hat), -1)
        mask = tf.random_uniform(shape=tf.shape(self.aux_loss),
                                 minval=0.,
                                 maxval=1.,
                                 dtype=tf.float32)
        mask = tf.cast(mask < self.proportion_of_exp_used_for_predictor_update,
                       tf.float32)
        self.aux_loss = tf.reduce_sum(mask * self.aux_loss) / tf.maximum(
            tf.reduce_sum(mask), 1.)
Beispiel #24
0
def _decoder(code, code_size, out_size):
    out_1_decoder = fc('out_1_decoder', code, code_size)
    out_2_decoder = fc('out_2_decoder', out_1_decoder, H_SIZE)
    out_decoder = fc('out_decoder', out_2_decoder, out_size, act=tf.nn.sigmoid)
    return out_decoder
    def __init__(self,
                 sess,
                 ob_space,
                 ac_space,
                 nbatch,
                 nsteps,
                 reuse=False,
                 deterministic=False):  #pylint: disable=W0613

        # Assign action as Gaussian Distribution
        self.pdtype = make_pdtype(ac_space)
        self.num_obs = 13
        #print("action_space: {}".format(ac_space))
        with tf.variable_scope("model", reuse=reuse):
            phero_values = tf.placeholder(shape=(None, self.num_obs),
                                          dtype=tf.float32,
                                          name="phero_values")
            #velocities = tf.placeholder(shape=(None, 2), dtype=tf.float32, name="velocities")

            # Actor neural net
            pi_net = self.net(phero_values)
            # Critic neural net
            vf_h2 = self.net(phero_values)
            vf = fc(vf_h2, 'vf', 1)[:, 0]

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_net,
                                                        init_scale=0.01)

        if deterministic:
            a0 = self.pd.mode()
        else:
            a0 = self.pd.sample()
        neglogp0 = self.pd.neglogp(a0)
        self.initial_state = None
        self.phero = phero_values
        #self.velocities = velocities

        self.vf = vf

        def step(ob, *_args, **_kwargs):
            '''
            Generate action & value & log probability by inputting one observation into the policy neural net
            '''
            phero = [o for o in ob]

            # lb = [o["laser"] for o in ob]
            # rb = [o["rel_goal"] for o in ob]
            # vb = [o["velocities"] for o in ob]

            a, v, neglogp = sess.run([a0, vf, neglogp0], {self.phero: phero})
            # Action clipping (normalising action within the range (-1, 1) for better training)
            # The network will learn what is happening as the training goes.
            # for i in range(a.shape[1]):
            #     a[0][i] = min(1.0, max(-1.0, a[0][i]))
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            phero = [o for o in ob]
            # lb = [o["laser"] for o in ob]
            # rb = [o["rel_goal"] for o in ob]
            # vb = [o["velocities"] for o in ob]
            return sess.run(vf, {self.phero: phero})

        self.step = step
        self.value = value
    def forward_alexnet(self, inp, weights, reuse=False):
        # reuse is for the normalization parameters.

        conv1 = conv_block(inp,
                           weights['conv1_weights'],
                           weights['conv1_biases'],
                           stride_y=4,
                           stride_x=4,
                           groups=1,
                           reuse=reuse,
                           scope='conv1')
        norm1 = lrn(conv1, 2, 1e-05, 0.75)
        pool1 = max_pool(norm1, 3, 3, 2, 2, padding='VALID')

        # 2nd Layer: Conv (w ReLu)  -> Lrn -> Pool with 2 groups
        conv2 = conv_block(pool1,
                           weights['conv2_weights'],
                           weights['conv2_biases'],
                           stride_y=1,
                           stride_x=1,
                           groups=2,
                           reuse=reuse,
                           scope='conv2')
        norm2 = lrn(conv2, 2, 1e-05, 0.75)
        pool2 = max_pool(norm2, 3, 3, 2, 2, padding='VALID')

        # 3rd Layer: Conv (w ReLu)
        conv3 = conv_block(pool2,
                           weights['conv3_weights'],
                           weights['conv3_biases'],
                           stride_y=1,
                           stride_x=1,
                           groups=1,
                           reuse=reuse,
                           scope='conv3')

        # 4th Layer: Conv (w ReLu) splitted into two groups
        conv4 = conv_block(conv3,
                           weights['conv4_weights'],
                           weights['conv4_biases'],
                           stride_y=1,
                           stride_x=1,
                           groups=2,
                           reuse=reuse,
                           scope='conv4')

        # 5th Layer: Conv (w ReLu) -> Pool splitted into two groups
        conv5 = conv_block(conv4,
                           weights['conv5_weights'],
                           weights['conv5_biases'],
                           stride_y=1,
                           stride_x=1,
                           groups=2,
                           reuse=reuse,
                           scope='conv5')
        pool5 = max_pool(conv5, 3, 3, 2, 2, padding='VALID')

        # 6th Layer: Flatten -> FC (w ReLu) -> Dropout
        flattened = tf.reshape(pool5, [-1, 6 * 6 * 256])
        fc6 = fc(flattened,
                 weights['fc6_weights'],
                 weights['fc6_biases'],
                 activation='relu')
        dropout6 = dropout(fc6, self.KEEP_PROB)

        # 7th Layer: FC (w ReLu) -> Dropout
        fc7 = fc(dropout6,
                 weights['fc7_weights'],
                 weights['fc7_biases'],
                 activation='relu')
        dropout7 = dropout(fc7, self.KEEP_PROB)

        # 8th Layer: FC and return unscaled activations
        fc8 = fc(dropout7, weights['fc8_weights'], weights['fc8_biases'])

        return fc7, fc8
Beispiel #27
0
    def build(self, input, is_dropout=False):  # is_dropout 是否dropout
        #卷积层1
        conv1 = convM_N(input,
                        96,
                        "conv1",
                        self.data_dict_AlexNet, [11, 11],
                        4,
                        finetune=self.finetune)
        lrn1 = tf.nn.lrn(conv1,
                         bias=1.0,
                         alpha=0.001 / 9,
                         beta=0.75,
                         name='lrn1')
        pool1 = tf.nn.max_pool(lrn1, [1, 3, 3, 1], [1, 2, 2, 1],
                               padding='VALID',
                               name='pool1')
        # 卷积层2
        conv2 = convM_N(pool1,
                        256,
                        "conv2",
                        self.data_dict_AlexNet, [5, 5],
                        1,
                        finetune=self.finetune)
        lrn2 = tf.nn.lrn(conv2,
                         bias=1.0,
                         alpha=0.001 / 9,
                         beta=0.75,
                         name='lrn2')
        pool2 = tf.nn.max_pool(lrn2, [1, 3, 3, 1], [1, 2, 2, 1],
                               padding='VALID',
                               name='pool2')
        #卷积层3
        conv3 = conv3_3(pool2,
                        384,
                        'conv3',
                        self.data_dict_AlexNet,
                        finetune=self.finetune)
        #卷积层4
        conv4 = conv3_3(conv3,
                        384,
                        'conv4',
                        self.data_dict_AlexNet,
                        finetune=self.finetune)
        #卷积层5
        conv5 = conv3_3(conv4,
                        256,
                        'conv5',
                        self.data_dict_AlexNet,
                        finetune=self.finetune)
        pool3 = tf.nn.max_pool(conv5, [1, 3, 3, 1], [1, 2, 2, 1],
                               padding='VALID',
                               name='pool3')

        # fully connected layer 全连接
        flatten = tf.reshape(pool3, [self.batchsize, -1])
        fc1 = fc(flatten, 4096, 'fc1', finetune=False)
        fc1 = tf.nn.relu(fc1)
        if is_dropout: fc1 = tf.nn.dropout(fc1, 0.5)

        fc2 = fc(fc1, 4096, 'fc2', finetune=False)
        fc2 = tf.nn.relu(fc2)
        if is_dropout: fc2 = tf.nn.dropout(fc2, 0.5)

        fc3 = fc(fc2, self.n_classes, 'fc3', finetune=False)
        return fc3
Beispiel #28
0
    def apply_policy(
        ph_ob,
        ph_new,
        ph_istate,
        reuse,
        scope,
        hidsize,
        memsize,
        extrahid,
        sy_nenvs,
        sy_nsteps,
        pdparamsize,
        rec_gate_init,
    ):
        ph = ph_ob
        logger.info(
            f"CnnGruPolicy: using '{ph.name}' shape {ph.shape} as image input")
        assert len(ph.shape.as_list()) == 3  # B, Envs, Features
        X = tf.cast(ph, tf.float32) / 255.0
        X = tf.reshape(X, (-1, *ph.shape.as_list()[-3:]))

        activ = tf.nn.relu
        yes_gpu = any(get_available_gpus())

        with tf.variable_scope(
                scope,
                reuse=reuse), tf.device("/gpu:0" if yes_gpu else "/cpu:0"):
            X = activ(fc(
                X,
                "fc1",
                nh=32,
                init_scale=np.sqrt(2),
            ))
            X = activ(fc(
                X,
                "fc2",
                nh=64,
                init_scale=np.sqrt(2),
            ))
            X = activ(fc(
                X,
                "fc3",
                nh=64,
                init_scale=np.sqrt(2),
            ))
            X = to2d(X)
            X = activ(fc(X, "fc1", nh=hidsize, init_scale=np.sqrt(2)))
            X = tf.reshape(X, [sy_nenvs, sy_nsteps, hidsize])
            X, snext = tf.nn.dynamic_rnn(
                GRUCell(memsize, rec_gate_init=rec_gate_init),
                (X, ph_new[:, :, None]),
                dtype=tf.float32,
                time_major=False,
                initial_state=ph_istate,
            )
            X = tf.reshape(X, (-1, memsize))
            Xtout = X
            if extrahid:
                Xtout = X + activ(
                    fc(Xtout, "fc2val", nh=memsize, init_scale=0.1))
                X = X + activ(fc(X, "fc2act", nh=memsize, init_scale=0.1))
            pdparam = fc(X, "pd", nh=pdparamsize, init_scale=0.01)
            vpred_int = fc(Xtout, "vf_int", nh=1, init_scale=0.01)
            vpred_ext = fc(Xtout, "vf_ext", nh=1, init_scale=0.01)

            pdparam = tf.reshape(pdparam, (sy_nenvs, sy_nsteps, pdparamsize))
            vpred_int = tf.reshape(vpred_int, (sy_nenvs, sy_nsteps))
            vpred_ext = tf.reshape(vpred_ext, (sy_nenvs, sy_nsteps))
        return pdparam, vpred_int, vpred_ext, snext
Beispiel #29
0
    def __init__(self, sess, ob_space, ac_space, nenvs, nsteps, units_per_hlayer, reuse=False, activ_fcn='relu6'):  # pylint: disable=W0613
        # this method is called with nbatch = nenvs*nsteps

        # nh, nw, nc = ob_space.shape
        # ob_shape = (nbatch, nh, nw, nc)
        # actdim = ac_space.shape[0]
        # Todo check initialization
        # Input and Output dimensions
        nd, = ob_space.shape
        nbatch = nenvs * nsteps
        ob_shape = (nbatch, nd)
        nact = ac_space.n
        X = tf.placeholder(tf.float32, ob_shape, name='Ob')  # obs
        with tf.variable_scope("model", reuse=reuse):
            if activ_fcn == 'relu6':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))

                h3 = tf.nn.relu6(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'elu':
                h1 = tf.nn.elu(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  # , init_scale=np.sqrt(2)))
                h2 = tf.nn.elu(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  # , init_scale=np.sqrt(2)))

                h3 = tf.nn.elu(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  # , init_scale=np.sqrt(2)))
            elif activ_fcn == 'mixed':
                h1 = tf.nn.relu6(fc(X, 'pi_vf_fc1', nh=units_per_hlayer[0]))  #, init_scale=np.sqrt(2)))
                h2 = tf.nn.relu6(fc(h1, 'pi_vf_fc2', nh=units_per_hlayer[1]))  #, init_scale=np.sqrt(2)))

                h3 = tf.nn.tanh(fc(h2, 'pi_fc1', nh=units_per_hlayer[2]))  #, init_scale=np.sqrt(2)))

            pi_logit = fc(h3, 'pi', nact, init_scale=0.01)
            pi = tf.nn.softmax(pi_logit)

            vf = fc(h2, 'vf', 1)[:, 0]  # predicted value of input state

        self.pd = CategoricalPd(pi_logit)  # pdparam
        a0 = self.pd.sample()  # returns action index: 0,1
        # a0 = tf.argmax(pi, axis=1)
        neglogp0 = self.pd.neglogp(a0)

        self.initial_state = None

        def step(ob, *_args, **_kwargs):
            a, pi, v, neglogp = sess.run([a0, pi_logit, vf, neglogp0], {X: ob})
            return a, pi, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X: ob})

        self.X = X
        self.pi = pi
        self.pi_logit = pi_logit
        self.vf = vf
        self.ac = a0
        self.step = step
        self.value = value
Beispiel #30
0
# set data placeholders
x = tf.placeholder(tf.float32, shape=[None, 784], name="x")
x_image = tf.reshape(x, [-1, 28, 28, 1])
tf.summary.image('input', x_image, 3)
y = tf.placeholder(tf.float32, shape=[None, 10], name="labels")

if use_two_conv:
    conv1 = conv(x_image, 1, 32, "conv1")
    conv_out = conv(conv1, 32, 64, "conv2")
else:
    conv_out = conv(x_image, 1, 16, "conv")

flattened = tf.reshape(conv_out, [-1, 7 * 7 * 64])

if use_two_fc:
    fc1 = fc(flattened, 7 * 7 * 64, 1024, "fc1")
    relu = tf.nn.relu(fc1)
    embedding_input = relu
    tf.summary.histogram("fc1/relu", relu)
    embedding_size = 1024
    logits = fc(relu, 1024, 10, "fc2")
else:
    embedding_input = flattened
    embedding_size = 7 * 7 * 64
    logits = fc(flattened, 7 * 7 * 64, 10, "fc")

with tf.name_scope("xent"):
    xent = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
        logits=logits, labels=y),
                          name="xent")
    tf.summary.scalar("xent", xent)