Exemple #1
0
    def _init(self, ob_space, ac_space):
        self.pdtype = distributions.make_pdtype(ac_space)

        ob = U.get_placeholder(name='ob', dtype=tf.int32, shape=[None] + list(ob_space.shape))
        next_blocks, my_grid, opp_grid = tf.split(ob, [16, 12 * 6, 12 * 6], axis=1)

        with tf.variable_scope('next_blocks'):
            next_blocks = tf.one_hot(next_blocks, depth=5)
            next_blocks = U.flattenallbut0(next_blocks)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l1', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)
            next_blocks = tf.nn.leaky_relu(tf.layers.dense(next_blocks, 12, name='l2', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        with tf.variable_scope('grids', reuse=False):
            my_grid = _grid_cnn(my_grid)

        with tf.variable_scope('grids', reuse=True):
            opp_grid = _grid_cnn(opp_grid)

        x = tf.concat([next_blocks, my_grid, opp_grid], axis=1)
        x = tf.nn.leaky_relu(tf.layers.dense(x, 64, name='lin', kernel_initializer=U.normc_initializer(1.0)), alpha=0.1)

        logits = tf.layers.dense(x, self.pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
        self.pd = self.pdtype.pdfromflat(logits)
        self.vpred = tf.layers.dense(x, 1, name='value', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #2
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #3
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = U.dense(last_out,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out,
                           pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd",
                                     shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #4
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, rnn_hid_units, gaussian_fixed_var=True):
        #assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        # Apply rnn_to reduce history
        with tf.variable_scope("vf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "vf_dense%i"%i, weight_init=U.normc_initializer(1.0))
            self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]

        # Apply rnn_to reduce history
        with tf.variable_scope("pf"):
            last_out = self.rnn(ob, ob_space.shape[0], rnn_hid_units)
            for i in range(num_hid_layers):
                last_out = U.dense(last_out, hid_size, "pf_dense%i"%i, weight_init=U.normc_initializer(1.0))

            assert gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box)
            mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #5
0
    def _create_logit_value(self,
                            action_layer,
                            value_layer,
                            gaussian_fixed_var=False):
        # actor
        if gaussian_fixed_var and isinstance(self.ac_space, gym.spaces.Box):
            mean = U.dense(action_layer,
                           self.pdtype.param_shape()[0] // 2, "polfinal",
                           U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, self.pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(action_layer,
                              self.pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        self.pd = self.pdtype.pdfromflat(pdparam)
        self.ac = U.switch(self.stochastic, self.pd.sample(), self.pd.mode())

        # critic
        self.vpred = U.dense(value_layer,
                             1,
                             "vffinal",
                             weight_init=U.normc_initializer(1.0))[:, 0]
Exemple #6
0
    def _policy_nn(self, odim, adim, train):
        # activ = tf.nn.tanh
        # self.pdtype = make_pdtype(self.ac_space)
        # self.pdtype = DiagGaussianPdType(self.ac_space.shape[0])
        # hid1_size = 64
        # out = tf.layers.dense(self.x, adim, trainable=train,
        #                       kernel_initializer=tf.random_normal_initializer(stddev=np.sqrt(1/adim)), name='out')
        #
        # self.pd = self.pdtype.pdfromflat(out)
        # self._act = U.function([self.ob], self.pd.sample()) # [self.pd.sample(), mean, logstd]
        # self .ac = self.pd.sample()
        # logits = tf.layers.dense(self.x, self.pdtype.param_shape()[0], name='logits',
        #                               kernel_initializer=U.normc_initializer(0.01))
        # self.pd = self.pdtype.pdfromflat(logits)
        mean = tf.layers.dense(self.x,
                               self.pdtype.param_shape()[0] // 2,
                               name="polfinal",
                               kernel_initializer=U.normc_initializer(0.01))
        logstd = tf.get_variable(name="logstd",
                                 shape=[1,
                                        self.pdtype.param_shape()[0] // 2],
                                 initializer=tf.zeros_initializer())
        # 链接
        pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        self.pd = self.pdtype.pdfromflat(pdparam)

        stochastic = U.get_placeholder(dtype=tf.bool,
                                       shape=(),
                                       name="stochastic")
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self._act = U.function([stochastic, self.ob], ac)
        self.ac = ac
Exemple #7
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              exploration_rate,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        # with tf.variable_scope("obfilter"):
        #     self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        obz = ob

        valueFunction = Sequential()
        valueFunction.add(InputLayer(input_tensor=obz))
        valueFunction.add(Dense(64, activation='tanh'))
        valueFunction.add(Dense(64, activation='tanh'))

        self.vpred = self.dense(x=valueFunction.output,
                                size=1,
                                name="vffinal",
                                weight_init=U.normc_initializer(1.0),
                                bias=True)[:, 0]

        model = Sequential()
        model.add(InputLayer(input_tensor=obz))
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(64, activation='tanh'))
        model.add(Dense(23))
        model.load_weights("neural_kick")

        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = model.output
            logstd = tf.get_variable(
                name="logstd",
                shape=[1, pdtype.param_shape()[0] // 2],
                initializer=tf.constant_initializer(exploration_rate))
            pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = tf.layers.dense(model.output,
                                      pdtype.param_shape()[0], "polfinal",
                                      U.normc_initializer(0.01))
        my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1)
        my_var_out = tf.identity(my_var, name='output_node')
        self.pd = pdtype.pdfromflat(pdparam)
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind):
        print type(ob_space)
        assert isinstance(ob_space, gym.spaces.box.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
        self.ob = [ob]

        #process ob_
        x = ob / 255.0
            
        ob_last = self.img_encoder(x, kind)
        
        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode()) # XXX
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #9
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              tau,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)
        print('use zpmpl_Adv')
        self.ac_space = ac_space
        self.hid_size = hid_size
        self.num_hid_layers = num_hid_layers
        self.gaussian_fixed_var = gaussian_fixed_var

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        self.ob = U.get_placeholder(name="ob_adv",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))
        self.ob_ = U.get_placeholder(name="adv_ob_",
                                     dtype=tf.float32,
                                     shape=[sequence_length] +
                                     list(ob_space.shape))

        with tf.variable_scope("obfilter_adv"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('adv_vf'):
            self.obz = tf.clip_by_value(
                (self.ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = self.obz
            for i in range(self.num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        self.hid_size,
                        name="adv_vffc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="adv_vffinal",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        self.pdparam = self.build_action(self.ob)
        self.pdparam_ = self.build_action(self.ob_, reuse=True)

        self.pd = pdtype.pdfromflat(self.pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.ac = self.pd.sample()
        self.ac_, _ = self.sample_()
        self._act = U.function([stochastic, self.ob], [ac, self.vpred])
Exemple #10
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        # to store current input observation
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        # construct value function estimator
        with tf.variable_scope('vf'):
            # to store clipped normalized current input observation
            #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            obz = ob
            # last layer is input obz
            last_out = obz
            for i in range(num_hid_layers):
                #last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=tf.zeros_initializer()))
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))

            # close off the neural network
            #self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=tf.zeros_initializer())[:,0]
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        # construct policy network
        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                #last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=tf.zeros_initializer()))
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            # continuous action space, and want state-independent variance on
            # output gaussian means
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                #mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=tf.zeros_initializer())
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                self.mean = mean
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                #pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=tf.zeros_initializer())
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        # apparently unnecessary
        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #11
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, activation='tanh', gaussian_fixed_var=True, keep=1.0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob_shape = OBSERVATION_DIM if PREPROCESS else ob_space.shape[0]
        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length, ob_shape])

        if activation == 'tanh':
            activ = tf.nn.tanh
        elif activation == 'elu':
            activ = tf.nn.elu
        elif activation == 'lrelu':
            activ = lambda x: tf.maximum(x, 0.01 * x)
        else:
            raise NotImplementedError("Not available activation: " + activation)

        if PREPROCESS:
            last_out = ob
        else:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "vffc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="vdrop%i" % (i + 1))
        self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = ob
        for i in range(num_hid_layers):
            last_out = activ(U.dense(last_out, hid_size, "polfc%i" % (i + 1), weight_init=U.normc_initializer(1.0)))
            last_out = tf.nn.dropout(last_out, keep_prob=keep, name="pdrop%i" % (i + 1))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = U.dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", U.normc_initializer(0.01))
            logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2],
                                     initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
        else:
            pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #12
0
    def __init__(self,
                 name,
                 observation_shape,
                 action_shape,
                 hid_size,
                 num_hid_layers,
                 stochastic=True):
        with tf.variable_scope(name):
            self.stochastic = stochastic
            self.hid_size, self.num_hid_layers = hid_size, num_hid_layers
            self.action_shape, self.observation_shape = action_shape, observation_shape
            self.scope = tf.get_variable_scope().name
            self.pdtype = DiagGaussianPdType(action_shape[0])

            observations_ph = U.get_placeholder(name='ob',
                                                dtype=tf.float32,
                                                shape=[None] +
                                                list(observation_shape))
            stochastic_ph = tf.placeholder(dtype=tf.bool, shape=())

            with tf.variable_scope('obfilter'):
                self.ob_rms = RunningMeanStd(shape=observation_shape)

            with tf.variable_scope('pol'):
                last_out = tf.clip_by_value(
                    (observations_ph - self.ob_rms.mean) / self.ob_rms.std,
                    -5.0, 5.0)
                for i in range(num_hid_layers):
                    last_out = tf.nn.tanh(
                        tf.layers.dense(
                            last_out,
                            hid_size,
                            name='fc%i' % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                mean = tf.layers.dense(
                    last_out,
                    self.pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name='logstd',
                    shape=[1, self.pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)

            self.pd = self.pdtype.pdfromflat(pdparam)

            action_op = U.switch(stochastic_ph, self.pd.sample(),
                                 self.pd.mode())
            self._act = U.function([stochastic_ph, observations_ph], action_op)
Exemple #13
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
        # pdb.set_trace()
        # var_is_good = any(isinstance(ob_space, t) for t in [gym.spaces.Box,Box])
        assert isinstance(ob_space, (gym.spaces.Box))



        if isinstance(hid_size,int):
            hid_size = [hid_size] * num_hid_layers

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('sigma'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.sigmapred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size[i], name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred, self.sigmapred])
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n)
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            last_out = obz
            last_out = tf.one_hot(indices=tf.cast(last_out, dtype=tf.int32), depth=ob_space.n)
            def sub_pol(input_m, scope):
                state_embedding = tf.tile(tf.expand_dims(input_m, axis=1), [1, 1, 1])
                rnn_cell = rnn.BasicLSTMCell(
                  num_units=pdtype.param_shape()[0])
                last_out, states = tf.nn.dynamic_rnn(
                  cell=rnn_cell,
                  inputs=state_embedding,
                  dtype=tf.float32, scope=scope)
                return tf.squeeze(last_out, axis=1)
            ppsl = []
            for i in range(4):
                ppsl.append(sub_pol(last_out, 'pol' + '/' + str(i)))
            last_out = tf.concat(ppsl, axis=1) 
	    
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01), activity_regularizer=tf.contrib.layers.l2_regularizer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01), activity_regularizer=tf.contrib.layers.l2_regularizer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #15
0
    def _init(self, ob_space, ac_space):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            # self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
            self.vpred = discriminator_model([last_out], drop_rate=0.5)

        with tf.variable_scope('pol'):
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))

            pdparam = generator_model([last_out],
                                      pdtype.param_shape()[0],
                                      drop_rate=0.5)

            # if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            #     mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
            #     logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
            #     pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            # else:
            #     pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #16
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True, summaries = False, should_act = True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        ob = tf.get_default_graph().get_tensor_by_name("observations:0");
        if ob is None:
            ob = U.get_placeholder(name="observations", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope('pol'):
            last_out = ob
            for i in range(num_hid_layers):
                last_out = tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0))
                last_out = tf.nn.elu(last_out);
                #last_out = tf.nn.tanh(last_out)
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, tf.ones(shape=mean.shape)* logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        with tf.variable_scope("distribution"):
            self.pd = pdtype.pdfromflat(pdparam)

        if should_act:
            with tf.variable_scope("obfilter"):
                self.ob_rms = RunningMeanStd(shape=ob_space.shape)

            with tf.variable_scope('vf'):
                #obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
                last_out = ob
                for i in range(num_hid_layers):
                    last_out = tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0))
                    last_out = tf.nn.tanh(last_out);
                self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]


            self.state_in = []
            self.state_out = []

            with tf.variable_scope("distribution"):
                stochastic = tf.placeholder(dtype=tf.bool, shape=(), name = "stochastic")
                ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

            self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #17
0
    def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
        nbatch = nenv*nsteps
        ob_shape = (nbatch, ob_space.shape[0]*nstack)
        nact = ac_space.shape[0]
        X = tf.placeholder(tf.float32, ob_shape) #obs
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("obfilter", reuse=reuse):
            self.ob_rms = RunningMeanStd(shape=ob_shape[1:])
        with tf.variable_scope("retfilter", reuse=reuse):
            self.ret_rms = RunningMeanStd(shape=(1,))

        obz = tf.clip_by_value((X - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
        #obz = X

        with tf.variable_scope("model", reuse=reuse):
            h1 = tf.nn.tanh(dense(obz, 128, "fc1", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h2 = tf.nn.tanh(dense(h1, 128, "fc2", weight_init=U.normc_initializer(1.0), bias_init=0.0))
            h3 = tf.nn.tanh(dense(h2, 128, "fc3", weight_init=U.normc_initializer(1.0), bias_init=0.0))

            mean = dense(h3, nact, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0)
            logstd = tf.get_variable("logstd", [nact], tf.float32, tf.zeros_initializer())
            logstd = tf.expand_dims(logstd, 0)
            pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
            vf = dense(h3, 1, "v", weight_init=U.normc_initializer(1.0), bias_init=0.0)

        v0 = vf[:, 0]
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        a0 = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        self.initial_state = [] #not stateful

        def step(stoch, ob, *_args, **_kwargs):
            a, v = sess.run([a0, v0], {stochastic:stoch, X:ob})
            return a, v, [] #dummy state

        def value(ob, *_args, **_kwargs):
            return sess.run(v0, {X:ob})

        self.X = X
        self.vf = vf
        self.vnorm = (self.vf - self.ret_rms.mean) / self.ret_rms.std
        self.step = step
        self.value = value
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, kind):
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape))
        ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape))

        self.ob = [ob_p, ob_f]
        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape = ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            

        #process ob_f
        x = ob_f / 255.0
            
        x = self.img_encoder(x, kind)
        
        ob_last = tf.concat((obpz, x), axis=-1)

        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred_ext = tf.layers.dense(last_out, 1, name='vf_ext', kernel_initializer=U.normc_initializer(1.0))[:,0]
            self.vpred_int = tf.layers.dense(last_out, 1, name='vf_int', kernel_initializer=U.normc_initializer(1.0))[:,0]


        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        self.state_in = []
        self.state_out = []
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob_p, ob_f], [ac, self.vpred_ext, self.vpred_int])
Exemple #19
0
    def _init(self, ob_space, ac_space, layers_val, layers_pol, gaussian_fixed_var=True,
              dist='gaussian', ):
        assert isinstance(ob_space, gym.spaces.Box)

        self.dist = dist
        self.pdtype = pdtype = make_pdtype(ac_space, dist=dist)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i, size in enumerate(layers_val):
                last_out = tf.nn.relu(tf.layers.dense(last_out, size, name="fc%i" % (i + 1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i, size in enumerate(layers_pol):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, size, name='fc%i' % (i + 1), kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0] // 2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0] // 2], initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        if dist == 'gaussian':
            self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.std, self.pd.mean, self.pd.logstd])
        elif dist == 'beta':
            self._act = U.function([stochastic, ob], [ac, self.vpred, self.pd.alpha, self.pd.beta, self.pd.alpha_beta])
Exemple #20
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, exploration_rate, gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(0.01)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(last_out, pdtype.param_shape()[0]//2, name='final', kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.constant_initializer(exploration_rate))
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(last_out, pdtype.param_shape()[0], name='final', kernel_initializer=U.normc_initializer(0.01))

        my_var = tf.strided_slice(mean, [0], [1], [1], shrink_axis_mask=1)
        my_var_out = tf.identity(my_var, name='output_node')
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #21
0
    def _build(self):
        ac_space = self._ac_space
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size
        gaussian_fixed_var = self._gaussian_fixed_var

        # obs
        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}_primitive".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])

        # obs normalization
        self.ob_rms = {}
        for ob_name in self.ob_type:
            with tf.variable_scope("ob_rms_{}".format(ob_name)):
                self.ob_rms[ob_name] = RunningMeanStd(
                    shape=self._ob_shape[ob_name])
        obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
               self.ob_rms[ob_name].std for ob_name in self.ob_type]
        obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
        obz = tf.concat(obz, -1)

        # value function
        with tf.variable_scope("vf"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name="final",
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        # primitive policy
        self.pdtype = pdtype = make_pdtype(ac_space)
        with tf.variable_scope("pol"):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = self._activation(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name="final",
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        self._act = U.function([stochastic] + self.obs, [ac, self.vpred])
        self._value = U.function(self.obs, self.vpred)
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        feature_funcs = []

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        self.std = tf.constant(1.0)
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):

            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            import numpy as np
            # for i in range(0, ob_space.shape[0]):
            #     # Polinomial
            #     # feature_funcs.append(lambda s, i=i: tf.pow(s, i))
            #     # Fourier
            #     # feature_funcs.append(lambda s, i=i: tf.cos(i*np.pi*s))
            #     # RBF
            #     feature_funcs.append(lambda s, i=i: tf.exp(-tf.pow(s - self.ob_rms.mean, 2)/(2*self.ob_rms.std
            #                                                                                     **2)))
            # obz = tf.concat([func(ob) for func in feature_funcs], axis = 1)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(0.1))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            # for i in range(num_hid_layers):
            #     last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name = 'fc%i' % (i + 1), kernel_initializer = U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.multiply(
                    tf.ones(shape=[1, pdtype.param_shape()[0] // 2]),
                    tf.constant(0.05))
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        pdparam = tf.clip_by_value(pdparam, -10.0, 10.0)
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #23
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        next_ob = U.get_placeholder(name="next_ob",
                                    dtype=tf.float32,
                                    shape=[sequence_length] +
                                    list(ob_space.shape))

        act = U.get_placeholder(name="act",
                                dtype=tf.float32,
                                shape=[sequence_length] + list(ac_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('qf'):
            obz = tf.clip_by_value(
                (next_ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz

            for i in range(num_hid_layers):
                if i == num_hid_layers - 1:
                    last_out = tf.concat([last_out, act], axis=-1)
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.qpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            # out_std = tf.exp(0.5*logstd + 0.0)
            # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                # pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                import numpy as np
                pdparam = tf.concat([
                    mean, mean * 0.0 +
                    np.random.randn(pdtype.param_shape()[0] // 2) * logstd
                ],
                                    axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = ob  #tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        # Since we are using a Box for the action space
        # this distribution is used DiagGaussianPd
        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        # if stocastic = true, the call the sample of the distribion
        # otherwise just use the mean
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Exemple #25
0
    def _build(self):
        num_primitives = self.num_primitives
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size

        self._obs = {}
        for ob_name, ob_shape in self._ob_shape.items():
            self._obs[ob_name] = U.get_placeholder(
                name="ob_{}".format(ob_name),
                dtype=tf.float32,
                shape=[None] + self._ob_shape[ob_name])
        self._prev_primitive = prev_primitive = U.get_placeholder(
            name="prev_primitive", dtype=tf.int32, shape=[None])

        with tf.variable_scope(self.name):
            self._scope = tf.get_variable_scope().name

            self.ob_rms = {}
            for ob_name in self.ob_type:
                with tf.variable_scope("ob_rms_{}".format(ob_name)):
                    self.ob_rms[ob_name] = RunningMeanStd(
                        shape=self._ob_shape[ob_name])
            obz = [(self._obs[ob_name] - self.ob_rms[ob_name].mean) /
                   self.ob_rms[ob_name].std for ob_name in self.ob_type]
            obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
            obz = tf.concat(obz, -1)

            prev_primitive_one_hot = tf.one_hot(prev_primitive,
                                                num_primitives,
                                                name="prev_primitive_one_hot")
            obz = tf.concat([obz, prev_primitive_one_hot], -1)

            # value function
            with tf.variable_scope("vf"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%d" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.vpred = tf.layers.dense(
                    _,
                    1,
                    name="vpred",
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

            # meta policy
            with tf.variable_scope("pol"):
                _ = obz
                for i in range(num_hid_layers):
                    _ = self._activation(
                        tf.layers.dense(
                            _,
                            hid_size,
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                self.selector = tf.layers.dense(
                    _,
                    num_primitives,
                    name="action",
                    kernel_initializer=U.normc_initializer(0.01))
                self.pdtype = pdtype = CategoricalPdType(num_primitives)
                self.pd = pdtype.pdfromflat(self.selector)

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self.obs = [self._obs[ob_name] for ob_name in self.ob_type]
        self._act = U.function([stochastic, self._prev_primitive] + self.obs,
                               [ac, self.vpred])
    def __init__(self, observations, action_space, latent, optimizer=None, sess=None, train=True, beta=1.0,
                 l2=0., lr=0.001, init_scale=0.01, init_bias=0.0, trainable_variance=True, trainable_bias=True,
                 init_logstd=0., scope_name="pi", clip=None, state_dependent_variance=True, **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        latent = tf.layers.flatten(latent)

        self.action_space = action_space
        self.pdtype = make_pdtype(action_space)
        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=init_scale,
                                                    init_bias=init_bias,
                                                    trainable_variance=trainable_variance,
                                                    state_dependent_variance=state_dependent_variance,
                                                    trainable_bias=trainable_bias,
                                                    init_logstd=init_logstd,
                                                    clip=clip, beta=beta)  # init_bias=0.0

        self.stochastic = tf.placeholder(dtype=tf.bool, shape=())
        self.action = tf_util.switch(self.stochastic, self.pd.sample(), self.pd.mode())
        self.neglogp = self.pd.neglogp(self.action)
        if beta == 1.0:
            self.prob = tf.nn.softmax(self.pd.flatparam())
        else:
            self.prob = boltzmann(self.pd.flatparam(), beta=beta)
        if optimizer is None:
            self.optimizer = tf.train.AdamOptimizer(learning_rate=lr)
        else:
            self.optimizer = optimizer
        self.sess = sess or tf.get_default_session()
        self.vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope_name)
        try:
            self.action_ph = tf.placeholder(tf.int64, [None], name='targets_placeholder')
            self.action_selected = action_selected = tf.one_hot(self.action_ph, self.action_space.n)
        #out = tf.reduce_sum(tf.reduce_sum(tf.log(self.logits+1e-5)*action_selected, axis=1))
            out = tf.reduce_mean(tf.log(tf.reduce_sum(self.prob*action_selected, axis=1)))
            gradients = tf.gradients(out, self.vars)
        except:
            self.action_ph = tf.placeholder(dtype=tf.float32, shape=(None,) + action_space.shape,
                                            name='targets_placeholder')
            gradients = tf.gradients(-self.pd.neglogp(self.action_ph), self.vars)
        self.cont = cont = not isinstance(self.action_space, Discrete)

        self.compute_gradients = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=[gradients, tf.exp(- self.pd.neglogp(self.action_ph)), - self.pd.neglogp(self.action_ph),
                     self.pd.mean]
        )
        '''self.compute_cont_gradients = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=tf.gradients(-self.pd.neglogp(self.action_ph), self.vars)
        )'''
        self.debug = tf_util.function(
            inputs=[self.X, self.action_ph],
            outputs=[gradients, self.prob, self.action_ph]
        )
        self.set_from_flat = tf_util.SetFromFlat(self.vars)
        if self.cont:
            total_error = tf.reduce_sum(tf.square(self.action_ph - tf.reduce_mean(self.action_ph, axis=0)), axis=0)
            unexplained_error = tf.reduce_sum(tf.square(self.action_ph - self.pd.mean), axis=0)
            R_squared = 1 - (unexplained_error / total_error)
            self.accuracy = accuracy = R_squared
        else:
            self.accuracy = accuracy = tf.reduce_mean(tf.cast(tf.math.equal(self.pd.mode(), self.action_ph), tf.float32))
        self.entropy = entropy = tf.reduce_mean(self.pd.entropy())
        if train:
            self.gamma = l2
            self._build_train(cont=cont, state_dependent_variance=state_dependent_variance)
        self.pdf = tf.exp(self.pd.logp(self.action_ph))
Exemple #27
0
    def _init(self, ob_space, ac_space, hid_size, num_hid_layers, lstm_hid_size, kind):
        print("This is lstm policy for only sensors.")
        assert isinstance(ob_space, tuple)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None
        
        ob_p = U.get_placeholder(name="ob_physics", dtype=tf.float32, shape=[sequence_length] + list(ob_space[0].shape))
        ob_f= U.get_placeholder(name="ob_frames", dtype=tf.float32, shape=[sequence_length]+list(ob_space[1].shape))

        #process ob_p
        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape = ob_space[0].shape)
        obpz = tf.clip_by_value((ob_p - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
            

        #process ob_f
        x = ob_f / 255.0

        if kind == 'small': # from A3C paper
            x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 256, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        elif kind == 'large': # Nature DQN
            x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
            x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
            x = U.flattenallbut0(x)
            x = tf.nn.relu(tf.layers.dense(x, 512, name='lin', kernel_initializer=U.normc_initializer(1.0)))
        else:
            raise NotImplementedError

        # lstm layer for memmory
        lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstm_hid_size, state_is_tuple=True, name = "rnn")
        c_init = np.zeros((1, lstm_cell.state_size.c), np.float32)
        h_init = np.zeros((1, lstm_cell.state_size.h), np.float32)
        self.state_init = (c_init, h_init)
        c_in = U.get_placeholder(name="state_c", dtype=tf.float32,shape=(None, lstm_cell.state_size.c))
        h_in = U.get_placeholder(name="state_h", dtype=tf.float32,shape=(None, lstm_cell.state_size.h))
        self.state_in = (c_in, h_in)

        state_in = tf.contrib.rnn.LSTMStateTuple(c_in, h_in)
        lstm_outputs, lstm_states = lstm_cell(x, state_in)
        lstm_c, lstm_h = lstm_states
        self.state_out = (lstm_c, lstm_h)

        rnn_out = tf.reshape(lstm_outputs, (-1, lstm_hid_size))
        
        # conjugate sensor and physics
        ob_last = tf.concat((rnn_out, obpz), axis = -1)

        # value network
        with tf.variable_scope("vf"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.relu(tf.layers.dense(last_out, hid_size, name="fc%i"%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(last_out, 1, name='final', kernel_initializer=U.normc_initializer(1.0))[:,0]
 
        with tf.variable_scope("pol"):
            last_out = ob_last
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(tf.layers.dense(last_out, hid_size, name='fc%i'%(i+1), kernel_initializer=U.normc_initializer(1.0)))
            logits = tf.layers.dense(last_out, pdtype.param_shape()[0], name='logits', kernel_initializer=U.normc_initializer(0.01))
            self.pd = pdtype.pdfromflat(logits)

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob_p, ob_f, c_in, h_in], [ac, self.vpred, lstm_c, lstm_h])
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0):
        assert isinstance(ob_space, gym.spaces.Box)

        self.ac_space_dim = ac_space.shape[0]
        self.ob_space_dim = ob_space.shape[0]
        self.dc = dc
        self.last_action = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.last_action_init = tf.zeros(ac_space.shape, dtype=tf.float32)
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        # create a filter for the pure shape, meaning excluding u[k-1]
        obs_shape_pure = ((self.ob_space_dim - self.ac_space_dim), )

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)
        with tf.variable_scope("obfilter_pure"):
            self.ob_rms_only = RunningMeanStd(shape=obs_shape_pure)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        obz_pure = tf.clip_by_value(
            (ob[:, :-self.ac_space_dim] - self.ob_rms_only.mean) /
            self.ob_rms_only.std, -5.0, 5.0)

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "vffc0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "vffc1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "vfff0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "vfff1",
                            weight_init=U.normc_initializer(1.0))

        #self.vpred = dense3D2(last_out, 1, "vffinal", option, num_options=num_options, weight_init=U.normc_initializer(1.0))[:,0]
        #last_out0 = tf.Print(last_out0,[tf.size(last_out0[:,0])])
        self.vpred = U.switch(option[0], last_out1, last_out0)[:, 0]

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        last_out0 = obz  # for option 0
        last_out1 = obz_pure  # for option 1
        for i in range(num_hid_layers):
            last_out0 = tf.nn.tanh(
                U.dense(last_out0,
                        hid_size,
                        "oppi0%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
            last_out1 = tf.nn.tanh(
                U.dense(last_out1,
                        hid_size,
                        "oppi1%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        last_out0 = U.dense(last_out0,
                            1,
                            "oppif0",
                            weight_init=U.normc_initializer(1.0))
        last_out1 = U.dense(last_out1,
                            1,
                            "oppif1",
                            weight_init=U.normc_initializer(1.0))
        last_out = tf.concat([last_out0, last_out1], 1)
        self.op_pi = tf.nn.softmax(last_out)

        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        #termination_sample = tf.greater(self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),maxval=1.))
        termination_sample = tf.constant([True])

        # define the angle
        #ctrl_in = tf.reshape([(tf.math.atan2(ob[:,1],ob[:,0])),(ob[:,2])], [-1,2])
        #last_out = ctrl_in
        last_out = obz_pure
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01),
                            bias=False)
            mean = tf.nn.tanh(mean)
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))

        #self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OPfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        #ac = tf.Print (ac, [ac,option,ob], "action and option before selecting: ")
        ac = U.switch(option[0], ac,
                      tf.stop_gradient(ob[:, -self.ac_space_dim:]))
        ac = tf.clip_by_value(ac, -1.0, 1.0)
        #ac = U.switch(option[0], tf.constant(1.0), tf.constant(0.0))
        #ac = tf.Print (ac, [ac], "action after selection: ")
        self.last_action = tf.stop_gradient(ac)
        self._act = U.function([stochastic, ob, option],
                               [ac, self.vpred, last_out, logstd])

        self._get_v = U.function([ob, option], [self.vpred])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op = U.function([ob], [self.op_pi])
Exemple #29
0
    def _build(self):
        ac_space = self._ac_space
        num_hid_layers = self._num_hid_layers
        hid_size = self._hid_size
        gaussian_fixed_var = self._gaussian_fixed_var
        if not isinstance(hid_size, list):
            hid_size = [hid_size]
        if len(hid_size) != num_hid_layers:
            hid_size += [hid_size[-1]] * (num_hid_layers - len(hid_size))

        self.obs = []
        self.pds = []

        for j in range(self._config.num_contexts):
            # obs
            _ob = {}
            for ob_name, ob_shape in self._ob_shape.items():
                _ob[ob_name] = U.get_placeholder(
                    name="ob_{}/from_{}".format(ob_name, j),
                    dtype=tf.float32,
                    shape=[None] + self._ob_shape[ob_name])

            # obs normalization
            if self._config.obs_norm == 'learn':
                obz = [(_ob[ob_name] - self.ob_rms[ob_name].mean) /
                       self.ob_rms[ob_name].std for ob_name in self.ob_type]
            else:
                obz = [_ob[ob_name] for ob_name in self.ob_type]

            obz = [tf.clip_by_value(ob, -5.0, 5.0) for ob in obz]
            obz = tf.concat(obz, -1)

            # value function
            with tf.variable_scope('vf', reuse=tf.AUTO_REUSE):
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = self._activation(
                        tf.layers.dense(
                            last_out,
                            hid_size[i],
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))
                vpred = tf.layers.dense(
                    last_out,
                    1,
                    name="final",
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]
                if j == self._id:
                    self.vpred = vpred

            # policy
            pdtype = make_pdtype(ac_space)
            if j == self._id:
                self.pdtype = pdtype
            with tf.variable_scope('pol', reuse=tf.AUTO_REUSE):
                last_out = obz
                for i in range(num_hid_layers):
                    last_out = self._activation(
                        tf.layers.dense(
                            last_out,
                            hid_size[i],
                            name="fc%i" % (i + 1),
                            kernel_initializer=U.normc_initializer(1.0)))

                if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                    mean = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0] // 2,
                        name="final",
                        kernel_initializer=U.normc_initializer(0.01))
                    logstd = tf.get_variable(
                        name="logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                else:
                    pdparam = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0],
                        name="final",
                        kernel_initializer=U.normc_initializer(0.01))

            self.obs.append([_ob[ob_name] for ob_name in self.ob_type])
            self.pds.append(pdtype.pdfromflat(pdparam))

        self.ob = self.obs[self._id]
        self.pd = self.pds[self._id]

        # sample action
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic] + self.ob, [ac, self.vpred])
        self._value = U.function([stochastic] + self.ob, self.vpred)
Exemple #30
0
    def __init__(self, sess, ob_space, ac_space, nbatch, nsteps, reuse=False,
                 name='policy', args=None): #pylint: disable=W0613
        policy_variance_state_dependent = args.policy_variance_state_dependent
        ac_fn = args.ac_fn
        hidden_sizes = args.hidden_sizes
        num_sharing_layers = args.num_sharing_layers
        num_layers = args.num_layers
        assert ac_fn in ['tanh', 'sigmoid', 'relu']

        if isinstance(hidden_sizes, int):
            assert num_layers is not None
            hidden_sizes = [hidden_sizes] * num_layers
        if num_layers is None:
            num_layers = len(hidden_sizes)
        assert num_layers == len(hidden_sizes)


        # print(f'Policy hidden_sizes:{hidden_sizes}')

        self.pdtype = make_pdtype(ac_space)

        with tf.variable_scope(name, reuse=reuse):
            X, processed_x = observation_input(ob_space, nbatch)

            activ = getattr( tf.nn, ac_fn )
            processed_x = tf.layers.flatten(processed_x)

            # --- share layers
            for ind_layer in range(num_sharing_layers):
                processed_x = activ( fc(processed_x, f'share_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)) )

            # --- policy
            pi_h = processed_x
            for ind_layer in range( num_sharing_layers, num_layers ):
                pi_h = activ(fc(pi_h, f'pi_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))

            from gym import spaces
            params_addtional = {}
            if policy_variance_state_dependent and isinstance( ac_space, spaces.Box ):
                latent_logstd = processed_x
                for ind_layer in range(num_sharing_layers, num_layers):
                    latent_logstd = activ(fc(latent_logstd, f'logstd_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
                params_addtional['latent_logstd'] = latent_logstd

            self.pd, self.pi = self.pdtype.pdfromlatent(pi_h, init_scale=0.01, logstd_initial=args.logstd, **params_addtional)


            # --- value function
            vf_h = processed_x
            for ind_layer in range( num_sharing_layers, num_layers ):
                vf_h = activ(fc(vf_h, f'vf_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
            vf = fc(vf_h, 'vf', 1)[:,0]



            a_sample = self.pd.sample()
            neglogp_sample = self.pd.neglogp(a_sample)
            self.initial_state = None


            # --- predict function
            # use placeholder
            # use stochastic action
            # use deterministic action
            if args.coef_predict_task > 0:
                import tensorflow.contrib.distributions as dists
                assert isinstance( ac_space, Box ), 'Only Implement for Box action space'
                A_type = tf.placeholder_with_default('pl', dtype=tf.string)
                A_pl = self.pdtype.sample_placeholder([None])
                self.A = A_pl
                self.A_type = A_type

                A_input_1 = U.switch( tf.equal( A_type, 'det' ), self.pd.mode(), a_sample )
                A_input = U.switch( tf.equal( A_type, 'pl' ), A_pl,A_input_1)
                predict_h = tf.concat( (processed_x, A_input))
                for ind_layer in range(num_sharing_layers, num_layers):
                    predict_h = activ(fc(predict_h, f'predict_fc{ind_layer}', nh=hidden_sizes[ind_layer], init_scale=np.sqrt(2)))
                predict_mean = fc(predict_h, f'predict_fc{ind_layer}', nh=ob_space.shape[0], init_scale=np.sqrt(2))

                predict_cov_init_value = np.identity( shape=ob_space.shape )
                predict_cov = tf.get_variable( name='predict_cov', shape=predict_cov_init_value, initializer=tf.constant_initializer(predict_cov_init_value) )
                predict_dist = dists.MultivariateNormalTriL( predict_mean, predict_cov )
                self.predict_dist = predict_dist

            scope_model = tf.get_variable_scope().name
            self.variables_all = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope_model)
            self.variables_trainable = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope_model)


        #--- set logstd
        # if isinstance( ac_space, Box ):
        # if not policy_variance_state_dependent:
        #     logstd_pl, _ = observation_input( ac_space, batch_size=1, name='ac' )
        #     assign_logstd = tf.assign( self.pdtype.logstd, logstd_pl )
        #     set_logstd_entity = U.function([logstd_pl], assign_logstd)
        #     def set_logstd(logstd_new):
        #         # if isinstance( logstd_new, float  ):
        #         #     logstd_new = [[logstd_new] * ac_space.shape[0]]
        #         set_logstd_entity(logstd_new)
        #     self.set_logstd = set_logstd
        # self.get_logstd = U.function([], self.pdtype.logstd)

        def step(ob, *_args, **_kwargs):
            a, v, neglogp = sess.run([a_sample, vf, neglogp_sample], {X:ob})
            return a, v, self.initial_state, neglogp

        def value(ob, *_args, **_kwargs):
            return sess.run(vf, {X:ob})

        def step_policyflat(ob, *_args, **_kwargs):
            a, v, neglogp, polciyflat = sess.run([a_sample, vf, neglogp_sample, self.pd.flatparam()], {X:ob}) #TODO: TEST flat for discrete action space
            return a, v, self.initial_state, neglogp, polciyflat

        def step_test(ob, *_args, **_kwargs):
            a = sess.run([self.pd.mode()], {X:ob})
            return a

        self.X = X
        self.vf = vf
        self.step = step
        self.step_policyflat = step_policyflat
        self.value = value
        self.step_test = step_test
Exemple #31
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True,
              num_options=2,
              dc=0,
              w_intfc=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.w_intfc = w_intfc
        self.state_in = []
        self.state_out = []
        self.dc = dc
        self.num_options = num_options
        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))
        option = U.get_placeholder(name="option", dtype=tf.int32, shape=[None])

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0,
                               5.0)
        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "vffc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.vpred = dense3D2(last_out,
                              1,
                              "vffinal",
                              option,
                              num_options=num_options,
                              weight_init=U.normc_initializer(1.0))[:, 0]

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "termfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.tpred = tf.nn.sigmoid(
            dense3D2(tf.stop_gradient(last_out),
                     1,
                     "termhead",
                     option,
                     num_options=num_options,
                     weight_init=U.normc_initializer(1.0)))[:, 0]
        termination_sample = tf.greater(
            self.tpred, tf.random_uniform(shape=tf.shape(self.tpred),
                                          maxval=1.))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "polfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
            mean = dense3D2(last_out,
                            pdtype.param_shape()[0] // 2,
                            "polfinal",
                            option,
                            num_options=num_options,
                            weight_init=U.normc_initializer(0.01))
            logstd = tf.get_variable(
                name="logstd",
                shape=[num_options, 1,
                       pdtype.param_shape()[0] // 2],
                initializer=tf.zeros_initializer())
            pdparam = U.concatenate([mean, mean * 0.0 + logstd[option[0]]],
                                    axis=1)
        else:
            pdparam = U.dense(last_out,
                              pdtype.param_shape()[0], "polfinal",
                              U.normc_initializer(0.01))
        self.pd = pdtype.pdfromflat(pdparam)
        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())

        # self.op_pi = tf.nn.softmax(U.dense(tf.stop_gradient(last_out), num_options, "OP", weight_init=U.normc_initializer(1.0)))
        # pdb.set_trace()
        # self.op_pi = tf.constant(1./num_options)

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "intfc%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.intfc = tf.sigmoid(
            U.dense(last_out,
                    num_options,
                    "intfcfinal",
                    weight_init=U.normc_initializer(1.0)))

        last_out = obz
        for i in range(num_hid_layers):
            last_out = tf.nn.tanh(
                U.dense(last_out,
                        hid_size,
                        "OP%i" % (i + 1),
                        weight_init=U.normc_initializer(1.0)))
        self.op_pi = tf.nn.softmax(
            U.dense(last_out,
                    num_options,
                    "OPfinal",
                    weight_init=U.normc_initializer(1.0)))

        self._act = U.function([stochastic, ob, option], [ac])
        self.get_term = U.function([ob, option], [termination_sample])
        self.get_tpred = U.function([ob, option], [self.tpred])
        self.get_vpred = U.function([ob, option], [self.vpred])
        self._get_op_int = U.function([ob], [self.op_pi, self.intfc])
        self._get_intfc = U.function([ob], [self.intfc])
        self._get_op = U.function([ob], [self.op_pi])