Beispiel #1
0
    def __init__(self,
                 env,
                 observations,
                 latent,
                 estimate_q=False,
                 vf_latent=None,
                 sess=None,
                 **tensors):
        """
        Parameters:
        ----------
        env             RL environment

        observations    tensorflow placeholder in which the observations will be fed

        latent          latent state from which policy distribution parameters should be inferred

        vf_latent       latent state from which value function should be inferred (if None, then latent is used)

        sess            tensorflow session to run calculations in (if None, default session is used)

        **tensors       tensorflow tensors for additional attributes such as state or mask

        """

        self.X = observations
        self.state = tf.constant([])
        self.initial_state = None
        self.__dict__.update(tensors)

        vf_latent = vf_latent if vf_latent is not None else latent

        vf_latent = tf.layers.flatten(vf_latent)
        latent = tf.layers.flatten(latent)

        # Based on the action space, will select what probability distribution type
        self.pdtype = make_pdtype(env.action_space)

        self.pd, self.pi = self.pdtype.pdfromlatent(latent, init_scale=0.01)

        # Take an action
        self.action = self.pd.sample()

        # Calculate the neg log of our probability
        self.neglogp = self.pd.neglogp(self.action)
        self.sess = sess

        if estimate_q:
            assert isinstance(env.action_space, gym.spaces.Discrete)
            self.q = fc(vf_latent, 'q', env.action_space.n)
            self.vf = self.q
        else:
            self.vf = fc(vf_latent, 'vf', 1)
            self.vf = self.vf[:, 0]
Beispiel #2
0
    def _init(self,
              ob_space,
              ac_space,
              num_units,
              gaussian_fixed_var=True,
              async_update=False):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(
            ac_space)  # pd: probability distribution
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape,
                                         use_mpi=(not async_update))

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell_vf',
                initializer=U.normc_initializer(1.0))

            init_c, init_h = lstm.zero_state(1, dtype=tf.float32)

            self.input_c_vf = U.get_placeholder(dtype=tf.float32,
                                                name="c_vf",
                                                shape=[None] +
                                                list(init_c.get_shape()[1:]))
            self.input_h_vf = U.get_placeholder(dtype=tf.float32,
                                                name="h_vf",
                                                shape=[None] +
                                                list(init_h.get_shape()[1:]))

            inpt_vf = tf.expand_dims(obz, 0)
            out_vf, (new_c, new_h) = tf.nn.dynamic_rnn(
                lstm,
                inpt_vf,
                initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                    self.input_c_vf, self.input_h_vf),
                dtype=tf.float32)
            out_vf = tf.squeeze(out_vf, axis=[0])

            self.vpred = tf.layers.dense(
                out_vf,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]
            self.out_hs_vf = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)

        with tf.variable_scope('pol'):

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell_pol',
                initializer=U.normc_initializer(1.0))

            init_c, init_h = lstm.zero_state(1, dtype=tf.float32)

            self.input_c_pol = U.get_placeholder(dtype=tf.float32,
                                                 name="c_pol",
                                                 shape=[None] +
                                                 list(init_c.get_shape()[1:]))
            self.input_h_pol = U.get_placeholder(dtype=tf.float32,
                                                 name="h_pol",
                                                 shape=[None] +
                                                 list(init_h.get_shape()[1:]))

            inpt_pol = tf.expand_dims(obz, 0)
            out_pol, (new_c, new_h) = tf.nn.dynamic_rnn(
                lstm,
                inpt_pol,
                initial_state=tf.nn.rnn_cell.LSTMStateTuple(
                    self.input_c_vf, self.input_h_vf),
                dtype=tf.float32)
            out_pol = tf.squeeze(out_pol, axis=[0])
            self.out_hs_pol = tf.nn.rnn_cell.LSTMStateTuple(new_c, new_h)

            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    out_pol,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    out_pol,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([
            stochastic, ob, self.input_c_vf, self.input_h_vf, self.input_c_pol,
            self.input_h_pol
        ], [ac, self.vpred, self.out_hs_vf, self.out_hs_pol])
Beispiel #3
0
    def _init(self,
              ob_space,
              ac_space,
              num_units,
              gaussian_fixed_var=True,
              async_update=False):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(
            ac_space)  # pd: probability distribution
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        full_path_is_done = tf.get_variable("full_path_is_done",
                                            dtype=tf.bool,
                                            initializer=True,
                                            trainable=False)

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape,
                                         use_mpi=(not async_update))

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell',
                initializer=U.normc_initializer(1.0))

            init_lstm_state = lstm.zero_state(1, dtype=tf.float32)
            v_lstm_state = tf.get_variable("v_lstm_state",
                                           dtype=tf.float32,
                                           initializer=init_lstm_state,
                                           trainable=False)
            ba_state = tf.get_variable("ba_state",
                                       dtype=tf.float32,
                                       initializer=init_lstm_state,
                                       trainable=False)
            assign_ba_state = tf.cond(
                full_path_is_done,
                lambda: tf.assign(ba_state, v_lstm_state),  # TRUE
                lambda: tf.assign(ba_state, ba_state))  # FALSE
            lstm_state = tf.cond(tf.equal(tf.shape(ob)[0], 1),
                                 lambda: v_lstm_state, lambda: ba_state)
            assign_fpid = tf.assign(full_path_is_done,
                                    tf.math.greater(tf.shape(ob)[0], 1))

            with tf.control_dependencies([assign_ba_state]):
                last_out = tf.expand_dims(last_out, 0)
                last_out, lstm_new_state = tf.nn.dynamic_rnn(
                    lstm,
                    last_out,
                    initial_state=init_lstm_state,
                    dtype=tf.float32)
                assign_new_state = tf.assign(v_lstm_state, lstm_new_state)
                last_out = tf.squeeze(last_out, axis=[0])

            with tf.control_dependencies([assign_new_state, assign_fpid]):
                self.vpred = tf.layers.dense(
                    last_out,
                    1,
                    name='final',
                    kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz

            lstm = tf.contrib.rnn.LSTMCell(
                num_units=num_units,
                name=f'rnn_cell',
                initializer=U.normc_initializer(1.0),
                state_is_tuple=False)

            init_lstm_state = lstm.zero_state(1, dtype=tf.float32)
            v_lstm_state = tf.get_variable("v_lstm_state",
                                           dtype=tf.float32,
                                           initializer=init_lstm_state,
                                           trainable=False)
            ba_state = tf.get_variable("ba_state",
                                       dtype=tf.float32,
                                       initializer=init_lstm_state,
                                       trainable=False)
            assign_ba_state = tf.cond(
                full_path_is_done,
                lambda: tf.assign(ba_state, v_lstm_state),  # TRUE
                lambda: tf.assign(ba_state, ba_state))  # FALSE
            lstm_state = tf.cond(tf.equal(tf.shape(ob)[0], 1),
                                 lambda: v_lstm_state, lambda: ba_state)
            assign_fpid = tf.assign(full_path_is_done,
                                    tf.math.greater(tf.shape(ob)[0], 1))

            with tf.control_dependencies([assign_ba_state]):
                last_out = tf.expand_dims(last_out, 0)
                last_out, lstm_new_state = tf.nn.dynamic_rnn(
                    lstm, last_out, initial_state=lstm_state, dtype=tf.float32)
                assign_new_state = tf.assign(v_lstm_state, lstm_new_state)
                last_out = tf.squeeze(last_out, axis=[0])

            with tf.control_dependencies([assign_new_state, assign_fpid]):
                if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                    mean = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0] // 2,
                        name='final',
                        kernel_initializer=U.normc_initializer(0.01))
                    logstd = tf.get_variable(
                        name="logstd",
                        shape=[1, pdtype.param_shape()[0] // 2],
                        initializer=tf.zeros_initializer())
                    pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
                else:
                    pdparam = tf.layers.dense(
                        last_out,
                        pdtype.param_shape()[0],
                        name='final',
                        kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])
Beispiel #4
0
    def _init(self,
              ob_space,
              ac_space,
              hid_size,
              num_hid_layers,
              gaussian_fixed_var=True):
        assert isinstance(ob_space, gym.spaces.Box)

        self.pdtype = pdtype = make_pdtype(ac_space)
        sequence_length = None

        ob = U.get_placeholder(name="ob",
                               dtype=tf.float32,
                               shape=[sequence_length] + list(ob_space.shape))

        with tf.variable_scope("obfilter"):
            self.ob_rms = RunningMeanStd(shape=ob_space.shape)

        with tf.variable_scope('vf'):
            obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std,
                                   -5.0, 5.0)
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name="fc%i" % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            self.vpred = tf.layers.dense(
                last_out,
                1,
                name='final',
                kernel_initializer=U.normc_initializer(1.0))[:, 0]

        with tf.variable_scope('pol'):
            last_out = obz
            for i in range(num_hid_layers):
                last_out = tf.nn.tanh(
                    tf.layers.dense(
                        last_out,
                        hid_size,
                        name='fc%i' % (i + 1),
                        kernel_initializer=U.normc_initializer(1.0)))
            if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
                mean = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0] // 2,
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))
                logstd = tf.get_variable(
                    name="logstd",
                    shape=[1, pdtype.param_shape()[0] // 2],
                    initializer=tf.zeros_initializer())
                pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
            else:
                pdparam = tf.layers.dense(
                    last_out,
                    pdtype.param_shape()[0],
                    name='final',
                    kernel_initializer=U.normc_initializer(0.01))

        self.pd = pdtype.pdfromflat(pdparam)

        self.state_in = []
        self.state_out = []

        stochastic = tf.placeholder(dtype=tf.bool, shape=())
        ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
        self._act = U.function([stochastic, ob], [ac, self.vpred])