コード例 #1
0
def test_build_mlp():
    import utils
    input_layer = tf.placeholder(tf.float32, (None, 10))
    new_layer = utils.build_mlp(input_layer,
                                5,
                                "test",
                                n_layers=2,
                                hidden_dim=500,
                                activation=tf.nn.relu,
                                output_activation=None,
                                reuse=False)
    print(new_layer)
    print('---------------------')
    reuse_layer = utils.build_mlp(input_layer,
                                  5,
                                  "test",
                                  n_layers=2,
                                  hidden_dim=500,
                                  activation=tf.nn.relu,
                                  output_activation=None,
                                  reuse=True)
    print(reuse_layer)
    print(new_layer == reuse_layer)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        org, reuse = sess.run(
            [new_layer, reuse_layer],
            feed_dict={input_layer: np.atleast_2d(np.arange(0, 10))})
        print('org: ', org)
        print('reuse: ', reuse)
コード例 #2
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        # REMEMBER: use a new variable such as 'normalized_state' instead of reuse variable 'state', OTHERWISE, it won't work (maybe because of no gradient update to it) 
        normalized_state = utils.normalize(state, self._init_dataset.state_mean, self._init_dataset.state_std)
        normalized_action = utils.normalize(action, self._init_dataset.action_mean, self._init_dataset.action_std)
        normalized_state_dif = utils.build_mlp(tf.concat([normalized_state, normalized_action], axis=1), self._state_dim, scope="dynamics_func", n_layers=self._nn_layers, reuse=reuse)
        next_state_pred = state + utils.unnormalize(normalized_state_dif, self._init_dataset.delta_state_mean, self._init_dataset.delta_state_std)
        
        return next_state_pred
コード例 #3
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        state_norm = utils.normalize(state, self._init_dataset.state_mean,
                                     self._init_dataset.state_std)
        action_norm = utils.normalize(action, self._init_dataset.action_mean,
                                      self._init_dataset.action_std)
        x = tf.concat([state_norm, action_norm], axis=1)
        x = utils.build_mlp(x,
                            self._state_dim,
                            'dynamic_net',
                            self._nn_layers,
                            reuse=reuse)
        next_state_pred = state + utils.unnormalize(
            x, self._init_dataset.state_mean, self._init_dataset.state_std)

        return next_state_pred
コード例 #4
0
    def _dynamics_func(self, state, action, reuse=True):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        print("ModelBasedPolicy - before _dynamics_func")
        ### PROBLEM 1
        ##############
        ### part a ###Normalize state and action by using statistics of self._init_dataset
        ##############
        s_mean = self._init_dataset.state_mean
        a_mean = self._init_dataset.action_mean
        s_std = self._init_dataset.state_std
        a_std = self._init_dataset.action_std
        state_d_m = self._init_dataset.delta_state_mean
        state_d_s = self._init_dataset.delta_state_std
        normalize_state = utils.normalize(state, s_mean, s_std)
        normalize_action = utils.normalize(action, a_mean, a_std)

        ##############
        ### part b ### Concatenate state and action
        ##############
        s_a = tf.concat([normalize_state, normalize_action], axis=1)
        # s_a = np.concatenate((normalize_state,normalize_action),axis = None)
        # d = s_a.shape
        ##############
        ### part c ### Generate NN
        ##############
        # print ("generate the NN")
        # s_a_placeholder = tf.placeholder(tf.float32, [None,d[0]])
        print("generate the NN")
        next_state_prediction = utils.build_mlp(s_a,
                                                self._state_dim,
                                                "nn_prediciton",
                                                n_layers=self._nn_layers,
                                                reuse=reuse)

        print("finish gen NN")

        ##############
        ### pard d ###
        ##############
        delta_state = utils.unnormalize(next_state_prediction, state_d_m,
                                        state_d_s)
        next_state_pred = state + delta_state
        print("ModelBasedPolicy - after _dynamics_func")
        return next_state_pred
コード例 #5
0
    def dynamics_func(self, state, action, reuse):
        """
        takes state and action, returns the next state 

        returns:
            prediction of next state
        """
        state_norm = utils.normalize(state, self.init_dataset.state_mean,
                                     self.init_dataset.state_std)
        action_norm = utils.normalize(action, self.init_dataset.action_mean,
                                      self.init_dataset.action_std)

        # network input is concatenated state, action
        s_a = tf.concat([state_norm, action_norm], axis=1)
        d_next_state_norm = utils.build_mlp(s_a,
                                            self.state_dim,
                                            'prediction',
                                            self.nn_layers,
                                            reuse=reuse)

        d_next_state = utils.unnormalize(d_next_state_norm,
                                         self.init_dataset.delta_state_mean,
                                         self.init_dataset.delta_state_std)

        next_state_pred = d_next_state + state
        return next_state_pred
コード例 #6
0
ファイル: models.py プロジェクト: Rolight/t-rex-runner-AI
 def add_prediction_op(self):
     pred = [
         build_mlp(input_placeholder=self.train_inputs[action_id],
                   output_size=2,
                   scope='nndym-%s-action-%d' % (self.name, action_id),
                   **self.mlp_params)
         for action_id in range(self.env_conf['action_type_size'])
     ]
     return pred
コード例 #7
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        # self._dynamics_func is supposed to take in a batch.  The input state is assumed to be [None, self._state_dim].
        state_mean, state_std = self._init_dataset.state_mean, self._init_dataset.state_std
        action_mean, action_std = self._init_dataset.action_mean, self._init_dataset.action_std
        normalized_state = utils.normalize(state,
                                           mean=state_mean,
                                           std=state_std)
        normalized_action = utils.normalize(action,
                                            mean=action_mean,
                                            std=action_std)
        input_nn = tf.concat(values=[normalized_state, normalized_action],
                             axis=1)
        #tf.concat([normalized_state, normalized_action], axis=-1) # np.concatenate(normalized_state, normalized_action)

        # print("unnormlaized state: ", state)
        # print("state: ", normalized_state.shape)
        # print("state: ", normalized_state)
        # print("action: ", normalized_action.shape)
        # print("type: ", type(normalized_action))
        # print("input_nn: : ", input_nn)

        normalized_delta_state_pred = utils.build_mlp(
            input_layer=input_nn,
            output_dim=self._state_dim,
            scope="dynamics_model",
            n_layers=self._nn_layers,
            reuse=reuse)

        delta_state_pred = utils.unnormalize(
            normalized_delta_state_pred, self._init_dataset.delta_state_mean,
            self._init_dataset.delta_state_std)
        next_state_pred = tf.add(state, delta_state_pred)

        return next_state_pred
コード例 #8
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state                
        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        ## (a) Normalize both the state and action by using the statistics of self._init_dataset and
        ##             the utils.normalize function
        state_mean = self._init_dataset.state_mean
        state_std = self._init_dataset.state_std
        state_normalize = utils.normalize(state,
                                          state_mean,
                                          state_std,
                                          eps=1e-8)

        # # @@@@@@
        # print("state", tf.convert_to_tensor(state).get_shape())
        # print("action", tf.convert_to_tensor(action).get_shape())
        # state (?, 20)
        # action (?, 6)
        action_mean = self._init_dataset.action_mean
        action_std = self._init_dataset.action_std
        action_normalize = utils.normalize(action,
                                           action_mean,
                                           action_std,
                                           eps=1e-8)

        ## (b) Concatenate the normalized state and action
        concatenated = tf.concat([state_normalize, action_normalize],
                                 axis=1,
                                 name='concatenated')

        # (c) Pass the concatenated, normalized state-action tensor through a neural network with
        #     self._nn_layers number of layers using the function utils.build_mlp. The resulting output
        #     is the normalized predicted difference between the next state and the current state
        next_state = utils.build_mlp(input_layer=concatenated,
                                     output_dim=self._state_dim,
                                     scope='dynamics_func',
                                     n_layers=self._nn_layers,
                                     reuse=reuse)

        # (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
        #     the predicted next state
        delta_state_mean = self._init_dataset.delta_state_mean
        delta_state_std = self._init_dataset.delta_state_std
        next_state_pred = state + utils.unnormalize(
            next_state, delta_state_mean, delta_state_std)
        return next_state_pred
コード例 #9
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        
        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        #print(type(state)
        norm_state = utils.normalize(state, self._init_dataset.state_mean,
                                     self._init_dataset.state_std)
        norm_action = utils.normalize(action, self._init_dataset.action_mean,
                                      self._init_dataset.action_std)
        norm_all = tf.concat((norm_state, norm_action), axis=1)
        # st_ph,ac_ph,nx_st_ph = self._setup_placeholders()
        dy_func = utils.build_mlp(norm_all,
                                  self._state_dim,
                                  scope="DYN_FUNC",
                                  n_layers=self._nn_layers,
                                  reuse=reuse)

        # dy_out = dy_func(norm_all)
        #self.sess.run(dy_func,feed_dict={norm_all:norm_all})

        dy_out = utils.unnormalize(dy_func,
                                   self._init_dataset.delta_state_mean,
                                   self._init_dataset.delta_state_std)

        next_state_pred = dy_out + state
        #raise NotImplementedError

        return next_state_pred
コード例 #10
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        normalized_state = utils.normalize(state,
                                           mean=self._init_dataset.state_mean,
                                           std=self._init_dataset.state_std)
        normalized_action = utils.normalize(
            action,
            mean=self._init_dataset.action_mean,
            std=self._init_dataset.action_std)

        normalized_state_action = tf.concat(
            [normalized_state, normalized_action], axis=1)

        normalized_state_delta = utils.build_mlp(
            input_layer=normalized_state_action,
            output_dim=self._state_dim,
            scope='dynamics_func',
            n_layers=self._nn_layers,
            reuse=reuse)

        state_delta = utils.unnormalize(
            normalized_state_delta,
            mean=self._init_dataset.delta_state_mean,
            std=self._init_dataset.delta_state_std)

        next_state_pred = state + state_delta

        return next_state_pred
コード例 #11
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        # convert to float32
        state = tf.dtypes.cast(state, dtype=tf.float32)
        action = tf.dtypes.cast(action, dtype=tf.float32)

        mean_state = self._init_dataset.state_mean
        std_state = self._init_dataset.state_std
        state_norm = utils.normalize(state, mean_state, std_state)
        mean_action = self._init_dataset.action_mean
        std_action = self._init_dataset.action_std
        action_norm = utils.normalize(action, mean_action, std_action)
        state_action = tf.concat([state_norm, action_norm], axis=1)
        delta_state_prediction_norm = utils.build_mlp(state_action,
                                                      self._state_dim,
                                                      'state_prediction',
                                                      n_layers=self._nn_layers,
                                                      reuse=reuse)
        delta_mean_state = self._init_dataset.delta_state_mean
        delta_std_state = self._init_dataset.delta_state_std
        delta_state_prediction = utils.unnormalize(delta_state_prediction_norm,
                                                   delta_mean_state,
                                                   delta_std_state)
        next_state_pred = state + delta_state_prediction
        return next_state_pred
コード例 #12
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        #print(self._init_dataset.shape)
        # Normalize state and action
        #d_mean, d_std = np.mean(self._init_dataset), np.std(self._init_dataset)
        n_state = utils.normalize(state, self._init_dataset.state_mean,
                                  self._init_dataset.state_std)
        n_action = utils.normalize(action, self._init_dataset.action_mean,
                                   self._init_dataset.action_std)
        # Predict next state based on the current state and action
        n_sa = tf.concat((n_state, n_action), axis=1)
        n_nsp = utils.build_mlp(n_sa,
                                output_dim=self._state_dim,
                                scope="f_transition",
                                n_layers=self._nn_layers,
                                reuse=reuse)
        next_state_pred = state + utils.unnormalize(
            n_nsp, self._init_dataset.delta_state_mean,
            self._init_dataset.delta_state_std)

        return next_state_pred
コード例 #13
0
    def _dynamics_func(self, state, action, reuse=False):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        # raise NotImplementedError
        norm_state = utils.normalize(state, self._init_dataset.state_mean,
                                     self._init_dataset.state_std)
        norm_action = utils.normalize(action, self._init_dataset.action_mean,
                                      self._init_dataset.action_std)
        norm_state_action = tf.concat([norm_state, norm_action], 1)
        norm_delta_state_pred = utils.build_mlp(norm_state_action,
                                                self._state_dim,
                                                "dynamic",
                                                n_layers=self._nn_layers,
                                                reuse=reuse)
        delta_state_pred = utils.unnormalize(
            norm_delta_state_pred, self._init_dataset.delta_state_mean,
            self._init_dataset.delta_state_std)

        next_state_pred = state + delta_state_pred

        return next_state_pred
コード例 #14
0
    def _dynamics_func(self, state, action, reuse):
        """
            Takes as input a state and action, and predicts the next state

            returns:
                next_state_pred: predicted next state

            implementation details (in order):
                (a) Normalize both the state and action by using the statistics of self._init_dataset and
                    the utils.normalize function
                (b) Concatenate the normalized state and action
                (c) Pass the concatenated, normalized state-action tensor through a neural network with
                    self._nn_layers number of layers using the function utils.build_mlp. The resulting output
                    is the normalized predicted difference between the next state and the current state
                (d) Unnormalize the delta state prediction, and add it to the current state in order to produce
                    the predicted next state

        """
        ### PROBLEM 1
        ### YOUR CODE HERE
        state_ = utils.normalize(x=state,
                                 mean=self._init_dataset.state_mean,
                                 std=self._init_dataset.state_std)
        action_ = utils.normalize(x=action,
                                  mean=self._init_dataset.action_mean,
                                  std=self._init_dataset.action_std)
        input_ = tf.concat(values=[state_, action_], axis=-1)
        residual = utils.build_mlp(input_layer=input_,
                                   output_dim=self._state_dim,
                                   scope="dynamics",
                                   n_layers=self._nn_layers,
                                   reuse=reuse)
        residual = utils.unnormalize(x=residual,
                                     mean=self._init_dataset.delta_state_mean,
                                     std=self._init_dataset.delta_state_std)
        next_state_pred = state + residual

        return next_state_pred
コード例 #15
0
    def _dynamics_func(self, state, action, reuse):

        # get dataset statistics
        state_std = self._init_dataset.state_std
        state_mean = self._init_dataset.state_mean

        action_std = self._init_dataset.action_std
        action_mean = self._init_dataset.action_mean

        delta_state_std = self._init_dataset.delta_state_std
        delta_state_mean = self._init_dataset.delta_state_mean

        # normalize input data
        state_norm = utils.normalize(state, state_mean, state_std)
        action_norm = utils.normalize(action, action_mean, action_std)

        # perform delta prediction
        inp = tf.concat([state_norm, action_norm], 1)
        out = utils.build_mlp(inp, self._state_dim, 'policy', reuse=reuse)

        # perdict next state
        next_state_pred = state + utils.unnormalize(out, delta_state_mean,
                                                    delta_state_std)
        return next_state_pred
コード例 #16
0
ファイル: ppo.py プロジェクト: xjwhy/Carla-ppo
    def __init__(self,
                 input_states,
                 taken_actions,
                 action_space,
                 scope_name,
                 initial_std=0.4,
                 initial_mean_factor=0.1,
                 pi_hidden_sizes=(500, 300),
                 vf_hidden_sizes=(500, 300)):
        """
            input_states [batch_size, width, height, depth]:
                Input images to predict actions for
            taken_actions [batch_size, num_actions]:
                Placeholder of taken actions for training
            action_space (gym.spaces.Box):
                Continous action space of our agent
            scope_name (string):
                Variable scope name for the policy graph
            initial_std (float):
                Initial value of the std used in the gaussian policy
            initial_mean_factor (float):
                Variance scaling factor for the action mean prediction layer
            pi_hidden_sizes (list):
                List of layer sizes used to construct action predicting MLP
            vf_hidden_sizes (list):
                List of layer sizes used to construct value predicting MLP
        """

        num_actions, action_min, action_max = action_space.shape[
            0], action_space.low, action_space.high

        with tf.variable_scope(scope_name):
            # Policy branch π(a_t | s_t; θ)
            self.pi = build_mlp(input_states,
                                hidden_sizes=pi_hidden_sizes,
                                activation=tf.nn.relu,
                                output_activation=tf.nn.relu)
            self.action_mean = tf.layers.dense(
                self.pi,
                num_actions,
                activation=tf.nn.tanh,
                kernel_initializer=tf.initializers.variance_scaling(
                    scale=initial_mean_factor),
                name="action_mean")
            self.action_mean = action_min + (
                (self.action_mean + 1) / 2) * (action_max - action_min)
            self.action_logstd = tf.Variable(np.full((num_actions),
                                                     np.log(initial_std),
                                                     dtype=np.float32),
                                             name="action_logstd")

            # Value branch V(s_t; θ)
            if vf_hidden_sizes is None:
                self.vf = self.pi  # Share features if None
            else:
                self.vf = build_mlp(input_states,
                                    hidden_sizes=vf_hidden_sizes,
                                    activation=tf.nn.relu,
                                    output_activation=tf.nn.relu)
            self.value = tf.squeeze(tf.layers.dense(self.vf,
                                                    1,
                                                    activation=None,
                                                    name="value"),
                                    axis=-1)

            # Create graph for sampling actions
            self.action_normal = tfp.distributions.Normal(
                self.action_mean,
                tf.exp(self.action_logstd),
                validate_args=True)
            self.sampled_action = tf.squeeze(self.action_normal.sample(1),
                                             axis=0)

            # Clip action space to min max
            self.sampled_action = tf.clip_by_value(self.sampled_action,
                                                   action_min, action_max)

            # Get the log probability of taken actions
            # log π(a_t | s_t; θ)
            self.action_log_prob = tf.reduce_sum(
                self.action_normal.log_prob(taken_actions),
                axis=-1,
                keepdims=True)
コード例 #17
0
    def build_policy_network(self):
        sy_ob_no = tf.placeholder(shape=[None, self.ob_dim],
                                  name="ob",
                                  dtype=tf.float32)
        if self.discrete:
            sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32)
        else:
            sy_ac_na = tf.placeholder(shape=[None, self.ac_dim],
                                      name="ac",
                                      dtype=tf.float32)

        # Define a placeholder for advantages
        sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32)

        if self.discrete:
            # YOUR_CODE_HERE
            sy_logits_na = build_mlp(sy_ob_no,
                                     self.ac_dim,
                                     'policy_network',
                                     n_layers=self.n_layers,
                                     size=self.size)
            sy_prob_na = tf.nn.softmax(sy_logits_na)
            # print('sy_logits_na', sy_logits_na)

            sy_sampled_ac = tf.multinomial(
                sy_logits_na, 1)  # Hint: Use the tf.multinomial op
            sy_sampled_ac = tf.reshape(sy_sampled_ac, [-1])
            # print('sy_sampled_ac', sy_sampled_ac)

            sy_ac_onehot_na = tf.one_hot(sy_ac_na, depth=self.ac_dim)
            # print('sy_ac_onehot_na', sy_ac_onehot_na)
            sy_ac_responseprob_na = tf.reduce_mean(tf.multiply(
                sy_ac_onehot_na, sy_prob_na),
                                                   axis=1)
            # print('sy_ac_responseprob_na', sy_ac_responseprob_na)
            sy_logprob_n = tf.log(sy_ac_responseprob_na)
            # print('sy_logprob_n', sy_logprob_n)

        else:
            # YOUR_CODE_HERE
            # Parameterize the gaussian policy by mean and std
            self.mpc_a = tf.placeholder(shape=[None, self.ac_dim],
                                        name="mpc_a",
                                        dtype=tf.float32)

            net = sy_ob_no

            net = tf.layers.dense(net, 64, activation=tf.tanh)

            net = tf.layers.dense(net, 64, activation=tf.tanh)

            net = tf.concat([net, self.mpc_a], axis=1)

            sy_mean_na = tf.layers.dense(net, self.ac_dim, activation=None)

            # gate = tf.Variable(tf.ones([1, self.ac_dim]))

            # sy_mean_na = gate *  self.mpc_a  + (1-gate) * sy_mean_na

            # sy_mean_na = self.mpc_a

            self.sy_logstd = tf.Variable(
                tf.zeros([1, self.ac_dim]),
                name='action/logstd',
                dtype=tf.float32
            )  # logstd should just be a trainable variable, not a network output.
            self.sy_std = tf.exp(self.sy_logstd)
            # print('self.sy_std', self.sy_std)

            # Sample an action
            sy_sampled_ac = sy_mean_na + self.sy_std * tf.random_normal(
                tf.shape(sy_mean_na))
            # print('sy_sampled_ac', sy_sampled_ac)

            # Log likely hood of chosen this action
            # Hint: Use the log probability under a multivariate gaussian.
            sy_z = (sy_ac_na - sy_mean_na) / self.sy_std
            # print('sy_z', sy_z)
            sy_logprob_n = -0.5 * tf.square(sy_z) - 0.5 * tf.log(
                tf.constant(2 * np.pi)) - self.sy_logstd

            sy_logprob_n = tf.reduce_sum(sy_logprob_n, axis=1)
            # print('sy_logprob_n', sy_logprob_n)

        return sy_ob_no, sy_ac_na, sy_adv_n, sy_sampled_ac, sy_logprob_n
コード例 #18
0
ファイル: ppo.py プロジェクト: bitsauce/RoadFollowing-ppo
    def __init__(self,
                 input_states,
                 taken_actions,
                 num_actions,
                 action_min,
                 action_max,
                 scope_name,
                 initial_std=0.4,
                 initial_mean_factor=0.1,
                 pi_hidden_sizes=(500, 300),
                 vf_hidden_sizes=(500, 300)):
        """
            input_states [batch_size, width, height, depth]:
                Input images to predict actions for
            taken_actions [batch_size, num_actions]:
                Actions taken by the old policy (used for training)
            num_actions (int):
                Number of continous actions to output
            action_min [num_actions]:
                Minimum possible value for the respective action
            action_max [num_actions]:
                Maximum possible value for the respective action
            scope_name (string):
                Variable scope name for the policy graph
            initial_mean_factor (float):
                Variance scaling factor for the action mean prediction layer
        """

        with tf.variable_scope(scope_name):
            # Policy branch π(a_t | s_t; θ)
            self.pi = build_mlp(input_states,
                                hidden_sizes=pi_hidden_sizes,
                                activation=tf.nn.relu,
                                output_activation=tf.nn.relu)
            self.action_mean = tf.layers.dense(
                self.pi,
                num_actions,
                activation=tf.nn.tanh,
                kernel_initializer=tf.initializers.variance_scaling(
                    scale=initial_mean_factor),
                name="action_mean")
            self.action_mean = action_min + (
                (self.action_mean + 1) / 2) * (action_max - action_min)
            self.action_logstd = tf.Variable(np.full((num_actions),
                                                     np.log(initial_std),
                                                     dtype=np.float32),
                                             name="action_logstd")

            # Value branch V(s_t; θ)
            if vf_hidden_sizes is None:
                self.vf = self.pi  # Share features if None
            else:
                self.vf = build_mlp(input_states,
                                    hidden_sizes=vf_hidden_sizes,
                                    activation=tf.nn.relu,
                                    output_activation=tf.nn.relu)
            self.value = tf.squeeze(tf.layers.dense(self.vf,
                                                    1,
                                                    activation=None,
                                                    name="value"),
                                    axis=-1)

            # Create graph for sampling actions
            self.action_normal = tfp.distributions.Normal(
                self.action_mean,
                tf.exp(self.action_logstd),
                validate_args=True)
            self.sampled_action = tf.squeeze(self.action_normal.sample(1),
                                             axis=0)

            # Clip action space to min max
            self.sampled_action = tf.clip_by_value(self.sampled_action,
                                                   action_min, action_max)

            # Get the log probability of taken actions
            # log π(a_t | s_t; θ)
            self.action_log_prob = tf.reduce_sum(
                self.action_normal.log_prob(taken_actions),
                axis=-1,
                keepdims=True)