コード例 #1
0
 def define_forward_pass(self):
     if self.discrete:
         logits_na = build_mlp(self.observations_pl, output_size=self.ac_dim, scope='discrete_logits', n_layers=self.n_layers, size=self.size)
         self.parameters = logits_na
     else:
         mean = build_mlp(self.observations_pl, output_size=self.ac_dim, scope='continuous_logits', n_layers=self.n_layers, size=self.size)
         logstd = tf.Variable(tf.zeros(self.ac_dim), name='logstd')
         self.parameters = (mean, logstd)
コード例 #2
0
ファイル: MLP_policy.py プロジェクト: furunding/RL
    def __init__(
            self,
            ac_dim,
            ob_dim,
            n_layers,
            size,
            learning_rate=1e-4,
            training=True,
            discrete=False,  # unused for now
            nn_baseline=False,  # unused for now
            **kwargs):
        super().__init__(**kwargs)

        # init vars
        # self.sess = sess
        self.ac_dim = ac_dim
        self.ob_dim = ob_dim
        self.n_layers = n_layers
        self.size = size
        self.learning_rate = learning_rate
        self.training = training
        self.model = build_mlp(output_size=self.ac_dim,
                               n_layers=self.n_layers,
                               size=self.size)
        self.loss_object = lambda y_true, y_pred: tf.reduce_mean(
            tf.reduce_sum(tf.square(y_true - y_pred)))
        self.optimizer = keras.optimizers.Adam(
            learning_rate=self.learning_rate)
        '''
コード例 #3
0
 def build_baseline_forward_pass(self):
     self.baseline_prediction = tf.squeeze(
         build_mlp(self.observations_pl,
                   output_size=1,
                   scope='nn_baseline',
                   n_layers=self.n_layers,
                   size=self.size))
コード例 #4
0
    def define_forward_pass(self):
        # normalize input data to mean 0, std 1
        obs_unnormalized = self.obs_pl
        acs_unnormalized = self.acs_pl
        # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines
        obs_normalized = normalize(
            obs_unnormalized, self.obs_mean_pl, self.obs_std_pl
        )  # TODO(Q1) Define obs_normalized using obs_unnormalized,and self.obs_mean_pl and self.obs_std_pl
        acs_normalized = normalize(
            acs_unnormalized, self.acs_mean_pl, self.acs_std_pl
        )  # TODO(Q2) Define acs_normalized using acs_unnormalized and self.acs_mean_pl and self.acs_std_pl

        # predicted change in obs
        concatenated_input = tf.concat([obs_normalized, acs_normalized],
                                       axis=1)
        # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s)
        self.delta_pred_normalized = build_mlp(concatenated_input, \
                                        self.ob_dim, \
                                        self.scope, \
                                        self.n_layers, \
                                        self.size) # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state)
        self.delta_pred_unnormalized = unnormalize(
            self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl
        )  # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl
        self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized  # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
コード例 #5
0
 def build_model(self):
     # self.define_placeholders()
     model = build_mlp(output_size=self.ac_dim,
                       n_layers=self.n_layers,
                       size=self.size)
     self.model = model
     self.logstd = tf.Variable(tf.zeros(self.ac_dim), name='logstd')
コード例 #6
0
 def __init__(self, ac_dim, ob_dim, n_layers, size, **kwargs):
     super().__init__()
     self.model = build_mlp((ob_dim, ),
                            output_size=ac_dim,
                            n_layers=n_layers,
                            size=size,
                            name='model')
コード例 #7
0
 def define_forward_pass(self):
     # TODO implement this build_mlp function in tf_utils
     mean = build_mlp(self.observations_pl,
                      output_size=self.ac_dim,
                      scope='continuous_logits',
                      n_layers=self.n_layers,
                      size=self.size)
     logstd = tf.Variable(tf.zeros(self.ac_dim), name='logstd')
     self.parameters = (mean, logstd)
コード例 #8
0
ファイル: ff_model.py プロジェクト: JiefanYa/RL-Training
    def define_forward_pass(self):
        # normalize input data to mean 0, std 1
        obs_unnormalized = self.obs_pl
        acs_unnormalized = self.acs_pl
        # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines
        obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl)
        acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl)

        # predicted change in obs
        concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1)
        # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s)
        self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size)
        self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl)
        self.next_obs_pred = self.obs_pl + self.delta_pred_unnormalized
コード例 #9
0
    def _build(self):
        """
            Notes on notation:

            Symbolic variables have the prefix sy_, to distinguish them from the numerical values
            that are computed later in the function

            Prefixes and suffixes:
            ob - observation 
            ac - action
            _no - this tensor should have shape (batch self.size /n/, observation dim)
            _na - this tensor should have shape (batch self.size /n/, action dim)
            _n  - this tensor should have shape (batch self.size /n/)

            Note: batch self.size /n/ is defined at runtime, and until then, the shape for that axis
            is None

            ----------------------------------------------------------------------------------
            loss: a function of self.sy_ob_no, self.sy_ac_na and self.sy_adv_n that we will differentiate
                to get the policy gradient.
        """
        self.sy_ob_no, self.sy_ac_na, self.sy_adv_n = self.define_placeholders(
        )

        # define the critic
        self.critic_prediction = tf.squeeze(
            build_mlp(self.sy_ob_no,
                      1,
                      "nn_critic",
                      n_layers=self.n_layers,
                      size=self.size))
        self.sy_target_n = tf.placeholder(shape=[None],
                                          name="critic_target",
                                          dtype=tf.float32)

        # TODO: set up the critic loss
        # HINT1: the critic_prediction should regress onto the targets placeholder (sy_target_n)
        # HINT2: use tf.losses.mean_squared_error
        # DONE
        self.critic_loss = tf.losses.mean_squared_error(
            self.sy_target_n, self.critic_prediction)

        # TODO: use the AdamOptimizer to optimize the loss defined above
        # DONE
        self.critic_update_op = tf.train.AdamOptimizer(
            self.learning_rate).minimize(self.critic_loss)
コード例 #10
0
    def __init__(self, hparams):
        super().__init__()
        self.ob_dim = hparams['ob_dim']
        self.ac_dim = hparams['ac_dim']
        self.discrete = hparams['discrete']
        self.size = hparams['size']
        self.n_layers = hparams['n_layers']
        self.learning_rate = hparams['learning_rate']
        self.num_target_updates = hparams['num_target_updates']
        self.num_grad_steps_per_target_update = hparams[
            'num_grad_steps_per_target_update']
        self.gamma = hparams['gamma']

        self.nn_critic = build_mlp((hparams['ob_dim'], ),
                                   output_size=1,
                                   n_layers=hparams['n_layers'],
                                   size=hparams['size'],
                                   name='nn_critic')
コード例 #11
0
    def __init__(self, env, agent_params, batch_size=500000, **kwargs):
        super(PGAgent, self).__init__()

        # init vars
        self.env = env
        self.agent_params = agent_params
        self.gamma = self.agent_params['gamma']
        self.standardize_advantages = self.agent_params[
            'standardize_advantages']
        self.nn_baseline = self.agent_params['nn_baseline']
        self.reward_to_go = self.agent_params['reward_to_go']

        # actor/policy
        if self.agent_params['discrete']:
            self.actor = DiscreteMLPPolicy(self.agent_params['ac_dim'],
                                           self.agent_params['ob_dim'],
                                           self.agent_params['n_layers'],
                                           self.agent_params['size'])
        else:
            self.actor = ContinuousMLPPolicy(self.agent_params['ac_dim'],
                                             self.agent_params['ob_dim'],
                                             self.agent_params['n_layers'],
                                             self.agent_params['size'])
        self.policy_optimizer = tf.keras.optimizers.Adam(
            learning_rate=self.agent_params['learning_rate'])

        # replay buffer
        self.replay_buffer = ReplayBuffer(2 * batch_size)

        self.baseline_model = None
        if self.agent_params['nn_baseline']:
            self.baseline_model = build_mlp(
                (self.agent_params['ob_dim'], ),
                output_size=1,
                n_layers=self.agent_params['n_layers'],
                size=self.agent_params['size'],
                name='baseline_model')
            self.baseline_loss = tf.keras.losses.MeanSquaredError()
            self.baseline_optimizer = tf.keras.optimizers.Adam(
                learning_rate=self.agent_params['learning_rate'])
            self.baseline_model.compile(optimizer=self.baseline_optimizer,
                                        loss=self.baseline_loss)
コード例 #12
0
    def __init__(
            self,
            ac_dim,
            ob_dim,
            n_layers,
            size,
            discrete=False,  # unused for now
            nn_baseline=False,  # unused for now
            **kwargs):
        super().__init__()

        # init vars
        self.ac_dim = ac_dim
        self.ob_dim = ob_dim
        self.n_layers = n_layers
        self.size = size

        self.mean = build_mlp((self.ob_dim, ),
                              output_size=self.ac_dim,
                              n_layers=self.n_layers,
                              size=self.size)
        self.gauss_noise = GaussianNoise(self.ac_dim, name='noise')
コード例 #13
0
ファイル: ff_model.py プロジェクト: dacphuc1993/CS285-deeprl
    def define_forward_pass(self):
        # normalize input data to mean 0, std 1
        obs_unnormalized = self.obs_pl
        acs_unnormalized = self.acs_pl
        # Hint: Consider using the normalize function defined in infrastructure.utils for the following two lines
        obs_normalized = normalize(obs_unnormalized, self.obs_mean_pl, self.obs_std_pl)
        acs_normalized = normalize(acs_unnormalized, self.acs_mean_pl, self.acs_std_pl)

        # predicted change in obs
        concatenated_input = tf.concat([obs_normalized, acs_normalized], axis=1)
        # Hint: Note that the prefix delta is used in the variable below to denote changes in state, i.e. (s'-s)
        # TODO(Q1) Use the build_mlp function and the concatenated_input above to define a neural network that predicts unnormalized delta states (i.e. change in state)
        # TODO(Q1) Unnormalize the the delta_pred above using the unnormalize function, and self.delta_mean_pl and self.delta_std_pl
        # TODO(Q1) Predict next observation using current observation and delta prediction (not that next_obs here is unnormalized)
        # DONE
        self.delta_pred_normalized = build_mlp(concatenated_input, self.ob_dim, self.scope, self.n_layers, self.size)
        self.delta_pred_unnormalized = unnormalize(self.delta_pred_normalized, self.delta_mean_pl, self.delta_std_pl
        self.next_obs_pred = obs_unnormalized + self.delta_pred_unnormalized

    def define_train_op(self):

        # normalize the labels
        # TODO(Q1) Define a normalized version of delta_labels using self.delta_labels (which are unnormalized), and self.delta_mean_pl and self.delta_std_pl
        # DONE
        self.delta_labels_normalized = normalize(self.delta_labels, self.delta_mean_pl, self.delta_std_pl)

        # compared predicted deltas to labels (both should be normalized)
        # TODO(Q1) Define a loss function that takes as input normalized versions of predicted change in state and ground truth change in state
        # TODO(Q1) Define a train_op to minimize the loss defined above. Adam optimizer will work well.
        # DONE
        self.loss = self.delta_labels_normalized - self.next_obs_pred
        self.train_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

    #############################

    def get_prediction(self, obs, acs, data_statistics):
        if len(obs.shape)>1:
            observations = obs
            actions = acs
        else:
            observations = obs[None]
            actions = acs [None]
        # TODO(Q1) Run model prediction on the given batch of data
        # DONE
        return self.sess.run(self.next_obs_pred, feed_dict={self.obs_pl:obs, 
                                                            self.acs_pl:acs, 
                                                            self.obs_mean_pl:data_statistics["obs_mean"],
                                                            self.acs_mean_pl:data_statistics["acs_mean"],
                                                            self.obs_std_pl:data_statistics["obs_std"],
                                                            self.acs_std_pl:data_statistics["acs_std"],
                                                            self.delta_mean_pl:data_statistics["delta_mean"],
                                                            self.delta_std_pl:data_statistics["delta_std"]})

    def update(self, observations, actions, next_observations, data_statistics):
        # train the model
        # TODO(Q1) Run the defined train_op here, and also return the loss being optimized (on this batch of data)
        # DONE
        _, loss = self.sess.run([self.train_op, self.loss], feed_dict={self.obs_pl: observations, 
                                                                    self.acs_pl: actions, 
                                                                    self.delta_labels:next_observations,
                                                                    self.obs_mean_pl:data_statistics["obs_mean"],
                                                                    self.acs_mean_pl:data_statistics["acs_mean"],
                                                                    self.obs_std_pl:data_statistics["obs_std"],
                                                                    self.acs_std_pl:data_statistics["acs_std"],
                                                                    self.delta_mean_pl:data_statistics["delta_mean"],
                                                                    self.delta_std_pl:data_statistics["delta_std"]})
        return loss
コード例 #14
0
ファイル: MLP_policy.py プロジェクト: qcy-l/CS285_hw_2019
 def define_forward_pass(self):
     self.values = build_mlp(self.observations_pl, output_size=1, scope='value', n_layers=self.n_layers, size=self.size)