コード例 #1
0
    def _build_ppo_loss(self):
        self.policy_optimizer =  tf.train.AdamOptimizer(
                learning_rate=self._input_ph['lr'],
            )

        for i in range(self.params.num_subtasks):
            self._update_operator['pol_loss_unclipped_{}'.format(TASKS(i))] = \
                -self._tensor['ratio_{}'.format(TASKS(i))] * \
                tf.reshape(self._input_ph['advantage'], [-1])

            self._update_operator['pol_loss_clipped_{}'.format(TASKS(i))] = \
                -self._tensor['ratio_clipped_{}'.format(TASKS(i))] * \
                tf.reshape(self._input_ph['advantage'], [-1])

            self._update_operator['surr_loss_{}'.format(TASKS(i))] = tf.reduce_mean(
                tf.maximum(self._update_operator['pol_loss_unclipped_{}'.format(TASKS(i))],
                           self._update_operator['pol_loss_clipped_{}'.format(TASKS(i))])
            )

            self._update_operator['loss_{}'.format(TASKS(i))] = \
                self._update_operator['surr_loss_{}'.format(TASKS(i))]

        # if self.params.use_kl_penalty:
        #     self._update_operator['kl_pen'] = \
        #         self._input_ph['kl_lambda'] * \
        #         self._tensor['kl']
        #     self._update_operator['kl_loss'] = self._kl_eta * \
        #         tf.square(tf.maximum(0., self._tensor['kl'] -
        #                              2. * self.params.target_kl))
        #     self._update_operator['loss'] += \
        #         self._update_operator['kl_pen'] + \
        #         self._update_operator['kl_loss']

            if self.params.use_weight_decay:
                self._update_operator['weight_decay_loss_{}'] = \
                    tf_util.l2_loss(self._tensor['variable_list_{}'.format(TASKS(i))])
                self._update_operator['loss_{}'.format(TASKS(i))] += \
                    self._update_operator['weight_decay_loss_{}'.format(TASKS(i))] * \
                    self.params.weight_decay_coefficient

            self._update_operator['sparse_correlation_loss_{}'.format(TASKS(i))] = \
                tf_util.correlation_loss(self._tensor['policy_masks_{}'.format(TASKS(i))],
                    [self._tensor['policy_masks_{}'.format(TASKS(j))] \
                    for j in range(self.params.num_subtasks) if j != i], apply_sigmoid=True)

            self._update_operator['sparse_mask_loss_{}'.format(TASKS(i))] = \
                tf_util.l2_loss(self._tensor['policy_masks_{}'.format(TASKS(i))],
                apply_sigmoid=True)

            self._update_operator['surr_loss_{}'.format(TASKS(i))] += \
                self._update_operator['sparse_correlation_loss_{}'.format(TASKS(i))] * \
                self.params.correlation_coefficient + \
                self._update_operator['sparse_mask_loss_{}'.format(TASKS(i))] * \
                self.params.mask_penalty

            self._update_operator['update_op_{}'.format(TASKS(i))] = \
                self.policy_optimizer.minimize(self._update_operator['surr_loss_{}'.format(TASKS(i))])
コード例 #2
0
    def _build_sac_loss(self):
        self._update_operator['loss'] = \
            tf.reduce_mean(self._tensor['log_p_n'] - self._input_ph['advantage'])

        if self.params.use_weight_decay:
            self._update_operator['weight_decay_loss'] = \
                tf_util.l2_loss(self._trainable_var_list)
            self._update_operator['loss'] += \
                self._update_operator['weight_decay_loss'] * \
                self.params.weight_decay_coefficient


        self._update_operator['policy_gradients'] = {
            self._input_ph['policy_sparse_masks'][i]:
            tf.gradients(self._update_operator['surr_loss'], self._MLP._w[i]) \
            for i in range(len(self._input_ph['policy_sparse_masks']))}

        self._update_operator['update_op'] = tf.train.AdamOptimizer(
            learning_rate=self._input_ph['lr'],
            #     # beta1=0.5, beta2=0.99, epsilon=1e-4
        ).minimize(self._update_operator['surr_loss'])

        # the actual parameter values
        self._update_operator['get_flat_param'] = \
            tf_util.GetFlat(self._session, self._trainable_var_list)
        # deprecated dont use
        # call this to set parameter values
        self._update_operator['set_from_flat_param'] = \
            tf_util.SetFromFlat(self._session, self._trainable_var_list)
コード例 #3
0
    def _build_ppo_loss(self):
        self._update_operator['pol_loss_unclipped'] = \
            -self._tensor['ratio'] * \
            tf.reshape(self._input_ph['advantage'], [-1])

        self._update_operator['pol_loss_clipped'] = \
            -self._tensor['ratio_clipped'] * \
            tf.reshape(self._input_ph['advantage'], [-1])

        self._update_operator['surr_loss'] = tf.reduce_mean(
            tf.maximum(self._update_operator['pol_loss_unclipped'],
                       self._update_operator['pol_loss_clipped'])
        )

        self._update_operator['loss'] = self._update_operator['surr_loss']
        # if self.params.use_kl_penalty:
        #     self._update_operator['kl_pen'] = \
        #         self._input_ph['kl_lambda'] * \
        #         self._tensor['kl']
        #     self._update_operator['kl_loss'] = self._kl_eta * \
        #         tf.square(tf.maximum(0., self._tensor['kl'] -
        #                              2. * self.params.target_kl))
        #     self._update_operator['loss'] += \
        #         self._update_operator['kl_pen'] + \
        #         self._update_operator['kl_loss']

        if self.params.use_weight_decay:
            self._update_operator['weight_decay_loss'] = \
                tf_util.l2_loss(self._trainable_var_list)
            self._update_operator['loss'] += \
                self._update_operator['weight_decay_loss'] * \
                self.params.weight_decay_coefficient

        self._update_operator['update_op'] = tf.train.AdamOptimizer(
            learning_rate=self._input_ph['lr'],
            # beta1=0.5, beta2=0.99, epsilon=1e-4
        ).minimize(self._update_operator['surr_loss'])

        # the actual parameter values
        self._update_operator['get_flat_param'] = \
            tf_util.GetFlat(self._session, self._trainable_var_list)
        # call this to set parameter values
        self._update_operator['set_from_flat_param'] = \
            tf_util.SetFromFlat(self._session, self._trainable_var_list)
コード例 #4
0
    def _build_value_network_and_loss(self):
        # build the placeholder for training the value function
        self._input_ph['value_target'] = \
            tf.placeholder(tf.float32, [None, 1], name='value_target')

        self._input_ph['old_values'] = \
            tf.placeholder(tf.float32, [None, 1], name='old_value_est')

        # build the baseline-value function
        network_shape = [self._observation_size] + \
            self.params.value_network_shape + [1]
        num_layer = len(network_shape) - 1
        act_type = \
            [self.params.value_activation_type] * (num_layer - 1) + [None]
        norm_type = \
            [self.params.value_normalizer_type] * (num_layer - 1) + [None]
        init_data = []
        for _ in range(num_layer):
            init_data.append(
                {'w_init_method': 'normc', 'w_init_para': {'stddev': 1.0},
                 'b_init_method': 'constant', 'b_init_para': {'val': 0.0}}
            )

        if self.params.use_subtask_value:
            self._value_MLP = network_util.SparseMultitaskMLP(
                dims=network_shape, scope='value_mlp', train=True,
                activation_type=act_type, normalizer_type=norm_type,
                init_data=init_data, linear_last_layer=True,
                num_tasks=self.params.num_subtasks
            )

            for i in range(self.params.num_subtasks):
                self._tensor['value_weights_{}'.format(TASKS(i))] = self._value_MLP._w[i]
                self._tensor['value_masks_{}'.format(TASKS(i))] = self._value_MLP._sparse_mask[i]

            self._tensor['value_b'] = self._value_MLP._b

            output = self._value_MLP(self._tensor['net_input'])

            self.value_optimizer = tf.train.AdamOptimizer(
                learning_rate=self.params.value_lr,
                beta1=0.5, beta2=0.99, epsilon=1e-4
            )

            for i in range(self.params.num_subtasks):
                self._tensor['pred_value_{}'.format(TASKS(i))] = output[i]

                self._update_operator['vf_loss_{}'.format(TASKS(i))] = .5 * tf.reduce_mean(
                    tf.square(
                        self._tensor['pred_value_{}'.format(TASKS(i))] - self._input_ph['value_target']
                    )
                )

                self._update_operator['sparse_value_correlation_loss_{}'] = \
                    tf_util.correlation_loss(self._tensor['value_masks_{}'.format(TASKS(i))],
                     [self._tensor['value_masks_{}'.format(TASKS(j))] \
                      for j in range(self.params.num_subtasks) if j != i])

                self._update_operator['sparse_value_mask_loss_{}'] = \
                    tf_util.l2_loss(self._tensor['value_masks_{}'.format(TASKS(i))],
                        apply_sigmoid=True)

                self._update_operator['vf_loss_{}'.format(TASKS(i))] += \
                    self._update_operator['sparse_value_correlation_loss_{}'] * \
                    self.params.correlation_coefficient + \
                    self._update_operator['sparse_value_mask_loss_{}'] * \
                    self.params.mask_penalty

                self._update_operator['vf_update_op_{}'.format(TASKS(i))] = \
                    self.value_optimizer.minimize(self._update_operator['vf_loss_{}'.format(TASKS(i))])

                self._tensor['variable_list_{}'.format(TASKS(i))] = [
                    *self._tensor['policy_weights_{}'.format(TASKS(i))],
                    *self._tensor['policy_b'],
                    *self._tensor['value_weights_{}'.format(TASKS(i))],
                    *self._tensor['value_b'],
                    self._tensor['action_logstd']
                ]

        else:
            self._value_MLP = network_util.MLP(
                dims=network_shape, scope='value_mlp', train=True,
                activation_type=act_type, normalizer_type=norm_type,
                init_data=init_data, linear_last_layer=True,
            )

            self._tensor['value_weights'] = self._value_MLP._w
            self._tensor['value_b'] = self._value_MLP._b

            self._tensor['pred_value'] = self._value_MLP(self._tensor['net_input'])

            self.value_optimizer = tf.train.AdamOptimizer(
                learning_rate=self.params.value_lr,
                beta1=0.5, beta2=0.99, epsilon=1e-4
            )
            self._update_operator['vf_loss'] = .5 * tf.reduce_mean(
                tf.square(
                    self._tensor['pred_value'] - self._input_ph['value_target']
                )
            )

            self._update_operator['vf_update_op'] = \
                self.value_optimizer.minimize(self._update_operator['vf_loss'])

            for i in range(self.params.num_subtasks):
                self._tensor['variable_list_{}'.format(TASKS(i))] = [
                    *self._tensor['policy_weights_{}'.format(TASKS(i))],
                    *self._tensor['policy_b'],
                    *self._tensor['value_weights'],
                    *self._tensor['value_b'],
                    self._tensor['action_logstd']
                ]