Beispiel #1
0
    def __init__(self,
                 envspec,
                 v_min=-10,
                 v_max=10,
                 atoms=51,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 network_settings={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        assert not envspec.is_continuous, 'rainbow only support discrete action space'
        super().__init__(envspec=envspec, **kwargs)
        self.v_min = v_min
        self.v_max = v_max
        self.atoms = atoms
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = tf.reshape(
            tf.constant(
                [self.v_min + i * self.delta_z for i in range(self.atoms)],
                dtype=tf.float32), [-1, self.atoms])  # [1, N]
        self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1]))  # [A, N]
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _create_net(name, representation_net=None):
            return ValueNetwork(
                name=name,
                representation_net=representation_net,
                value_net_type=OutputNetworkType.RAINBOW_DUELING,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      atoms=self.atoms,
                                      network_settings=network_settings))

        self.rainbow_net = _create_net('rainbow_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.rainbow_target_net = _create_net('rainbow_target_net',
                                              self._representation_target_net)
        update_target_net_weights(self.rainbow_target_net.weights,
                                  self.rainbow_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self._worker_params_dict.update(self.rainbow_net._policy_models)

        self._all_params_dict.update(self.rainbow_net._all_models)
        self._all_params_dict.update(optimizer=self.optimizer)
        self._model_post_process()
Beispiel #2
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 lr=5.0e-4,
                 alpha=2,
                 ployak=0.995,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'sql only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.alpha = alpha
        self.ployak = ployak

        def _q_net():
            return NetWork(self.feat_dim, self.a_dim, hidden_units)

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)

        self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
Beispiel #3
0
    def __init__(self,
                 envspec,
                 online_quantiles=8,
                 target_quantiles=8,
                 select_quantiles=32,
                 quantiles_idx=64,
                 huber_delta=1.,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 network_settings={
                     'q_net': [128, 64],
                     'quantile': [128, 64],
                     'tile': [64]
                 },
                 **kwargs):
        assert not envspec.is_continuous, 'iqn only support discrete action space'
        super().__init__(envspec=envspec, **kwargs)
        self.pi = tf.constant(np.pi)
        self.online_quantiles = online_quantiles
        self.target_quantiles = target_quantiles
        self.select_quantiles = select_quantiles
        self.quantiles_idx = quantiles_idx
        self.huber_delta = huber_delta
        self.assign_interval = assign_interval
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)

        def _create_net(name, representation_net=None):
            return ValueNetwork(name=name,
                                representation_net=representation_net,
                                value_net_type=OutputNetworkType.IQN_NET,
                                value_net_kwargs=dict(
                                    action_dim=self.a_dim,
                                    quantiles_idx=self.quantiles_idx,
                                    network_settings=network_settings))

        self.q_net = _create_net('q_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.q_target_net = _create_net('q_target_net',
                                        self._representation_target_net)
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self._worker_params_dict.update(self.q_net._policy_models)

        self._all_params_dict.update(self.q_net._all_models)
        self._all_params_dict.update(optimizer=self.optimizer)
        self._model_post_process()
Beispiel #4
0
    def __init__(self,
                 envspec,
                 alpha=0.2,
                 beta=0.1,
                 ployak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 network_settings=[32, 32],
                 **kwargs):
        assert not envspec.is_continuous, 'maxsqn only support discrete action space'
        super().__init__(envspec=envspec, **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.use_epsilon = use_epsilon
        self.ployak = ployak
        self.log_alpha = alpha if not auto_adaption else tf.Variable(
            initial_value=0.0,
            name='log_alpha',
            dtype=tf.float32,
            trainable=True)
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        def _create_net(name, representation_net=None):
            return DoubleValueNetwork(
                name=name,
                representation_net=representation_net,
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL,
                value_net_kwargs=dict(output_shape=self.a_dim,
                                      network_settings=network_settings))

        self.critic_net = _create_net('critic_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.critic_target_net = _create_net('critic_target_net',
                                             self._representation_target_net)

        update_target_net_weights(self.critic_target_net.weights,
                                  self.critic_net.weights)
        self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr])
        self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer, [self.q_lr, self.alpha_lr])

        self._worker_params_dict.update(self.critic_net._policy_models)

        self._all_params_dict.update(self.critic_net._all_models)
        self._all_params_dict.update(optimizer_critic=self.optimizer_critic,
                                     optimizer_alpha=self.optimizer_alpha)
        self._model_post_process()
Beispiel #5
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 online_quantiles=8,
                 target_quantiles=8,
                 select_quantiles=32,
                 quantiles_idx=64,
                 huber_delta=1.,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 hidden_units={
                     'q_net': [128, 64],
                     'quantile': [128, 64],
                     'tile': [64]
                 },
                 **kwargs):
        assert not is_continuous, 'iqn only support discrete action space'
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.pi = tf.constant(np.pi)
        self.online_quantiles = online_quantiles
        self.target_quantiles = target_quantiles
        self.select_quantiles = select_quantiles
        self.quantiles_idx = quantiles_idx
        self.huber_delta = huber_delta
        self.assign_interval = assign_interval
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self.max_train_step)

        def _net(): return NetWork(self.feat_dim, self.a_dim, self.quantiles_idx, hidden_units)

        self.q_net = _net()
        self.q_target_net = _net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_net.weights, self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(
            model=self.q_net,
            optimizer=self.optimizer
        ))
Beispiel #6
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 alpha=0.2,
                 beta=0.1,
                 ployak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'maxsqn only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.use_epsilon = use_epsilon
        self.ployak = ployak
        self.log_alpha = alpha if not auto_adaption else tf.Variable(
            initial_value=0.0,
            name='log_alpha',
            dtype=tf.float32,
            trainable=True)
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        def _q_net():
            return Critic(self.feat_dim, self.a_dim, hidden_units)

        self.critic_net = DoubleQ(_q_net)
        self.critic_target_net = DoubleQ(_q_net)
        self.critic_tv = self.critic_net.trainable_variables + self.other_tv
        update_target_net_weights(self.critic_target_net.weights,
                                  self.critic_net.weights)
        self.q_lr, self.alpha_lr = map(self.init_lr, [q_lr, alpha_lr])
        self.optimizer_critic, self.optimizer_alpha = map(
            self.init_optimizer, [self.q_lr, self.alpha_lr])

        self.model_recorder(
            dict(critic_net=self.critic_net,
                 optimizer_critic=self.optimizer_critic,
                 optimizer_alpha=self.optimizer_alpha))
Beispiel #7
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 v_min=-10,
                 v_max=10,
                 atoms=51,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 hidden_units={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        assert not is_continuous, 'rainbow only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.v_min = v_min
        self.v_max = v_max
        self.atoms = atoms
        self.delta_z = (self.v_max - self.v_min) / (self.atoms - 1)
        self.z = tf.reshape(
            tf.constant(
                [self.v_min + i * self.delta_z for i in range(self.atoms)],
                dtype=tf.float32), [-1, self.atoms])  # [1, N]
        self.zb = tf.tile(self.z, tf.constant([self.a_dim, 1]))  # [A, N]
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _net():
            return NetWork(self.feat_dim, self.a_dim, self.atoms, hidden_units)

        self.rainbow_net = _net()
        self.rainbow_target_net = _net()
        self.critic_tv = self.rainbow_net.trainable_variables + self.other_tv
        update_target_net_weights(self.rainbow_target_net.weights,
                                  self.rainbow_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)
        self.model_recorder(
            dict(model=self.rainbow_net, optimizer=self.optimizer))
Beispiel #8
0
    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size:
                self.intermediate_variable_reset()
                low_data = self.get_transitions(
                    self.data_low,
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'])
                high_data = self.get_transitions(
                    self.data_high,
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'])

                # --------------------------------------获取需要传给train函数的参数
                _low_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'a', 'r', 's_', 'done', 'g', 'g_'],
                    data_dict=low_data)
                _high_training_data = self.get_value_from_dict(
                    data_name_list=['s', 'r', 'a', 'g', 'done', 's_'],
                    data_dict=high_data)
                summaries = self.train_low(_low_training_data)

                self.summaries.update(summaries)
                update_target_net_weights(
                    self.low_actor_target.weights +
                    self.low_critic_target.weights,
                    self.low_actor.weights + self.low_critic.weights,
                    self.ployak)
                if self.counts % self.sub_goal_steps == 0:
                    self.counts = 0
                    high_summaries = self.train_high(_high_training_data)
                    self.summaries.update(high_summaries)
                    update_target_net_weights(
                        self.high_actor_target.weights +
                        self.high_critic_target.weights,
                        self.high_actor.weights + self.high_critic.weights,
                        self.ployak)
                self.counts += 1
                self.summaries.update(
                    dict([[
                        'LEARNING_RATE/low_actor_lr',
                        self.low_actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/low_critic_lr',
                              self.low_critic_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/high_actor_lr',
                              self.high_actor_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/high_critic_lr',
                              self.high_critic_lr(self.train_step)
                          ]]))
                self.write_training_summaries(self.global_step, self.summaries)
Beispiel #9
0
    def __init__(self,
                 envspec,
                 nums=20,
                 huber_delta=1.,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 network_settings=[128, 128],
                 **kwargs):
        assert not envspec.is_continuous, 'qrdqn only support discrete action space'
        assert nums > 0
        super().__init__(envspec=envspec, **kwargs)
        self.nums = nums
        self.huber_delta = huber_delta
        self.quantiles = tf.reshape(
            tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums),
                        dtype=tf.float32), [-1, self.nums])  # [1, N]
        self.batch_quantiles = tf.tile(self.quantiles,
                                       [self.a_dim, 1])  # [1, N] => [A, N]
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _create_net(name, representation_net=None):
            return ValueNetwork(
                name=name,
                representation_net=representation_net,
                value_net_type=OutputNetworkType.QRDQN_DISTRIBUTIONAL,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      nums=self.nums,
                                      network_settings=network_settings))

        self.q_dist_net = _create_net('q_dist_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.q_target_dist_net = _create_net('q_target_dist_net',
                                             self._representation_target_net)
        update_target_net_weights(self.q_target_dist_net.weights,
                                  self.q_dist_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self._worker_params_dict.update(self.q_dist_net._policy_models)

        self._all_params_dict.update(self.q_dist_net._all_models)
        self._all_params_dict.update(optimizer=self.optimizer)
        self._model_post_process()
Beispiel #10
0
    def __init__(self,
                 s_dim: Union[List[int], np.ndarray],
                 a_dim: Union[List[int], np.ndarray],
                 is_continuous: Union[List[bool], np.ndarray],

                 ployak: float = 0.995,
                 actor_lr: float = 5.0e-4,
                 critic_lr: float = 1.0e-3,
                 hidden_units: Dict = {
                     'actor': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        '''
        TODO: Annotation
        '''
        assert all(is_continuous), 'maddpg only support continuous action space'
        super().__init__(
            s_dim=s_dim,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.ployak = ployak

        # self.action_noises = NormalActionNoise(mu=np.zeros(self.a_dim), sigma=1 * np.ones(self.a_dim))
        self.action_noises = {i: OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.a_dim[i]), sigma=0.2 * np.ones(self.a_dim[i])) for i in range(self.agent_sep_ctls)}

        def _actor_net(i): return ActorCts(self.s_dim[i], self.a_dim[i], hidden_units['actor'])
        self.actor_nets = {i: _actor_net(i) for i in range(self.agent_sep_ctls)}
        self.actor_target_nets = {i: _actor_net(i) for i in range(self.agent_sep_ctls)}

        def _q_net(): return Critic(self.total_s_dim, self.total_a_dim, hidden_units['q'])
        self.q_nets = {i: _q_net() for i in range(self.agent_sep_ctls)}
        self.q_target_nets = {i: _q_net() for i in range(self.agent_sep_ctls)}

        for i in range(self.agent_sep_ctls):
            update_target_net_weights(
                self.actor_target_nets[i].weights + self.q_target_nets[i].weights,
                self.actor_nets[i].weights + self.q_nets[i].weights
            )

        self.actor_lrs = {i: self.init_lr(actor_lr) for i in range(self.agent_sep_ctls)}
        self.critic_lrs = {i: self.init_lr(critic_lr) for i in range(self.agent_sep_ctls)}
        self.optimizer_actors = {i: self.init_optimizer(self.actor_lrs[i]) for i in range(self.agent_sep_ctls)}
        self.optimizer_critics = {i: self.init_optimizer(self.critic_lrs[i]) for i in range(self.agent_sep_ctls)}

        models_and_optimizers = {}
        models_and_optimizers.update({f'actor-{i}': self.actor_nets[i] for i in range(self.agent_sep_ctls)})
        models_and_optimizers.update({f'critic-{i}': self.q_nets[i] for i in range(self.agent_sep_ctls)})
        models_and_optimizers.update({f'optimizer_actor-{i}': self.optimizer_actors[i] for i in range(self.agent_sep_ctls)})
        models_and_optimizers.update({f'optimizer_critic-{i}': self.optimizer_critics[i] for i in range(self.agent_sep_ctls)})

        self.model_recorder(models_and_optimizers)
Beispiel #11
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 nums=20,
                 huber_delta=1.,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 hidden_units=[128, 128],
                 **kwargs):
        assert not is_continuous, 'qrdqn only support discrete action space'
        assert nums > 0
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.nums = nums
        self.huber_delta = huber_delta
        self.quantiles = tf.reshape(
            tf.constant((2 * np.arange(self.nums) + 1) / (2.0 * self.nums),
                        dtype=tf.float32), [-1, self.nums])  # [1, N]
        self.batch_quantiles = tf.tile(self.quantiles,
                                       [self.a_dim, 1])  # [1, N] => [A, N]
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _net():
            return NetWork(self.feat_dim, self.a_dim, self.nums, hidden_units)

        self.q_dist_net = _net()
        self.q_target_dist_net = _net()
        self.critic_tv = self.q_dist_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_dist_net.weights,
                                  self.q_dist_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)
        self.model_recorder(
            dict(model=self.q_dist_net, optimizer=self.optimizer))
Beispiel #12
0
    def __init__(self,
                 envspec,

                 target_k: int = 4,
                 lr: float = 5.0e-4,
                 eps_init: float = 1,
                 eps_mid: float = 0.2,
                 eps_final: float = 0.01,
                 init2mid_annealing_step: int = 1000,
                 assign_interval: int = 1000,
                 network_settings: List[int] = [32, 32],
                 **kwargs):
        assert not envspec.is_continuous, 'dqn only support discrete action space'
        super().__init__(envspec=envspec, **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self.max_train_step)
        self.assign_interval = assign_interval
        self.target_k = target_k
        assert self.target_k > 0, "assert self.target_k > 0"
        self.target_nets = []
        self.current_target_idx = 0

        def _create_net(name, representation_net=None): return ValueNetwork(
            name=name,
            representation_net=representation_net,
            value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL,
            value_net_kwargs=dict(output_shape=self.a_dim, network_settings=network_settings)
        )
        self.q_net = _create_net('dqn_q_net', self._representation_net)

        for i in range(self.target_k):
            target_q_net = _create_net(
                'dqn_q_target_net' + str(i),
                self._create_representation_net('_representation_target_net' + str(i))
            )
            update_target_net_weights(target_q_net.weights, self.q_net.weights)
            self.target_nets.append(target_q_net)

        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self._worker_params_dict.update(self.q_net._policy_models)

        self._all_params_dict.update(self.q_net._all_models)
        self._all_params_dict.update(optimizer=self.optimizer)
        self._model_post_process()
Beispiel #13
0
    def __init__(self,
                 envspec,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 head_num=4,
                 network_settings=[32, 32],
                 **kwargs):
        assert not envspec.is_continuous, 'Bootstrapped DQN only support discrete action space'
        super().__init__(envspec=envspec, **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval
        self.head_num = head_num
        self._probs = [1. / head_num for _ in range(head_num)]
        self.now_head = 0

        def _create_net(name, representation_net=None):
            return ValueNetwork(
                name=name,
                representation_net=representation_net,
                value_net_type=OutputNetworkType.CRITIC_QVALUE_BOOTSTRAP,
                value_net_kwargs=dict(output_shape=self.a_dim,
                                      head_num=self.head_num,
                                      network_settings=network_settings))

        self.q_net = _create_net('q_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.q_target_net = _create_net('q_target_net',
                                        self._representation_target_net)
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self._worker_params_dict.update(self.q_net._policy_models)

        self._all_params_dict.update(self.q_net._all_models)
        self._all_params_dict.update(optimizer=self.optimizer)
        self._model_post_process()
Beispiel #14
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,

                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 hidden_units={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        assert not is_continuous, 'dueling double dqn only support discrete action space'
        super().__init__(
            s_dim=s_dim,
            visual_sources=visual_sources,
            visual_resolution=visual_resolution,
            a_dim=a_dim,
            is_continuous=is_continuous,
            **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _net(): return NetWork(self.feat_dim, self.a_dim, hidden_units)

        self.dueling_net = _net()
        self.dueling_target_net = _net()
        self.critic_tv = self.dueling_net.trainable_variables + self.other_tv
        update_target_net_weights(self.dueling_target_net.weights, self.dueling_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(
            model=self.dueling_net,
            optimizer=self.optimizer
        ))
Beispiel #15
0
    def __init__(self,
                 envspec,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 network_settings={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        assert not envspec.is_continuous, 'dueling double dqn only support discrete action space'
        super().__init__(envspec=envspec, **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _create_net(name, representation_net):
            return ValueNetwork(
                name=name,
                representation_net=representation_net,
                value_net_type=OutputNetworkType.CRITIC_DUELING,
                value_net_kwargs=dict(output_shape=self.a_dim,
                                      network_settings=network_settings))

        self.dueling_net = _create_net('dueling_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.dueling_target_net = _create_net('dueling_target_net',
                                              self._representation_target_net)
        update_target_net_weights(self.dueling_target_net.weights,
                                  self.dueling_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self._worker_params_dict.update(self.dueling_net._policy_models)

        self._all_params_dict.update(self.dueling_net._all_models)
        self._all_params_dict.update(optimizer=self.optimizer)
        self._model_post_process()
Beispiel #16
0
    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(memories, isw, crsty_loss, cell_state):
            td_error, summaries = self.train(memories, isw, crsty_loss,
                                             cell_state)
            if self.annealing and not self.auto_adaption:
                self.log_alpha.assign(
                    tf.math.log(
                        tf.cast(self.alpha_annealing(self.global_step.numpy()),
                                tf.float32)))
            return td_error, summaries

        def _pre_process(data):
            data['visual_s'] = np.transpose(data['visual_s'][:, 0].numpy(),
                                            (0, 3, 1, 2))
            data['visual_s_'] = np.transpose(data['visual_s_'][:, 0].numpy(),
                                             (0, 3, 1, 2))
            data['pos'] = self.data_convert(
                np.transpose(random_crop(data['visual_s'], self.img_size),
                             (0, 2, 3, 1)))
            data['visual_s'] = self.data_convert(
                np.transpose(random_crop(data['visual_s'], self.img_size),
                             (0, 2, 3, 1)))
            data['visual_s_'] = self.data_convert(
                np.transpose(random_crop(data['visual_s_'], self.img_size),
                             (0, 2, 3, 1)))
            return (data, )

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    _train,
                    'update_function':
                    lambda: update_target_net_weights(
                        self.critic_target_net.weights + self.encoder_target.
                        trainable_variables, self.critic_net.weights + self.
                        encoder.trainable_variables, self.ployak),
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/alpha_lr',
                              self.alpha_lr(self.train_step)
                          ]]),
                    'train_data_list': [
                        's', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done',
                        'pos'
                    ],
                    'pre_process_function':
                    _pre_process
                })
Beispiel #17
0
    def __init__(self,
                 s_dim,
                 visual_sources,
                 visual_resolution,
                 a_dim,
                 is_continuous,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 head_num=4,
                 hidden_units=[32, 32],
                 **kwargs):
        assert not is_continuous, 'Bootstrapped DQN only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval
        self.head_num = head_num
        self._probs = [1. / head_num for _ in range(head_num)]
        self.now_head = 0

        def _q_net():
            return NetWork(self.feat_dim, self.a_dim, self.head_num,
                           hidden_units)

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
Beispiel #18
0
    def learn(self, **kwargs) -> NoReturn:
        '''
        TODO: Annotation
        '''
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            if self.data.is_lg_batch_size:
                self.intermediate_variable_reset()
                batch_data = self.data.sample()
                done = batch_data[-1]
                s, visual_a, a, r, s_, visual_s_ = [batch_data[i:i + self.agent_sep_ctls] for i in range(0, len(batch_data) - 1, self.agent_sep_ctls)]
                target_a = [self._get_actions(i, s_[i], evaluation=True, use_target=True) for i in range(self.agent_sep_ctls)]

                s_all = np.hstack(s)
                a_all = np.hstack(a)
                s_next_all = np.hstack(s_)
                target_a_all = np.hstack(target_a)

                for i in range(self.agent_sep_ctls):
                    summary = {}
                    if i == 0:
                        al = np.full(fill_value=[], shape=(done.shape[0], 0), dtype=np.float32)
                        ar = np.hstack(a[i + 1:])
                    elif i == self.agent_sep_ctls - 1:
                        al = np.hstack(a[:i])
                        ar = np.full(fill_value=[], shape=(done.shape[0], 0), dtype=np.float32)
                    else:
                        al = np.hstack(a[:i])
                        ar = np.hstack(a[i + 1:])

                    # actor: al, ar, s(all), s
                    # critic: r, done, s_(all), target_a(all), s(all), a(all)
                    summary.update(self._train(i, s_all, a_all, s_next_all, target_a_all, r[i], done, s[i], al, ar))
                    summary.update({'LEARNING_RATE/actor_lr': self.actor_lrs[i](self.train_step), 'LEARNING_RATE/critic_lr': self.critic_lrs[i](self.train_step)})
                    self.write_training_summaries(self.global_step, summary, self.writers[i])

                self.global_step.assign_add(1)

                for i in range(self.agent_sep_ctls):
                    update_target_net_weights(
                        self.actor_target_nets[i].weights + self.q_target_nets[i].weights,
                        self.actor_nets[i].weights + self.q_nets[i].weights,
                        self.ployak)
Beispiel #19
0
    def __init__(self,
                 s_dim: Union[int, np.ndarray],
                 visual_sources: Union[int, np.ndarray],
                 visual_resolution: Union[List, np.ndarray],
                 a_dim: Union[int, np.ndarray],
                 is_continuous: Union[bool, np.ndarray],
                 lr: float = 5.0e-4,
                 eps_init: float = 1,
                 eps_mid: float = 0.2,
                 eps_final: float = 0.01,
                 init2mid_annealing_step: int = 1000,
                 assign_interval: int = 1000,
                 hidden_units: List[int] = [32, 32],
                 **kwargs):
        assert not is_continuous, 'dqn only support discrete action space'
        super().__init__(s_dim=s_dim,
                         visual_sources=visual_sources,
                         visual_resolution=visual_resolution,
                         a_dim=a_dim,
                         is_continuous=is_continuous,
                         **kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self.max_train_step)
        self.assign_interval = assign_interval

        def _q_net():
            return NetWork(self.feat_dim, self.a_dim, hidden_units)

        self.q_net = _q_net()
        self.q_target_net = _q_net()
        self.critic_tv = self.q_net.trainable_variables + self.other_tv
        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        self.model_recorder(dict(model=self.q_net, optimizer=self.optimizer))
Beispiel #20
0
    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')
        for i in range(self.train_times_per_step):
            if self.data_low.is_lg_batch_size and self.data_high.is_lg_batch_size:
                self.intermediate_variable_reset()
                low_data = self.get_transitions(self.data_low)
                high_data = self.get_transitions(self.data_high)

                summaries = self.train_low(low_data)

                self.summaries.update(summaries)
                update_target_net_weights(self.low_ac_target_net.weights,
                                          self.low_ac_net.weights, self.ployak)
                if self.counts % self.sub_goal_steps == 0:
                    self.counts = 0
                    high_summaries = self.train_high(high_data)
                    self.summaries.update(high_summaries)
                    update_target_net_weights(self.high_ac_target_net.weights,
                                              self.high_ac_net.weights,
                                              self.ployak)
                self.counts += 1
                self.summaries.update(
                    dict([[
                        'LEARNING_RATE/low_actor_lr',
                        self.low_actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/low_critic_lr',
                              self.low_critic_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/high_actor_lr',
                              self.high_actor_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/high_critic_lr',
                              self.high_critic_lr(self.train_step)
                          ]]))
                self.write_training_summaries(self.global_step, self.summaries)
Beispiel #21
0
    def __init__(self,
                 envspec,
                 lr=5.0e-4,
                 alpha=2,
                 ployak=0.995,
                 network_settings=[32, 32],
                 **kwargs):
        assert not envspec.is_continuous, 'sql only support discrete action space'
        super().__init__(envspec=envspec, **kwargs)
        self.alpha = alpha
        self.ployak = ployak

        def _create_net(name, representation_net=None):
            return ValueNetwork(
                name=name,
                representation_net=representation_net,
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ALL,
                value_net_kwargs=dict(output_shape=self.a_dim,
                                      network_settings=network_settings))

        self.q_net = _create_net('q_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.q_target_net = _create_net('q_target_net',
                                        self._representation_target_net)
        self.lr = self.init_lr(lr)
        self.optimizer = self.init_optimizer(self.lr)

        update_target_net_weights(self.q_target_net.weights,
                                  self.q_net.weights)

        self._worker_params_dict.update(self.q_net._policy_models)

        self._all_params_dict.update(self.q_net._all_models)
        self._all_params_dict.update(optimizer=self.optimizer)
        self._model_post_process()
Beispiel #22
0
 def learn(self, **kwargs):
     self.train_step = kwargs.get('train_step')
     for i in range(self.train_times_per_step):
         self._learn(
             function_dict={
                 'train_function':
                 self.train,
                 'update_function':
                 lambda: update_target_net_weights(
                     self.q_target_net.weights, self.q_net.weights, self.
                     ployak),
                 'summary_dict':
                 dict([['LEARNING_RATE/lr',
                        self.lr(self.train_step)]])
             })
Beispiel #23
0
 def learn(self, **kwargs):
     self.train_step = kwargs.get('train_step')
     for i in range(self.train_times_per_step):
         self._learn(function_dict={
             'train_function': self.train,
             'update_function': lambda: update_target_net_weights(
                 self.actor_target_net.weights + self.reward_critic_target_net.weights + self.cost_critic_target_net.weights,
                 self.actor_net.weights + self.reward_critic_net.weights + self.cost_critic_net.weights,
                 self.ployak),
             'summary_dict': dict([
                 ['LEARNING_RATE/actor_lr', self.actor_lr(self.train_step)],
                 ['LEARNING_RATE/reward_critic_lr', self.reward_critic_lr(self.train_step)],
                 ['LEARNING_RATE/cost_critic_lr', self.cost_critic_lr(self.train_step)]
             ]),
             'sample_data_list': ['s', 'visual_s', 'a', 'r', 's_', 'visual_s_', 'done', 'cost'],
             'train_data_list': ['ss', 'vvss', 'a', 'r', 'done', 'cost'],
         })
Beispiel #24
0
    def learn(self, **kwargs):
        self.train_step = kwargs.get('train_step')

        def _train(memories, isw, crsty_loss, cell_state):
            if self.is_continuous or self.use_gumbel:
                td_error, summaries = self.train_persistent(
                    memories, isw, crsty_loss, cell_state)
            else:
                td_error, summaries = self.train_discrete(
                    memories, isw, crsty_loss, cell_state)
            if self.annealing and not self.auto_adaption:
                self.log_alpha.assign(
                    tf.math.log(
                        tf.cast(self.alpha_annealing(self.global_step.numpy()),
                                tf.float32)))
            return td_error, summaries

        for i in range(self.train_times_per_step):
            self._learn(
                function_dict={
                    'train_function':
                    _train,
                    'update_function':
                    lambda: update_target_net_weights(
                        self.critic_target_net.weights, self.critic_net.
                        weights, self.ployak),
                    'summary_dict':
                    dict([[
                        'LEARNING_RATE/actor_lr',
                        self.actor_lr(self.train_step)
                    ],
                          [
                              'LEARNING_RATE/critic_lr',
                              self.critic_lr(self.train_step)
                          ],
                          [
                              'LEARNING_RATE/alpha_lr',
                              self.alpha_lr(self.train_step)
                          ]])
                })
Beispiel #25
0
    def __init__(self,
                 envspec,

                 ployak=0.995,
                 delay_num=2,
                 gaussian_noise_sigma=0.2,
                 gaussian_noise_bound=0.2,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.ployak = ployak
        self.delay_num = delay_num
        self.discrete_tau = discrete_tau
        self.gaussian_noise_sigma = gaussian_noise_sigma
        self.gaussian_noise_bound = gaussian_noise_bound

        if self.is_continuous:
            def _create_net(name, representation_net=None): return ADoubleCNetwork(
                name=name,
                representation_net=representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DPG,
                policy_net_kwargs=dict(output_shape=self.a_dim,
                                       network_settings=network_settings['actor_continuous']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q'])
            )
            self.noised_action = self.target_noised_action = ClippedNormalNoisedAction(sigma=self.gaussian_noise_sigma, noise_bound=self.gaussian_noise_bound)
        else:
            def _create_net(name, representation_net=None): return ADoubleCNetwork(
                name=name,
                representation_net=representation_net,
                policy_net_type=OutputNetworkType.ACTOR_DCT,
                policy_net_kwargs=dict(output_shape=self.a_dim,
                                       network_settings=network_settings['actor_discrete']),
                value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                value_net_kwargs=dict(action_dim=self.a_dim,
                                      network_settings=network_settings['q'])
            )
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        self.ac_net = _create_net('ac_net', self._representation_net)
        self._representation_target_net = self._create_representation_net('_representation_target_net')
        self.ac_target_net = _create_net('ac_target_net', self._representation_target_net)

        update_target_net_weights(self.ac_target_net.weights, self.ac_net.weights)
        self.actor_lr, self.critic_lr = map(self.init_lr, [actor_lr, critic_lr])
        self.optimizer_actor, self.optimizer_critic = map(self.init_optimizer, [self.actor_lr, self.critic_lr])

        self._worker_params_dict.update(self.ac_net._policy_models)

        self._all_params_dict.update(self.ac_net._all_models)
        self._all_params_dict.update(optimizer_actor=self.optimizer_actor,
                                     optimizer_critic=self.optimizer_critic)
        self._model_post_process()
Beispiel #26
0
 def _target_params_update(self):
     update_target_net_weights(self.ac_target_net.weights, self.ac_net.weights, self.ployak)
Beispiel #27
0
 def _update():
     if self.global_step % self.assign_interval == 0:
         update_target_net_weights(self.q_target_dist_net.weights, self.q_dist_net.weights)
Beispiel #28
0
 def _target_params_update(self):
     if self.global_step % self.assign_interval == 0:
         update_target_net_weights(self.q_target_net.weights,
                                   self.q_net.weights)
Beispiel #29
0
 def _target_params_update(self):
     if self.global_step % self.assign_interval == 0:
         update_target_net_weights(self.target_nets[self.current_target_idx].weights, self.q_net.weights)
         self.current_target_idx = (self.current_target_idx + 1) % self.target_k
Beispiel #30
0
    def __init__(
            self,
            envspec,
            ployak=0.995,
            actor_lr=5.0e-4,
            reward_critic_lr=1.0e-3,
            cost_critic_lr=1.0e-3,
            lambda_lr=5.0e-4,
            discrete_tau=1.0,
            cost_constraint=1.0,
            network_settings={
                'actor_continuous': [32, 32],
                'actor_discrete': [32, 32],
                'reward': [32, 32],
                'cost': [32, 32]
            },
            **kwargs):
        super().__init__(envspec=envspec, **kwargs)
        self.ployak = ployak
        self.discrete_tau = discrete_tau
        self._lambda = tf.Variable(0.0, dtype=tf.float32)
        self.cost_constraint = cost_constraint  # long tern cost <= d

        if self.is_continuous:
            # NOTE: value_net is reward net; value_net2 is cost net.
            def _create_net(name, representation_net=None):
                return ACCNetwork(
                    name=name,
                    representation_net=representation_net,
                    policy_net_type=OutputNetworkType.ACTOR_DPG,
                    policy_net_kwargs=dict(
                        output_shape=self.a_dim,
                        network_settings=network_settings['actor_continuous']),
                    value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                    value_net_kwargs=dict(
                        action_dim=self.a_dim,
                        network_settings=network_settings['reward']),
                    value_net2_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                    value_net2_kwargs=dict(
                        action_dim=self.a_dim,
                        network_settings=network_settings['cost']))
        else:

            def _create_net(name, representation_net=None):
                return ACCNetwork(
                    name=name,
                    representation_net=representation_net,
                    policy_net_type=OutputNetworkType.ACTOR_DCT,
                    policy_net_kwargs=dict(
                        output_shape=self.a_dim,
                        network_settings=network_settings['actor_discrete']),
                    value_net_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                    value_net_kwargs=dict(
                        action_dim=self.a_dim,
                        network_settings=network_settings['reward']),
                    value_net2_type=OutputNetworkType.CRITIC_QVALUE_ONE,
                    value_net2_kwargs=dict(
                        action_dim=self.a_dim,
                        network_settings=network_settings['cost']))

        self.ac_net = _create_net('ac_net', self._representation_net)
        self._representation_target_net = self._create_representation_net(
            '_representation_target_net')
        self.ac_target_net = _create_net('ac_target_net',
                                         self._representation_target_net)

        if self.is_continuous:
            # self.action_noise = NormalActionNoise(sigma=0.2)
            self.action_noise = OrnsteinUhlenbeckActionNoise(sigma=0.2)
        else:
            self.gumbel_dist = tfp.distributions.Gumbel(0, 1)

        update_target_net_weights(self.ac_target_net.weights,
                                  self.ac_net.weights)
        self.lambda_lr = lambda_lr
        self.actor_lr, self.reward_critic_lr, self.cost_critic_lr = map(
            self.init_lr, [actor_lr, reward_critic_lr, cost_critic_lr])
        self.optimizer_actor, self.optimizer_reward_critic, self.optimizer_cost_critic = map(
            self.init_optimizer,
            [self.actor_lr, self.reward_critic_lr, self.cost_critic_lr])

        self._worker_params_dict.update(self.ac_net._policy_models)

        self._all_params_dict.update(self.ac_net._all_models)
        self._all_params_dict.update(
            optimizer_actor=self.optimizer_actor,
            optimizer_reward_critic=self.optimizer_reward_critic,
            optimizer_cost_critic=self.optimizer_cost_critic)
        self._model_post_process()