Esempio n. 1
0
    def __init__(self, wm_lr=1e-3, roll_out_horizon=15, **kwargs):
        super().__init__(**kwargs)
        network_settings = kwargs.get('network_settings', {})
        assert not self.obs_spec.has_visual_observation, "assert not self.obs_spec.has_visual_observation"
        assert self.obs_spec.has_vector_observation, "assert self.obs_spec.has_vector_observation"

        self._wm_lr = wm_lr
        self._roll_out_horizon = roll_out_horizon
        self._forward_dynamic_model = VectorSA2S(
            self.obs_spec.vector_dims[0],
            self.a_dim,
            hidden_units=network_settings['forward_model'])
        self._reward_model = VectorSA2R(
            self.obs_spec.vector_dims[0],
            self.a_dim,
            hidden_units=network_settings['reward_model'])
        self._done_model = VectorSA2D(
            self.obs_spec.vector_dims[0],
            self.a_dim,
            hidden_units=network_settings['done_model'])
        self._wm_oplr = OPLR([
            self._forward_dynamic_model, self._reward_model, self._done_model
        ], self._wm_lr, **self._oplr_params)
        self._trainer_modules.update(
            _forward_dynamic_model=self._forward_dynamic_model,
            _reward_model=self._reward_model,
            _done_model=self._done_model,
            _wm_oplr=self._wm_oplr)
Esempio n. 2
0
 def __init__(self,
              lr: float = 5.0e-4,
              eps_init: float = 1,
              eps_mid: float = 0.2,
              eps_final: float = 0.01,
              init2mid_annealing_step: int = 1000,
              assign_interval: int = 1000,
              network_settings: List[int] = [32, 32],
              **kwargs):
     super().__init__(**kwargs)
     assert not self.is_continuous, 'dqn only support discrete action space'
     self.expl_expt_mng = ExplorationExploitationClass(
         eps_init=eps_init,
         eps_mid=eps_mid,
         eps_final=eps_final,
         init2mid_annealing_step=init2mid_annealing_step,
         max_step=self._max_train_step)
     self.assign_interval = assign_interval
     self.q_net = TargetTwin(
         CriticQvalueAll(self.obs_spec,
                         rep_net_params=self._rep_net_params,
                         output_shape=self.a_dim,
                         network_settings=network_settings)).to(self.device)
     self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
     self._trainer_modules.update(model=self.q_net)
     self._trainer_modules.update(oplr=self.oplr)
Esempio n. 3
0
 def __init__(self,
              nums=20,
              huber_delta=1.,
              lr=5.0e-4,
              eps_init=1,
              eps_mid=0.2,
              eps_final=0.01,
              init2mid_annealing_step=1000,
              assign_interval=1000,
              network_settings=[128, 128],
              **kwargs):
     assert nums > 0, 'assert nums > 0'
     super().__init__(**kwargs)
     assert not self.is_continuous, 'qrdqn only support discrete action space'
     self.nums = nums
     self.huber_delta = huber_delta
     self.quantiles = th.tensor((2 * np.arange(self.nums) + 1) / (2.0 * self.nums)).float().to(self.device)  # [N,]
     self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                       eps_mid=eps_mid,
                                                       eps_final=eps_final,
                                                       init2mid_annealing_step=init2mid_annealing_step,
                                                       max_step=self._max_train_step)
     self.assign_interval = assign_interval
     self.q_net = TargetTwin(QrdqnDistributional(self.obs_spec,
                                                 rep_net_params=self._rep_net_params,
                                                 action_dim=self.a_dim,
                                                 nums=self.nums,
                                                 network_settings=network_settings)).to(self.device)
     self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
     self._trainer_modules.update(model=self.q_net,
                                  oplr=self.oplr)
Esempio n. 4
0
    def __init__(self,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 head_num=4,
                 network_settings=[32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'Bootstrapped DQN only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.head_num = head_num
        self._probs = th.FloatTensor([1. / head_num for _ in range(head_num)])
        self.now_head = 0

        self.q_net = TargetTwin(
            CriticQvalueBootstrap(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  head_num=self.head_num,
                                  network_settings=network_settings)).to(
                                      self.device)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
Esempio n. 5
0
    def __init__(self,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 network_settings={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'dueling double dqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self._max_train_step)
        self.assign_interval = assign_interval

        self.q_net = TargetTwin(CriticDueling(self.obs_spec,
                                              rep_net_params=self._rep_net_params,
                                              output_shape=self.a_dim,
                                              network_settings=network_settings)).to(self.device)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net,
                                     oplr=self.oplr)
Esempio n. 6
0
    def __init__(
            self,
            agent_spec,
            lr=5.0e-4,
            network_settings={
                'actor_continuous': {
                    'hidden_units': [32, 32],
                    'condition_sigma': False,
                    'log_std_bound': [-20, 2]
                },
                'actor_discrete': [32, 32]
            },
            **kwargs):
        super().__init__(agent_spec=agent_spec, **kwargs)
        if self.is_continuous:
            self.net = ActorMuLogstd(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous']).to(
                    self.device)
        else:
            self.net = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_discrete']).to(
                    self.device)
        self.oplr = OPLR(self.net, lr, **self._oplr_params)

        self._trainer_modules.update(model=self.net, oplr=self.oplr)
Esempio n. 7
0
    def __init__(self,
                 polyak=0.995,
                 noise_action='ou',
                 noise_params={'sigma': 0.2},
                 use_target_action_noise=False,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.polyak = polyak
        self.discrete_tau = discrete_tau
        self.use_target_action_noise = use_target_action_noise

        if self.is_continuous:
            actor = ActorDPG(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous'])
            self.target_noised_action = ClippedNormalNoisedAction(
                sigma=0.2, noise_bound=0.2)
            if noise_action in ['ou', 'clip_normal']:
                self.noised_action = Noise_action_REGISTER[noise_action](
                    **noise_params)
            elif noise_action == 'normal':
                self.noised_action = self.target_noised_action
            else:
                raise Exception(
                    f'cannot use noised action type of {noise_action}')
        else:
            actor = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_discrete'])
        self.actor = TargetTwin(actor, self.polyak).to(self.device)
        self.critic = TargetTwin(
            CriticQvalueOne(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            action_dim=self.a_dim,
                            network_settings=network_settings['q']),
            self.polyak).to(self.device)

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params)
        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Esempio n. 8
0
    def __init__(self,
                 polyak=0.995,
                 delay_num=2,
                 noise_action='clip_normal',
                 noise_params={
                     'sigma': 0.2,
                     'noise_bound': 0.2
                 },
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.polyak = polyak
        self.delay_num = delay_num
        self.discrete_tau = discrete_tau

        if self.is_continuous:
            actor = ActorDPG(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous'])
            self.noised_action = self.target_noised_action = Noise_action_REGISTER[
                noise_action](**noise_params)
        else:
            actor = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous'])
        self.actor = TargetTwin(actor, self.polyak).to(self.device)

        self.critic = TargetTwin(
            CriticQvalueOne(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            action_dim=self.a_dim,
                            network_settings=network_settings['q']),
            self.polyak).to(self.device)
        self.critic2 = deepcopy(self.critic)

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr,
                                **self._oplr_params)
        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     critic2=self.critic2,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Esempio n. 9
0
    def __init__(
            self,
            agent_spec,
            actor_step_size=0.5,
            beta=1.0e-3,
            lambda_=0.95,
            cg_iters=10,
            damping_coeff=0.1,
            epsilon=0.2,
            critic_lr=1e-3,
            train_critic_iters=10,
            network_settings={
                'actor_continuous': {
                    'hidden_units': [64, 64],
                    'condition_sigma': False,
                    'log_std_bound': [-20, 2]
                },
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(agent_spec=agent_spec, **kwargs)
        self.actor_step_size = actor_step_size
        self.beta = beta
        self.lambda_ = lambda_
        self._epsilon = epsilon
        self._cg_iters = cg_iters
        self._damping_coeff = damping_coeff
        self._train_critic_iters = train_critic_iters

        if self.is_continuous:
            self.actor = ActorMuLogstd(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous']).to(
                    self.device)
        else:
            self.actor = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_discrete']).to(
                    self.device)
        self.critic = CriticValue(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            network_settings=network_settings['critic']).to(self.device)

        self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params)
        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     critic_oplr=self.critic_oplr)
Esempio n. 10
0
    def __init__(self,
                 mixer='vdn',
                 mixer_settings={},
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 use_double=True,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 network_settings={
                     'share': [128],
                     'v': [128],
                     'adv': [128]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        assert not any(list(self.is_continuouss.values())
                       ), 'VDN only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self._use_double = use_double
        self._mixer_type = mixer
        self._mixer_settings = mixer_settings

        self.q_nets = {}
        for id in set(self.model_ids):
            self.q_nets[id] = TargetTwin(
                CriticDueling(self.obs_specs[id],
                              rep_net_params=self._rep_net_params,
                              output_shape=self.a_dims[id],
                              network_settings=network_settings)).to(
                                  self.device)

        self.mixer = self._build_mixer()

        self.oplr = OPLR(
            tuple(self.q_nets.values()) + (self.mixer, ), lr,
            **self._oplr_params)
        self._trainer_modules.update(
            {f"model_{id}": self.q_nets[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(mixer=self.mixer, oplr=self.oplr)
Esempio n. 11
0
    def __init__(self,
                 alpha=0.2,
                 beta=0.1,
                 polyak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 network_settings=[32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'maxsqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self._max_train_step)
        self.use_epsilon = use_epsilon
        self.polyak = polyak
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        self.critic = TargetTwin(CriticQvalueAll(self.obs_spec,
                                                 rep_net_params=self._rep_net_params,
                                                 output_shape=self.a_dim,
                                                 network_settings=network_settings),
                                 self.polyak).to(self.device)
        self.critic2 = deepcopy(self.critic)

        self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params)

        if self.auto_adaption:
            self.log_alpha = th.tensor(0., requires_grad=True).to(self.device)
            self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params)
            self._trainer_modules.update(alpha_oplr=self.alpha_oplr)
        else:
            self.log_alpha = th.tensor(alpha).log().to(self.device)

        self._trainer_modules.update(critic=self.critic,
                                     critic2=self.critic2,
                                     log_alpha=self.log_alpha,
                                     critic_oplr=self.critic_oplr)
Esempio n. 12
0
    def __init__(self,
                 obs_spec,
                 rep_net_params,
                 is_continuous,
                 action_dim,
                 *,
                 eta=0.2,
                 lr=1.0e-3,
                 beta=0.2):
        """
        params:
            is_continuous: sepecify whether action space is continuous(True) or discrete(False)
            action_dim: dimension of action

            eta: weight of intrinsic reward
            lr: the learning rate of curiosity model
            beta: weight factor of loss between inverse_dynamic_net and forward_net
        """
        super().__init__()
        self.eta = eta
        self.beta = beta
        self.is_continuous = is_continuous
        self.action_dim = action_dim

        self.rep_net = RepresentationNetwork(obs_spec=obs_spec,
                                             rep_net_params=rep_net_params)

        self.feat_dim = self.rep_net.h_dim

        # S, S' => A
        self.inverse_dynamic_net = nn.Sequential(
            nn.Linear(self.feat_dim * 2, self.feat_dim * 2),
            Act_REGISTER[default_act](),
            nn.Linear(self.feat_dim * 2, action_dim))
        if self.is_continuous:
            self.inverse_dynamic_net.add_module('tanh', nn.Tanh())

        # S, A => S'
        self.forward_net = nn.Sequential(
            nn.Linear(self.feat_dim + action_dim,
                      self.feat_dim), Act_REGISTER[default_act](),
            nn.Linear(self.feat_dim, self.feat_dim))

        self.oplr = OPLR(
            models=[self.rep_net, self.inverse_dynamic_net, self.forward_net],
            lr=lr)
Esempio n. 13
0
    def __init__(
            self,
            agent_spec,
            beta=1.0e-3,
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            network_settings={
                'actor_continuous': {
                    'hidden_units': [64, 64],
                    'condition_sigma': False,
                    'log_std_bound': [-20, 2]
                },
                'actor_discrete': [32, 32],
                'critic': [32, 32]
            },
            **kwargs):
        super().__init__(agent_spec=agent_spec, **kwargs)
        self.beta = beta

        if self.is_continuous:
            self.actor = ActorMuLogstd(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous']).to(
                    self.device)
        else:
            self.actor = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_discrete']).to(
                    self.device)
        self.critic = CriticValue(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            network_settings=network_settings['critic']).to(self.device)

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params)

        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Esempio n. 14
0
    def __init__(self,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 use_target_action_noise=False,
                 noise_action='ou',
                 noise_params={
                     'sigma': 0.2
                 },
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.discrete_tau = discrete_tau
        self.use_target_action_noise = use_target_action_noise

        if self.is_continuous:
            self.target_noised_action = ClippedNormalNoisedAction(sigma=0.2, noise_bound=0.2)
            self.noised_action = Noise_action_REGISTER[noise_action](**noise_params)
            self.actor = ActorDPG(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  network_settings=network_settings['actor_continuous']).to(self.device)
        else:
            self.actor = ActorDct(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  network_settings=network_settings['actor_discrete']).to(self.device)

        self.critic = CriticQvalueOne(self.obs_spec,
                                      rep_net_params=self._rep_net_params,
                                      action_dim=self.a_dim,
                                      network_settings=network_settings['q']).to(self.device)

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR(self.critic, critic_lr, **self._oplr_params)
        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Esempio n. 15
0
    def __init__(self,
                 lr=5.0e-4,
                 alpha=2,
                 polyak=0.995,
                 network_settings=[32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'sql only support discrete action space'
        self.alpha = alpha
        self.polyak = polyak

        self.q_net = TargetTwin(CriticQvalueAll(self.obs_spec,
                                                rep_net_params=self._rep_net_params,
                                                output_shape=self.a_dim,
                                                network_settings=network_settings),
                                self.polyak).to(self.device)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net,
                                     oplr=self.oplr)
Esempio n. 16
0
 def __init__(self,
              v_min=-10,
              v_max=10,
              atoms=51,
              lr=5.0e-4,
              eps_init=1,
              eps_mid=0.2,
              eps_final=0.01,
              init2mid_annealing_step=1000,
              assign_interval=2,
              network_settings={
                  'share': [128],
                  'v': [128],
                  'adv': [128]
              },
              **kwargs):
     super().__init__(**kwargs)
     assert not self.is_continuous, 'rainbow only support discrete action space'
     self._v_min = v_min
     self._v_max = v_max
     self._atoms = atoms
     self._delta_z = (self._v_max - self._v_min) / (self._atoms - 1)
     self._z = th.linspace(self._v_min, self._v_max,
                           self._atoms).float().to(self.device)  # [N,]
     self.expl_expt_mng = ExplorationExploitationClass(
         eps_init=eps_init,
         eps_mid=eps_mid,
         eps_final=eps_final,
         init2mid_annealing_step=init2mid_annealing_step,
         max_step=self._max_train_step)
     self.assign_interval = assign_interval
     self.rainbow_net = TargetTwin(
         RainbowDueling(self.obs_spec,
                        rep_net_params=self._rep_net_params,
                        action_dim=self.a_dim,
                        atoms=self._atoms,
                        network_settings=network_settings)).to(self.device)
     self.rainbow_net.target.train()  # so that NoisyLinear takes effect
     self.oplr = OPLR(self.rainbow_net, lr, **self._oplr_params)
     self._trainer_modules.update(model=self.rainbow_net, oplr=self.oplr)
Esempio n. 17
0
 def __init__(self,
              online_quantiles=8,
              target_quantiles=8,
              select_quantiles=32,
              quantiles_idx=64,
              huber_delta=1.,
              lr=5.0e-4,
              eps_init=1,
              eps_mid=0.2,
              eps_final=0.01,
              init2mid_annealing_step=1000,
              assign_interval=2,
              network_settings={
                  'q_net': [128, 64],
                  'quantile': [128, 64],
                  'tile': [64]
              },
              **kwargs):
     super().__init__(**kwargs)
     assert not self.is_continuous, 'iqn only support discrete action space'
     self.online_quantiles = online_quantiles
     self.target_quantiles = target_quantiles
     self.select_quantiles = select_quantiles
     self.quantiles_idx = quantiles_idx
     self.huber_delta = huber_delta
     self.assign_interval = assign_interval
     self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                       eps_mid=eps_mid,
                                                       eps_final=eps_final,
                                                       init2mid_annealing_step=init2mid_annealing_step,
                                                       max_step=self._max_train_step)
     self.q_net = TargetTwin(IqnNet(self.obs_spec,
                                    rep_net_params=self._rep_net_params,
                                    action_dim=self.a_dim,
                                    quantiles_idx=self.quantiles_idx,
                                    network_settings=network_settings)).to(self.device)
     self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
     self._trainer_modules.update(model=self.q_net,
                                  oplr=self.oplr)
Esempio n. 18
0
    def __init__(self,
                 target_k: int = 4,
                 lr: float = 5.0e-4,
                 eps_init: float = 1,
                 eps_mid: float = 0.2,
                 eps_final: float = 0.01,
                 init2mid_annealing_step: int = 1000,
                 assign_interval: int = 1000,
                 network_settings: List[int] = [32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'dqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.target_k = target_k
        assert self.target_k > 0, "assert self.target_k > 0"
        self.current_target_idx = 0

        self.q_net = CriticQvalueAll(self.obs_spec,
                                     rep_net_params=self._rep_net_params,
                                     output_shape=self.a_dim,
                                     network_settings=network_settings).to(
                                         self.device)
        self.target_nets = []
        for i in range(self.target_k):
            target_q_net = deepcopy(self.q_net)
            target_q_net.eval()
            sync_params(target_q_net, self.q_net)
            self.target_nets.append(target_q_net)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
Esempio n. 19
0
class CuriosityModel(nn.Module):
    """
    Model of Intrinsic Curiosity Module (ICM).
    Curiosity-driven Exploration by Self-supervised Prediction, https://arxiv.org/abs/1705.05363
    """
    def __init__(self,
                 obs_spec,
                 rep_net_params,
                 is_continuous,
                 action_dim,
                 *,
                 eta=0.2,
                 lr=1.0e-3,
                 beta=0.2):
        """
        params:
            is_continuous: sepecify whether action space is continuous(True) or discrete(False)
            action_dim: dimension of action

            eta: weight of intrinsic reward
            lr: the learning rate of curiosity model
            beta: weight factor of loss between inverse_dynamic_net and forward_net
        """
        super().__init__()
        self.eta = eta
        self.beta = beta
        self.is_continuous = is_continuous
        self.action_dim = action_dim

        self.rep_net = RepresentationNetwork(obs_spec=obs_spec,
                                             rep_net_params=rep_net_params)

        self.feat_dim = self.rep_net.h_dim

        # S, S' => A
        self.inverse_dynamic_net = nn.Sequential(
            nn.Linear(self.feat_dim * 2, self.feat_dim * 2),
            Act_REGISTER[default_act](),
            nn.Linear(self.feat_dim * 2, action_dim))
        if self.is_continuous:
            self.inverse_dynamic_net.add_module('tanh', nn.Tanh())

        # S, A => S'
        self.forward_net = nn.Sequential(
            nn.Linear(self.feat_dim + action_dim,
                      self.feat_dim), Act_REGISTER[default_act](),
            nn.Linear(self.feat_dim, self.feat_dim))

        self.oplr = OPLR(
            models=[self.rep_net, self.inverse_dynamic_net, self.forward_net],
            lr=lr)

    def forward(self, BATCH):
        fs, _ = self.rep_net(BATCH.obs,
                             begin_mask=BATCH.begin_mask)  # [T, B, *]
        fs_, _ = self.rep_net(BATCH.obs_,
                              begin_mask=BATCH.begin_mask)  # [T, B, *]

        # [T, B, *] <S, A> => S'
        s_eval = self.forward_net(th.cat((fs, BATCH.action), -1))
        LF = 0.5 * (fs_ - s_eval).square().sum(-1, keepdim=True)  # [T, B, 1]
        intrinsic_reward = self.eta * LF
        loss_forward = LF.mean()  # 1

        a_eval = self.inverse_dynamic_net(th.cat((fs, fs_), -1))  # [T, B, *]
        if self.is_continuous:
            loss_inverse = 0.5 * \
                           (a_eval - BATCH.action).square().sum(-1).mean()
        else:
            idx = BATCH.action.argmax(-1)  # [T, B]
            loss_inverse = F.cross_entropy(a_eval.view(-1, self.action_dim),
                                           idx.view(-1))  # 1

        loss = (1 - self.beta) * loss_inverse + self.beta * loss_forward
        self.oplr.optimize(loss)
        summaries = {
            'LOSS/curiosity_loss': loss,
            'LOSS/forward_loss': loss_forward,
            'LOSS/inverse_loss': loss_inverse
        }
        return intrinsic_reward, summaries
Esempio n. 20
0
class MAXSQN(SarlOffPolicy):
    """
    https://github.com/createamind/DRL/blob/master/spinup/algos/maxsqn/maxsqn.py
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 alpha=0.2,
                 beta=0.1,
                 polyak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 network_settings=[32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'maxsqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self._max_train_step)
        self.use_epsilon = use_epsilon
        self.polyak = polyak
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        self.critic = TargetTwin(CriticQvalueAll(self.obs_spec,
                                                 rep_net_params=self._rep_net_params,
                                                 output_shape=self.a_dim,
                                                 network_settings=network_settings),
                                 self.polyak).to(self.device)
        self.critic2 = deepcopy(self.critic)

        self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params)

        if self.auto_adaption:
            self.log_alpha = th.tensor(0., requires_grad=True).to(self.device)
            self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params)
            self._trainer_modules.update(alpha_oplr=self.alpha_oplr)
        else:
            self.log_alpha = th.tensor(alpha).log().to(self.device)

        self._trainer_modules.update(critic=self.critic,
                                     critic2=self.critic2,
                                     log_alpha=self.log_alpha,
                                     critic_oplr=self.critic_oplr)

    @property
    def alpha(self):
        return self.log_alpha.exp()

    @iton
    def select_action(self, obs):
        q = self.critic(obs, rnncs=self.rnncs)  # [B, A]
        self.rnncs_ = self.critic.get_rnncs()

        if self.use_epsilon and self._is_train_mode and self.expl_expt_mng.is_random(self._cur_train_step):
            actions = np.random.randint(0, self.a_dim, self.n_copies)
        else:
            cate_dist = td.Categorical(logits=(q / self.alpha))
            mu = q.argmax(-1)  # [B,]
            actions = pi = cate_dist.sample()  # [B,]
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        q1 = self.critic(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, A]
        q2 = self.critic2(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, A]
        q1_eval = (q1 * BATCH.action).sum(-1, keepdim=True)  # [T, B, 1]
        q2_eval = (q2 * BATCH.action).sum(-1, keepdim=True)  # [T, B, 1]

        q1_log_probs = (q1 / (self.alpha + th.finfo().eps)).log_softmax(-1)  # [T, B, A]
        q1_entropy = -(q1_log_probs.exp() * q1_log_probs).sum(-1, keepdim=True).mean()  # 1

        q1_target = self.critic.t(BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
        q2_target = self.critic2.t(BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
        q1_target_max = q1_target.max(-1, keepdim=True)[0]  # [T, B, 1]
        q1_target_log_probs = (q1_target / (self.alpha + th.finfo().eps)).log_softmax(-1)  # [T, B, A]
        q1_target_entropy = -(q1_target_log_probs.exp() * q1_target_log_probs).sum(-1, keepdim=True)  # [T, B, 1]

        q2_target_max = q2_target.max(-1, keepdim=True)[0]  # [T, B, 1]
        # q2_target_log_probs = q2_target.log_softmax(-1)
        # q2_target_log_max = q2_target_log_probs.max(1, keepdim=True)[0]

        q_target = th.minimum(q1_target_max, q2_target_max) + self.alpha * q1_target_entropy  # [T, B, 1]
        dc_r = n_step_return(BATCH.reward,
                             self.gamma,
                             BATCH.done,
                             q_target,
                             BATCH.begin_mask).detach()  # [T, B, 1]
        td_error1 = q1_eval - dc_r  # [T, B, 1]
        td_error2 = q2_eval - dc_r  # [T, B, 1]
        q1_loss = (td_error1.square() * BATCH.get('isw', 1.0)).mean()  # 1
        q2_loss = (td_error2.square() * BATCH.get('isw', 1.0)).mean()  # 1
        loss = 0.5 * (q1_loss + q2_loss)
        self.critic_oplr.optimize(loss)
        summaries = {
            'LEARNING_RATE/critic_lr': self.critic_oplr.lr,
            'LOSS/loss': loss,
            'Statistics/log_alpha': self.log_alpha,
            'Statistics/alpha': self.alpha,
            'Statistics/q1_entropy': q1_entropy,
            'Statistics/q_min': th.minimum(q1, q2).mean(),
            'Statistics/q_mean': q1.mean(),
            'Statistics/q_max': th.maximum(q1, q2).mean()
        }
        if self.auto_adaption:
            alpha_loss = -(self.alpha * (self.target_entropy - q1_entropy).detach()).mean()
            self.alpha_oplr.optimize(alpha_loss)
            summaries.update({
                'LOSS/alpha_loss': alpha_loss,
                'LEARNING_RATE/alpha_lr': self.alpha_oplr.lr
            })
        return (td_error1 + td_error2) / 2, summaries

    def _after_train(self):
        super()._after_train()
        self.critic.sync()
        self.critic2.sync()
Esempio n. 21
0
class C51(SarlOffPolicy):
    """
    Category 51, https://arxiv.org/abs/1707.06887
    No double, no dueling, no noisy net.
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 v_min=-10,
                 v_max=10,
                 atoms=51,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=1000,
                 network_settings=[128, 128],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'c51 only support discrete action space'
        self._v_min = v_min
        self._v_max = v_max
        self._atoms = atoms
        self._delta_z = (self._v_max - self._v_min) / (self._atoms - 1)
        self._z = th.linspace(self._v_min, self._v_max,
                              self._atoms).float().to(self.device)  # [N,]
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.q_net = TargetTwin(
            C51Distributional(self.obs_spec,
                              rep_net_params=self._rep_net_params,
                              action_dim=self.a_dim,
                              atoms=self._atoms,
                              network_settings=network_settings)).to(
                                  self.device)
        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net, oplr=self.oplr)

    @iton
    def select_action(self, obs):
        feat = self.q_net(obs, rnncs=self.rnncs)  # [B, A, N]
        self.rnncs_ = self.q_net.get_rnncs()

        if self._is_train_mode and self.expl_expt_mng.is_random(
                self._cur_train_step):
            actions = np.random.randint(0, self.a_dim, self.n_copies)
        else:
            q = (self._z * feat).sum(-1)  # [B, A, N] * [N,] => [B, A]
            actions = q.argmax(-1)  # [B,]
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        q_dist = self.q_net(BATCH.obs,
                            begin_mask=BATCH.begin_mask)  # [T, B, A, N]
        # [T, B, A, N] * [T, B, A, 1] => [T, B, A, N] => [T, B, N]
        q_dist = (q_dist * BATCH.action.unsqueeze(-1)).sum(-2)

        q_eval = (q_dist * self._z).sum(-1)  # [T, B, N] * [N,] => [T, B]

        target_q_dist = self.q_net.t(
            BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A, N]
        # [T, B, A, N] * [1, N] => [T, B, A]
        target_q = (target_q_dist * self._z).sum(-1)
        a_ = target_q.argmax(-1)  # [T, B]
        a_onehot = F.one_hot(a_, self.a_dim).float()  # [T, B, A]
        # [T, B, A, N] * [T, B, A, 1] => [T, B, A, N] => [T, B, N]
        target_q_dist = (target_q_dist * a_onehot.unsqueeze(-1)).sum(-2)

        target = n_step_return(
            BATCH.reward.repeat(1, 1, self._atoms), self.gamma,
            BATCH.done.repeat(1, 1, self._atoms), target_q_dist,
            BATCH.begin_mask.repeat(1, 1, self._atoms)).detach()  # [T, B, N]
        target = target.clamp(self._v_min, self._v_max)  # [T, B, N]
        # An amazing trick for calculating the projection gracefully.
        # ref: https://github.com/ShangtongZhang/DeepRL
        target_dist = (
            1 - (target.unsqueeze(-1) - self._z.view(1, 1, -1, 1)).abs() /
            self._delta_z).clamp(0, 1) * target_q_dist.unsqueeze(
                -1)  # [T, B, N, 1]
        target_dist = target_dist.sum(-1)  # [T, B, N]

        _cross_entropy = -(target_dist * th.log(q_dist + th.finfo().eps)).sum(
            -1, keepdim=True)  # [T, B, 1]
        loss = (_cross_entropy * BATCH.get('isw', 1.0)).mean()  # 1

        self.oplr.optimize(loss)
        return _cross_entropy, {
            'LEARNING_RATE/lr': self.oplr.lr,
            'LOSS/loss': loss,
            'Statistics/q_max': q_eval.max(),
            'Statistics/q_min': q_eval.min(),
            'Statistics/q_mean': q_eval.mean()
        }

    def _after_train(self):
        super()._after_train()
        if self._cur_train_step % self.assign_interval == 0:
            self.q_net.sync()
Esempio n. 22
0
    def __init__(self,
                 polyak=0.995,
                 noise_action='ou',
                 noise_params={'sigma': 0.2},
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        """
        TODO: Annotation
        """
        super().__init__(**kwargs)
        self.polyak = polyak
        self.discrete_tau = discrete_tau

        self.actors, self.critics = {}, {}
        for id in set(self.model_ids):
            if self.is_continuouss[id]:
                self.actors[id] = TargetTwin(
                    ActorDPG(
                        self.obs_specs[id],
                        rep_net_params=self._rep_net_params,
                        output_shape=self.a_dims[id],
                        network_settings=network_settings['actor_continuous']),
                    self.polyak).to(self.device)
            else:
                self.actors[id] = TargetTwin(
                    ActorDct(
                        self.obs_specs[id],
                        rep_net_params=self._rep_net_params,
                        output_shape=self.a_dims[id],
                        network_settings=network_settings['actor_discrete']),
                    self.polyak).to(self.device)
            self.critics[id] = TargetTwin(
                MACriticQvalueOne(list(self.obs_specs.values()),
                                  rep_net_params=self._rep_net_params,
                                  action_dim=sum(self.a_dims.values()),
                                  network_settings=network_settings['q']),
                self.polyak).to(self.device)
        self.actor_oplr = OPLR(list(self.actors.values()), actor_lr,
                               **self._oplr_params)
        self.critic_oplr = OPLR(list(self.critics.values()), critic_lr,
                                **self._oplr_params)

        # TODO: 添加动作类型判断
        self.noised_actions = {
            id: Noise_action_REGISTER[noise_action](**noise_params)
            for id in set(self.model_ids) if self.is_continuouss[id]
        }

        self._trainer_modules.update(
            {f"actor_{id}": self.actors[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(
            {f"critic_{id}": self.critics[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Esempio n. 23
0
class MADDPG(MultiAgentOffPolicy):
    """
    Multi-Agent Deep Deterministic Policy Gradient, https://arxiv.org/abs/1706.02275
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 polyak=0.995,
                 noise_action='ou',
                 noise_params={'sigma': 0.2},
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        """
        TODO: Annotation
        """
        super().__init__(**kwargs)
        self.polyak = polyak
        self.discrete_tau = discrete_tau

        self.actors, self.critics = {}, {}
        for id in set(self.model_ids):
            if self.is_continuouss[id]:
                self.actors[id] = TargetTwin(
                    ActorDPG(
                        self.obs_specs[id],
                        rep_net_params=self._rep_net_params,
                        output_shape=self.a_dims[id],
                        network_settings=network_settings['actor_continuous']),
                    self.polyak).to(self.device)
            else:
                self.actors[id] = TargetTwin(
                    ActorDct(
                        self.obs_specs[id],
                        rep_net_params=self._rep_net_params,
                        output_shape=self.a_dims[id],
                        network_settings=network_settings['actor_discrete']),
                    self.polyak).to(self.device)
            self.critics[id] = TargetTwin(
                MACriticQvalueOne(list(self.obs_specs.values()),
                                  rep_net_params=self._rep_net_params,
                                  action_dim=sum(self.a_dims.values()),
                                  network_settings=network_settings['q']),
                self.polyak).to(self.device)
        self.actor_oplr = OPLR(list(self.actors.values()), actor_lr,
                               **self._oplr_params)
        self.critic_oplr = OPLR(list(self.critics.values()), critic_lr,
                                **self._oplr_params)

        # TODO: 添加动作类型判断
        self.noised_actions = {
            id: Noise_action_REGISTER[noise_action](**noise_params)
            for id in set(self.model_ids) if self.is_continuouss[id]
        }

        self._trainer_modules.update(
            {f"actor_{id}": self.actors[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(
            {f"critic_{id}": self.critics[id]
             for id in set(self.model_ids)})
        self._trainer_modules.update(actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)

    def episode_reset(self):
        super().episode_reset()
        for noised_action in self.noised_actions.values():
            noised_action.reset()

    @iton
    def select_action(self, obs: Dict):
        acts_info = {}
        actions = {}
        for aid, mid in zip(self.agent_ids, self.model_ids):
            output = self.actors[mid](obs[aid],
                                      rnncs=self.rnncs[aid])  # [B, A]
            self.rnncs_[aid] = self.actors[mid].get_rnncs()
            if self.is_continuouss[aid]:
                mu = output  # [B, A]
                pi = self.noised_actions[mid](mu)  # [B, A]
            else:
                logits = output  # [B, A]
                mu = logits.argmax(-1)  # [B,]
                cate_dist = td.Categorical(logits=logits)
                pi = cate_dist.sample()  # [B,]
            action = pi if self._is_train_mode else mu
            acts_info[aid] = Data(action=action)
            actions[aid] = action
        return actions, acts_info

    @iton
    def _train(self, BATCH_DICT):
        """
        TODO: Annotation
        """
        summaries = defaultdict(dict)
        target_actions = {}
        for aid, mid in zip(self.agent_ids, self.model_ids):
            if self.is_continuouss[aid]:
                target_actions[aid] = self.actors[mid].t(
                    BATCH_DICT[aid].obs_,
                    begin_mask=BATCH_DICT['global'].begin_mask)  # [T, B, A]
            else:
                target_logits = self.actors[mid].t(
                    BATCH_DICT[aid].obs_,
                    begin_mask=BATCH_DICT['global'].begin_mask)  # [T, B, A]
                target_cate_dist = td.Categorical(logits=target_logits)
                target_pi = target_cate_dist.sample()  # [T, B]
                action_target = F.one_hot(
                    target_pi, self.a_dims[aid]).float()  # [T, B, A]
                target_actions[aid] = action_target  # [T, B, A]
        target_actions = th.cat(list(target_actions.values()),
                                -1)  # [T, B, N*A]

        qs, q_targets = {}, {}
        for mid in self.model_ids:
            qs[mid] = self.critics[mid](
                [BATCH_DICT[id].obs for id in self.agent_ids],
                th.cat([BATCH_DICT[id].action for id in self.agent_ids],
                       -1))  # [T, B, 1]
            q_targets[mid] = self.critics[mid].t(
                [BATCH_DICT[id].obs_ for id in self.agent_ids],
                target_actions)  # [T, B, 1]

        q_loss = {}
        td_errors = 0.
        for aid, mid in zip(self.agent_ids, self.model_ids):
            dc_r = n_step_return(
                BATCH_DICT[aid].reward, self.gamma, BATCH_DICT[aid].done,
                q_targets[mid],
                BATCH_DICT['global'].begin_mask).detach()  # [T, B, 1]
            td_error = dc_r - qs[mid]  # [T, B, 1]
            td_errors += td_error
            q_loss[aid] = 0.5 * td_error.square().mean()  # 1
            summaries[aid].update({
                'Statistics/q_min': qs[mid].min(),
                'Statistics/q_mean': qs[mid].mean(),
                'Statistics/q_max': qs[mid].max()
            })
        self.critic_oplr.optimize(sum(q_loss.values()))

        actor_loss = {}
        for aid, mid in zip(self.agent_ids, self.model_ids):
            if self.is_continuouss[aid]:
                mu = self.actors[mid](
                    BATCH_DICT[aid].obs,
                    begin_mask=BATCH_DICT['global'].begin_mask)  # [T, B, A]
            else:
                logits = self.actors[mid](
                    BATCH_DICT[aid].obs,
                    begin_mask=BATCH_DICT['global'].begin_mask)  # [T, B, A]
                logp_all = logits.log_softmax(-1)  # [T, B, A]
                gumbel_noise = td.Gumbel(0,
                                         1).sample(logp_all.shape)  # [T, B, A]
                _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax(
                    -1)  # [T, B, A]
                _pi_true_one_hot = F.one_hot(
                    _pi.argmax(-1), self.a_dims[aid]).float()  # [T, B, A]
                _pi_diff = (_pi_true_one_hot - _pi).detach()  # [T, B, A]
                mu = _pi_diff + _pi  # [T, B, A]

            all_actions = {id: BATCH_DICT[id].action for id in self.agent_ids}
            all_actions[aid] = mu
            q_actor = self.critics[mid](
                [BATCH_DICT[id].obs for id in self.agent_ids],
                th.cat(list(all_actions.values()), -1),
                begin_mask=BATCH_DICT['global'].begin_mask)  # [T, B, 1]
            actor_loss[aid] = -q_actor.mean()  # 1

        self.actor_oplr.optimize(sum(actor_loss.values()))

        for aid in self.agent_ids:
            summaries[aid].update({
                'LOSS/actor_loss': actor_loss[aid],
                'LOSS/critic_loss': q_loss[aid]
            })
        summaries['model'].update({
            'LOSS/actor_loss',
            sum(actor_loss.values()), 'LOSS/critic_loss',
            sum(q_loss.values())
        })
        return td_errors / self.n_agents_percopy, summaries

    def _after_train(self):
        super()._after_train()
        for actor in self.actors.values():
            actor.sync()
        for critic in self.critics.values():
            critic.sync()
Esempio n. 24
0
    def __init__(self,
                 q_lr=5.0e-3,
                 intra_option_lr=5.0e-4,
                 termination_lr=5.0e-4,
                 use_eps_greedy=False,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 boltzmann_temperature=1.0,
                 options_num=4,
                 ent_coff=0.01,
                 double_q=False,
                 use_baseline=True,
                 terminal_mask=True,
                 termination_regularizer=0.01,
                 assign_interval=1000,
                 network_settings={
                     'q': [32, 32],
                     'intra_option': [32, 32],
                     'termination': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.options_num = options_num
        self.termination_regularizer = termination_regularizer
        self.ent_coff = ent_coff
        self.use_baseline = use_baseline
        self.terminal_mask = terminal_mask
        self.double_q = double_q
        self.boltzmann_temperature = boltzmann_temperature
        self.use_eps_greedy = use_eps_greedy

        self.q_net = TargetTwin(
            CriticQvalueAll(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            output_shape=self.options_num,
                            network_settings=network_settings['q'])).to(
                                self.device)

        self.intra_option_net = OcIntraOption(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.a_dim,
            options_num=self.options_num,
            network_settings=network_settings['intra_option']).to(self.device)
        self.termination_net = CriticQvalueAll(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.options_num,
            network_settings=network_settings['termination'],
            out_act='sigmoid').to(self.device)

        if self.is_continuous:
            # https://discuss.pytorch.org/t/valueerror-cant-optimize-a-non-leaf-tensor/21751
            # https://blog.csdn.net/nkhgl/article/details/100047276
            self.log_std = th.as_tensor(
                np.full((self.options_num, self.a_dim),
                        -0.5)).requires_grad_().to(self.device)  # [P, A]
            self.intra_option_oplr = OPLR(
                [self.intra_option_net, self.log_std], intra_option_lr,
                **self._oplr_params)
        else:
            self.intra_option_oplr = OPLR(self.intra_option_net,
                                          intra_option_lr, **self._oplr_params)
        self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params)
        self.termination_oplr = OPLR(self.termination_net, termination_lr,
                                     **self._oplr_params)

        self._trainer_modules.update(q_net=self.q_net,
                                     intra_option_net=self.intra_option_net,
                                     termination_net=self.termination_net,
                                     q_oplr=self.q_oplr,
                                     intra_option_oplr=self.intra_option_oplr,
                                     termination_oplr=self.termination_oplr)
        self.options = self.new_options = self._generate_random_options()
Esempio n. 25
0
class OC(SarlOffPolicy):
    """
    The Option-Critic Architecture. http://arxiv.org/abs/1609.05140
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 q_lr=5.0e-3,
                 intra_option_lr=5.0e-4,
                 termination_lr=5.0e-4,
                 use_eps_greedy=False,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 boltzmann_temperature=1.0,
                 options_num=4,
                 ent_coff=0.01,
                 double_q=False,
                 use_baseline=True,
                 terminal_mask=True,
                 termination_regularizer=0.01,
                 assign_interval=1000,
                 network_settings={
                     'q': [32, 32],
                     'intra_option': [32, 32],
                     'termination': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.options_num = options_num
        self.termination_regularizer = termination_regularizer
        self.ent_coff = ent_coff
        self.use_baseline = use_baseline
        self.terminal_mask = terminal_mask
        self.double_q = double_q
        self.boltzmann_temperature = boltzmann_temperature
        self.use_eps_greedy = use_eps_greedy

        self.q_net = TargetTwin(
            CriticQvalueAll(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            output_shape=self.options_num,
                            network_settings=network_settings['q'])).to(
                                self.device)

        self.intra_option_net = OcIntraOption(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.a_dim,
            options_num=self.options_num,
            network_settings=network_settings['intra_option']).to(self.device)
        self.termination_net = CriticQvalueAll(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.options_num,
            network_settings=network_settings['termination'],
            out_act='sigmoid').to(self.device)

        if self.is_continuous:
            # https://discuss.pytorch.org/t/valueerror-cant-optimize-a-non-leaf-tensor/21751
            # https://blog.csdn.net/nkhgl/article/details/100047276
            self.log_std = th.as_tensor(
                np.full((self.options_num, self.a_dim),
                        -0.5)).requires_grad_().to(self.device)  # [P, A]
            self.intra_option_oplr = OPLR(
                [self.intra_option_net, self.log_std], intra_option_lr,
                **self._oplr_params)
        else:
            self.intra_option_oplr = OPLR(self.intra_option_net,
                                          intra_option_lr, **self._oplr_params)
        self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params)
        self.termination_oplr = OPLR(self.termination_net, termination_lr,
                                     **self._oplr_params)

        self._trainer_modules.update(q_net=self.q_net,
                                     intra_option_net=self.intra_option_net,
                                     termination_net=self.termination_net,
                                     q_oplr=self.q_oplr,
                                     intra_option_oplr=self.intra_option_oplr,
                                     termination_oplr=self.termination_oplr)
        self.options = self.new_options = self._generate_random_options()

    def _generate_random_options(self):
        # [B,]
        return th.tensor(np.random.randint(0, self.options_num,
                                           self.n_copies)).to(self.device)

    def episode_step(self, obs: Data, env_rets: Data, begin_mask: np.ndarray):
        super().episode_step(obs, env_rets, begin_mask)
        self.options = self.new_options

    @iton
    def select_action(self, obs):
        q = self.q_net(obs, rnncs=self.rnncs)  # [B, P]
        self.rnncs_ = self.q_net.get_rnncs()
        pi = self.intra_option_net(obs, rnncs=self.rnncs)  # [B, P, A]
        beta = self.termination_net(obs, rnncs=self.rnncs)  # [B, P]
        options_onehot = F.one_hot(self.options,
                                   self.options_num).float()  # [B, P]
        options_onehot_expanded = options_onehot.unsqueeze(-1)  # [B, P, 1]
        pi = (pi * options_onehot_expanded).sum(-2)  # [B, A]
        if self.is_continuous:
            mu = pi.tanh()  # [B, A]
            log_std = self.log_std[self.options]  # [B, A]
            dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
            actions = dist.sample().clamp(-1, 1)  # [B, A]
        else:
            pi = pi / self.boltzmann_temperature  # [B, A]
            dist = td.Categorical(logits=pi)
            actions = dist.sample()  # [B, ]
        max_options = q.argmax(-1).long()  # [B, P] => [B, ]
        if self.use_eps_greedy:
            # epsilon greedy
            if self._is_train_mode and self.expl_expt_mng.is_random(
                    self._cur_train_step):
                self.new_options = self._generate_random_options()
            else:
                self.new_options = max_options
        else:
            beta_probs = (beta * options_onehot).sum(-1)  # [B, P] => [B,]
            beta_dist = td.Bernoulli(probs=beta_probs)
            self.new_options = th.where(beta_dist.sample() < 1, self.options,
                                        max_options)
        return actions, Data(action=actions,
                             last_options=self.options,
                             options=self.new_options)

    def random_action(self):
        actions = super().random_action()
        self._acts_info.update(
            last_options=np.random.randint(0, self.options_num, self.n_copies),
            options=np.random.randint(0, self.options_num, self.n_copies))
        return actions

    def _preprocess_BATCH(self, BATCH):  # [T, B, *]
        BATCH = super()._preprocess_BATCH(BATCH)
        BATCH.last_options = int2one_hot(BATCH.last_options, self.options_num)
        BATCH.options = int2one_hot(BATCH.options, self.options_num)
        return BATCH

    @iton
    def _train(self, BATCH):
        q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, P]
        q_next = self.q_net.t(BATCH.obs_,
                              begin_mask=BATCH.begin_mask)  # [T, B, P]
        beta_next = self.termination_net(
            BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, P]

        qu_eval = (q * BATCH.options).sum(-1, keepdim=True)  # [T, B, 1]
        beta_s_ = (beta_next * BATCH.options).sum(-1,
                                                  keepdim=True)  # [T, B, 1]
        q_s_ = (q_next * BATCH.options).sum(-1, keepdim=True)  # [T, B, 1]
        # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L94
        if self.double_q:
            q_ = self.q_net(BATCH.obs_,
                            begin_mask=BATCH.begin_mask)  # [T, B, P]
            # [T, B, P] => [T, B] => [T, B, P]
            max_a_idx = F.one_hot(q_.argmax(-1), self.options_num).float()
            q_s_max = (q_next * max_a_idx).sum(-1, keepdim=True)  # [T, B, 1]
        else:
            q_s_max = q_next.max(-1, keepdim=True)[0]  # [T, B, 1]
        u_target = (1 - beta_s_) * q_s_ + beta_s_ * q_s_max  # [T, B, 1]
        qu_target = n_step_return(BATCH.reward, self.gamma, BATCH.done,
                                  u_target,
                                  BATCH.begin_mask).detach()  # [T, B, 1]
        td_error = qu_target - qu_eval  # gradient : q   [T, B, 1]
        q_loss = (td_error.square() *
                  BATCH.get('isw', 1.0)).mean()  # [T, B, 1] => 1
        self.q_oplr.optimize(q_loss)

        q_s = qu_eval.detach()  # [T, B, 1]
        # https://github.com/jeanharb/option_critic/blob/5d6c81a650a8f452bc8ad3250f1f211d317fde8c/neural_net.py#L130
        if self.use_baseline:
            adv = (qu_target - q_s).detach()  # [T, B, 1]
        else:
            adv = qu_target.detach()  # [T, B, 1]
        # [T, B, P] => [T, B, P, 1]
        options_onehot_expanded = BATCH.options.unsqueeze(-1)
        pi = self.intra_option_net(BATCH.obs,
                                   begin_mask=BATCH.begin_mask)  # [T, B, P, A]
        # [T, B, P, A] => [T, B, A]
        pi = (pi * options_onehot_expanded).sum(-2)
        if self.is_continuous:
            mu = pi.tanh()  # [T, B, A]
            log_std = self.log_std[BATCH.options.argmax(-1)]  # [T, B, A]
            dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
            log_p = dist.log_prob(BATCH.action).unsqueeze(-1)  # [T, B, 1]
            entropy = dist.entropy().unsqueeze(-1)  # [T, B, 1]
        else:
            pi = pi / self.boltzmann_temperature  # [T, B, A]
            log_pi = pi.log_softmax(-1)  # [T, B, A]
            entropy = -(log_pi.exp() * log_pi).sum(-1,
                                                   keepdim=True)  # [T, B, 1]
            log_p = (BATCH.action * log_pi).sum(-1, keepdim=True)  # [T, B, 1]
        pi_loss = -(log_p * adv + self.ent_coff * entropy).mean()  # 1

        beta = self.termination_net(BATCH.obs,
                                    begin_mask=BATCH.begin_mask)  # [T, B, P]
        beta_s = (beta * BATCH.last_options).sum(-1, keepdim=True)  # [T, B, 1]
        if self.use_eps_greedy:
            v_s = q.max(
                -1,
                keepdim=True)[0] - self.termination_regularizer  # [T, B, 1]
        else:
            v_s = (1 - beta_s) * q_s + beta_s * q.max(
                -1, keepdim=True)[0]  # [T, B, 1]
            # v_s = q.mean(-1, keepdim=True)  # [T, B, 1]
        beta_loss = beta_s * (q_s - v_s).detach()  # [T, B, 1]
        # https://github.com/lweitkamp/option-critic-pytorch/blob/0c57da7686f8903ed2d8dded3fae832ee9defd1a/option_critic.py#L238
        if self.terminal_mask:
            beta_loss *= (1 - BATCH.done)  # [T, B, 1]
        beta_loss = beta_loss.mean()  # 1

        self.intra_option_oplr.optimize(pi_loss)
        self.termination_oplr.optimize(beta_loss)

        return td_error, {
            'LEARNING_RATE/q_lr': self.q_oplr.lr,
            'LEARNING_RATE/intra_option_lr': self.intra_option_oplr.lr,
            'LEARNING_RATE/termination_lr': self.termination_oplr.lr,
            # 'Statistics/option': self.options[0],
            'LOSS/q_loss': q_loss,
            'LOSS/pi_loss': pi_loss,
            'LOSS/beta_loss': beta_loss,
            'Statistics/q_option_max': q_s.max(),
            'Statistics/q_option_min': q_s.min(),
            'Statistics/q_option_mean': q_s.mean()
        }

    def _after_train(self):
        super()._after_train()
        if self._cur_train_step % self.assign_interval == 0:
            self.q_net.sync()
Esempio n. 26
0
    def __init__(self,
                 alpha=0.2,
                 annealing=True,
                 last_alpha=0.01,
                 polyak=0.995,
                 entropic_index=1.5,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': {
                         'share': [128, 128],
                         'mu': [64],
                         'log_std': [64],
                         'soft_clip': False,
                         'log_std_bound': [-20, 2]
                     },
                     'actor_discrete': [64, 32],
                     'q': [128, 128]
                 },
                 auto_adaption=True,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 alpha_lr=5.0e-4,
                 **kwargs):
        super().__init__(**kwargs)
        self.polyak = polyak
        self.discrete_tau = discrete_tau
        self.entropic_index = 2 - entropic_index
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        self.critic = TargetTwin(CriticQvalueOne(self.obs_spec,
                                                 rep_net_params=self._rep_net_params,
                                                 action_dim=self.a_dim,
                                                 network_settings=network_settings['q']),
                                 self.polyak).to(self.device)
        self.critic2 = deepcopy(self.critic)

        if self.is_continuous:
            self.actor = ActorCts(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  network_settings=network_settings['actor_continuous']).to(self.device)
        else:
            self.actor = ActorDct(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  network_settings=network_settings['actor_discrete']).to(self.device)

        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim))

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr, **self._oplr_params)

        if self.auto_adaption:
            self.log_alpha = th.tensor(0., requires_grad=True).to(self.device)
            self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params)
            self._trainer_modules.update(alpha_oplr=self.alpha_oplr)
        else:
            self.log_alpha = th.tensor(alpha).log().to(self.device)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha, int(1e6))

        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     critic2=self.critic2,
                                     log_alpha=self.log_alpha,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Esempio n. 27
0
class TAC(SarlOffPolicy):
    """Tsallis Actor Critic, TAC with V neural Network. https://arxiv.org/abs/1902.00137
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 alpha=0.2,
                 annealing=True,
                 last_alpha=0.01,
                 polyak=0.995,
                 entropic_index=1.5,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': {
                         'share': [128, 128],
                         'mu': [64],
                         'log_std': [64],
                         'soft_clip': False,
                         'log_std_bound': [-20, 2]
                     },
                     'actor_discrete': [64, 32],
                     'q': [128, 128]
                 },
                 auto_adaption=True,
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 alpha_lr=5.0e-4,
                 **kwargs):
        super().__init__(**kwargs)
        self.polyak = polyak
        self.discrete_tau = discrete_tau
        self.entropic_index = 2 - entropic_index
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        self.critic = TargetTwin(CriticQvalueOne(self.obs_spec,
                                                 rep_net_params=self._rep_net_params,
                                                 action_dim=self.a_dim,
                                                 network_settings=network_settings['q']),
                                 self.polyak).to(self.device)
        self.critic2 = deepcopy(self.critic)

        if self.is_continuous:
            self.actor = ActorCts(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  network_settings=network_settings['actor_continuous']).to(self.device)
        else:
            self.actor = ActorDct(self.obs_spec,
                                  rep_net_params=self._rep_net_params,
                                  output_shape=self.a_dim,
                                  network_settings=network_settings['actor_discrete']).to(self.device)

        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim))

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr, **self._oplr_params)

        if self.auto_adaption:
            self.log_alpha = th.tensor(0., requires_grad=True).to(self.device)
            self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params)
            self._trainer_modules.update(alpha_oplr=self.alpha_oplr)
        else:
            self.log_alpha = th.tensor(alpha).log().to(self.device)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha, int(1e6))

        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     critic2=self.critic2,
                                     log_alpha=self.log_alpha,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)

    @property
    def alpha(self):
        return self.log_alpha.exp()

    @iton
    def select_action(self, obs):
        if self.is_continuous:
            mu, log_std = self.actor(obs, rnncs=self.rnncs)  # [B, A]
            pi = td.Normal(mu, log_std.exp()).sample().tanh()  # [B, A]
            mu.tanh_()  # squash mu     # [B, A]
        else:
            logits = self.actor(obs, rnncs=self.rnncs)  # [B, A]
            mu = logits.argmax(-1)  # [B,]
            cate_dist = td.Categorical(logits=logits)
            pi = cate_dist.sample()  # [B,]
        self.rnncs_ = self.actor.get_rnncs()
        actions = pi if self._is_train_mode else mu
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        if self.is_continuous:
            target_mu, target_log_std = self.actor(BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
            dist = td.Independent(td.Normal(target_mu, target_log_std.exp()), 1)
            target_pi = dist.sample()  # [T, B, A]
            target_pi, target_log_pi = squash_action(target_pi, dist.log_prob(
                target_pi).unsqueeze(-1), is_independent=False)  # [T, B, A]
            target_log_pi = tsallis_entropy_log_q(target_log_pi, self.entropic_index)  # [T, B, 1]
        else:
            target_logits = self.actor(BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
            target_cate_dist = td.Categorical(logits=target_logits)
            target_pi = target_cate_dist.sample()  # [T, B]
            target_log_pi = target_cate_dist.log_prob(target_pi).unsqueeze(-1)  # [T, B, 1]
            target_pi = F.one_hot(target_pi, self.a_dim).float()  # [T, B, A]
        q1 = self.critic(BATCH.obs, BATCH.action, begin_mask=BATCH.begin_mask)  # [T, B, 1]
        q2 = self.critic2(BATCH.obs, BATCH.action, begin_mask=BATCH.begin_mask)  # [T, B, 1]

        q1_target = self.critic.t(BATCH.obs_, target_pi, begin_mask=BATCH.begin_mask)  # [T, B, 1]
        q2_target = self.critic2.t(BATCH.obs_, target_pi, begin_mask=BATCH.begin_mask)  # [T, B, 1]
        q_target = th.minimum(q1_target, q2_target)  # [T, B, 1]
        dc_r = n_step_return(BATCH.reward,
                             self.gamma,
                             BATCH.done,
                             (q_target - self.alpha * target_log_pi),
                             BATCH.begin_mask).detach()  # [T, B, 1]
        td_error1 = q1 - dc_r  # [T, B, 1]
        td_error2 = q2 - dc_r  # [T, B, 1]

        q1_loss = (td_error1.square() * BATCH.get('isw', 1.0)).mean()  # 1
        q2_loss = (td_error2.square() * BATCH.get('isw', 1.0)).mean()  # 1
        critic_loss = 0.5 * q1_loss + 0.5 * q2_loss
        self.critic_oplr.optimize(critic_loss)

        if self.is_continuous:
            mu, log_std = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, A]
            dist = td.Independent(td.Normal(mu, log_std.exp()), 1)
            pi = dist.rsample()  # [T, B, A]
            pi, log_pi = squash_action(pi, dist.log_prob(pi).unsqueeze(-1), is_independent=False)  # [T, B, A]
            log_pi = tsallis_entropy_log_q(log_pi, self.entropic_index)  # [T, B, 1]
            entropy = dist.entropy().mean()  # 1
        else:
            logits = self.actor(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, A]
            logp_all = logits.log_softmax(-1)  # [T, B, A]
            gumbel_noise = td.Gumbel(0, 1).sample(logp_all.shape)  # [T, B, A]
            _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax(-1)  # [T, B, A]
            _pi_true_one_hot = F.one_hot(_pi.argmax(-1), self.a_dim).float()  # [T, B, A]
            _pi_diff = (_pi_true_one_hot - _pi).detach()  # [T, B, A]
            pi = _pi_diff + _pi  # [T, B, A]
            log_pi = (logp_all * pi).sum(-1, keepdim=True)  # [T, B, 1]
            entropy = -(logp_all.exp() * logp_all).sum(-1).mean()  # 1
        q_s_pi = th.minimum(self.critic(BATCH.obs, pi, begin_mask=BATCH.begin_mask),
                            self.critic2(BATCH.obs, pi, begin_mask=BATCH.begin_mask))  # [T, B, 1]
        actor_loss = -(q_s_pi - self.alpha * log_pi).mean()  # 1
        self.actor_oplr.optimize(actor_loss)

        summaries = {
            'LEARNING_RATE/actor_lr': self.actor_oplr.lr,
            'LEARNING_RATE/critic_lr': self.critic_oplr.lr,
            'LOSS/actor_loss': actor_loss,
            'LOSS/q1_loss': q1_loss,
            'LOSS/q2_loss': q2_loss,
            'LOSS/critic_loss': critic_loss,
            'Statistics/log_alpha': self.log_alpha,
            'Statistics/alpha': self.alpha,
            'Statistics/entropy': entropy,
            'Statistics/q_min': th.minimum(q1, q2).min(),
            'Statistics/q_mean': th.minimum(q1, q2).mean(),
            'Statistics/q_max': th.maximum(q1, q2).max()
        }
        if self.auto_adaption:
            alpha_loss = -(self.alpha * (log_pi + self.target_entropy).detach()).mean()  # 1
            self.alpha_oplr.optimize(alpha_loss)
            summaries.update({
                'LOSS/alpha_loss': alpha_loss,
                'LEARNING_RATE/alpha_lr': self.alpha_oplr.lr
            })
        return (td_error1 + td_error2) / 2, summaries

    def _after_train(self):
        super()._after_train()
        self.critic.sync()
        self.critic2.sync()

        if self.annealing and not self.auto_adaption:
            self.log_alpha.copy_(self.alpha_annealing(self._cur_train_step).log())
Esempio n. 28
0
class TD3(SarlOffPolicy):
    """
    Twin Delayed Deep Deterministic Policy Gradient, https://arxiv.org/abs/1802.09477
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 polyak=0.995,
                 delay_num=2,
                 noise_action='clip_normal',
                 noise_params={
                     'sigma': 0.2,
                     'noise_bound': 0.2
                 },
                 actor_lr=5.0e-4,
                 critic_lr=1.0e-3,
                 discrete_tau=1.0,
                 network_settings={
                     'actor_continuous': [32, 32],
                     'actor_discrete': [32, 32],
                     'q': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.polyak = polyak
        self.delay_num = delay_num
        self.discrete_tau = discrete_tau

        if self.is_continuous:
            actor = ActorDPG(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous'])
            self.noised_action = self.target_noised_action = Noise_action_REGISTER[
                noise_action](**noise_params)
        else:
            actor = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous'])
        self.actor = TargetTwin(actor, self.polyak).to(self.device)

        self.critic = TargetTwin(
            CriticQvalueOne(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            action_dim=self.a_dim,
                            network_settings=network_settings['q']),
            self.polyak).to(self.device)
        self.critic2 = deepcopy(self.critic)

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR([self.critic, self.critic2], critic_lr,
                                **self._oplr_params)
        self._trainer_modules.update(actor=self.actor,
                                     critic=self.critic,
                                     critic2=self.critic2,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)

    def episode_reset(self):
        super().episode_reset()
        if self.is_continuous:
            self.noised_action.reset()

    @iton
    def select_action(self, obs):
        output = self.actor(obs, rnncs=self.rnncs)  # [B, A]
        self.rnncs_ = self.actor.get_rnncs()
        if self.is_continuous:
            mu = output  # [B, A]
            pi = self.noised_action(mu)  # [B, A]
        else:
            logits = output  # [B, A]
            mu = logits.argmax(-1)  # [B,]
            cate_dist = td.Categorical(logits=logits)
            pi = cate_dist.sample()  # [B,]
        actions = pi if self._is_train_mode else mu
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        for _ in range(self.delay_num):
            if self.is_continuous:
                action_target = self.target_noised_action(
                    self.actor.t(BATCH.obs_,
                                 begin_mask=BATCH.begin_mask))  # [T, B, A]
            else:
                target_logits = self.actor.t(
                    BATCH.obs_, begin_mask=BATCH.begin_mask)  # [T, B, A]
                target_cate_dist = td.Categorical(logits=target_logits)
                target_pi = target_cate_dist.sample()  # [T, B]
                action_target = F.one_hot(target_pi,
                                          self.a_dim).float()  # [T, B, A]
            q1 = self.critic(BATCH.obs,
                             BATCH.action,
                             begin_mask=BATCH.begin_mask)  # [T, B, 1]
            q2 = self.critic2(BATCH.obs,
                              BATCH.action,
                              begin_mask=BATCH.begin_mask)  # [T, B, 1]
            q_target = th.minimum(
                self.critic.t(BATCH.obs_,
                              action_target,
                              begin_mask=BATCH.begin_mask),
                self.critic2.t(BATCH.obs_,
                               action_target,
                               begin_mask=BATCH.begin_mask))  # [T, B, 1]
            dc_r = n_step_return(BATCH.reward, self.gamma, BATCH.done,
                                 q_target,
                                 BATCH.begin_mask).detach()  # [T, B, 1]
            td_error1 = q1 - dc_r  # [T, B, 1]
            td_error2 = q2 - dc_r  # [T, B, 1]

            q1_loss = (td_error1.square() * BATCH.get('isw', 1.0)).mean()  # 1
            q2_loss = (td_error2.square() * BATCH.get('isw', 1.0)).mean()  # 1
            critic_loss = 0.5 * (q1_loss + q2_loss)
            self.critic_oplr.optimize(critic_loss)

        if self.is_continuous:
            mu = self.actor(BATCH.obs,
                            begin_mask=BATCH.begin_mask)  # [T, B, A]
        else:
            logits = self.actor(BATCH.obs,
                                begin_mask=BATCH.begin_mask)  # [T, B, A]
            logp_all = logits.log_softmax(-1)  # [T, B, A]
            gumbel_noise = td.Gumbel(0, 1).sample(logp_all.shape)  # [T, B, A]
            _pi = ((logp_all + gumbel_noise) / self.discrete_tau).softmax(
                -1)  # [T, B, A]
            _pi_true_one_hot = F.one_hot(_pi.argmax(-1),
                                         self.a_dim).float()  # [T, B, A]
            _pi_diff = (_pi_true_one_hot - _pi).detach()  # [T, B, A]
            mu = _pi_diff + _pi  # [T, B, A]
        q1_actor = self.critic(BATCH.obs, mu,
                               begin_mask=BATCH.begin_mask)  # [T, B, 1]

        actor_loss = -q1_actor.mean()  # 1
        self.actor_oplr.optimize(actor_loss)
        return (td_error1 + td_error2) / 2, {
            'LEARNING_RATE/actor_lr': self.actor_oplr.lr,
            'LEARNING_RATE/critic_lr': self.critic_oplr.lr,
            'LOSS/actor_loss': actor_loss,
            'LOSS/critic_loss': critic_loss,
            'Statistics/q_min': th.minimum(q1, q2).min(),
            'Statistics/q_mean': th.minimum(q1, q2).mean(),
            'Statistics/q_max': th.maximum(q1, q2).max()
        }

    def _after_train(self):
        super()._after_train()
        self.actor.sync()
        self.critic.sync()
        self.critic2.sync()
Esempio n. 29
0
class IQN(SarlOffPolicy):
    """
    Implicit Quantile Networks, https://arxiv.org/abs/1806.06923
    Double DQN
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 online_quantiles=8,
                 target_quantiles=8,
                 select_quantiles=32,
                 quantiles_idx=64,
                 huber_delta=1.,
                 lr=5.0e-4,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 assign_interval=2,
                 network_settings={
                     'q_net': [128, 64],
                     'quantile': [128, 64],
                     'tile': [64]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'iqn only support discrete action space'
        self.online_quantiles = online_quantiles
        self.target_quantiles = target_quantiles
        self.select_quantiles = select_quantiles
        self.quantiles_idx = quantiles_idx
        self.huber_delta = huber_delta
        self.assign_interval = assign_interval
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self._max_train_step)
        self.q_net = TargetTwin(IqnNet(self.obs_spec,
                                       rep_net_params=self._rep_net_params,
                                       action_dim=self.a_dim,
                                       quantiles_idx=self.quantiles_idx,
                                       network_settings=network_settings)).to(self.device)
        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net,
                                     oplr=self.oplr)

    @iton
    def select_action(self, obs):
        _, select_quantiles_tiled = self._generate_quantiles(  # [N*B, X]
            batch_size=self.n_copies,
            quantiles_num=self.select_quantiles
        )
        q_values = self.q_net(obs, select_quantiles_tiled, rnncs=self.rnncs)  # [N, B, A]
        self.rnncs_ = self.q_net.get_rnncs()

        if self._is_train_mode and self.expl_expt_mng.is_random(self._cur_train_step):
            actions = np.random.randint(0, self.a_dim, self.n_copies)
        else:
            # [N, B, A] => [B, A] => [B,]
            actions = q_values.mean(0).argmax(-1)
        return actions, Data(action=actions)

    def _generate_quantiles(self, batch_size, quantiles_num):
        _quantiles = th.rand([quantiles_num * batch_size, 1])  # [N*B, 1]
        _quantiles_tiled = _quantiles.repeat(1, self.quantiles_idx)  # [N*B, 1] => [N*B, X]

        # pi * i * tau [N*B, X] * [X, ] => [N*B, X]
        _quantiles_tiled = th.arange(self.quantiles_idx) * np.pi * _quantiles_tiled
        _quantiles_tiled.cos_()  # [N*B, X]

        _quantiles = _quantiles.view(batch_size, quantiles_num, 1)  # [N*B, 1] => [B, N, 1]
        return _quantiles, _quantiles_tiled  # [B, N, 1], [N*B, X]

    @iton
    def _train(self, BATCH):
        time_step = BATCH.reward.shape[0]
        batch_size = BATCH.reward.shape[1]

        quantiles, quantiles_tiled = self._generate_quantiles(  # [T*B, N, 1], [N*T*B, X]
            batch_size=time_step * batch_size,
            quantiles_num=self.online_quantiles)
        # [T*B, N, 1] => [T, B, N, 1]
        quantiles = quantiles.view(time_step, batch_size, -1, 1)
        quantiles_tiled = quantiles_tiled.view(time_step, -1, self.quantiles_idx)  # [N*T*B, X] => [T, N*B, X]

        quantiles_value = self.q_net(BATCH.obs, quantiles_tiled, begin_mask=BATCH.begin_mask)  # [T, N, B, A]
        # [T, N, B, A] => [N, T, B, A] * [T, B, A] => [N, T, B, 1]
        quantiles_value = (quantiles_value.swapaxes(0, 1) * BATCH.action).sum(-1, keepdim=True)
        q_eval = quantiles_value.mean(0)  # [N, T, B, 1] => [T, B, 1]

        _, select_quantiles_tiled = self._generate_quantiles(  # [N*T*B, X]
            batch_size=time_step * batch_size,
            quantiles_num=self.select_quantiles)
        select_quantiles_tiled = select_quantiles_tiled.view(
            time_step, -1, self.quantiles_idx)  # [N*T*B, X] => [T, N*B, X]

        q_values = self.q_net(
            BATCH.obs_, select_quantiles_tiled, begin_mask=BATCH.begin_mask)  # [T, N, B, A]
        q_values = q_values.mean(1)  # [T, N, B, A] => [T, B, A]
        next_max_action = q_values.argmax(-1)  # [T, B]
        next_max_action = F.one_hot(
            next_max_action, self.a_dim).float()  # [T, B, A]

        _, target_quantiles_tiled = self._generate_quantiles(  # [N'*T*B, X]
            batch_size=time_step * batch_size,
            quantiles_num=self.target_quantiles)
        target_quantiles_tiled = target_quantiles_tiled.view(
            time_step, -1, self.quantiles_idx)  # [N'*T*B, X] => [T, N'*B, X]
        target_quantiles_value = self.q_net.t(BATCH.obs_, target_quantiles_tiled,
                                              begin_mask=BATCH.begin_mask)  # [T, N', B, A]
        target_quantiles_value = target_quantiles_value.swapaxes(0, 1)  # [T, N', B, A] => [N', T, B, A]
        target_quantiles_value = (target_quantiles_value * next_max_action).sum(-1, keepdim=True)  # [N', T, B, 1]

        target_q = target_quantiles_value.mean(0)  # [T, B, 1]
        q_target = n_step_return(BATCH.reward,  # [T, B, 1]
                                 self.gamma,
                                 BATCH.done,  # [T, B, 1]
                                 target_q,  # [T, B, 1]
                                 BATCH.begin_mask).detach()  # [T, B, 1]
        td_error = q_target - q_eval  # [T, B, 1]

        # [N', T, B, 1] => [N', T, B]
        target_quantiles_value = target_quantiles_value.squeeze(-1)
        target_quantiles_value = target_quantiles_value.permute(
            1, 2, 0)  # [N', T, B] => [T, B, N']
        quantiles_value_target = n_step_return(BATCH.reward.repeat(1, 1, self.target_quantiles),
                                               self.gamma,
                                               BATCH.done.repeat(1, 1, self.target_quantiles),
                                               target_quantiles_value,
                                               BATCH.begin_mask.repeat(1, 1,
                                                                       self.target_quantiles)).detach()  # [T, B, N']
        # [T, B, N'] => [T, B, 1, N']
        quantiles_value_target = quantiles_value_target.unsqueeze(-2)
        quantiles_value_online = quantiles_value.permute(1, 2, 0, 3)  # [N, T, B, 1] => [T, B, N, 1]
        # [T, B, N, 1] - [T, B, 1, N'] => [T, B, N, N']
        quantile_error = quantiles_value_online - quantiles_value_target
        huber = F.huber_loss(quantiles_value_online, quantiles_value_target,
                             reduction="none", delta=self.huber_delta)  # [T, B, N, N]
        # [T, B, N, 1] - [T, B, N, N'] => [T, B, N, N']
        huber_abs = (quantiles - quantile_error.detach().le(0.).float()).abs()
        loss = (huber_abs * huber).mean(-1)  # [T, B, N, N'] => [T, B, N]
        loss = loss.sum(-1, keepdim=True)  # [T, B, N] => [T, B, 1]

        loss = (loss * BATCH.get('isw', 1.0)).mean()  # 1
        self.oplr.optimize(loss)
        return td_error, {
            'LEARNING_RATE/lr': self.oplr.lr,
            'LOSS/loss': loss,
            'Statistics/q_max': q_eval.max(),
            'Statistics/q_min': q_eval.min(),
            'Statistics/q_mean': q_eval.mean()
        }

    def _after_train(self):
        super()._after_train()
        if self._cur_train_step % self.assign_interval == 0:
            self.q_net.sync()
Esempio n. 30
0
class DQN(SarlOffPolicy):
    """
    Deep Q-learning Network, DQN, [2013](https://arxiv.org/pdf/1312.5602.pdf), [2015](https://storage.googleapis.com/deepmind-media/dqn/DQNNaturePaper.pdf)
    DQN + LSTM, https://arxiv.org/abs/1507.06527
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 lr: float = 5.0e-4,
                 eps_init: float = 1,
                 eps_mid: float = 0.2,
                 eps_final: float = 0.01,
                 init2mid_annealing_step: int = 1000,
                 assign_interval: int = 1000,
                 network_settings: List[int] = [32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'dqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.q_net = TargetTwin(
            CriticQvalueAll(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            output_shape=self.a_dim,
                            network_settings=network_settings)).to(self.device)
        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net)
        self._trainer_modules.update(oplr=self.oplr)

    @iton
    def select_action(self, obs):
        q_values = self.q_net(obs, rnncs=self.rnncs)  # [B, *]
        self.rnncs_ = self.q_net.get_rnncs()

        if self._is_train_mode and self.expl_expt_mng.is_random(
                self._cur_train_step):
            actions = np.random.randint(0, self.a_dim, self.n_copies)
        else:
            actions = q_values.argmax(-1)  # [B,]
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, A]
        q_next = self.q_net.t(BATCH.obs_,
                              begin_mask=BATCH.begin_mask)  # [T, B, A]
        q_eval = (q * BATCH.action).sum(-1, keepdim=True)  # [T, B, 1]
        q_target = n_step_return(
            BATCH.reward,
            self.gamma,
            BATCH.done,
            q_next.max(-1, keepdim=True)[0],
            BATCH.begin_mask,
            nstep=self._n_step_value).detach()  # [T, B, 1]
        td_error = q_target - q_eval  # [T, B, 1]
        q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean()  # 1
        self.oplr.optimize(q_loss)
        return td_error, {
            'LEARNING_RATE/lr': self.oplr.lr,
            'LOSS/loss': q_loss,
            'Statistics/q_max': q_eval.max(),
            'Statistics/q_min': q_eval.min(),
            'Statistics/q_mean': q_eval.mean()
        }

    def _after_train(self):
        super()._after_train()
        if self._cur_train_step % self.assign_interval == 0:
            self.q_net.sync()