Ejemplo n.º 1
0
 def __init__(self,
              lr: float = 5.0e-4,
              eps_init: float = 1,
              eps_mid: float = 0.2,
              eps_final: float = 0.01,
              init2mid_annealing_step: int = 1000,
              assign_interval: int = 1000,
              network_settings: List[int] = [32, 32],
              **kwargs):
     super().__init__(**kwargs)
     assert not self.is_continuous, 'dqn only support discrete action space'
     self.expl_expt_mng = ExplorationExploitationClass(
         eps_init=eps_init,
         eps_mid=eps_mid,
         eps_final=eps_final,
         init2mid_annealing_step=init2mid_annealing_step,
         max_step=self._max_train_step)
     self.assign_interval = assign_interval
     self.q_net = TargetTwin(
         CriticQvalueAll(self.obs_spec,
                         rep_net_params=self._rep_net_params,
                         output_shape=self.a_dim,
                         network_settings=network_settings)).to(self.device)
     self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
     self._trainer_modules.update(model=self.q_net)
     self._trainer_modules.update(oplr=self.oplr)
Ejemplo n.º 2
0
    def __init__(self,
                 target_k: int = 4,
                 lr: float = 5.0e-4,
                 eps_init: float = 1,
                 eps_mid: float = 0.2,
                 eps_final: float = 0.01,
                 init2mid_annealing_step: int = 1000,
                 assign_interval: int = 1000,
                 network_settings: List[int] = [32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'dqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.target_k = target_k
        assert self.target_k > 0, "assert self.target_k > 0"
        self.current_target_idx = 0

        self.q_net = CriticQvalueAll(self.obs_spec,
                                     rep_net_params=self._rep_net_params,
                                     output_shape=self.a_dim,
                                     network_settings=network_settings).to(
                                         self.device)
        self.target_nets = []
        for i in range(self.target_k):
            target_q_net = deepcopy(self.q_net)
            target_q_net.eval()
            sync_params(target_q_net, self.q_net)
            self.target_nets.append(target_q_net)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
Ejemplo n.º 3
0
    def __init__(self,
                 alpha=0.2,
                 beta=0.1,
                 polyak=0.995,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 use_epsilon=False,
                 q_lr=5.0e-4,
                 alpha_lr=5.0e-4,
                 auto_adaption=True,
                 network_settings=[32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'maxsqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init,
                                                          eps_mid=eps_mid,
                                                          eps_final=eps_final,
                                                          init2mid_annealing_step=init2mid_annealing_step,
                                                          max_step=self._max_train_step)
        self.use_epsilon = use_epsilon
        self.polyak = polyak
        self.auto_adaption = auto_adaption
        self.target_entropy = beta * np.log(self.a_dim)

        self.critic = TargetTwin(CriticQvalueAll(self.obs_spec,
                                                 rep_net_params=self._rep_net_params,
                                                 output_shape=self.a_dim,
                                                 network_settings=network_settings),
                                 self.polyak).to(self.device)
        self.critic2 = deepcopy(self.critic)

        self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params)

        if self.auto_adaption:
            self.log_alpha = th.tensor(0., requires_grad=True).to(self.device)
            self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params)
            self._trainer_modules.update(alpha_oplr=self.alpha_oplr)
        else:
            self.log_alpha = th.tensor(alpha).log().to(self.device)

        self._trainer_modules.update(critic=self.critic,
                                     critic2=self.critic2,
                                     log_alpha=self.log_alpha,
                                     critic_oplr=self.critic_oplr)
Ejemplo n.º 4
0
    def __init__(self,
                 lr=5.0e-4,
                 alpha=2,
                 polyak=0.995,
                 network_settings=[32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'sql only support discrete action space'
        self.alpha = alpha
        self.polyak = polyak

        self.q_net = TargetTwin(CriticQvalueAll(self.obs_spec,
                                                rep_net_params=self._rep_net_params,
                                                output_shape=self.a_dim,
                                                network_settings=network_settings),
                                self.polyak).to(self.device)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net,
                                     oplr=self.oplr)
Ejemplo n.º 5
0
    def __init__(self,
                 q_lr=5.0e-3,
                 intra_option_lr=5.0e-4,
                 termination_lr=5.0e-4,
                 use_eps_greedy=False,
                 eps_init=1,
                 eps_mid=0.2,
                 eps_final=0.01,
                 init2mid_annealing_step=1000,
                 boltzmann_temperature=1.0,
                 options_num=4,
                 ent_coff=0.01,
                 double_q=False,
                 use_baseline=True,
                 terminal_mask=True,
                 termination_regularizer=0.01,
                 assign_interval=1000,
                 network_settings={
                     'q': [32, 32],
                     'intra_option': [32, 32],
                     'termination': [32, 32]
                 },
                 **kwargs):
        super().__init__(**kwargs)
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.options_num = options_num
        self.termination_regularizer = termination_regularizer
        self.ent_coff = ent_coff
        self.use_baseline = use_baseline
        self.terminal_mask = terminal_mask
        self.double_q = double_q
        self.boltzmann_temperature = boltzmann_temperature
        self.use_eps_greedy = use_eps_greedy

        self.q_net = TargetTwin(
            CriticQvalueAll(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            output_shape=self.options_num,
                            network_settings=network_settings['q'])).to(
                                self.device)

        self.intra_option_net = OcIntraOption(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.a_dim,
            options_num=self.options_num,
            network_settings=network_settings['intra_option']).to(self.device)
        self.termination_net = CriticQvalueAll(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.options_num,
            network_settings=network_settings['termination'],
            out_act='sigmoid').to(self.device)

        if self.is_continuous:
            # https://discuss.pytorch.org/t/valueerror-cant-optimize-a-non-leaf-tensor/21751
            # https://blog.csdn.net/nkhgl/article/details/100047276
            self.log_std = th.as_tensor(
                np.full((self.options_num, self.a_dim),
                        -0.5)).requires_grad_().to(self.device)  # [P, A]
            self.intra_option_oplr = OPLR(
                [self.intra_option_net, self.log_std], intra_option_lr,
                **self._oplr_params)
        else:
            self.intra_option_oplr = OPLR(self.intra_option_net,
                                          intra_option_lr, **self._oplr_params)
        self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params)
        self.termination_oplr = OPLR(self.termination_net, termination_lr,
                                     **self._oplr_params)

        self._trainer_modules.update(q_net=self.q_net,
                                     intra_option_net=self.intra_option_net,
                                     termination_net=self.termination_net,
                                     q_oplr=self.q_oplr,
                                     intra_option_oplr=self.intra_option_oplr,
                                     termination_oplr=self.termination_oplr)
        self.options = self.new_options = self._generate_random_options()
Ejemplo n.º 6
0
    def __init__(
            self,
            q_lr=5.0e-3,
            intra_option_lr=5.0e-4,
            termination_lr=5.0e-4,
            interest_lr=5.0e-4,
            boltzmann_temperature=1.0,
            options_num=4,
            ent_coff=0.01,
            double_q=False,
            use_baseline=True,
            terminal_mask=True,
            termination_regularizer=0.01,
            assign_interval=1000,
            network_settings={
                'q': [32, 32],
                'intra_option': [32, 32],
                'termination': [32, 32],
                'interest': [32, 32]
            },
            **kwargs):
        super().__init__(**kwargs)
        self.assign_interval = assign_interval
        self.options_num = options_num
        self.termination_regularizer = termination_regularizer
        self.ent_coff = ent_coff
        self.use_baseline = use_baseline
        self.terminal_mask = terminal_mask
        self.double_q = double_q
        self.boltzmann_temperature = boltzmann_temperature

        self.q_net = TargetTwin(
            CriticQvalueAll(self.obs_spec,
                            rep_net_params=self._rep_net_params,
                            output_shape=self.options_num,
                            network_settings=network_settings['q'])).to(
                                self.device)

        self.intra_option_net = OcIntraOption(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.a_dim,
            options_num=self.options_num,
            network_settings=network_settings['intra_option']).to(self.device)
        self.termination_net = CriticQvalueAll(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.options_num,
            network_settings=network_settings['termination'],
            out_act='sigmoid').to(self.device)
        self.interest_net = CriticQvalueAll(
            self.obs_spec,
            rep_net_params=self._rep_net_params,
            output_shape=self.options_num,
            network_settings=network_settings['interest'],
            out_act='sigmoid').to(self.device)

        if self.is_continuous:
            self.log_std = th.as_tensor(
                np.full((self.options_num, self.a_dim),
                        -0.5)).requires_grad_().to(self.device)  # [P, A]
            self.intra_option_oplr = OPLR(
                [self.intra_option_net, self.log_std], intra_option_lr,
                **self._oplr_params)
        else:
            self.intra_option_oplr = OPLR(self.intra_option_net,
                                          intra_option_lr, **self._oplr_params)

        self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params)
        self.termination_oplr = OPLR(self.termination_net, termination_lr,
                                     **self._oplr_params)
        self.interest_oplr = OPLR(self.interest_net, interest_lr,
                                  **self._oplr_params)

        self._trainer_modules.update(q_net=self.q_net,
                                     intra_option_net=self.intra_option_net,
                                     termination_net=self.termination_net,
                                     interest_net=self.interest_net,
                                     q_oplr=self.q_oplr,
                                     intra_option_oplr=self.intra_option_oplr,
                                     termination_oplr=self.termination_oplr,
                                     interest_oplr=self.interest_oplr)

        self.options = self.new_options = th.tensor(
            np.random.randint(0, self.options_num,
                              self.n_copies)).to(self.device)
Ejemplo n.º 7
0
    def __init__(
            self,
            alpha=0.2,
            annealing=True,
            last_alpha=0.01,
            polyak=0.995,
            use_gumbel=True,
            discrete_tau=1.0,
            network_settings={
                'actor_continuous': {
                    'share': [128, 128],
                    'mu': [64],
                    'log_std': [64],
                    'soft_clip': False,
                    'log_std_bound': [-20, 2]
                },
                'actor_discrete': [64, 32],
                'q': [128, 128],
                'v': [128, 128]
            },
            actor_lr=5.0e-4,
            critic_lr=1.0e-3,
            alpha_lr=5.0e-4,
            auto_adaption=True,
            **kwargs):
        super().__init__(**kwargs)
        self.polyak = polyak
        self.use_gumbel = use_gumbel
        self.discrete_tau = discrete_tau
        self.auto_adaption = auto_adaption
        self.annealing = annealing

        self.v_net = TargetTwin(
            CriticValue(self.obs_spec,
                        rep_net_params=self._rep_net_params,
                        network_settings=network_settings['v']),
            self.polyak).to(self.device)

        if self.is_continuous:
            self.actor = ActorCts(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_continuous']).to(
                    self.device)
        else:
            self.actor = ActorDct(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['actor_discrete']).to(
                    self.device)

        # entropy = -log(1/|A|) = log |A|
        self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else
                                      np.log(self.a_dim))

        if self.is_continuous or self.use_gumbel:
            self.q_net = CriticQvalueOne(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                action_dim=self.a_dim,
                network_settings=network_settings['q']).to(self.device)
        else:
            self.q_net = CriticQvalueAll(
                self.obs_spec,
                rep_net_params=self._rep_net_params,
                output_shape=self.a_dim,
                network_settings=network_settings['q']).to(self.device)
        self.q_net2 = deepcopy(self.q_net)

        self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params)
        self.critic_oplr = OPLR([self.q_net, self.q_net2, self.v_net],
                                critic_lr, **self._oplr_params)

        if self.auto_adaption:
            self.log_alpha = th.tensor(0., requires_grad=True).to(self.device)
            self.alpha_oplr = OPLR(self.log_alpha, alpha_lr,
                                   **self._oplr_params)
            self._trainer_modules.update(alpha_oplr=self.alpha_oplr)
        else:
            self.log_alpha = th.tensor(alpha).log().to(self.device)
            if self.annealing:
                self.alpha_annealing = LinearAnnealing(alpha, last_alpha,
                                                       int(1e6))

        self._trainer_modules.update(actor=self.actor,
                                     v_net=self.v_net,
                                     q_net=self.q_net,
                                     q_net2=self.q_net2,
                                     log_alpha=self.log_alpha,
                                     actor_oplr=self.actor_oplr,
                                     critic_oplr=self.critic_oplr)
Ejemplo n.º 8
0
class AveragedDQN(SarlOffPolicy):
    """
    Averaged-DQN, http://arxiv.org/abs/1611.01929
    """
    policy_mode = 'off-policy'

    def __init__(self,
                 target_k: int = 4,
                 lr: float = 5.0e-4,
                 eps_init: float = 1,
                 eps_mid: float = 0.2,
                 eps_final: float = 0.01,
                 init2mid_annealing_step: int = 1000,
                 assign_interval: int = 1000,
                 network_settings: List[int] = [32, 32],
                 **kwargs):
        super().__init__(**kwargs)
        assert not self.is_continuous, 'dqn only support discrete action space'
        self.expl_expt_mng = ExplorationExploitationClass(
            eps_init=eps_init,
            eps_mid=eps_mid,
            eps_final=eps_final,
            init2mid_annealing_step=init2mid_annealing_step,
            max_step=self._max_train_step)
        self.assign_interval = assign_interval
        self.target_k = target_k
        assert self.target_k > 0, "assert self.target_k > 0"
        self.current_target_idx = 0

        self.q_net = CriticQvalueAll(self.obs_spec,
                                     rep_net_params=self._rep_net_params,
                                     output_shape=self.a_dim,
                                     network_settings=network_settings).to(
                                         self.device)
        self.target_nets = []
        for i in range(self.target_k):
            target_q_net = deepcopy(self.q_net)
            target_q_net.eval()
            sync_params(target_q_net, self.q_net)
            self.target_nets.append(target_q_net)

        self.oplr = OPLR(self.q_net, lr, **self._oplr_params)
        self._trainer_modules.update(model=self.q_net, oplr=self.oplr)

    @iton
    def select_action(self, obs):
        q_values = self.q_net(obs, rnncs=self.rnncs)  # [B, *]
        self.rnncs_ = self.q_net.get_rnncs()

        if self._is_train_mode and self.expl_expt_mng.is_random(
                self._cur_train_step):
            actions = np.random.randint(0, self.a_dim, self.n_copies)
        else:
            for i in range(self.target_k):
                target_q_values = self.target_nets[i](obs, rnncs=self.rnncs)
                q_values += target_q_values
            actions = q_values.argmax(-1)  # 不取平均也可以 [B, ]
        return actions, Data(action=actions)

    @iton
    def _train(self, BATCH):
        q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask)  # [T, B, *]
        q_next = 0
        for i in range(self.target_k):
            q_next += self.target_nets[i](BATCH.obs_,
                                          begin_mask=BATCH.begin_mask)
        q_next /= self.target_k  # [T, B, *]
        q_eval = (q * BATCH.action).sum(-1, keepdim=True)  # [T, B, 1]
        q_target = n_step_return(BATCH.reward, self.gamma, BATCH.done,
                                 q_next.max(-1, keepdim=True)[0],
                                 BATCH.begin_mask).detach()  # [T, B, 1]
        td_error = q_target - q_eval  # [T, B, 1]
        q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean()  # 1

        self.oplr.optimize(q_loss)
        return td_error, {
            'LEARNING_RATE/lr': self.oplr.lr,
            'LOSS/loss': q_loss,
            'Statistics/q_max': q_eval.max(),
            'Statistics/q_min': q_eval.min(),
            'Statistics/q_mean': q_eval.mean()
        }

    def _after_train(self):
        super()._after_train()
        if self._cur_train_step % self.assign_interval == 0:
            sync_params(self.target_nets[self.current_target_idx], self.q_net)
            self.current_target_idx = (self.current_target_idx +
                                       1) % self.target_k