def __init__(self, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings)).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net) self._trainer_modules.update(oplr=self.oplr)
def __init__(self, target_k: int = 4, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.target_k = target_k assert self.target_k > 0, "assert self.target_k > 0" self.current_target_idx = 0 self.q_net = CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings).to( self.device) self.target_nets = [] for i in range(self.target_k): target_q_net = deepcopy(self.q_net) target_q_net.eval() sync_params(target_q_net, self.q_net) self.target_nets.append(target_q_net) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, alpha=0.2, beta=0.1, polyak=0.995, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, use_epsilon=False, q_lr=5.0e-4, alpha_lr=5.0e-4, auto_adaption=True, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'maxsqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass(eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.use_epsilon = use_epsilon self.polyak = polyak self.auto_adaption = auto_adaption self.target_entropy = beta * np.log(self.a_dim) self.critic = TargetTwin(CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings), self.polyak).to(self.device) self.critic2 = deepcopy(self.critic) self.critic_oplr = OPLR([self.critic, self.critic2], q_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) self._trainer_modules.update(critic=self.critic, critic2=self.critic2, log_alpha=self.log_alpha, critic_oplr=self.critic_oplr)
def __init__(self, lr=5.0e-4, alpha=2, polyak=0.995, network_settings=[32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'sql only support discrete action space' self.alpha = alpha self.polyak = polyak self.q_net = TargetTwin(CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings), self.polyak).to(self.device) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr)
def __init__(self, q_lr=5.0e-3, intra_option_lr=5.0e-4, termination_lr=5.0e-4, use_eps_greedy=False, eps_init=1, eps_mid=0.2, eps_final=0.01, init2mid_annealing_step=1000, boltzmann_temperature=1.0, options_num=4, ent_coff=0.01, double_q=False, use_baseline=True, terminal_mask=True, termination_regularizer=0.01, assign_interval=1000, network_settings={ 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32] }, **kwargs): super().__init__(**kwargs) self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.options_num = options_num self.termination_regularizer = termination_regularizer self.ent_coff = ent_coff self.use_baseline = use_baseline self.terminal_mask = terminal_mask self.double_q = double_q self.boltzmann_temperature = boltzmann_temperature self.use_eps_greedy = use_eps_greedy self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['q'])).to( self.device) self.intra_option_net = OcIntraOption( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, options_num=self.options_num, network_settings=network_settings['intra_option']).to(self.device) self.termination_net = CriticQvalueAll( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['termination'], out_act='sigmoid').to(self.device) if self.is_continuous: # https://discuss.pytorch.org/t/valueerror-cant-optimize-a-non-leaf-tensor/21751 # https://blog.csdn.net/nkhgl/article/details/100047276 self.log_std = th.as_tensor( np.full((self.options_num, self.a_dim), -0.5)).requires_grad_().to(self.device) # [P, A] self.intra_option_oplr = OPLR( [self.intra_option_net, self.log_std], intra_option_lr, **self._oplr_params) else: self.intra_option_oplr = OPLR(self.intra_option_net, intra_option_lr, **self._oplr_params) self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params) self.termination_oplr = OPLR(self.termination_net, termination_lr, **self._oplr_params) self._trainer_modules.update(q_net=self.q_net, intra_option_net=self.intra_option_net, termination_net=self.termination_net, q_oplr=self.q_oplr, intra_option_oplr=self.intra_option_oplr, termination_oplr=self.termination_oplr) self.options = self.new_options = self._generate_random_options()
def __init__( self, q_lr=5.0e-3, intra_option_lr=5.0e-4, termination_lr=5.0e-4, interest_lr=5.0e-4, boltzmann_temperature=1.0, options_num=4, ent_coff=0.01, double_q=False, use_baseline=True, terminal_mask=True, termination_regularizer=0.01, assign_interval=1000, network_settings={ 'q': [32, 32], 'intra_option': [32, 32], 'termination': [32, 32], 'interest': [32, 32] }, **kwargs): super().__init__(**kwargs) self.assign_interval = assign_interval self.options_num = options_num self.termination_regularizer = termination_regularizer self.ent_coff = ent_coff self.use_baseline = use_baseline self.terminal_mask = terminal_mask self.double_q = double_q self.boltzmann_temperature = boltzmann_temperature self.q_net = TargetTwin( CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['q'])).to( self.device) self.intra_option_net = OcIntraOption( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, options_num=self.options_num, network_settings=network_settings['intra_option']).to(self.device) self.termination_net = CriticQvalueAll( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['termination'], out_act='sigmoid').to(self.device) self.interest_net = CriticQvalueAll( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.options_num, network_settings=network_settings['interest'], out_act='sigmoid').to(self.device) if self.is_continuous: self.log_std = th.as_tensor( np.full((self.options_num, self.a_dim), -0.5)).requires_grad_().to(self.device) # [P, A] self.intra_option_oplr = OPLR( [self.intra_option_net, self.log_std], intra_option_lr, **self._oplr_params) else: self.intra_option_oplr = OPLR(self.intra_option_net, intra_option_lr, **self._oplr_params) self.q_oplr = OPLR(self.q_net, q_lr, **self._oplr_params) self.termination_oplr = OPLR(self.termination_net, termination_lr, **self._oplr_params) self.interest_oplr = OPLR(self.interest_net, interest_lr, **self._oplr_params) self._trainer_modules.update(q_net=self.q_net, intra_option_net=self.intra_option_net, termination_net=self.termination_net, interest_net=self.interest_net, q_oplr=self.q_oplr, intra_option_oplr=self.intra_option_oplr, termination_oplr=self.termination_oplr, interest_oplr=self.interest_oplr) self.options = self.new_options = th.tensor( np.random.randint(0, self.options_num, self.n_copies)).to(self.device)
def __init__( self, alpha=0.2, annealing=True, last_alpha=0.01, polyak=0.995, use_gumbel=True, discrete_tau=1.0, network_settings={ 'actor_continuous': { 'share': [128, 128], 'mu': [64], 'log_std': [64], 'soft_clip': False, 'log_std_bound': [-20, 2] }, 'actor_discrete': [64, 32], 'q': [128, 128], 'v': [128, 128] }, actor_lr=5.0e-4, critic_lr=1.0e-3, alpha_lr=5.0e-4, auto_adaption=True, **kwargs): super().__init__(**kwargs) self.polyak = polyak self.use_gumbel = use_gumbel self.discrete_tau = discrete_tau self.auto_adaption = auto_adaption self.annealing = annealing self.v_net = TargetTwin( CriticValue(self.obs_spec, rep_net_params=self._rep_net_params, network_settings=network_settings['v']), self.polyak).to(self.device) if self.is_continuous: self.actor = ActorCts( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_continuous']).to( self.device) else: self.actor = ActorDct( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['actor_discrete']).to( self.device) # entropy = -log(1/|A|) = log |A| self.target_entropy = 0.98 * (-self.a_dim if self.is_continuous else np.log(self.a_dim)) if self.is_continuous or self.use_gumbel: self.q_net = CriticQvalueOne( self.obs_spec, rep_net_params=self._rep_net_params, action_dim=self.a_dim, network_settings=network_settings['q']).to(self.device) else: self.q_net = CriticQvalueAll( self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings['q']).to(self.device) self.q_net2 = deepcopy(self.q_net) self.actor_oplr = OPLR(self.actor, actor_lr, **self._oplr_params) self.critic_oplr = OPLR([self.q_net, self.q_net2, self.v_net], critic_lr, **self._oplr_params) if self.auto_adaption: self.log_alpha = th.tensor(0., requires_grad=True).to(self.device) self.alpha_oplr = OPLR(self.log_alpha, alpha_lr, **self._oplr_params) self._trainer_modules.update(alpha_oplr=self.alpha_oplr) else: self.log_alpha = th.tensor(alpha).log().to(self.device) if self.annealing: self.alpha_annealing = LinearAnnealing(alpha, last_alpha, int(1e6)) self._trainer_modules.update(actor=self.actor, v_net=self.v_net, q_net=self.q_net, q_net2=self.q_net2, log_alpha=self.log_alpha, actor_oplr=self.actor_oplr, critic_oplr=self.critic_oplr)
class AveragedDQN(SarlOffPolicy): """ Averaged-DQN, http://arxiv.org/abs/1611.01929 """ policy_mode = 'off-policy' def __init__(self, target_k: int = 4, lr: float = 5.0e-4, eps_init: float = 1, eps_mid: float = 0.2, eps_final: float = 0.01, init2mid_annealing_step: int = 1000, assign_interval: int = 1000, network_settings: List[int] = [32, 32], **kwargs): super().__init__(**kwargs) assert not self.is_continuous, 'dqn only support discrete action space' self.expl_expt_mng = ExplorationExploitationClass( eps_init=eps_init, eps_mid=eps_mid, eps_final=eps_final, init2mid_annealing_step=init2mid_annealing_step, max_step=self._max_train_step) self.assign_interval = assign_interval self.target_k = target_k assert self.target_k > 0, "assert self.target_k > 0" self.current_target_idx = 0 self.q_net = CriticQvalueAll(self.obs_spec, rep_net_params=self._rep_net_params, output_shape=self.a_dim, network_settings=network_settings).to( self.device) self.target_nets = [] for i in range(self.target_k): target_q_net = deepcopy(self.q_net) target_q_net.eval() sync_params(target_q_net, self.q_net) self.target_nets.append(target_q_net) self.oplr = OPLR(self.q_net, lr, **self._oplr_params) self._trainer_modules.update(model=self.q_net, oplr=self.oplr) @iton def select_action(self, obs): q_values = self.q_net(obs, rnncs=self.rnncs) # [B, *] self.rnncs_ = self.q_net.get_rnncs() if self._is_train_mode and self.expl_expt_mng.is_random( self._cur_train_step): actions = np.random.randint(0, self.a_dim, self.n_copies) else: for i in range(self.target_k): target_q_values = self.target_nets[i](obs, rnncs=self.rnncs) q_values += target_q_values actions = q_values.argmax(-1) # 不取平均也可以 [B, ] return actions, Data(action=actions) @iton def _train(self, BATCH): q = self.q_net(BATCH.obs, begin_mask=BATCH.begin_mask) # [T, B, *] q_next = 0 for i in range(self.target_k): q_next += self.target_nets[i](BATCH.obs_, begin_mask=BATCH.begin_mask) q_next /= self.target_k # [T, B, *] q_eval = (q * BATCH.action).sum(-1, keepdim=True) # [T, B, 1] q_target = n_step_return(BATCH.reward, self.gamma, BATCH.done, q_next.max(-1, keepdim=True)[0], BATCH.begin_mask).detach() # [T, B, 1] td_error = q_target - q_eval # [T, B, 1] q_loss = (td_error.square() * BATCH.get('isw', 1.0)).mean() # 1 self.oplr.optimize(q_loss) return td_error, { 'LEARNING_RATE/lr': self.oplr.lr, 'LOSS/loss': q_loss, 'Statistics/q_max': q_eval.max(), 'Statistics/q_min': q_eval.min(), 'Statistics/q_mean': q_eval.mean() } def _after_train(self): super()._after_train() if self._cur_train_step % self.assign_interval == 0: sync_params(self.target_nets[self.current_target_idx], self.q_net) self.current_target_idx = (self.current_target_idx + 1) % self.target_k