def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.explore_noise = 0.1 # standard deviation of explore noise self.policy_noise = 0.2 # standard deviation of policy noise self.update_freq = 2 # delay update frequency, for soft target update self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target.load_state_dict(self.act.state_dict()) self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target.load_state_dict(self.cri.state_dict()) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam([ { 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }, ], lr=learning_rate)
def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): AgentBase.__init__(self) self.explore_noise = 0.05 self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target.load_state_dict(self.act.state_dict()) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target.load_state_dict(self.cri.state_dict()) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam([ { 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }, ], lr=learning_rate)
def __init__(self, cfg): self.critic_eval = Critic(cfg.n_state, cfg.n_action, cfg.mid_critic) self.critic_pred = Critic(cfg.n_state, cfg.n_action, cfg.mid_critic) self.actor_eval = Actor(cfg.n_state, cfg.n_action, cfg.mid_actor) self.actor_pred = Actor(cfg.n_state, cfg.n_action, cfg.mid_actor) hard_update(self.actor_pred, self.actor_eval) hard_update(self.critic_pred, self.critic_eval) self.noise = OUANoise() self.cfg = cfg self.epsilon = cfg.epsilon
class AgentDDPG(AgentBase): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.explore_noise = 0.05 self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target.load_state_dict(self.act.state_dict()) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target.load_state_dict(self.cri.state_dict()) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam([{ 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }]) def select_actions(self, states): # states = (state, ...) states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act(states) actions = (actions + torch.randn_like(actions) * self.explore_noise).clamp( -1, 1) return actions.detach().cpu().numpy() def update_policy(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() critic_obj = actor_obj = None # just for print return for _ in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, state, action, next_state = buffer.random_sample( batch_size) next_a = self.act_target(next_state) next_q = self.cri_target(next_state, next_a) q_label = reward + mask * next_q q_value = self.cri(state, action) critic_obj = self.criterion(q_value, q_label) q_value_pg = self.act(state) # policy gradient actor_obj = -self.cri_target(state, q_value_pg).mean() united_obj = actor_obj + critic_obj # objective self.optimizer.zero_grad() united_obj.backward() self.optimizer.step() soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) self.obj_a = actor_obj.item() self.obj_c = critic_obj.item()
class AgentTD3(AgentBase): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.explore_noise = 0.1 # standard deviation of explore noise self.policy_noise = 0.2 # standard deviation of policy noise self.update_freq = 2 # delay update frequency, for soft target update self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target.load_state_dict(self.act.state_dict()) self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target.load_state_dict(self.cri.state_dict()) self.criterion = nn.MSELoss() self.optimizer = torch.optim.Adam([ { 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }, ], lr=learning_rate) def update_policy(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() critic_obj = actor_obj = None for i in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, state, action, next_s = buffer.random_sample( batch_size) next_a = self.act_target.get_action( next_s, self.policy_noise) # policy noise next_q = torch.min(*self.cri_target.get__q1_q2( next_s, next_a)) # twin critics q_label = reward + mask * next_q q1, q2 = self.cri.get__q1_q2(state, action) critic_obj = self.criterion(q1, q_label) + self.criterion( q2, q_label) # twin critics q_value_pg = self.act(state) # policy gradient actor_obj = -self.cri_target(state, q_value_pg).mean() united_obj = actor_obj + critic_obj # objective self.optimizer.zero_grad() united_obj.backward() self.optimizer.step() if i % self.update_freq == 0: # delay update soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) self.obj_a = actor_obj.item() self.obj_c = critic_obj.item()