class DDPGAgent: def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) #self.noise = OUNoise(out_actor, scale=1.0 ) self.noise = RNoise(out_actor, 0.5) self.epsilon = 1. self.epsilon_decay_rate = 0.999 self.epsilon_min = 0.2 # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=0.0) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0.0) def epsilon_decay(self): self.epsilon = max(self.epsilon_decay_rate*self.epsilon, self.epsilon_min) def act(self, obs, rand=0., add_noise=True): if np.random.random() < rand: action = np.random.randn(2) # select an action (for each agent) action = np.clip(action, -1, 1) # all actions between -1 and 1 action = torch.tensor(action, dtype=torch.float) else: obs = obs.to(device) self.actor.eval() with torch.no_grad(): action = self.actor(obs) self.actor.train() if add_noise: action += self.epsilon * self.noise.noise() action = action.squeeze(0) return action def reset(self): self.noise.reset() def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise() return action
class DDPGAgent: def __init__(self, state_size, action_size, hidden_layers, gamma, tau, lr_actor, lr_critic, weight_decay, seed): """Initialize DDPG agent.""" self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.actor = Network(state_size, hidden_layers[0], hidden_layers[1], action_size, seed, actor=True).to(device) self.critic = Network(2 * (state_size + action_size), hidden_layers[2], hidden_layers[3], 1, seed).to(device) self.target_actor = Network(state_size, hidden_layers[0], hidden_layers[1], action_size, seed, actor=True).to(device) self.target_critic = Network(2 * (state_size + action_size), hidden_layers[2], hidden_layers[3], 1, seed).to(device) self.noise = OUNoise(action_size, seed, scale=1.0) ''' # initialize targets same as original networks self.hard_update(self.target_actor, self.actor) self.hard_update(self.target_critic, self.critic) ''' self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay) def act(self, state): """Calculate actions under current policy for a specific agent.""" state = torch.from_numpy(state).float().to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() action += self.noise.noise() return np.clip(action, -1, 1) def step_learn(self, experiences): """Update actor and critic using sampled experiences.""" # states_list: list (length of num_agents) of 2D tensors (batch_size * state_size) # action_list: list (length of num_agents) of 2D tensors (batch_size * action_size) # rewards: 2D tensors (batch_size * num_agents) # next_states_list: list (length of num_agents) of 2D tensors (batch_size * state_size) # dones: 2D tensors (batch_size * num_agents) states_list, actions_list, rewards, next_states_list, dones = experiences next_full_states = torch.cat(next_states_list, dim=1).to( device) # 2D tensor (batch_size * (num_agents*state_size)) full_states = torch.cat(states_list, dim=1).to( device) # 2D tensor (batch_size * (num_agents*state_size)) full_actions = torch.cat(actions_list, dim=1).to( device) # 2D tensor (batch_size * (num_agents*action_size)) # update critic next_actions_list = [ self.target_actor(states) for states in states_list ] next_full_actions = torch.cat(next_actions_list, dim=1).to(device) Q_target_next = self.target_critic( next_full_states, next_full_actions) # 2D tensor (batch_size * 1) ''' Q_target = rewards[:, idx_agent].view(-1, 1) + \ self.gamma * Q_target_next * (1.0 - dones[:, idx_agent].view(-1, 1)) ''' Q_target = rewards + (self.gamma * Q_target_next * (1.0 - dones)) Q_predict = self.critic(full_states, full_actions) # 2D tensor (batch_size * 1) critic_loss = F.mse_loss(Q_predict, Q_target) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update actor ''' action_pred = self.actor(states_list[idx_agent]) # 2D tensor (batch_size * action_size) actions_list_update = actions_list.copy() actions_list_update[idx_agent] = action_pred full_actions_update = torch.cat(actions_list_update, dim = 1).to(device) # 2D tensor (batch_size * (num_agents*action_size)) ''' actions_pred = [self.actor(states) for states in states_list] actions_pred_tensor = torch.cat(actions_pred, dim=1).to(device) # actor_loss = -self.critic(full_states, full_actions_update).mean() actor_loss = -self.critic(full_states, actions_pred_tensor).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # soft update target networks self.soft_update(self.target_critic, self.critic, self.tau) self.soft_update(self.target_actor, self.actor, self.tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def reset(self): self.noise.reset()
class DDPGAgent: def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-4): super(DDPGAgent, self).__init__() self.state_size = in_actor self.action_size = out_actor self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0 ) # self.noise = OUNoise(action_size) #single agent only self.noise_scale = NOISE_START # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5) # def act(self, obs, noise=0.0): # obs = obs.to(device) # action = self.actor(obs) + noise*self.noise.noise().to(device) # return action def act(self, states, i_episode, add_noise=True): """Returns actions for given state as per current policy.""" if self.noise_scale > NOISE_END: #self.noise_scale *= NOISE_REDUCTION self.noise_scale = NOISE_REDUCTION**(i_episode-EPISODES_BEFORE_TRAINING) #else keep the previous value if not add_noise: self.noise_scale = 0.0 # states = torch.from_numpy(states).float().to(DEVICE) self.actor.eval() with torch.no_grad(): actions = self.actor(states)#.cpu().data.numpy() self.actor.train() #add noise actions += self.noise_scale*self.add_noise2() #works much better than OU Noise process # actions += self.noise_scale*self.noise.noise() return actions def add_noise2(self): # noise = 0.5*np.random.randn(self.action_size) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped noise = 0.5*torch.rand(self.action_size).to(device) return noise def reset(self): self.noise.reset() def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise().to(device) return action
class DDPGAgent: def __init__(self, state_size, action_size, hidden_in_dim, hidden_out_dim, extrem_out=64, num_agents=2, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() critic_state_size = state_size * num_agents critic_action_size = (action_size * (num_agents)) self.actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.target_actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.target_critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0) print("critic", self.critic, self.target_critic, "optim", self.critic_optimizer) print("actor", self.actor, self.target_actor, "optim", self.actor_optimizer) def act(self, obs, noise=0.0, batch=True): obs = obs.to(device) self.actor.eval() act = self.actor(obs, batch=batch).cpu().data no = noise * self.noise.noise() #print( "act" , act , "noise" , no) action = act + no self.actor.train() return np.clip(action, -1, 1) def target_act(self, obs, noise=0.0, batch=True): obs = obs.to(device) self.target_actor.eval() action = self.target_actor( obs, batch=batch).cpu().data + noise * self.noise.noise() self.target_actor.training() return np.clip(action, -1, 1)