class DDPGAgent: def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor(obs) + noise*self.noise.noise() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise() return action
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, lr_actor=1.0e-3): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.noise = OUNoise(out_actor, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor)
class DDPGAgent: def __init__( self, in_actor, out_actor, n_filt_actor, kernel_size_actor, stride_actor, fc_units_actor, in_critic, n_filt_critic, kernel_size_critic, stride_critic, fc_units_critic, lr_actor=1.0e-3, lr_critic=1.0e-5): # 1e-5 was getting to 0.4 score (sporadically) super(DDPGAgent, self).__init__() self.actor = Network(in_actor, out_actor, n_filt_actor, kernel_size_actor, stride_actor, fc_units_actor, actor=True).to(device) self.critic = Network(in_critic, 1, n_filt_critic, kernel_size_critic, stride_critic, fc_units_critic).to(device) self.target_actor = Network(in_actor, out_actor, n_filt_actor, kernel_size_actor, stride_actor, fc_units_actor, actor=True).to(device) self.target_critic = Network(in_critic, 1, n_filt_critic, kernel_size_critic, stride_critic, fc_units_critic).to(device) self.noise = OUNoise(out_actor, scale=.1) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-3) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor(obs) + noise * self.noise.noise() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise * self.noise.noise() return action
def __init__(self, discount_factor=0.95, tau=0.1): super(MADDPG, self).__init__() # DDGAgent used only to train independent actors self.maddpg_agent = [ DDPGAgent(24, 256, 128, 2), DDPGAgent(24, 256, 128, 2) ] # Shared critic trained for both agents # critic input = obs_full + actions = 48+2+2=52 self.critic = Network(52, 256, 128, 1).to(device) self.target_critic = Network(52, 256, 128, 1).to(device) # initialize targets same as original networks hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=1.0e-3, weight_decay=0.0) self.discount_factor = discount_factor self.tau = tau self.iter = 0
def __init__(self, state_size, action_size, hidden_in_dim, hidden_out_dim, extrem_out=64, num_agents=2, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() critic_state_size = state_size * num_agents critic_action_size = (action_size * (num_agents)) self.actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.target_actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.target_critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0) print("critic", self.critic, self.target_critic, "optim", self.critic_optimizer) print("actor", self.actor, self.target_actor, "optim", self.actor_optimizer)
class DDPGAgent: def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) #self.noise = OUNoise(out_actor, scale=1.0 ) self.noise = RNoise(out_actor, 0.5) self.epsilon = 1. self.epsilon_decay_rate = 0.999 self.epsilon_min = 0.2 # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=0.0) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0.0) def epsilon_decay(self): self.epsilon = max(self.epsilon_decay_rate*self.epsilon, self.epsilon_min) def act(self, obs, rand=0., add_noise=True): if np.random.random() < rand: action = np.random.randn(2) # select an action (for each agent) action = np.clip(action, -1, 1) # all actions between -1 and 1 action = torch.tensor(action, dtype=torch.float) else: obs = obs.to(device) self.actor.eval() with torch.no_grad(): action = self.actor(obs) self.actor.train() if add_noise: action += self.epsilon * self.noise.noise() action = action.squeeze(0) return action def reset(self): self.noise.reset() def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise() return action
def __init__(self, in_actor, out_actor, in_critic, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() hidden_in_actor = 64 hidden_out_actor = 128 hidden_in_critic = hidden_in_actor hidden_out_critic = hidden_out_actor self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1, out_actor, actor=False).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1, out_actor, actor=False).to(device) self.noise = OUNoise(out_actor, scale=0.9) #scale 1.0 self.noise_shape = out_actor # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) WD = 1e-5 self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=WD)
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-2, lr_critic=1.0e-2): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) #self.noise = OUNoise(out_actor, scale=1.0 ) self.noise = RNoise(out_actor, 0.5) self.epsilon = 1. self.epsilon_decay_rate = 0.999 self.epsilon_min = 0.2 # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=0.0) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0.0)
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-4): super(DDPGAgent, self).__init__() self.state_size = in_actor self.action_size = out_actor self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0 ) # self.noise = OUNoise(action_size) #single agent only self.noise_scale = NOISE_START # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)
def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=3.0e-5, lr_critic=1.0e-5): super(DDPGAgent, self).__init__() self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor, weight_decay=1.e-5) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5)
def __init__(self, state_size, action_size, hidden_layers, gamma, tau, lr_actor, lr_critic, weight_decay, seed): """Initialize DDPG agent.""" self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.actor = Network(state_size, hidden_layers[0], hidden_layers[1], action_size, seed, actor=True).to(device) self.critic = Network(2 * (state_size + action_size), hidden_layers[2], hidden_layers[3], 1, seed).to(device) self.target_actor = Network(state_size, hidden_layers[0], hidden_layers[1], action_size, seed, actor=True).to(device) self.target_critic = Network(2 * (state_size + action_size), hidden_layers[2], hidden_layers[3], 1, seed).to(device) self.noise = OUNoise(action_size, seed, scale=1.0) ''' # initialize targets same as original networks self.hard_update(self.target_actor, self.actor) self.hard_update(self.target_critic, self.critic) ''' self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay)
class MADDPG: def __init__(self, discount_factor=0.95, tau=0.1): super(MADDPG, self).__init__() # DDGAgent used only to train independent actors self.maddpg_agent = [ DDPGAgent(24, 256, 128, 2), DDPGAgent(24, 256, 128, 2) ] # Shared critic trained for both agents # critic input = obs_full + actions = 48+2+2=52 self.critic = Network(52, 256, 128, 1).to(device) self.target_critic = Network(52, 256, 128, 1).to(device) # initialize targets same as original networks hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=1.0e-3, weight_decay=0.0) self.discount_factor = discount_factor self.tau = tau self.iter = 0 def get_actors(self): """get actors of all the agents in the MADDPG object""" actors = [ddpg_agent.actor for ddpg_agent in self.maddpg_agent] return actors def get_target_actors(self): """get target_actors of all the agents in the MADDPG object""" target_actors = [ ddpg_agent.target_actor for ddpg_agent in self.maddpg_agent ] return target_actors def act(self, obs_all_agents, noise=0.0): """get actions from all agents in the MADDPG object""" actions = [ agent.act(obs, noise) for agent, obs in zip(self.maddpg_agent, obs_all_agents) ] return actions def target_act(self, obs_all_agents_list, noise=0.0): """get target network actions from all the agents in the MADDPG object """ target_actions_list = [] for obs_all_agents in obs_all_agents_list: target_actions = [] for ddpg_agent, obs in zip(self.maddpg_agent, obs_all_agents): target_actions.append(ddpg_agent.target_act(obs, noise)) target_actions_list.append(torch.stack(target_actions)) return target_actions_list def act_on_list(self, obs_all_agents_list, agent_number): actions_list = [] for obs_all_agents in obs_all_agents_list: actions = [] for i in range(len(self.maddpg_agent)): if i == agent_number: actions.append(self.maddpg_agent[i].actor( obs_all_agents[i])) else: actions.append(self.maddpg_agent[i].actor( obs_all_agents[i]).detach()) actions_list.append(torch.stack(actions)) return actions_list @staticmethod def convert_samples_to_tensor(samples): obs, actions, rewards, next_obs, dones = [], [], [], [], [] for sample in samples: obs.append(torch.tensor(sample[0], dtype=torch.float)) actions.append(torch.tensor(sample[1], dtype=torch.float)) rewards.append(torch.tensor(sample[2], dtype=torch.float)) next_obs.append(torch.tensor(sample[3], dtype=torch.float)) dones.append(torch.tensor(sample[4], dtype=torch.float)) return obs, actions, rewards, next_obs, dones def update(self, samples, agent_number): """update the critics and actors of all the agents """ obs_full, actions, rewards, next_obs_full, dones = self.convert_samples_to_tensor( samples) obs_full_s = torch.stack(obs_full) next_obs_full_s = torch.stack(next_obs_full) obs_full_c = torch.reshape(obs_full_s, (len(samples), -1)) next_obs_full_c = torch.reshape(next_obs_full_s, (len(samples), -1)) agent = self.maddpg_agent[agent_number] self.critic_optimizer.zero_grad() #critic loss = batch mean of (y- Q(s,a) from target network)^2 #y = reward of this timestep + discount * Q(st+1,at+1) from target network target_actions = self.target_act(next_obs_full_s) target_actions_s = torch.stack(target_actions) target_actions_c = torch.reshape(target_actions_s, (len(samples), -1)) target_critic_input = torch.cat((next_obs_full_c, target_actions_c), dim=1).to(device) with torch.no_grad(): q_next = self.target_critic(target_critic_input) rewards_s = torch.stack(rewards) dones_s = torch.stack(dones) y = rewards_s[:, agent_number].view( -1, 1) + self.discount_factor * q_next * ( 1 - dones_s[:, agent_number].view(-1, 1)) actions_s = torch.stack(actions) actions_c = torch.reshape(actions_s, (len(samples), -1)) critic_input = torch.cat((obs_full_c, actions_c), dim=1).to(device) q = self.critic(critic_input) huber_loss = torch.nn.SmoothL1Loss() critic_loss = huber_loss(q, y.detach()) critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() #update actor network using policy gradient agent.actor_optimizer.zero_grad() q_input = self.act_on_list(obs_full_s, agent_number) q_input_s = torch.stack(q_input) q_input_c = torch.reshape(q_input_s, (len(samples), -1)) # combine all the actions and observations for input to critic q_input2 = torch.cat((obs_full_c, q_input_c), dim=1) # get the policy gradient actor_loss = -self.critic(q_input2).mean() actor_loss.backward() torch.nn.utils.clip_grad_norm_(agent.actor.parameters(), 1) agent.actor_optimizer.step() al = actor_loss.cpu().detach().item() cl = critic_loss.cpu().detach().item() def update_targets(self): """soft update targets""" self.iter += 1 for ddpg_agent in self.maddpg_agent: soft_update(ddpg_agent.target_actor, ddpg_agent.actor, self.tau) soft_update(self.target_critic, self.critic, self.tau)
class DDPGAgent: def __init__(self, state_size, action_size, hidden_layers, gamma, tau, lr_actor, lr_critic, weight_decay, seed): """Initialize DDPG agent.""" self.state_size = state_size self.action_size = action_size self.gamma = gamma self.tau = tau self.actor = Network(state_size, hidden_layers[0], hidden_layers[1], action_size, seed, actor=True).to(device) self.critic = Network(2 * (state_size + action_size), hidden_layers[2], hidden_layers[3], 1, seed).to(device) self.target_actor = Network(state_size, hidden_layers[0], hidden_layers[1], action_size, seed, actor=True).to(device) self.target_critic = Network(2 * (state_size + action_size), hidden_layers[2], hidden_layers[3], 1, seed).to(device) self.noise = OUNoise(action_size, seed, scale=1.0) ''' # initialize targets same as original networks self.hard_update(self.target_actor, self.actor) self.hard_update(self.target_critic, self.critic) ''' self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=weight_decay) def act(self, state): """Calculate actions under current policy for a specific agent.""" state = torch.from_numpy(state).float().to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().data.numpy() self.actor.train() action += self.noise.noise() return np.clip(action, -1, 1) def step_learn(self, experiences): """Update actor and critic using sampled experiences.""" # states_list: list (length of num_agents) of 2D tensors (batch_size * state_size) # action_list: list (length of num_agents) of 2D tensors (batch_size * action_size) # rewards: 2D tensors (batch_size * num_agents) # next_states_list: list (length of num_agents) of 2D tensors (batch_size * state_size) # dones: 2D tensors (batch_size * num_agents) states_list, actions_list, rewards, next_states_list, dones = experiences next_full_states = torch.cat(next_states_list, dim=1).to( device) # 2D tensor (batch_size * (num_agents*state_size)) full_states = torch.cat(states_list, dim=1).to( device) # 2D tensor (batch_size * (num_agents*state_size)) full_actions = torch.cat(actions_list, dim=1).to( device) # 2D tensor (batch_size * (num_agents*action_size)) # update critic next_actions_list = [ self.target_actor(states) for states in states_list ] next_full_actions = torch.cat(next_actions_list, dim=1).to(device) Q_target_next = self.target_critic( next_full_states, next_full_actions) # 2D tensor (batch_size * 1) ''' Q_target = rewards[:, idx_agent].view(-1, 1) + \ self.gamma * Q_target_next * (1.0 - dones[:, idx_agent].view(-1, 1)) ''' Q_target = rewards + (self.gamma * Q_target_next * (1.0 - dones)) Q_predict = self.critic(full_states, full_actions) # 2D tensor (batch_size * 1) critic_loss = F.mse_loss(Q_predict, Q_target) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # update actor ''' action_pred = self.actor(states_list[idx_agent]) # 2D tensor (batch_size * action_size) actions_list_update = actions_list.copy() actions_list_update[idx_agent] = action_pred full_actions_update = torch.cat(actions_list_update, dim = 1).to(device) # 2D tensor (batch_size * (num_agents*action_size)) ''' actions_pred = [self.actor(states) for states in states_list] actions_pred_tensor = torch.cat(actions_pred, dim=1).to(device) # actor_loss = -self.critic(full_states, full_actions_update).mean() actor_loss = -self.critic(full_states, actions_pred_tensor).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # soft update target networks self.soft_update(self.target_critic, self.critic, self.tau) self.soft_update(self.target_actor, self.actor, self.tau) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) def reset(self): self.noise.reset()
class DDPGAgent: def __init__(self, in_actor, hidden_in_actor, hidden_out_actor, out_actor, in_critic, hidden_in_critic, hidden_out_critic, lr_actor=1.0e-4, lr_critic=1.0e-4): super(DDPGAgent, self).__init__() self.state_size = in_actor self.action_size = out_actor self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1).to(device) self.noise = OUNoise(out_actor, scale=1.0 ) # self.noise = OUNoise(action_size) #single agent only self.noise_scale = NOISE_START # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1.e-5) # def act(self, obs, noise=0.0): # obs = obs.to(device) # action = self.actor(obs) + noise*self.noise.noise().to(device) # return action def act(self, states, i_episode, add_noise=True): """Returns actions for given state as per current policy.""" if self.noise_scale > NOISE_END: #self.noise_scale *= NOISE_REDUCTION self.noise_scale = NOISE_REDUCTION**(i_episode-EPISODES_BEFORE_TRAINING) #else keep the previous value if not add_noise: self.noise_scale = 0.0 # states = torch.from_numpy(states).float().to(DEVICE) self.actor.eval() with torch.no_grad(): actions = self.actor(states)#.cpu().data.numpy() self.actor.train() #add noise actions += self.noise_scale*self.add_noise2() #works much better than OU Noise process # actions += self.noise_scale*self.noise.noise() return actions def add_noise2(self): # noise = 0.5*np.random.randn(self.action_size) #sigma of 0.5 as sigma of 1 will have alot of actions just clipped noise = 0.5*torch.rand(self.action_size).to(device) return noise def reset(self): self.noise.reset() def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor(obs) + noise*self.noise.noise().to(device) return action
class DDPGAgent: def __init__(self, in_actor, out_actor, in_critic, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() hidden_in_actor = 64 hidden_out_actor = 128 hidden_in_critic = hidden_in_actor hidden_out_critic = hidden_out_actor self.actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1, out_actor, actor=False).to(device) self.target_actor = Network(in_actor, hidden_in_actor, hidden_out_actor, out_actor, actor=True).to(device) self.target_critic = Network(in_critic, hidden_in_critic, hidden_out_critic, 1, out_actor, actor=False).to(device) self.noise = OUNoise(out_actor, scale=0.9) #scale 1.0 self.noise_shape = out_actor # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) WD = 1e-5 self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=WD) def reset(self): self.noise.reset() def noisef(self, mean=0, sigma=0.08): return np.random.normal(mean, sigma, self.noise_shape) def act(self, obs, noise=0.0): obs = obs.to(device) action = self.actor(obs).cpu().data.numpy( ) + noise * self.noisef() #self.noise.noise() return action def target_act(self, obs, noise=0.0): obs = obs.to(device) action = self.target_actor( obs).cpu() # + noise * self.noisef() #self.noise.noise() return action
class DDPGAgent: def __init__(self, state_size, action_size, hidden_in_dim, hidden_out_dim, extrem_out=64, num_agents=2, lr_actor=1.0e-4, lr_critic=1.0e-3): super(DDPGAgent, self).__init__() critic_state_size = state_size * num_agents critic_action_size = (action_size * (num_agents)) self.actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.target_actor = Network(state_size, action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out, actor=True).to(device) self.target_critic = Network(critic_state_size, critic_action_size, hidden_in_dim, hidden_out_dim, hidden_extrem_out=extrem_out).to(device) self.noise = OUNoise(action_size, scale=1.0) # initialize targets same as original networks hard_update(self.target_actor, self.actor) hard_update(self.target_critic, self.critic) self.actor_optimizer = Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0) print("critic", self.critic, self.target_critic, "optim", self.critic_optimizer) print("actor", self.actor, self.target_actor, "optim", self.actor_optimizer) def act(self, obs, noise=0.0, batch=True): obs = obs.to(device) self.actor.eval() act = self.actor(obs, batch=batch).cpu().data no = noise * self.noise.noise() #print( "act" , act , "noise" , no) action = act + no self.actor.train() return np.clip(action, -1, 1) def target_act(self, obs, noise=0.0, batch=True): obs = obs.to(device) self.target_actor.eval() action = self.target_actor( obs, batch=batch).cpu().data + noise * self.noise.noise() self.target_actor.training() return np.clip(action, -1, 1)