class Agent(): '''This agent Interacts with the environment to learn a policy that yields the highest commulative reward. The agent uses the Deep Deterministic Policy Gradient algorithm''' def __init__(self, state_size, action_size, seed=0): '''Initlize the Agent. Parameters ---------- state_size : int The dimension of each state action_size : int The dimension of each action seed : int The random seed used to generate random numbers. ''' self.state_size = state_size self.action_size = action_size random.seed(seed) #actor gives the best action for given state self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) #evaluates the action self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=ACTOR_LEARNING_RATE) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=CRITIC_LEARNING_RATE, weight_decay=WEIGHT_DECAY) #Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) #Noise self.noise = OUNoise(action_size,seed) self.t_step = 0 def step(self, state, action, reward, next_state, done): '''Instructs the agent to take a step in the environment. Executes each time the agent takes a step in the environment. The observed (state, action, reward, next_state, done) tuple is saved in the replay buffer. Once enough experiences have been captured the model is trained. Parameters ---------- state : array_like The current state. action : int The action that was taken. reward : int The reward that was received. next_state : array_like The next state. done : boolean True if the episode is completed, else False ''' self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step+1)%UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.train_model_parameters(experiences) def get_action(self, state, epsilon=0, add_noise=True): '''Gets the action for the given state defined by the current policy. The method returns the action to take for the given state given the current policy. In order to explore in the continuous space noise is added to the action. Parameters ---------- state : array_like The current state. epsilon : float The epsilon value usedfor epsilon-greedy action selection. add_noise : boolean Add noise to the action to encourage exploration. Returns ------- action : array-like The action to take. Each value is between -1 and 1. ''' state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action+=self.noise.sample() return np.clip(action,-1,1) def train_model_parameters(self, experiences): '''Update the model parameters using the given batch of experience tuples. The models are train via the Actor Critic paradigm. The next action is optained fromt he target actor. This is then passed to the target critic to obtain the target next state. The target current state is calculated via the bellman equations. The local critic estimates the next state and is updated accordingly. The local actions predictions the next actions given the current state. The loss for the actor is calculated as the ... Parameters ---------- experiences : Tuple[torch.Variable] A name tuple of state, action, reward, next_action and done. ''' states, actions, rewards, next_states, dones = experiences #Update critic next_actions = self.actor_target(next_states) Q_next_states = self.critic_target(next_states,next_actions) Q_states = rewards + GAMMA*Q_next_states*(1-dones) Q_states_estimated = self.critic_local(states,actions) critic_loss = F.mse_loss(Q_states_estimated, Q_states) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() #Update actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states,actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self._update_model_parameters(self.critic_local, self.critic_target) self._update_model_parameters(self.actor_local, self.actor_target) def _update_model_parameters(self,local_network, target_network): '''Copy the learned local network parameters to the target network. This method updates the Target network with the learned network parameters. The target parameters are old movd TAU towards the learned local parameters. The is done to help redude the amount of harmful correlation by constating moving the target. ''' for target_param, local_param in zip(target_network.parameters(), local_network.parameters()): target_param.data.copy_(TAU*local_param.data + (1-TAU) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() logging.warning(action) return np.clip(action, 0.0000001, 7.0) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
class DDPG_Agent(): def __init__(self, state_size, action_size, num_agents): """ Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents in the environment """ random_seed = 1 self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.num_agents = num_agents # Replay memory self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Noise process self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed) # Critic Networks self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) def step(self, states, actions, rewards, next_states, dones): """ add an experience in the reply buffer then sample randomly from that buffer to learn (reason behind the random sampling is to break the correlation between sequential experiences) """ # Save experience for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): """Returns actions for given state """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): # Populate list of actions one state at a time actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: # We add noise for exploration purposes actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ### Update critic # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Calculate Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_( self.critic_local.parameters(), 1) # adds gradient clipping to stabilize learning self.critic_optimizer.step() ### Update actor # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() ### Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, regular_model, target_model, tau): """ regular_model: it's the most up to date model as it's the one used for trainning target_model:this one is the most stable we copy the weights of the regular model to it tau (float): interpolation parameter """ for target_param, regular_param in zip(target_model.parameters(), regular_model.parameters()): target_param.data.copy_(tau * regular_param.data + (1.0 - tau) * target_param.data)
class Agent: """ Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent Params ====== state_size (int): state dimension action_size (int): action dimension num_agents (int): simultaneous running agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents random.seed(random_seed) # Actor Network and its target network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network and its target network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise object self.noise = OUNoise((num_agents, action_size), random_seed) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, device, random_seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use prioritized sample from buffer to learn. """ # Save memory for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # Learn from memory if enough samples exist if self.memory.experience_count > EXPERIENCES_PER_SAMPLING: experiences = self.memory.sample() self.learn(experiences, GAMMA) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, states, add_noise=True): """ Returns actions for given state as per current policy. """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[i, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, indices = experiences # update Critic # Get next predicted state, actions, and Q values actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current state Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute Critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # Update priorities delta = abs(Q_targets - Q_expected).detach().numpy() self.memory.update_priorities(delta, indices) @staticmethod def soft_update(local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_model_param, local_model_param in zip( target_model.parameters(), local_model.parameters()): target_model_param.data.copy_(tau * local_model_param.data + (1. - tau) * target_model_param.data)
class Actor_Critic: def __init__(self, n_features, action_bounds): self.n_features = n_features self.action_bounds = action_bounds self.eval_actor_net = Actor(n_features, action_bounds) self.load_weights(self.eval_actor_net) self.eval_actor_net.train() self.target_actor_net = Actor(n_features, action_bounds) self.target_actor_net.eval() self.eval_critic_net = Critic(n_features, action_bounds) self.load_weights(self.eval_critic_net) self.eval_critic_net.train() self.target_critic_net = Critic(n_features, action_bounds) self.target_critic_net.eval() self.memory = Memory(Config.MEMORY_CAPACITY) self.batch_size = Config.BATCH_SIZE self.tau = Config.REPLACEMENT_SOFT_TAU # we need a good teacher, so the teacher should learn faster than the actor self.optimizer_actor = torch.optim.Adam(self.eval_actor_net.parameters(), Config.LR_ACTOR, (0.9, 0.99)) self.optimizer_critic = torch.optim.Adam(self.eval_critic_net.parameters(), Config.LR_CRITIC, (0.9, 0.99)) self.gamma = Config.REWARD_DECAY def load_weights(self, net): # net.state_dict(), 得出来的名字,'layers.1.weight' for m in net.modules(): if isinstance(m, nn.Linear): nn.init.normal_(m.weight, 0, 1) nn.init.constant_(m.bias, 0.1) def store_transition(self, s, a, r, s_): self.memory.store([s, a, r, s_]) def chose_action(self, s): s = torch.Tensor(np.expand_dims(s, axis=0)) action = self.eval_actor_net(s).detach().squeeze(dim=0) return action def learn(self): # for x in self.Actor_target.state_dict().keys(): # eval('self.Actor_target.' + x + '.data.mul_((1-TAU))') # eval('self.Actor_target.' + x + '.data.add_(TAU*self.Actor_eval.' + x + '.data)') # for x in self.Critic_target.state_dict().keys(): # eval('self.Critic_target.' + x + '.data.mul_((1-TAU))') # eval('self.Critic_target.' + x + '.data.add_(TAU*self.Critic_eval.' + x + '.data)') # for target_param, param in zip(net_target.parameters(), net.parameters()): # target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau) for k, v in self.eval_critic_net.state_dict().items(): self.target_critic_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_critic_net.state_dict()[k]) for k, v in self.eval_actor_net.state_dict().items(): self.target_actor_net.state_dict()[k].copy_(self.tau * v + (1-self.tau) * self.target_actor_net.state_dict()[k]) batch_data = self.memory.sample(self.batch_size) s0, a0, r1, s1 = zip(*batch_data) s0 = torch.tensor(s0, dtype=torch.float) a0 = torch.tensor(a0, dtype=torch.float).view(self.batch_size, len(self.action_bounds)) r1 = torch.tensor(r1, dtype=torch.float).view(self.batch_size, -1) s1 = torch.tensor(s1, dtype=torch.float) # Input (s, a), output q q_s0_a0 = self.eval_critic_net(s0, a0) # Input (s_, a_), output q_ for q_target # 得到a_ a1 = self.target_actor_net(s1).detach() q_s1_a1 = self.target_critic_net(s1, a1).detach() q_target = r1 + self.gamma * q_s1_a1 loss_critic = nn.MSELoss()(q_s0_a0, q_target) # critic 学习过程 # # td_error=R + GAMMA * ct(bs_,at(bs_))-ce(s,ba) 更新ce , # 但这个ae(s)是记忆中的ba,让ce得出的Q靠近Q_target,让评价更准确 # loss = (Q(st, at) - (rt + r*Q'(st+1, u'(st+1))))**2 self.optimizer_critic.zero_grad() loss_critic.backward() self.optimizer_critic.step() # actor 学习过程 # https://zhuanlan.zhihu.com/p/84321382 actor_a = self.eval_actor_net(s0) critic_q = self.eval_critic_net(s0, actor_a) # loss=-q=-ce(s,ae(s))更新ae ae(s)=a ae(s_)=a_ # 如果 a是一个正确的行为的话,那么它的Q应该更贴近0 loss_actor = -torch.mean(critic_q) self.optimizer_actor.zero_grad() loss_actor.backward() self.optimizer_actor.step() return loss_critic, loss_actor def draw_curve(self, loss): x = np.arange(1, len(loss)+1) plt.title("cost curve") plt.xlabel("train step") plt.ylabel("cost") plt.plot(x, loss) plt.show()
class DDPG_Agent(): def __init__(self, state_size, action_size, num_agents): """ Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents in the environment """ random_seed = 10.0 self.state_size = state_size self.action_size = action_size self.random_seed = random.seed(random_seed) self.num_agents = num_agents # Replay memory self.memory = ReplayBuf(action_size, BUFFER_SIZE, BATCH_SIZE, self.random_seed) # Actor Networks self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Make sure the Actor Target Network has the same weight values as the Local Network for target, local in zip(self.actor_target.parameters(), self.actor_local.parameters()): target.data.copy_(local.data) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_target = Critic(state_size * num_agents, action_size * num_agents, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) """ self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) """ # Make sure the Critic Target Network has the same weight values as the Local Network for target, local in zip(self.critic_target.parameters(), self.critic_local.parameters()): target.data.copy_(local.data) self.noise = Ornstein_Uhlenbeck_Noise(action_size, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, noise=0.0): state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if ADD_NOISE: action += self.noise.sample() * noise return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): ### Used only for DDPG (use madddpg.maddpg_learn() for MADDPG) """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)