class Custom(BaseAgent): def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=5000, expl_noise=0.1, batch_size=128, policy_noise=0.2, noise_clip=0.5, policy_freq=2, device=None, **kwargs): super(Custom, self).__init__(env, device) self.actor = GaussianActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.behavior = GaussianActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.behavior_optimizer = torch.optim.Adam(self.behavior.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim, **kwargs).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 self.c_loss, self.a_loss = [], [] def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) return self.actor(obs, True).cpu().data.numpy().flatten() def behavior_init(self, iteration=1000): obs, action, reward, next_obs, done = self.replay_buffer.sample( self.batch_size) def train(self): obs, action, reward, next_obs, done = self.replay_buffer.sample( self.batch_size) self.total_it += 1 cur_action = self.actor(obs) with torch.no_grad(): # Select action according to policy and add clipped noise noise = (torch.randn_like(action) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) next_action = (self.actor_target(next_obs) + noise).clamp( -self.act_limit, self.act_limit) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.gamma * target_Q # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() c_loss = critic_loss.item() self.critic_optimizer.step() a_loss = 0 # Delayed policy updates if self.total_it % self.policy_freq == 0: for param in self.critic.parameters(): param.requires_grad = False # Compute actor losse actor_loss = -self.critic.Q1(obs, cur_action).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() a_loss = actor_loss.item() self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) return c_loss, a_loss def step(self, t): c, a = self.train() self.c_loss.append(c) self.a_loss.append(a) if t % 100 == 0: #self.evaluate(self.env) print( f'Iteration {t}: Critic Loss: {np.mean(self.c_loss)}, Actor Loss: {np.mean(self.a_loss)*2}' ) self.c_loss, self.a_loss = [], [] self.episode_timesteps += 1
class DDPG(BaseAgent): def __init__(self, env, lr=1e-3, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=1000, expl_noise=0.1, batch_size=256, device=None): super(DDPG, self).__init__(env, device) self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = QvalueCritic(self.obs_dim, self.act_dim).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) return self.actor(obs).cpu().data.numpy().flatten() def train(self): obs, action, reward, next_obs, done = self.replay_buffer.sample(self.batch_size) # Compute the target Q value target_Q = self.critic_target(next_obs, self.actor_target(next_obs)) target_Q = reward + (1 - done) * self.gamma * target_Q.detach() # Get current Q estimate current_Q = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = -self.critic(obs, self.actor(obs)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def step(self, t): self.episode_timesteps += 1 # Select action randomly or according to policy if t < self.start_timesteps: action = self.env.action_space.sample() else: action = ( self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device)) + np.random.normal(0, self.act_limit * self.expl_noise, size=self.act_dim) ).clip(-self.act_limit, self.act_limit) # Perform action next_obs, reward, done, _ = self.env.step(action) done_bool = float(done)# if self.episode_timesteps < self.env._max_episode_steps else 0 # Store data in replay buffer self.replay_buffer.add(copy.deepcopy(self.obs), action, reward, next_obs, done_bool) self.obs = next_obs self.episode_reward += reward # Train agent after collecting sufficient data if t > self.start_timesteps: self.train() if done: self.episode_end_handle(t)
class TD3(BaseAgent): def __init__(self, env, lr=3e-4, gamma=0.99, tau=0.005, buffer_size=int(1e6), start_timesteps=1000, expl_noise=0.1, batch_size=100, policy_noise=0.2, noise_clip=0.5, policy_freq=2, device=None, **kwargs): super(TD3, self).__init__(env, device) self.actor = DeterministicActor(self.obs_dim, self.act_dim, self.act_limit, **kwargs).to(self.device) self.actor_target = copy.deepcopy(self.actor) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim, **kwargs).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) self.replay_buffer = ReplayBuffer(self.obs_dim, self.act_dim, buffer_size) self.start_timesteps = start_timesteps self.expl_noise = expl_noise self.batch_size = batch_size self.lr = lr self.gamma = gamma self.tau = tau self.policy_noise = policy_noise self.noise_clip = noise_clip self.policy_freq = policy_freq self.total_it = 0 def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) return self.actor(obs).cpu().data.numpy().flatten() def train(self): obs, action, reward, next_obs, done = self.replay_buffer.sample( self.batch_size) self.total_it += 1 cur_action = self.actor(obs) with torch.no_grad(): # Select action according to policy and add clipped noise noise = (torch.randn_like(action) * self.policy_noise).clamp( -self.noise_clip, self.noise_clip) next_action = (self.actor_target(next_obs) + noise).clamp( -self.act_limit, self.act_limit) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.gamma * target_Q # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Delayed policy updates if self.total_it % self.policy_freq == 0: for param in self.critic.parameters(): param.requires_grad = False # Compute actor losse actor_loss = -self.critic.Q1(obs, cur_action).mean() #target = 1 / (2 * 0.2) * grad(self.critic.Q1(obs, cur_action).mean(), cur_action)[0].detach() + self.actor_target(obs).detach() #actor_loss = F.mse_loss(target, cur_action) # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def step(self, t): self.episode_timesteps += 1 # Select action randomly or according to policy if t < self.start_timesteps: action = self.env.action_space.sample() else: action = (self.actor.act( torch.tensor(self.obs, dtype=torch.float32, device=self.device)) + np.random.normal(0, self.act_limit * self.expl_noise, size=self.act_dim)).clip( -self.act_limit, self.act_limit) # Perform action next_obs, reward, done, _ = self.env.step(action) done_bool = float( done ) # if self.episode_timesteps < self.env._max_episode_steps else 0 # Store data in replay buffer self.replay_buffer.add(copy.deepcopy(self.obs), action, reward, next_obs, done_bool) self.obs = next_obs self.episode_reward += reward # Train agent after collecting sufficient data # TODO: extra training to compensate for inti_timesteps? if t > self.start_timesteps: self.train() if done: self.episode_end_handle(t)
class SAC(BaseAgent): def __init__(self, env, buffer_size=int(1e6), gamma=0.99, tau=0.005, lr=1e-3, start_timesteps=1000, actor_train_freq=2, batch_size=128, init_temperature=0.1, device=None): super(SAC, self).__init__(env, device) self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=lr) self.critic = DoubleQvalueCritic(self.obs_dim, self.act_dim).to(self.device) self.critic_target = copy.deepcopy(self.critic) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=lr) # Adjustable alpha self.log_alpha = torch.tensor(np.log(init_temperature), requires_grad=True, device=self.device) self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-4, betas=(0.5, 0.999)) self.replay_buffer = ReplayBuffer(buffer_size) self.start_timesteps = start_timesteps self.tau = tau self.gamma = gamma self.alpha = self.log_alpha.exp() self.actor_train_freq = actor_train_freq self.batch_size = batch_size def offline_initialize(self, replay_buffer, epoch=1): conf = 2 # PPO-style mini-batch training critic_losses, actor_losses = [], [] idxes = np.arange(replay_buffer.size - 1) print(replay_buffer.size) for i in range(epoch): np.random.shuffle(idxes) for j in range(replay_buffer.size // self.batch_size): idx = idxes[i * self.batch_size:(i + 1) * self.batch_size] obs, action, reward, next_obs, done, next_action = replay_buffer.sample( self.batch_size, True, idx) # SARSA-style policy evaluation #with torch.no_grad(): # # Compute the target Q value # target_Q1, target_Q2 = self.critic_target(next_obs, next_action) # target_Q = torch.min(target_Q1, target_Q2) # target_Q = reward + (1 - done) * self.gamma * target_Q ## Get current Q estimates #current_Q1, current_Q2 = self.critic(obs, action) ## Compute critic loss #critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) #critic_losses.append(critic_loss.item()) ## Optimize the critic #self.critic_optimizer.zero_grad() #critic_loss.backward() #self.critic_optimizer.step() # Behavior cloning under entropy-regularization _, logprob = self.actor(obs) _action = 0.5 * torch.log((1 + action) / (1 - action)) #actor_loss = (self.alpha * logprob - self.actor.logprob(obs, _action)).mean() actor_loss = -self.actor.logprob(obs, _action).mean() #print(action, _action) actor_losses.append(actor_loss.item()) self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() #alpha_loss = (self.log_alpha * (-logprob - self.target_entropy).detach()).mean() #self.alpha_optimizer.zero_grad() #alpha_loss.backward() #self.alpha_optimizer.step() #self.alpha = self.log_alpha.exp() for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) print( f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}' ) critic_losses, actor_losses = [], [] # Approximate support with the learn policy self.lower_bound = np.zeros((replay_buffer.size, self.act_dim)) self.upper_bound = np.zeros((replay_buffer.size, self.act_dim)) idxes = np.arange(replay_buffer.size) for _ in range(epoch): for i in range(int(np.ceil(replay_buffer.size / self.batch_size))): idx = idxes[i * self.batch_size:(i + 1) * self.batch_size] obs, action, reward, next_obs, done = replay_buffer.sample( self.batch_size, with_idxes=idx) mu, std = self.actor.mu_std(obs) self.lower_bound[i * self.batch_size:(i + 1) * self.batch_size] = mu - conf * std self.upper_bound[i * self.batch_size:(i + 1) * self.batch_size] = mu + conf * std def offline_improve(self, replay_buffer, epoch=10): self.actor = SquashedGaussianActor(self.obs_dim, self.act_dim, self.act_limit).to(self.device) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=1e-3) self.actor_target = copy.deepcopy(self.actor) # Adjustable alpha self.log_alpha = torch.tensor(np.log(0.1), requires_grad=True, device=self.device) self.target_entropy = -torch.prod( torch.Tensor(self.env.action_space.shape).to(self.device)).item() self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=1e-4, betas=(0.5, 0.999)) actor_losses, critic_losses = [], [] idxes = np.arange(replay_buffer.size - 1) for i in range(epoch): np.random.shuffle(idxes) for j in range(replay_buffer.size // self.batch_size): idx = idxes[i * self.batch_size:(i + 1) * self.batch_size] obs, action, reward, next_obs, done = replay_buffer.sample( self.batch_size, with_idxes=idx) if j % 100 == 0: self.evaluate(self.env) # SARSA-style policy evaluation with torch.no_grad(): # No constrain #next_action, logprob = self.actor(next_obs) #target_Q1, target_Q2 = self.critic_target(next_obs, next_action) #target_Q = torch.min(target_Q1, target_Q2) #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob) # Probablistically constrain #mu, std = self.actor.mu_std(next_obs) #a, b = (self.lower_bound[idx+1] - mu)/std, (self.upper_bound[idx+1] - mu)/std #dist = truncnorm(a, b, loc=mu, scale=std) #next_action = torch.tensor(dist.rvs(), dtype=torch.float32, device=self.device) #logprob = self.actor.logprob(next_obs, next_action) #target_Q1, target_Q2 = self.critic_target(next_obs, torch.tanh(next_action)) #target_Q = torch.min(target_Q1, target_Q2) #target_Q = reward + (1 - done) * self.gamma * (target_Q - self.alpha * logprob) # Q-learning constrain mu, std = self.actor_target.mu_std(next_obs) next_action = mu #np.clip(mu, self.lower_bound[idx+1], self.upper_bound[idx+1]) next_action = torch.tensor(self.act_limit * np.tanh(next_action), dtype=torch.float32, device=self.device) target_Q1, target_Q2 = self.critic_target( next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + ( 1 - done ) * self.gamma * target_Q #(target_Q - self.alpha * logprob) # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) critic_losses.append(critic_loss.item()) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Behavior cloning under entropy-regularization # TODO: freeze critic parameters here to prevent unnecessary backpropagation for param in self.critic.parameters(): param.requires_grad = False cur_action, _ = self.actor.mu_std(obs, False) cur_action = torch.tanh(cur_action) current_Q1, current_Q2 = self.critic(obs, cur_action) current_Q = torch.min(current_Q1, current_Q2) actor_loss = -current_Q.mean() #cur_action, logprob = self.actor(obs, detach=True) #current_Q1, current_Q2 = self.critic(obs, cur_action) #current_Q = torch.min(current_Q1, current_Q2) #actor_std_loss = (self.alpha * logprob - current_Q).mean() #actor_loss = actor_std_loss + actor_mu_loss #cur_action, logprob = self.actor(obs, detach=True) #current_Q1, current_Q2 = self.critic(obs, cur_action) #current_Q = torch.min(current_Q1, current_Q2) #actor_loss = (self.alpha * logprob - current_Q).mean() self.actor_optimizer.zero_grad() actor_loss.backward() actor_losses.append(-current_Q.mean().item()) self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True #alpha_loss = (self.log_alpha * (-logprob - 3 * self.target_entropy).detach()).mean() #self.alpha_optimizer.zero_grad() #alpha_loss.backward() #self.alpha_optimizer.step() #self.alpha = self.log_alpha.exp() for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) print( f'Epoch {i} Critic Loss: {np.mean(critic_losses)}, Actor Loss: {np.mean(actor_losses)}' ) critic_losses, actor_losses = [], [] def train(self, obs, action, next_obs, reward, done): with torch.no_grad(): next_action, logprob = self.actor(next_obs) # Compute the target Q value target_Q1, target_Q2 = self.critic_target(next_obs, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = reward + (1 - done) * self.gamma * ( target_Q - self.alpha * logprob) # Get current Q estimates current_Q1, current_Q2 = self.critic(obs, action) # Compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss( current_Q2, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() cur_action, logprob = self.actor(obs) # TODO: freeze critic parameters here to prevent unnecessary backpropagation for param in self.critic.parameters(): param.requires_grad = False current_Q1, current_Q2 = self.critic(obs, cur_action) current_Q = torch.min(current_Q1, current_Q2) actor_loss = (self.alpha * logprob - current_Q).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for param in self.critic.parameters(): param.requires_grad = True alpha_loss = (self.log_alpha * (-logprob - self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha.exp() for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def act(self, obs): obs = torch.tensor(obs, dtype=torch.float32, device=self.device) obs = obs.reshape(1, -1) return self.actor.act(obs, deterministic=True) def step(self, t): self.episode_timesteps += 1 # Select action randomly or according to policy #if t < self.start_timesteps:# or t > self.start_timesteps: # action = self.env.action_space.sample() #else: # action = self.actor.act(torch.tensor(self.obs, dtype=torch.float32, device=self.device)) action = self.actor.act( torch.tensor(self.obs, dtype=torch.float32, device=self.device)) # Perform action next_obs, reward, done, _ = self.env.step(action) #done_bool = float(done) if self.episode_timesteps < self.env._max_episode_steps else 0 done_bool = float( done ) # if self.episode_timesteps < self.env._max_episode_steps else 0 # Store data in replay buffer self.replay_buffer.add(copy.deepcopy(self.obs), action, next_obs, reward, done_bool) self.obs = next_obs self.episode_reward += reward # Train agent after collecting sufficient data, extra training iterations added when first reached start_timesteps if t == self.start_timesteps: for _ in range(self.start_timesteps): batch = self.replay_buffer.sample(self.batch_size) self.train(*batch) elif t > self.start_timesteps: batch = self.replay_buffer.sample(self.batch_size) self.train(*batch) if done: self.episode_end_handle(t)
memory.merge(batch) actor_infos[actor_id].append(ainfo) log("receive replay - memory size {} elapse {:.2f}".format( len(memory), time.time() - st)) # 러너가 배치를 요청했으면 보냄 payload = async_recv(learner) if payload is not None: st = time.time() # 러너가 보낸 베치와 에러 if len(actor_infos) > 0: ainfos = average_actor_info(actor_infos) else: ainfos = None if len(memory) < START_SIZE: payload = b'not enough' log("not enough data - memory size {}".format(len(memory))) else: # 충분하면 샘플링 후 보냄 binfo = BufferInfo(len(memory)) batch = memory.sample(NUM_BATCH) payload = pickle.dumps((batch, ainfos, binfo)) # 전송 learner.send(payload) log("send batch elapse {:.2f}".format(time.time() - st))
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # Keep track of time step self.t = 0 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memoryimport matplotlib.pyplot as pltA self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # update time step self.t += 1 # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: if self.t % UPDATE_EVERY == 0: for _ in range(NUM_UPDATES): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic lossimport matplotlib.pyplot as pltA Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)