class TD3: def __init__(self, Actor, Critic, action_space, replay_size=1000000, critic_lr=1e-3, training=True, actor_lr=1e-3, gamma=0.99, batch_size=100, tau=5e-3, update_freq=2, alpha=0.5, beta=0.5, noise_std=0.1, noise_clip=0.5, seed=0): torch.manual_seed(0) np.random.seed(seed) self.critic1 = Critic().to(device) self.critic2 = Critic().to(device) self.actor = Actor().to(device) self.critic_target1 = Critic().to(device) self.critic_target2 = Critic().to(device) self.actor_target = Actor().to(device) self.critic_optim1 = torch.optim.Adam(self.critic1.parameters(), critic_lr) self.critic_optim2 = torch.optim.Adam(self.critic2.parameters(), critic_lr) self.actor_optim = torch.optim.Adam(self.actor.parameters(), actor_lr) self.replay = deque(maxlen=replay_size) self.gamma = gamma self.batch_size = batch_size self.action_size = action_space.shape[0] self.high = action_space.high self.low = action_space.low self.replay = PrioritizedReplayBuffer(replay_size, batch_size, alpha) for target_param, critic_param in zip(self.critic_target1.parameters(), self.critic1.parameters()): target_param.data.copy_(critic_param.data) for target_param, critic_param in zip(self.critic_target2.parameters(), self.critic2.parameters()): target_param.data.copy_(critic_param.data) for target_param, actr_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(actr_param.data) self.noise_std = noise_std self.noise_clip = noise_clip self.beta = beta self.update_freq = update_freq self.tau = tau self.training = training def soft_update(self): for target_param, critic_param in zip(self.critic_target1.parameters(), self.critic1.parameters()): target_param.data.copy_(self.tau * critic_param.data + (1 - self.tau) * target_param.data) for target_param, critic_param in zip(self.critic_target2.parameters(), self.critic2.parameters()): target_param.data.copy_(self.tau * critic_param.data + (1 - self.tau) * target_param.data) for target_param, actr_param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(self.tau * actr_param.data + (1 - self.tau) * target_param.data) def store(self, state, action, reward, next_state, done): self.replay.store(state, action, reward, next_state, done) def train(self, t): if len(self.replay) < self.batch_size: return states, actions, rewards, next_states, dones, probs, indices = self.replay.sample( ) weights = (probs * len(self.replay))**(-self.beta) weights = weights / np.max(weights) weights = torch.tensor(weights, device=device, dtype=torch.float32) n = rewards.shape[1] states = torch.tensor(states, device=device, dtype=torch.float32).unsqueeze(1) next_states = torch.tensor(next_states, device=device, dtype=torch.float32).unsqueeze(1) rewards = torch.tensor(rewards, device=device, dtype=torch.float32) gammas = torch.tensor([self.gamma**i for i in range(n)], dtype=torch.float32, device=device) dones = torch.tensor(dones, device=device, dtype=torch.float32) actions = torch.tensor(actions, device=device, dtype=torch.long).unsqueeze(1) target_action = self.actor_target(next_states) action_noise = torch.normal(mean=torch.zeros(size=[self.batch_size, self.action_size]), std=torch.ones(size=[self.batch_size, self.action_size]) * self.noise_std) \ .clamp(-self.noise_clip, self.noise_clip).to(device) target_action += action_noise target_action = target_action.detach().cpu().numpy() target_action = torch.from_numpy( np.clip(target_action, self.low, self.high)).to(device) target = rewards.unsqueeze(1) + (1 - dones) * gammas * torch.min( self.critic_target1(next_states, target_action.detach()), self.critic_target2(next_states, target_action.detach())) loss1 = weights * (target - self.critic1.forward(states, actions))**2 td = torch.abs(target - self.critic1.forward(states, actions)).detach( ).cpu().numpy() + 0.001 self.replay.store_priorities(indices, td.squeeze(1)) self.critic_optim1.zero_grad() loss1.backward(retain_graph=True) self.critic_optim1.step() loss2 = weights * (target - self.critic2.forward(states, actions))**2 self.critic_optim2.zero_grad() loss2.backward() self.critic_optim2.step() if t % self.update_freq: policy_loss = -weights * self.critic1.forward( states, self.actor.forward(states)).mean() self.actor_optim.zero_grad() policy_loss.backward() self.actor_optim.step() self.soft_update() def sample(self): if len(self.replay) > self.batch_size: return random.sample(self.replay, self.batch_size) else: return self.replay def choose_action(self, state): state = torch.tensor(state.copy(), dtype=torch.float32).to(device) self.actor.eval() with torch.no_grad(): action = self.actor(state).cpu().detach().numpy() self.actor.train() action += np.random.normal(0, 0.1, self.action_size) action = np.clip(action, self.low, self.high) return action
class DQN: def __init__(self, Model, minibatch_size=64, replay_memory_size=1000000, gamma=0.99, learning_rate=5e-4, tau=1e-4, param_noise=0.1, max_distance=0.2, alpha=0.5, beta=0.5): self.minibatch_size = minibatch_size self.replay_memory_size = replay_memory_size self.gamma = gamma self.learning_rate = learning_rate self.tau = tau self.value = Model().to(device) self.target1 = Model().to(device) self.target1.eval() self.copy_weights() self.replay = PrioritizedReplayBuffer(replay_memory_size, minibatch_size, alpha) self.copy_weights() self.param_noise = param_noise self.max_distance = max_distance self.optimizer = torch.optim.Adam(self.value.parameters(), lr=self.learning_rate) self.beta = beta def copy_weights(self): for target_param, value_param in zip(self.target1.parameters(), self.value.parameters()): target_param.data.copy_(value_param.data) self.target1.eval() def soft_update(self): for target_param, value_param in zip(self.target1.parameters(), self.value.parameters()): target_param.data.copy_(value_param.data * self.tau + target_param.data * (1 - self.tau)) def choose_action(self, state): self.value.eval() state = torch.from_numpy(state).to(device, torch.float32) action = torch.argmax(self.value(state), dim=1).detach().cpu().numpy() self.value.train() return action def store(self, state, action, reward, next_state, done): self.replay.store(state, action, reward, next_state, done) def train(self): if len(self.replay) < self.minibatch_size: return states, actions, rewards, next_states, dones, probs, indices = self.replay.sample( ) weights = (probs * len(self.replay))**(-self.beta) weights = weights / np.max(weights) weights = torch.tensor(weights, device=device, dtype=torch.float32) n = rewards.shape[1] states = torch.from_numpy(states).to(device, torch.float32).unsqueeze(1) next_states = torch.from_numpy(next_states).to( device, torch.float32).unsqueeze(1) rewards = torch.from_numpy(rewards).to(device, torch.float32) gammas = torch.tensor([self.gamma**i for i in range(n)], dtype=torch.float32, device=device) dones = torch.from_numpy(dones).to(device, torch.float32) actions = torch.from_numpy(actions).to(device, torch.long).unsqueeze(1) target = torch.sum(rewards * gammas, dim=1) + ( self.gamma**n) * self.target1(next_states).detach().gather( 1, torch.argmax(self.value(next_states).detach(), dim=1).unsqueeze(1)).squeeze(1) * (1 - dones) target = target.unsqueeze(1) expected = self.value(states).gather(1, actions) self.optimizer.zero_grad() loss = (weights * ((target - expected)**2).squeeze(1)).mean() loss.backward() self.optimizer.step() # loss = F.mse_loss(target, expected) updated_priorities = torch.abs(target - expected).detach().cpu().numpy() + 0.001 self.replay.store_priorities(indices, updated_priorities.squeeze(1)) temp = loss.detach().cpu().item() return temp