def __init__(self, observation_space, action_space, lr=7e-4, gamma=0.99, tau=0.01): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = gamma self.memory = NumpyReplay(100000, observation_space.shape[0], self.device) self.action_space = action_space self.epsilon = 0.5 self.epsilon_decay = 0.9995 self.min_epsilon = 0.02 self.tau = tau self.dqn = Network(observation_space.shape[0], action_space.n).to(self.device) self.dqn_target = Network(observation_space.shape[0], action_space.n).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() self.optimizer = optim.Adam(self.dqn.parameters(), lr=lr)
def __init__(self, env_num, observation_space, action_space, lr=7e-4, steps=64, gamma=0.99, lam=0.95, entropy_coef=0.005, clip=0.1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = gamma self.lam = lam self.entropy_coef = entropy_coef self.clip = clip self.steps = steps self.memory = NumpyReplay(steps, env_num, observation_space.shape[0], self.device) self.actorcritic = ActorCritic(observation_space.shape[0], action_space.n).to(self.device) self.actorcritic_optimizer = optim.Adam(self.actorcritic.parameters(), lr=lr, eps=1e-6) self.target_actorcritic = ActorCritic(observation_space.shape[0], action_space.n).to(self.device) self.target_actorcritic.load_state_dict(self.actorcritic.state_dict())
def __init__(self, observation_space, action_space, alpha=0.2, gamma=0.99, tau=0.01, p_lr=7e-4, q_lr=7e-4, a_lr=3e-4, policy_freq=1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.state_shape = observation_space.shape[0] self.action_shape = action_space.shape[0] self.action_range = [action_space.low, action_space.high] self.alpha = alpha self.gamma = gamma self.tau = tau self.memory = NumpyReplay(1000000, observation_space.shape[0], self.device) self.count = 0 self.policy_freq = policy_freq self.actor = Actor(self.state_shape, self.action_shape).to(self.device) self.critic1 = Critic(self.state_shape, self.action_shape).to(self.device) self.target_critic1 = Critic(self.state_shape, self.action_shape).to(self.device) self.critic2 = Critic(self.state_shape, self.action_shape).to(self.device) self.target_critic2 = Critic(self.state_shape, self.action_shape).to(self.device) for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(param.data) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=p_lr) self.critic_optimizer1 = optim.Adam(self.critic1.parameters(), lr=q_lr) self.critic_optimizer2 = optim.Adam(self.critic2.parameters(), lr=q_lr) self.target_entropy = -torch.prod( torch.Tensor(self.action_shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr)
class DDQN: def __init__(self, observation_space, action_space, lr=7e-4, gamma=0.99, tau=0.01): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = gamma self.memory = NumpyReplay(100000, observation_space.shape[0], self.device) self.action_space = action_space self.epsilon = 0.5 self.epsilon_decay = 0.9995 self.min_epsilon = 0.02 self.tau = tau self.dqn = Network(observation_space.shape[0], action_space.n).to(self.device) self.dqn_target = Network(observation_space.shape[0], action_space.n).to(self.device) self.dqn_target.load_state_dict(self.dqn.state_dict()) self.dqn_target.eval() self.optimizer = optim.Adam(self.dqn.parameters(), lr=lr) def act(self, state): self.epsilon *= self.epsilon_decay self.epsilon = max(self.epsilon, self.min_epsilon) if np.random.random() < self.epsilon: action = [self.action_space.sample() for i in range(len(state))] return action state = torch.FloatTensor(state).to(self.device) action = self.dqn.forward(state).argmax(dim=-1) action = action.cpu().detach().numpy() return action def remember(self, state, action, reward, new_state, done): for i in range(len(state)): self.memory.update(state[i], action[i], reward[i], new_state[i], done[i]) def train(self, batch_size=128, epochs=1): if batch_size > self.memory.size: return for epoch in range(epochs): (states, actions, rewards, next_states, dones) = self.memory.sample(batch_size) actions = actions.unsqueeze(-1) rewards = rewards.unsqueeze(-1) dones = dones.unsqueeze(-1) q = self.dqn.forward(states).gather(-1, actions.long()) with torch.no_grad(): a2 = self.dqn.forward(next_states).argmax(dim=-1, keepdim=True) q2 = self.dqn_target.forward(next_states).gather(-1, a2) target = (rewards + (1 - dones) * self.gamma * q2) loss = F.mse_loss(q, target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.update_target() def update_target(self): with torch.no_grad(): for target_param, param in zip(self.dqn_target.parameters(), self.dqn.parameters()): target_param.data.mul_(1 - self.tau) torch.add(target_param.data, param.data, alpha=self.tau, out=target_param.data)
class SAC: def __init__(self, observation_space, action_space, alpha=0.2, gamma=0.99, tau=0.01, p_lr=7e-4, q_lr=7e-4, a_lr=3e-4, policy_freq=1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.state_shape = observation_space.shape[0] self.action_shape = action_space.shape[0] self.action_range = [action_space.low, action_space.high] self.alpha = alpha self.gamma = gamma self.tau = tau self.memory = NumpyReplay(1000000, observation_space.shape[0], self.device) self.count = 0 self.policy_freq = policy_freq self.actor = Actor(self.state_shape, self.action_shape).to(self.device) self.critic1 = Critic(self.state_shape, self.action_shape).to(self.device) self.target_critic1 = Critic(self.state_shape, self.action_shape).to(self.device) self.critic2 = Critic(self.state_shape, self.action_shape).to(self.device) self.target_critic2 = Critic(self.state_shape, self.action_shape).to(self.device) for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.copy_(param.data) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=p_lr) self.critic_optimizer1 = optim.Adam(self.critic1.parameters(), lr=q_lr) self.critic_optimizer2 = optim.Adam(self.critic2.parameters(), lr=q_lr) self.target_entropy = -torch.prod( torch.Tensor(self.action_shape).to(self.device)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device) self.alpha_optim = optim.Adam([self.log_alpha], lr=a_lr) def act(self, state, noise=True): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action, _ = self.actor.sample(state) action = action.cpu().detach().squeeze(0).numpy() return self.rescale_action(action) def rescale_action(self, action): return action * (self.action_range[1] - self.action_range[0]) / 2.0 +\ (self.action_range[1] + self.action_range[0]) / 2.0 def remember(self, state, action, reward, new_state, done): for i in range(len(state)): self.memory.update(state[i], action[i], reward[i], new_state[i], done[i]) def train(self, batch_size=64): if batch_size > self.memory.size: return self.count += 1 (states, actions, rewards, next_states, dones) = self.memory.sample(batch_size) actions = actions.unsqueeze(-1) rewards = rewards.unsqueeze(-1) dones = dones.unsqueeze(-1) self._train_critics(states, actions, rewards, next_states, dones) if self.count % self.policy_freq == 0: self._train_actor(states, actions, rewards, next_states, dones) self.update_target() def _train_critics(self, states, actions, rewards, next_states, dones): next_actions, next_log_pi = self.actor.sample(next_states) with torch.no_grad(): target_Q1 = self.target_critic1.forward(next_states, next_actions) target_Q2 = self.target_critic2.forward(next_states, next_actions) target_Q = torch.min(target_Q1, target_Q2) - self.alpha * next_log_pi target_Q = rewards + ((1 - dones) * self.gamma * target_Q) current_Q1 = self.critic1(states, actions) loss_Q1 = F.mse_loss(current_Q1, target_Q) self.critic_optimizer1.zero_grad() loss_Q1.backward() self.critic_optimizer1.step() current_Q2 = self.critic2(states, actions) loss_Q2 = F.mse_loss(current_Q2, target_Q) self.critic_optimizer2.zero_grad() loss_Q2.backward() self.critic_optimizer2.step() def _train_actor(self, states, actions, rewards, next_states, dones): new_actions, log_pi = self.actor.sample(states) min_q = torch.min(self.critic1.forward(states, new_actions), self.critic2.forward(states, new_actions)) policy_loss = (self.alpha * log_pi - min_q).mean() self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() alpha_loss = (self.log_alpha * (-log_pi - self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() def update_target(self): with torch.no_grad(): for target_param, param in zip(self.target_critic1.parameters(), self.critic1.parameters()): target_param.data.mul_(1 - self.tau) torch.add(target_param.data, param.data, alpha=self.tau, out=target_param.data) for target_param, param in zip(self.target_critic2.parameters(), self.critic2.parameters()): target_param.data.mul_(1 - self.tau) torch.add(target_param.data, param.data, alpha=self.tau, out=target_param.data)
class SharedPPO: def __init__(self, env_num, observation_space, action_space, lr=7e-4, steps=64, gamma=0.99, lam=0.95, entropy_coef=0.005, clip=0.1): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.gamma = gamma self.lam = lam self.entropy_coef = entropy_coef self.clip = clip self.steps = steps self.memory = NumpyReplay(steps, env_num, observation_space.shape[0], self.device) self.actorcritic = ActorCritic(observation_space.shape[0], action_space.n).to(self.device) self.actorcritic_optimizer = optim.Adam(self.actorcritic.parameters(), lr=lr, eps=1e-6) self.target_actorcritic = ActorCritic(observation_space.shape[0], action_space.n).to(self.device) self.target_actorcritic.load_state_dict(self.actorcritic.state_dict()) def act(self, state): state = torch.FloatTensor(state).to(self.device) probs, _ = self.target_actorcritic.forward(state) action = probs.sample() return action.cpu().detach().numpy() def remember(self, state, action, reward, new_state, done): state_torch = torch.FloatTensor(state).to(self.device) probs, _ = self.actorcritic.forward(state_torch) action_torch = torch.LongTensor(action).to(self.device) log_probs = probs.log_prob(action_torch) self.memory.update(state, action, log_probs, reward, new_state, done) def compute_gae(self, values, dones, rewards): returns = torch.zeros_like(rewards).to(self.device) advantages = torch.zeros_like(rewards).to(self.device) deltas = torch.zeros_like(rewards).to(self.device) returns[-1] = rewards[-1] + self.gamma * (1 - dones[-1]) * rewards[-1] advantages[-1] = returns[-1] - values[-1] for i in reversed(range(len(rewards) - 1)): delta = rewards[i] + self.gamma * ( 1 - dones[i]) * values[i + 1] - values[i] advantages[i] = delta + self.gamma * self.lam * ( 1 - dones[i]) * advantages[i + 1] returns[i] = advantages[i] + values[i] return returns, (advantages - advantages.mean()) / (advantages.std() + 1e-10) def compute_loss(self, states, actions, logp, advantages, returns): new_probs, v = self.actorcritic.forward(states) new_logprobs = new_probs.log_prob(actions) entropy = new_probs.entropy().mean() ratios = torch.exp( new_logprobs.unsqueeze(-1) - logp.unsqueeze(-1).detach()) surr1 = ratios * advantages.detach() surr2 = torch.clamp(ratios, 1 - self.clip, 1 + self.clip) * advantages.detach() policy_loss = -torch.min(surr1, surr2).mean() value_loss = 0.5 * F.mse_loss(v, returns.detach()) entropy_loss = -self.entropy_coef * entropy return policy_loss, value_loss, entropy_loss def train(self, epochs=8): if self.memory.length < self.steps: return states, actions, log_probs, rewards, next_states, dones = self.memory.sample( ) rewards = rewards.unsqueeze(-1) dones = dones.unsqueeze(-1) log_probs = log_probs.detach() _, v = self.actorcritic.forward(states) returns, advantages = self.compute_gae(v, dones, rewards) for _ in range(epochs): self.actorcritic_optimizer.zero_grad() policy_loss, value_loss, entropy_loss = self.compute_loss( states, actions, log_probs, advantages, returns) total_loss = policy_loss + value_loss + entropy_loss total_loss.backward() self.actorcritic_optimizer.step() self.target_actorcritic.load_state_dict(self.actorcritic.state_dict())