def train_agent(path, env, agent, seed=0, num_episodes=100, num_steps=100, batch_size=128, replay_buffer_size=1000000): if not os.path.isdir(path): os.makedirs(path) os.chdir(path) env.seed(seed) random.seed(seed) pickle.dump(agent.policy_net, open('first_policy.pickle', 'wb')) replay_buffer = ReplayBuffer(replay_buffer_size) rewards = [] max_angle = [] ave_angle = [] for episode in range(num_episodes): state = env.reset() episode_reward = 0 max_th = 0 ave_th = 0 for step in range(num_steps): action = agent.policy_net.get_action(state) + np.array([0.0]) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) if len(replay_buffer) > batch_size: agent.train_step(replay_buffer=replay_buffer, batch_size=batch_size) state = next_state episode_reward += reward th = np.arccos(state[0]) * np.sign(state[1]) max_th = max(max_th, abs(th)) ave_th += abs(th) rewards.append(episode_reward) max_angle.append(max_th) ave_angle.append(ave_th / num_steps) pickle.dump(agent.policy_net, open('last_policy.pickle', 'wb')) pickle.dump(rewards, open('rewards.pickle', 'wb')) pickle.dump(max_angle, open('max_angle.pickle', 'wb')) pickle.dump(ave_angle, open('ave_angle.pickle', 'wb')) plt.figure(figsize=(10, 6)) plt.plot(rewards) plt.title('Reward vs Episode') plt.savefig('rewards.png', dpi=100) plt.close()
class DQNAgent(nn.Module): def __init__(self, state_dim: int, action_dim: int, hidden_sizes: list = [128, 128], activation=nn.ReLU, buffer_size: int = 1000000, batch_size: int = 32, lr: float = 1e-4, gamma: float = 0.95, theta: float = 0.05): super(DQNAgent, self).__init__() self.q_net = mlp([state_dim] + hidden_sizes + [action_dim], activation=activation) self.target_net = mlp([state_dim] + hidden_sizes + [action_dim], activation=activation) self.target_net.load_state_dict(self.q_net.state_dict()) self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size self.optimizer = Adam(self.q_net.parameters(), lr=lr) self.gamma = gamma self.theta = theta def forward(self, x): return self.q_net(x) def save_memory(self, ex): self.buffer.push(ex) def train(self, k=4, max_norm=5.): losses = [] for _ in range(k): experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) next_q = self.target_net(t).max(-1, keepdim=True)[0] target = r + self.gamma * mask * next_q.detach() pred = self.q_net(s).gather(-1, a) loss = F.mse_loss(pred, target) self.optimizer.zero_grad() loss.backward() clip_grad_norm_(self.q_net.parameters(), max_norm) self.optimizer.step() losses.append(loss.item()) self.target_update() return np.mean(losses) def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): for target, param in zip(self.target_net.parameters(), self.q_net.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data #%%
num_frames = 100000 batch_size = 32 gamma = 0.99 losses = [] all_rewards = [] episode_reward = 0 # current_model = torch.load("data/rainbow.pt") state = env.reset() for frame_idx in range(1, num_frames + 1): action = current_model.act(state) next_state, reward, done, _ = env.step(action) replay_buffer.push(state, action, reward, next_state, done) state = next_state episode_reward += reward if done: state = env.reset() all_rewards.append(episode_reward) episode_reward = 0 if len(replay_buffer) > batch_size: loss = compute_td_loss(batch_size) losses.append(loss.data[0]) if frame_idx % 200 == 0: plot(frame_idx, all_rewards, losses)
class DDPGAgent(nn.Module): def __init__(self, state_dim: int, action_dim: int, action_min: float, action_max: float, q_hidden_sizes: list = [128, 128], p_hidden_sizes: list = [128, 128], activation=nn.ReLU, buffer_size: int = 1000000, batch_size: int = 32, q_lr: float = 1e-4, p_lr: float = 1e-3, gamma: float = 0.95, theta: float = 0.01, eps: float = 0.3): super(DDPGAgent, self).__init__() # Actor, frozen actor # Bound the action to minimum, maximum value using tanh loc = (action_min + action_max) / 2 scale = (action_min - action_max) / 2 self.policy = Actor(loc, scale, state_dim, action_dim, p_hidden_sizes, activation=activation, bn=True) self.policy_target = Actor(loc, scale, state_dim, action_dim, p_hidden_sizes, activation=activation, bn=True) self.policy_target.load_state_dict(self.policy.state_dict()) # Critic, frozen critic self.q_net = Critic(state_dim, action_dim, q_hidden_sizes, activation) self.q_target = Critic(state_dim, action_dim, q_hidden_sizes, activation) self.q_target.load_state_dict(self.q_net.state_dict()) # Replay buffer self.buffer = ReplayBuffer(buffer_size) self.batch_size = batch_size # Learner self.q_optimizer = Adam(self.q_net.parameters(), lr=q_lr) self.policy_optimizer = Adam(self.policy.parameters(), lr=p_lr) self.gamma = gamma # Polyak averaging parameter self.theta = theta # Exploration coefficient self.eps = eps # To use batchnorm self.policy.eval() self.policy_target.eval() # Get action def forward(self, x, step): x = self.policy(x) x_exp = x.clone() + self.eps * torch.randn(x.shape) return x, x_exp def save_memory(self, ex): self.buffer.push(ex) def train(self, k=1, q_max_norm=5., policy_max_norm=5.): q_losses = [] # q update # To stabilize the learning, # critic is updated several times per each training step # and gradient clipping is also used for _ in range(k): experiences = self.buffer.sample(self.batch_size) s, a, r, t, mask = get_batch(experiences) mu_t = self.policy_target(t) next_q = self.q_target(t, mu_t) target = r + self.gamma * mask * next_q.detach() pred = self.q_net(s, a) q_loss = F.mse_loss(pred, target) self.q_optimizer.zero_grad() q_loss.backward() clip_grad_norm_(self.q_net.parameters(), q_max_norm) self.q_optimizer.step() q_losses.append(q_loss.item()) # policy update # To stablize the learning, # batchnorm and gradient clipping is used self.policy.train() mu = self.policy(s) policy_loss = torch.mean(-self.q_net(s, mu)) self.policy_optimizer.zero_grad() policy_loss.backward() clip_grad_norm_(self.policy.parameters(), policy_max_norm) self.policy_optimizer.step() # Polyak averaging self.target_update() # batchnorm self.policy.eval() return np.mean(q_losses), policy_loss.item() def train_start(self): return (len(self.buffer) >= self.batch_size) def target_update(self): for target, param in zip(self.q_target.parameters(), self.q_net.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data for target, param in zip(self.policy_target.parameters(), self.policy.parameters()): target.data = (1 - self.theta) * target.data + self.theta * param.data