class DDPGHedgingAgent: """DDPGAgent interacting with environment. Attribute: env (gym.Env): openAI Gym environment actor (nn.Module): target actor model to select actions actor_target (nn.Module): actor model to predict next actions actor_optimizer (Optimizer): optimizer for training actor critic (nn.Module): critic model to predict state values critic_target (nn.Module): target critic model to predict state values critic_optimizer (Optimizer): optimizer for training critic memory (ReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling gamma (float): discount factor tau (float): parameter for soft target update initial_random_episode (int): initial random action steps noise (OUNoise): noise generator for exploration device (torch.device): cpu / gpu transition (list): temporory storage for the recent transition total_step (int): total step numbers is_test (bool): flag to show the current mode (train / test) """ def __init__(self, env: gym.Env, memory_size: int, batch_size: int, ou_noise_theta: float, ou_noise_sigma: float, gamma: float = 0.99, tau: float = 5e-3, initial_random_episode: int = 1e4, name_cases='myproject'): """ Initialize. """ # Logger self.wandb = wandb.init(project=name_cases) obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.env = env self.memory = ReplayBuffer(memory_size) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.initial_random_episode = initial_random_episode # noise self.noise = OUNoise( action_dim, theta=ou_noise_theta, sigma=ou_noise_sigma, ) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # networks self.actor = Actor(obs_dim, action_dim).to(self.device) self.actor_target = Actor(obs_dim, action_dim).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic = Critic(obs_dim + action_dim).to(self.device) self.critic_target = Critic(obs_dim + action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) # transition to store in memory self.transition = list() # total steps count self.total_step = 0 # mode: train / test self.is_test = False self.populate(self.initial_random_episode) def populate(self, eps: int = 100) -> None: """ Carries out several random steps through the environment to initially fill up the replay buffer with experiences Args: steps: number of random steps to populate the buffer with """ if not self.is_test: print("Populate Replay Buffer... ") kbar = pkbar.Kbar(target=eps, width=20) state = self.env.reset() for i in range(eps): while True: # Get action from sample space selected_action = self.env.action_space.sample() # selected_action = 0 noise = self.noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) next_state, reward, done, _ = self.env.step( selected_action) self.transition = [ state, selected_action, reward, next_state, int(done) ] self.memory.append(Experience(*self.transition)) state = next_state if done: state = self.env.reset() break kbar.add(1) # self.scaler = self.memory.standar_scaler() @torch.no_grad() def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" state_s = self.scaler.transform([state]) selected_action = self.actor( torch.FloatTensor(state_s).to(self.device)).item() # add noise for exploration during training if not self.is_test: noise = self.noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" next_state, reward, done, _ = self.env.step(action) if not self.is_test: self.transition += [reward, next_state, int(done)] self.memory.append(Experience(*self.transition)) return next_state, reward, done def update_model(self) -> torch.Tensor: """ Update the model by gradient descent. Change the loss in to mean variance optimization """ device = self.device # for shortening the following lines state, action, reward, next_state, done = self.memory.sample( self.batch_size, self.device) state = torch.FloatTensor(self.scaler.transform(state)).to(device) next_state = torch.FloatTensor( self.scaler.transform(next_state)).to(device) # state = state.to(device) # next_state = next_state.to(device) action = action.to(device) reward = reward.to(device) done = done.to(device) masks = 1 - done next_action = self.actor_target(next_state) next_value = self.critic_target(next_state, next_action) curr_return = reward.reshape( -1, 1) + self.gamma * next_value * masks.reshape(-1, 1) # train critic values = self.critic(state, action) critic_loss = F.mse_loss(values, curr_return) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in self.critic.parameters(): p.requires_grad = False # train actor q_values = self.critic(state, self.actor(state)) actor_loss = -q_values.mean() # actor_loss = 0.5 * q_values.std() ** 2 self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for p in self.critic.parameters(): p.requires_grad = True # target update self._target_soft_update() return actor_loss.data, critic_loss.data def train(self, num_frames: int, plotting_interval: int = 200): """Train the agent.""" self.is_test = False state = self.env.reset() actor_losses = [] critic_losses = [] scores = [] score = 0 print("Training...") kbar = pkbar.Kbar(target=num_frames, width=20) for self.total_step in range(1, num_frames + 1): action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward # if episode ends if done: state = self.env.reset() scores.append(score) score = 0 self._plot( self.total_step, scores, actor_losses, critic_losses, ) # if training is ready if (len(self.memory) >= self.batch_size): # and actor_loss, critic_loss = self.update_model() actor_losses.append(actor_loss) critic_losses.append(critic_loss) kbar.add(1) self.env.close() def test(self): """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 while not done: action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward self.env.close() return score def _target_soft_update(self): """Soft-update: target = tau*local + (1-tau)*target.""" tau = self.tau for t_param, l_param in zip(self.actor_target.parameters(), self.actor.parameters()): t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data) for t_param, l_param in zip(self.critic_target.parameters(), self.critic.parameters()): t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data) def _plot( self, frame_idx: int, scores: List[float], actor_losses: List[float], critic_losses: List[float], ): """Plot the training progresses.""" self.wandb.log({ 'frame': frame_idx, 'score': scores[-1], 'actor_loss': actor_losses[-1], 'critic_loss': critic_losses[-1] })
class DDPG: def __init__(self, n_state, n_action, a_limit, model_folder=None, memory_size=10000, batch_size=32, tau=0.01, gamma=0.99, var=3.0): # Record the parameters self.n_state = n_state self.n_action = n_action self.a_limit = a_limit self.memory_size = memory_size self.model_folder = model_folder self.batch_size = batch_size self.tau = tau self.gamma = gamma self.var = var # Create the network and related objects self.memory = np.zeros( [self.memory_size, 2 * self.n_state + self.n_action + 1], dtype=np.float32) self.memory_counter = 0 self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit) self.eval_critic = Critic(self.n_state, self.n_action) self.target_actor = Actor(self.n_state, self.n_action, self.a_limit, trainable=False) self.target_critic = Critic(self.n_state, self.n_action, trainable=False) self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001) self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002) self.criterion = nn.MSELoss() # Make sure the parameter of target network is the same as evaluate network self.hardCopy() def load(self): if os.path.exists(self.model_folder): self.eval_actor.load_state_dict( torch.load(os.path.join(self.model_folder, 'actor.pth'))) self.eval_critic.load_state_dict( torch.load(os.path.join(self.model_folder, 'critic.pth'))) self.hardCopy() def save(self): if not os.path.exists(self.model_folder): os.mkdir(self.model_folder) torch.save(self.eval_actor.state_dict(), os.path.join(self.model_folder, 'actor.pth')) torch.save(self.eval_critic.state_dict(), os.path.join(self.model_folder, 'critic.pth')) def chooseAction(self, s): """ 給定輸入state,透過evaluate actor輸出[-1, 1]之間的實數動作值 """ s = to_var(s) a = self.eval_actor(s) a = a.cpu().data.numpy() if self.var > 0: a = np.clip(np.random.normal(a, self.var), -2, 2) return a def store_path(self, s, a, r, s_): """ 儲存state transition相關資訊 """ transition = np.hstack((s, a, [r], s_)) idx = self.memory_counter % self.memory_size self.memory[idx, :] = transition self.memory_counter += 1 def softCopy(self): for ta, ea in zip(self.target_actor.parameters(), self.eval_actor.parameters()): ta.data.copy_((1.0 - self.tau) * ta.data + self.tau * ea.data) for tc, ec in zip(self.target_critic.parameters(), self.eval_critic.parameters()): tc.data.copy_((1.0 - self.tau) * tc.data + self.tau * ec.data) def hardCopy(self): for ta, ea in zip(self.target_actor.parameters(), self.eval_actor.parameters()): ta.data.copy_(ea.data) for tc, ec in zip(self.target_critic.parameters(), self.eval_critic.parameters()): tc.data.copy_(ec.data) def update(self): # 如果儲存的資訊太少就不更新 if self.memory_counter <= 5000: return # 將evaluate network的參數複製進入target network中 self.softCopy() # 決定輸入的batch data if self.memory_counter > self.memory_size: sample_idx = np.random.choice(self.memory_size, size=self.batch_size) else: sample_idx = np.random.choice(self.memory_counter, size=self.batch_size) # 從記憶庫中擷取要訓練的資料 batch_data = self.memory[sample_idx, :] batch_s = batch_data[:, :self.n_state] batch_a = batch_data[:, self.n_state:self.n_state + self.n_action] batch_r = batch_data[:, -self.n_state - 1:-self.n_state] batch_s_ = batch_data[:, -self.n_state:] # 送入Pytorch中 batch_s = to_var(batch_s) batch_a = to_var(batch_a) batch_r = to_var(batch_r) batch_s_ = to_var(batch_s_) # 用target network計算target Q值 next_q_target = self.target_critic(batch_s_, self.target_actor(batch_s_)) q_target = batch_r + self.gamma * next_q_target # 更新critic self.critic_optimizer.zero_grad() q_batch = self.eval_critic(batch_s, batch_a) value_loss = F.mse_loss(input=q_batch, target=q_target) value_loss.backward() self.critic_optimizer.step() # 更新actor self.actor_optimizer.zero_grad() policy_loss = -self.eval_critic(batch_s, self.eval_actor(batch_s)).mean() policy_loss.backward() self.actor_optimizer.step() # 降低action隨機搜索廣度 self.var *= .9995