class DDPGHedgingAgent: """DDPGAgent interacting with environment. Attribute: env (gym.Env): openAI Gym environment actor (nn.Module): target actor model to select actions actor_target (nn.Module): actor model to predict next actions actor_optimizer (Optimizer): optimizer for training actor critic (nn.Module): critic model to predict state values critic_target (nn.Module): target critic model to predict state values critic_optimizer (Optimizer): optimizer for training critic memory (ReplayBuffer): replay memory to store transitions batch_size (int): batch size for sampling gamma (float): discount factor tau (float): parameter for soft target update initial_random_episode (int): initial random action steps noise (OUNoise): noise generator for exploration device (torch.device): cpu / gpu transition (list): temporory storage for the recent transition total_step (int): total step numbers is_test (bool): flag to show the current mode (train / test) """ def __init__(self, env: gym.Env, memory_size: int, batch_size: int, ou_noise_theta: float, ou_noise_sigma: float, gamma: float = 0.99, tau: float = 5e-3, initial_random_episode: int = 1e4, name_cases='myproject'): """ Initialize. """ # Logger self.wandb = wandb.init(project=name_cases) obs_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] self.env = env self.memory = ReplayBuffer(memory_size) self.batch_size = batch_size self.gamma = gamma self.tau = tau self.initial_random_episode = initial_random_episode # noise self.noise = OUNoise( action_dim, theta=ou_noise_theta, sigma=ou_noise_sigma, ) # device: cpu / gpu self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print(self.device) # networks self.actor = Actor(obs_dim, action_dim).to(self.device) self.actor_target = Actor(obs_dim, action_dim).to(self.device) self.actor_target.load_state_dict(self.actor.state_dict()) self.critic = Critic(obs_dim + action_dim).to(self.device) self.critic_target = Critic(obs_dim + action_dim).to(self.device) self.critic_target.load_state_dict(self.critic.state_dict()) # optimizer self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=3e-4) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=1e-3) # transition to store in memory self.transition = list() # total steps count self.total_step = 0 # mode: train / test self.is_test = False self.populate(self.initial_random_episode) def populate(self, eps: int = 100) -> None: """ Carries out several random steps through the environment to initially fill up the replay buffer with experiences Args: steps: number of random steps to populate the buffer with """ if not self.is_test: print("Populate Replay Buffer... ") kbar = pkbar.Kbar(target=eps, width=20) state = self.env.reset() for i in range(eps): while True: # Get action from sample space selected_action = self.env.action_space.sample() # selected_action = 0 noise = self.noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) next_state, reward, done, _ = self.env.step( selected_action) self.transition = [ state, selected_action, reward, next_state, int(done) ] self.memory.append(Experience(*self.transition)) state = next_state if done: state = self.env.reset() break kbar.add(1) # self.scaler = self.memory.standar_scaler() @torch.no_grad() def select_action(self, state: np.ndarray) -> np.ndarray: """Select an action from the input state.""" state_s = self.scaler.transform([state]) selected_action = self.actor( torch.FloatTensor(state_s).to(self.device)).item() # add noise for exploration during training if not self.is_test: noise = self.noise.sample() selected_action = np.clip(selected_action + noise, -1.0, 1.0) self.transition = [state, selected_action] return selected_action def step(self, action: np.ndarray) -> Tuple[np.ndarray, np.float64, bool]: """Take an action and return the response of the env.""" next_state, reward, done, _ = self.env.step(action) if not self.is_test: self.transition += [reward, next_state, int(done)] self.memory.append(Experience(*self.transition)) return next_state, reward, done def update_model(self) -> torch.Tensor: """ Update the model by gradient descent. Change the loss in to mean variance optimization """ device = self.device # for shortening the following lines state, action, reward, next_state, done = self.memory.sample( self.batch_size, self.device) state = torch.FloatTensor(self.scaler.transform(state)).to(device) next_state = torch.FloatTensor( self.scaler.transform(next_state)).to(device) # state = state.to(device) # next_state = next_state.to(device) action = action.to(device) reward = reward.to(device) done = done.to(device) masks = 1 - done next_action = self.actor_target(next_state) next_value = self.critic_target(next_state, next_action) curr_return = reward.reshape( -1, 1) + self.gamma * next_value * masks.reshape(-1, 1) # train critic values = self.critic(state, action) critic_loss = F.mse_loss(values, curr_return) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in self.critic.parameters(): p.requires_grad = False # train actor q_values = self.critic(state, self.actor(state)) actor_loss = -q_values.mean() # actor_loss = 0.5 * q_values.std() ** 2 self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() for p in self.critic.parameters(): p.requires_grad = True # target update self._target_soft_update() return actor_loss.data, critic_loss.data def train(self, num_frames: int, plotting_interval: int = 200): """Train the agent.""" self.is_test = False state = self.env.reset() actor_losses = [] critic_losses = [] scores = [] score = 0 print("Training...") kbar = pkbar.Kbar(target=num_frames, width=20) for self.total_step in range(1, num_frames + 1): action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward # if episode ends if done: state = self.env.reset() scores.append(score) score = 0 self._plot( self.total_step, scores, actor_losses, critic_losses, ) # if training is ready if (len(self.memory) >= self.batch_size): # and actor_loss, critic_loss = self.update_model() actor_losses.append(actor_loss) critic_losses.append(critic_loss) kbar.add(1) self.env.close() def test(self): """Test the agent.""" self.is_test = True state = self.env.reset() done = False score = 0 while not done: action = self.select_action(state) next_state, reward, done = self.step(action) state = next_state score += reward self.env.close() return score def _target_soft_update(self): """Soft-update: target = tau*local + (1-tau)*target.""" tau = self.tau for t_param, l_param in zip(self.actor_target.parameters(), self.actor.parameters()): t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data) for t_param, l_param in zip(self.critic_target.parameters(), self.critic.parameters()): t_param.data.copy_(tau * l_param.data + (1.0 - tau) * t_param.data) def _plot( self, frame_idx: int, scores: List[float], actor_losses: List[float], critic_losses: List[float], ): """Plot the training progresses.""" self.wandb.log({ 'frame': frame_idx, 'score': scores[-1], 'actor_loss': actor_losses[-1], 'critic_loss': critic_losses[-1] })
class DDPG: """ Deep Deterministic Policy Gradient (DDPG) Helper Class """ def __init__(self, env, act_dim, state_dim, goal_dim, act_range, buffer_size=int(1e6), gamma=0.98, lr=0.001, tau=0.95): """ Initialization """ # Environment and A2C parameters self.act_dim = act_dim self.act_range = act_range self.env_dim = state_dim + goal_dim self.gamma = gamma self.lr = lr self.tau = tau self.env = env # Create actor and critic networks self.actor_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network = Actor(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) self.critic_network = Critic(self.env_dim, act_dim, act_range) self.critic_target_network = Critic(self.env_dim, act_dim, act_range) self.actor_target_network.load_state_dict( self.actor_network.state_dict()) sync_networks(self.actor_network) sync_networks(self.critic_network) # Optimizer self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=lr) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=lr) # Replay buffer # self.buffer = MemoryBuffer(buffer_size) self.buffer = ReplayMemory(buffer_size) # Normalizers self.goal_normalizer = Normalizer( goal_dim, default_clip_range=5) # Clip between [-5, 5] self.state_normalizer = Normalizer(state_dim, default_clip_range=5) def policy_action(self, s, g): """ Use the actor to predict value """ input = self.preprocess_inputs(s, g) return self.actor_network(input) def memorize(self, experiences): """ Store experience in memory buffer """ for exp in experiences: self.buffer.push(exp) def sample_batch(self, batch_size): return deepcopy(self.buffer.sample(batch_size)) def clip_states_goals(self, state, goal): state = np.clip(state, -200, 200) goal = np.clip(goal, -200, 200) return state, goal def preprocess_inputs(self, state, goal): """Normalize and concatenate state and goal""" #state, goal = self.clip_states_goals(state, goal) state_norm = self.state_normalizer.normalize(state) goal_norm = self.goal_normalizer.normalize(goal) inputs = np.concatenate([state_norm, goal_norm]) return torch.tensor(inputs, dtype=torch.float32).unsqueeze(0) def select_actions(self, pi): # add the gaussian action = pi.cpu().numpy().squeeze() action += 0.2 * self.act_range * np.random.randn(*action.shape) action = np.clip(action, -self.act_range, self.act_range) # random actions... random_actions = np.random.uniform(low=-self.act_range, high=self.act_range, size=self.act_dim) # choose if use the random actions action += np.random.binomial(1, 0.3, 1)[0] * (random_actions - action) action = np.clip(action, -self.act_range, self.act_range) return action def update_network(self, batch_size): s, actions, rewards, ns, _, g = self.sample_batch(batch_size) states, goals = self.clip_states_goals(s, g) new_states, new_goals = self.clip_states_goals(ns, g) norm_states = self.state_normalizer.normalize(states) norm_goals = self.goal_normalizer.normalize(goals) inputs_norm = np.concatenate([norm_states, norm_goals], axis=1) norm_new_states = self.state_normalizer.normalize(new_states) norm_new_goals = self.goal_normalizer.normalize(new_goals) inputs_next_norm = np.concatenate([norm_new_states, norm_new_goals], axis=1) # To tensor inputs_norm_tensor = torch.tensor(inputs_norm, dtype=torch.float32) inputs_next_norm_tensor = torch.tensor(inputs_next_norm, dtype=torch.float32) actions_tensor = torch.tensor(actions, dtype=torch.float32) r_tensor = torch.tensor(rewards, dtype=torch.float32) with torch.no_grad(): # do the normalization # concatenate the stuffs actions_next = self.actor_target_network(inputs_next_norm_tensor) q_next_value = self.critic_target_network(inputs_next_norm_tensor, actions_next) q_next_value = q_next_value.detach() target_q_value = r_tensor + self.gamma * q_next_value target_q_value = target_q_value.detach() # clip the q value clip_return = 1 / (1 - self.gamma) target_q_value = torch.clamp(target_q_value, -clip_return, 0) # the q loss real_q_value = self.critic_network(inputs_norm_tensor, actions_tensor) critic_loss = (target_q_value - real_q_value).pow(2).mean() # the actor loss actions_real = self.actor_network(inputs_norm_tensor) actor_loss = -self.critic_network(inputs_norm_tensor, actions_real).mean() actor_loss += 1.0 * (actions_real / self.act_range).pow(2).mean() # start to update the network self.actor_optim.zero_grad() actor_loss.backward() sync_grads(self.actor_network) self.actor_optim.step() # update the critic_network self.critic_optim.zero_grad() critic_loss.backward() sync_grads(self.critic_network) self.critic_optim.step() def soft_update_target_network(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.tau) * param.data + self.tau * target_param.data) def train(self, args): if MPI.COMM_WORLD.Get_rank() == 0: self.create_save_dir(args["save_dir"], args["env_name"], args["HER_strat"]) success_rates = [] for ep_num in range(NUM_EPOCHS): start = time.time() for _ in range(NUM_CYCLES): for _ in range(ROLLOUT_PER_WORKER): # Reset episode observation = self.env.reset() current_state = observation['observation'] goal = observation['desired_goal'] old_achieved_goal = observation['achieved_goal'] episode_exp = [] episode_exp_her = [] for _ in range(self.env._max_episode_steps): if args['render']: self.env.render() with torch.no_grad(): pi = self.policy_action(current_state, goal) action = self.select_actions(pi) obs, reward, _, _ = self.env.step(action) new_state = obs['observation'] new_achieved_goal = obs['achieved_goal'] # Add outputs to memory buffer episode_exp.append([ current_state, action, reward, new_state, old_achieved_goal, goal ]) if reward == 0: break old_achieved_goal = new_achieved_goal current_state = new_state if args["HER_strat"] == "final": experience = episode_exp[-1] # set g' to achieved goal experience[-1] = np.copy(experience[-2]) reward = self.env.compute_reward( experience[-2], experience[-1], None) # set reward of success experience[2] = reward episode_exp_her.append(experience) elif args["HER_strat"] in ["future", "episode"]: # For each transition of the episode trajectory for t in range(len(episode_exp)): # Add K random states which come from the same episode as the transition for _ in range(args["HER_k"]): if args["HER_strat"] == "future": # Select a future exp from the same episod selected = np.random.randint( t, len(episode_exp)) elif args["HER_strat"] == "episode": # Select an exp from the same episode selected = np.random.randint( 0, len(episode_exp)) # Take the achieved goal of the selected ag_selected = np.copy(episode_exp[selected][5]) s, a, _, ns, ag, _ = episode_exp[t] r = self.env.compute_reward( ag_selected, ag, None) # New transition where the achieved goal of the selected is the new goal her_transition = [s, a, r, ns, ag, ag_selected] episode_exp_her.append(her_transition) self.memorize(deepcopy(episode_exp)) self.memorize(deepcopy(episode_exp_her)) # Update Normalizers with the observations of this episode self.update_normalizers(deepcopy(episode_exp), deepcopy(episode_exp_her)) for _ in range(OPTIMIZATION_STEPS): # Sample experience from buffer self.update_network(args["batch_size"]) # Soft update the target networks self.soft_update_target_network(self.actor_target_network, self.actor_network) self.soft_update_target_network(self.critic_target_network, self.critic_network) success_rate = self.eval() success_rates.append(success_rate) if MPI.COMM_WORLD.Get_rank() == 0: print("Epoch:", ep_num + 1, " -- success rate:", success_rates[-1], " -- duration:", time.time() - start) torch.save([ self.state_normalizer.mean, self.state_normalizer.std, self.goal_normalizer.mean, self.goal_normalizer.std, self.actor_network.state_dict() ], self.model_path + '/model.pt') return success_rates def create_save_dir(self, save_dir, env_name, her_strat): if not os.path.exists(save_dir): os.mkdir(save_dir) # path to save the model subdir = os.path.join(save_dir, env_name) if not os.path.exists(subdir): os.mkdir(subdir) self.model_path = os.path.join(save_dir, env_name, her_strat) if not os.path.exists(self.model_path): os.mkdir(self.model_path) def update_normalizers(self, episode_exp, episode_exp_her): # Update Normalizers episode_exp_states = np.vstack(np.array(episode_exp)[:, 0]) episode_exp_goals = np.vstack(np.array(episode_exp)[:, 5]) if len(episode_exp_her) != 0: episode_exp_her_states = np.vstack(np.array(episode_exp_her)[:, 0]) episode_exp_her_goals = np.vstack(np.array(episode_exp_her)[:, 5]) states = np.concatenate( [episode_exp_states, episode_exp_her_states]) goals = np.concatenate([episode_exp_goals, episode_exp_her_goals]) else: states = np.copy(episode_exp_states) goals = np.copy(episode_exp_goals) states, goals = self.clip_states_goals(states, goals) self.state_normalizer.update(deepcopy(states)) self.goal_normalizer.update(deepcopy(goals)) self.state_normalizer.recompute_stats() self.goal_normalizer.recompute_stats() def eval(self): total_success_rate = [] for _ in range(NUM_TEST): per_success_rate = [] observation = self.env.reset() state = observation['observation'] goal = observation['desired_goal'] for _ in range(self.env._max_episode_steps): # self.env.render() with torch.no_grad(): input = self.preprocess_inputs(state, goal) pi = self.actor_network(input) action = pi.detach().cpu().numpy().squeeze() new_observation, _, _, info = self.env.step(action) state = new_observation['observation'] per_success_rate.append(info['is_success']) total_success_rate.append(per_success_rate) total_success_rate = np.array(total_success_rate) local_success_rate = np.mean(total_success_rate[:, -1]) global_success_rate = MPI.COMM_WORLD.allreduce(local_success_rate, op=MPI.SUM) return global_success_rate / MPI.COMM_WORLD.Get_size()
if __name__ == '__main__': # load the model param model_path = 'saved_models/%s/%s/model.pt' % (args.env, args.her_strat) o_mean, o_std, g_mean, g_std, model = torch.load(model_path, map_location=lambda storage, loc: storage) # create the environment env = gym.make(args.env) # get the env param obs = env.reset() # get the environment params act_dim = env.action_space.shape[0] env_dim = obs['observation'].shape[0] + obs['desired_goal'].shape[0] act_range = env.action_space.high[0] # create the actor network actor_network = Actor(env_dim, act_dim, act_range) actor_network.load_state_dict(model) actor_network.eval() for i in range(DEMO_LENGHT): observation = env.reset() # start to do the demo obs = observation['observation'] g = observation['desired_goal'] for t in range(env._max_episode_steps): env.render() inputs = process_inputs(obs, g, o_mean, o_std, g_mean, g_std) with torch.no_grad(): pi = actor_network(inputs) action = pi.detach().numpy().squeeze() # put actions into the environment observation_new, reward, done, info = env.step(action) if info['is_success']:
class DDPG: def __init__(self, n_state, n_action, a_limit, model_folder=None, memory_size=10000, batch_size=32, tau=0.01, gamma=0.99, var=3.0): # Record the parameters self.n_state = n_state self.n_action = n_action self.a_limit = a_limit self.memory_size = memory_size self.model_folder = model_folder self.batch_size = batch_size self.tau = tau self.gamma = gamma self.var = var # Create the network and related objects self.memory = np.zeros( [self.memory_size, 2 * self.n_state + self.n_action + 1], dtype=np.float32) self.memory_counter = 0 self.eval_actor = Actor(self.n_state, self.n_action, self.a_limit) self.eval_critic = Critic(self.n_state, self.n_action) self.target_actor = Actor(self.n_state, self.n_action, self.a_limit, trainable=False) self.target_critic = Critic(self.n_state, self.n_action, trainable=False) self.actor_optimizer = Adam(self.eval_actor.parameters(), lr=0.001) self.critic_optimizer = Adam(self.eval_critic.parameters(), lr=0.002) self.criterion = nn.MSELoss() # Make sure the parameter of target network is the same as evaluate network self.hardCopy() def load(self): if os.path.exists(self.model_folder): self.eval_actor.load_state_dict( torch.load(os.path.join(self.model_folder, 'actor.pth'))) self.eval_critic.load_state_dict( torch.load(os.path.join(self.model_folder, 'critic.pth'))) self.hardCopy() def save(self): if not os.path.exists(self.model_folder): os.mkdir(self.model_folder) torch.save(self.eval_actor.state_dict(), os.path.join(self.model_folder, 'actor.pth')) torch.save(self.eval_critic.state_dict(), os.path.join(self.model_folder, 'critic.pth')) def chooseAction(self, s): """ 給定輸入state,透過evaluate actor輸出[-1, 1]之間的實數動作值 """ s = to_var(s) a = self.eval_actor(s) a = a.cpu().data.numpy() if self.var > 0: a = np.clip(np.random.normal(a, self.var), -2, 2) return a def store_path(self, s, a, r, s_): """ 儲存state transition相關資訊 """ transition = np.hstack((s, a, [r], s_)) idx = self.memory_counter % self.memory_size self.memory[idx, :] = transition self.memory_counter += 1 def softCopy(self): for ta, ea in zip(self.target_actor.parameters(), self.eval_actor.parameters()): ta.data.copy_((1.0 - self.tau) * ta.data + self.tau * ea.data) for tc, ec in zip(self.target_critic.parameters(), self.eval_critic.parameters()): tc.data.copy_((1.0 - self.tau) * tc.data + self.tau * ec.data) def hardCopy(self): for ta, ea in zip(self.target_actor.parameters(), self.eval_actor.parameters()): ta.data.copy_(ea.data) for tc, ec in zip(self.target_critic.parameters(), self.eval_critic.parameters()): tc.data.copy_(ec.data) def update(self): # 如果儲存的資訊太少就不更新 if self.memory_counter <= 5000: return # 將evaluate network的參數複製進入target network中 self.softCopy() # 決定輸入的batch data if self.memory_counter > self.memory_size: sample_idx = np.random.choice(self.memory_size, size=self.batch_size) else: sample_idx = np.random.choice(self.memory_counter, size=self.batch_size) # 從記憶庫中擷取要訓練的資料 batch_data = self.memory[sample_idx, :] batch_s = batch_data[:, :self.n_state] batch_a = batch_data[:, self.n_state:self.n_state + self.n_action] batch_r = batch_data[:, -self.n_state - 1:-self.n_state] batch_s_ = batch_data[:, -self.n_state:] # 送入Pytorch中 batch_s = to_var(batch_s) batch_a = to_var(batch_a) batch_r = to_var(batch_r) batch_s_ = to_var(batch_s_) # 用target network計算target Q值 next_q_target = self.target_critic(batch_s_, self.target_actor(batch_s_)) q_target = batch_r + self.gamma * next_q_target # 更新critic self.critic_optimizer.zero_grad() q_batch = self.eval_critic(batch_s, batch_a) value_loss = F.mse_loss(input=q_batch, target=q_target) value_loss.backward() self.critic_optimizer.step() # 更新actor self.actor_optimizer.zero_grad() policy_loss = -self.eval_critic(batch_s, self.eval_actor(batch_s)).mean() policy_loss.backward() self.actor_optimizer.step() # 降低action隨機搜索廣度 self.var *= .9995