class PPO: def __init__(self, env_id, render=False, num_process=4, min_batch_size=2048, lr_p=3e-4, lr_v=3e-4, gamma=0.99, tau=0.95, clip_epsilon=0.2, ppo_epochs=10, ppo_mini_batch_size=64, seed=1, model_path=None): self.env_id = env_id self.gamma = gamma self.tau = tau self.ppo_epochs = ppo_epochs self.ppo_mini_batch_size = ppo_mini_batch_size self.clip_epsilon = clip_epsilon self.render = render self.num_process = num_process self.lr_p = lr_p self.lr_v = lr_v self.min_batch_size = min_batch_size self.model_path = model_path self.seed = seed self._init_model() def _init_model(self): """init model from parameters""" self.env, env_continuous, self.num_states, self.num_actions = get_env_info( self.env_id) # seeding torch.manual_seed(self.seed) self.env.seed(self.seed) self.policy_net = Policy(self.num_states, self.num_actions).to(device) self.value_net = Value(self.num_states).to(device) self.running_state = ZFilter((self.num_states, ), clip=5) if self.model_path: print("Loading Saved Model {}_ppo.p from {}/{}_ppo.p".format( self.env_id, self.model_path, self.env_id)) data = pickle.load( open('{}/{}_ppo.p'.format(self.model_path, self.env_id), "rb")) self.policy_net, self.value_net, self.running_state = data.policy_net, data.value_net, data.running_state self.collector = MemoryCollector(self.env, self.policy_net, render=self.render, running_state=self.running_state, num_process=self.num_process) self.optimizer_p = optim.Adam(self.policy_net.parameters(), lr=self.lr_p) self.optimizer_v = optim.Adam(self.value_net.parameters(), lr=self.lr_v) def choose_action(self, state): """select action""" state = FLOAT(state).unsqueeze(0).to(device) with torch.no_grad(): action, log_prob = self.policy_net.get_action_log_prob(state) action = action.cpu().numpy()[0] return action def eval(self, i_iter, render=False): state = self.env.reset() test_reward = 0 while True: if render: self.env.render() state = self.running_state(state) action = self.choose_action(state) state, reward, done, _ = self.env.step(action) test_reward += reward if done: break print(f"Iter: {i_iter}, test Reward: {test_reward}") self.env.close() def learn(self, writer, i_iter): """learn model""" memory, log = self.collector.collect_samples(self.min_batch_size) print( f"Iter: {i_iter}, num steps: {log['num_steps']}, total reward: {log['total_reward']: .4f}, " f"min reward: {log['min_episode_reward']: .4f}, max reward: {log['max_episode_reward']: .4f}, " f"average reward: {log['avg_reward']: .4f}, sample time: {log['sample_time']: .4f}" ) # record reward information writer.add_scalar("rewards/total_reward", log['total_reward'], i_iter) writer.add_scalar("rewards/average_reward", log['avg_reward'], i_iter) writer.add_scalar("rewards/min_reward", log['min_episode_reward'], i_iter) writer.add_scalar("rewards/max_reward", log['max_episode_reward'], i_iter) writer.add_scalar("rewards/num_steps", log['num_steps'], i_iter) batch, permuted_batch = memory.sample() # sample all items in memory # ('state', 'action', 'reward', 'next_state', 'mask', 'log_prob') batch_state = FLOAT(batch.state).to(device) batch_action = FLOAT(batch.action).to(device) batch_reward = FLOAT(batch.reward).to(device) batch_next_state = FLOAT(batch.next_state).to(device) batch_mask = FLOAT(batch.mask).to(device) batch_log_prob = FLOAT(batch.log_prob).to(device) with torch.no_grad(): batch_value = self.value_net(batch_state) batch_advantage, batch_return = estimate_advantages( batch_reward, batch_mask, batch_value, self.gamma, self.tau) alg_step_stats = {} if self.ppo_mini_batch_size: batch_size = batch_state.shape[0] mini_batch_num = int( math.ceil(batch_size / self.ppo_mini_batch_size)) # update with mini-batch for _ in range(self.ppo_epochs): index = torch.randperm(batch_size) for i in range(mini_batch_num): ind = index[slice( i * self.ppo_mini_batch_size, min(batch_size, (i + 1) * self.ppo_mini_batch_size))] state, action, returns, advantages, old_log_pis = batch_state[ind], batch_action[ind], \ batch_return[ ind], batch_advantage[ind], \ batch_log_prob[ ind] alg_step_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, state, action, returns, advantages, old_log_pis, self.clip_epsilon, 1e-3) else: for _ in range(self.ppo_epochs): alg_step_stats = ppo_step(self.policy_net, self.value_net, self.optimizer_p, self.optimizer_v, 1, batch_state, batch_action, batch_return, batch_advantage, batch_log_prob, self.clip_epsilon, 1e-3) return alg_step_stats def save(self, save_path): """save model""" check_path(save_path) pickle.dump((self.policy_net, self.value_net, self.running_state), open('{}/{}_ppo_encoder.p'.format(save_path, self.env_id), 'wb'))
class Learner: def __init__(self, learning_rate=0.01, FILE="Model/goodPolicy.pth"): self.FILE = FILE self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.policy = Policy().to(self.device) self.policy.load_state_dict(torch.load(self.FILE)) self.policy.eval() self.criterion = nn.CrossEntropyLoss() self.learning_rate = learning_rate self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.learning_rate) def simulate(self, episode: int, policyPercent: float, show=False): """ Simulate the cartpole process :param episode: number of episode want to simulate, how many percentage of policy want to use :return: list of ([trajectory of actions], [trajectory of observation], totalReward) """ env = gym.make('CartPole-v0') result = [] for i_episode in range(episode): actions = [] observations = [] totalReward = 500 # if not failed observation = env.reset() for t in range(500): if show: env.render() observationTensor = torch.from_numpy( observation) # convert from numpy to tensor observationTensor = torch.tensor(observationTensor, dtype=torch.float32) observationTensor = observationTensor.to(self.device) observations.append(observation.tolist()) if random.random( ) <= policyPercent: # policy mix with random choice with torch.no_grad(): action = torch.max(self.policy(observationTensor), 0)[1].item() # 0 or 1 else: action = random.randint(0, 1) actions.append(action) observation, reward, done, info = env.step(action) if done: totalReward = t + 1 # print(f"Episode finished after {t + 1} timesteps") break result.append((actions, observations, totalReward)) env.close() return result def trainPolicy(self, episodes, policyPercent=0.8): """ Train the policy """ # First play serval times to determine the average reward. trajectoriesForAvgRwd = self.simulate(20, 1) averageReward = sum([i[2] for i in trajectoriesForAvgRwd ]) / len(trajectoriesForAvgRwd) print(averageReward) trajectoriesForTrain = self.simulate(episodes, policyPercent) for trainTrajectory in trajectoriesForTrain: if trainTrajectory[2] > averageReward: # forward predictAction = self.policy( torch.tensor(trainTrajectory[1]).to(self.device)) loss = self.criterion( predictAction, torch.tensor(trainTrajectory[0]).to(self.device)) # backwards self.optimizer.zero_grad() loss.backward() self.optimizer.step() torch.save(self.policy.state_dict(), self.FILE)
class Agent(): def __init__(self, params): self.params = params self.__state_dim = params['state_dim'] self.__action_dim = params['action_dim'] self.__buffer_size = params['buffer_size'] self.__batch_size = params['batch_size'] self.__gamma = params['gamma'] self.__tau = params['tau'] self.__lr = params['lr'] self.__update_every = params['update_every'] eps = params['eps'] eps_decay = params['eps_decay'] min_eps = params['min_eps'] seed = params['seed'] device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Q-Network critic_params = dict() critic_params['seed'] = seed critic_params['arch_params'] = params['arch_params_critic'] self.critic_local = QNetwork(critic_params).to(device) self.critic_target = QNetwork(critic_params).to(device) self.optimizer_critic = optim.Adam(self.critic_local.parameters(), lr=self.__lr) #Policy actor_params = dict() actor_params['seed'] = seed actor_params['arch_params'] = params['arch_params_actor'] actor_params['noise_type'] = params['noise_type'] actor_params['eps'] = eps actor_params['eps_decay'] = eps_decay actor_params['min_eps'] = min_eps actor_params['arch_params'] = params['arch_params_actor'] self.actor_local = Policy(actor_params).to(device) self.actor_target = Policy(actor_params).to(device) self.optimizer_actor = optim.Adam(self.actor_local.parameters(), lr=self.__lr) self.__memory = ReplayBuffer(self.__buffer_size, self.__batch_size) self.__t_step = 0 def memorize_experience(self, state, action, reward, next_state, done): self.__memory.add(state, action, reward, next_state, done) self.__t_step = (self.__t_step + 1) def choose_action(self, state): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") state = torch.from_numpy(state.astype(dtype=np.float)).to(device) action, action_perturbed = self.actor_local(state) return action, action_perturbed def learn_from_past_experiences(self): if self.__t_step % self.__update_every == 0: if len(self.__memory) > self.__batch_size: experiences = self.__memory.sample() self.update_Qnet_and_policy(experiences) def update_Qnet_and_policy(self, experiences): states, actions, rewards, next_states, dones = experiences next_actions, next_actions_perturbed = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, next_actions) Q_targets = rewards + (self.__gamma * Q_targets_next * (1 - dones) ) # if done == True: second term is equal to 0 Q_expected = self.critic_local(states, actions) loss_func = nn.MSELoss() loss_critic = loss_func(Q_expected, Q_targets.detach()) self.optimizer_critic.zero_grad() loss_critic.backward() self.optimizer_critic.step() predicted_actions, predicted_actions_perturbed = self.actor_local( states) # new predicted actions, not the ones stored in buffer if self.params['noise_type'] == 'parameter': #if the distance between predicted_actions and predicted_actions_perturbed is too big (>=0.2) then update noise if (predicted_actions - predicted_actions_perturbed).pow(2).mean() >= 0.3: self.actor_local.eps /= 1.01 self.actor_target.eps /= 1.01 else: self.actor_local.eps *= 1.01 self.actor_target.eps *= 1.01 loss_actor = -self.critic_local(states, predicted_actions).mean() self.optimizer_actor.zero_grad() loss_actor.backward() self.optimizer_actor.step() self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def update_eps(self): self.actor_local.eps = max( self.actor_local.eps * self.actor_local.eps_decay, self.actor_local.min_eps) self.actor_target.eps = max( self.actor_target.eps * self.actor_target.eps_decay, self.actor_target.min_eps) def soft_update(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.__tau * local_param.data + (1.0 - self.__tau) * target_param.data) def save_weights(self, save_to): actor_params = { 'actor_params': self.actor_local.policy_params, 'state_dict': self.actor_local.state_dict() } critic_params = { 'critic_params': self.critic_local.qnet_params, 'state_dict': self.critic_local.state_dict() } file = dict() file['critic_params'] = critic_params file['actor_params'] = actor_params torch.save(file, open(save_to, 'wb')) def load_weights(self, load_from): checkpoint = torch.load(load_from) qnet_params = checkpoint['critic_params'] policy_params = checkpoint['actor_params'] self.actor_local = Policy(policy_params['actor_params']) self.actor_local.load_state_dict( checkpoint['actor_params']['state_dict']) self.critic_local = QNetwork(qnet_params['critic_params']) self.critic_local.load_state_dict( checkpoint['critic_params']['state_dict']) return self
np.random.seed(seed_value) torch.manual_seed(seed_value) os.environ['PYTHONHASHSEED'] = str(seed_value) if torch.cuda.is_available(): torch.cuda.manual_seed(seed_value) torch.cuda.manual_seed_all(seed_value) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = True seed_everything(seed) env = LunarLander() policy = Policy(env.observation_dim, env.action_dim) optimizer = optim.Adam(policy.parameters(), lr=lr) eps = np.finfo(np.float32).eps.item() def select_action(state): state = torch.from_numpy(state).float().unsqueeze(0) probs = policy(state) m = Categorical(probs) action = m.sample() policy.saved_log_probs.append(m.log_prob(action)) return action.item() def finish_episode(): R = 0 policy_loss = []