class PPO(object): """Main PPO class""" def __init__(self, args): """"Constructor which allows the PPO class to initialize the attributes of the class""" self.args = args self.random_seed() # Check if GPU is available via CUDA driver self.use_cuda = torch.cuda.is_available() self.device = torch.device("cuda" if self.use_cuda else "cpu") # Initialize the actor critic class self.actor_critic = ActorCritic( self.args.nb_states, self.args.nb_actions, self.args.hidden_layer_size).to(self.device) # Define the optimizer used for the optimization of the surrogate loss self.optimizer = self.args.optimizer(self.actor_critic.parameters(), self.args.lr) # For training multiple instances of the env are needed (Shoulder model) self.envs = [self.make_env() for i in range(self.args.num_envs)] self.envs = SubprocVecEnv(self.envs) # To validate the intermediate learning process one test env is needed self.env_test = self.args.env self.env_test.seed(self.args.seed) self.env_test.set_scaling(self.args.output_scaling) # Lists for Tensorboard to visualize learning process during learning self.test_rewards = [] self.loss = [] self.lr = [] self.actor_grad_weight = [] self.action_bang_bang = [] self.lr.append(self.args.lr) # Dump bin files if self.args.play is False: self.output_path = "trained_models" + '/PPO_{}'.format( datetime.now().strftime('%Y%b%d_%H%M%S')) + "/" os.mkdir(self.output_path) self.writer = SummaryWriter(self.output_path) #self.delta = (self.args.lr-self.args.lr_end)/1e6 def train(self): """Main training function""" frame_idx = 0 state = self.envs.reset() mean_100_reward = -np.inf self.info() while frame_idx < self.args.max_frames: log_probs = [] values = [] states = [] actions = [] rewards = [] masks = [] entropy = self.args.entropy for _ in range(self.args.nb_steps): state = torch.FloatTensor(state).to(self.device) dist, value = self.actor_critic(state) action = dist.sample() # Make sure action is loaded to CPU (not GPU) next_state, reward, done, _ = self.envs.step( action.cpu().numpy()) log_prob = dist.log_prob(action) entropy += dist.entropy().mean() log_probs.append(log_prob) values.append(value) rewards.append( torch.FloatTensor(reward).unsqueeze(1).to(self.device)) masks.append( torch.FloatTensor(1 - done).unsqueeze(1).to(self.device)) states.append(state) actions.append(action) state = next_state frame_idx += 1 #self.scheduler() # Evaluate training process and write data to tensorboard if frame_idx % 1000 == 0: test_reward = np.mean( [self.test_env(self.args.vis) for _ in range(10)]) self.test_rewards.append(test_reward) if self.args.play is False: print("Mean reward: ", np.round(np.mean(self.test_rewards[-101:-1]), 0)) if mean_100_reward < np.round( np.mean(self.test_rewards[-101:-1]), 0): mean_100_reward = np.round( np.mean(self.test_rewards[-101:-1]), 0) self.save_network(mean_100_reward) if len(self.test_rewards) >= 10: self.writer.add_scalar( 'data/reward', np.mean(self.test_rewards[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/ppo_loss', np.mean(self.loss[-11:-1]), frame_idx * self.args.num_envs) self.writer.add_scalar( 'data/nb_actions_outside_range', np.mean(self.action_bang_bang[-11:-1]), frame_idx * self.args.num_envs) # if test_reward > threshold_reward: early_stop = True next_state = torch.FloatTensor(next_state).to(self.device) _, next_value = self.actor_critic(next_state) returns = self.calc_gae(next_value, rewards, masks, values, self.args.gamma, self.args.tau) # detach() to take it away from the graph i.e. this operations are ignored for gradient calculations returns = torch.cat(returns).detach() log_probs = torch.cat(log_probs).detach() values = torch.cat(values).detach() states = torch.cat(states) actions = torch.cat(actions) advantage = returns - values self.ppo_update(self.args.ppo_epochs, self.args.mini_batch_size, states, actions, log_probs, returns, advantage, self.args.clip) def make_env(self): # Private trunk function for calling the SubprocVecEnv class def _trunk(): env = self.args.env # in this simple case the class TestEnv() is called (see openAI for more envs) env.seed(self.args.seed) env.set_scaling(self.args.output_scaling) return env return _trunk def test_env(self, vis=False): state = self.env_test.reset() if vis: self.env_test.render() done = False total_reward = 0 action_bang_bang = 0 step = 0 while not done: step += 1 state = torch.FloatTensor(state).unsqueeze(0).to(self.device) dist, _ = self.actor_critic(state) action = dist.sample().cpu().numpy()[0] force = action * self.args.output_scaling next_state, reward, done, _ = self.env_test.step(action) if force > 0.5 or force < -0.5: action_bang_bang += 1 state = next_state if vis: self.env_test.render() total_reward += reward self.action_bang_bang.append(action_bang_bang / step) return total_reward # Plain functions except that one can call them from an instance or the class @staticmethod def calc_gae(next_value, rewards, masks, values, gamma=0.99, tau=0.95): values = values + [next_value] gae = 0 returns = [] for step in reversed(range(len(rewards))): delta = rewards[step] + gamma * values[ step + 1] * masks[step] - values[step] gae = delta + gamma * tau * masks[step] * gae returns.insert(0, gae + values[step]) return returns @staticmethod def ppo_iter(mini_batch_size, states, actions, log_probs, returns, advantage): batch_size = states.size(0) for _ in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) yield states[rand_ids, :], actions[rand_ids, :], log_probs[ rand_ids, :], returns[rand_ids, :], advantage[rand_ids, :] def ppo_update(self, ppo_epochs, mini_batch_size, states, actions, log_probs, returns, advantages, clip_param=0.2): for _ in range(ppo_epochs): for state, action, old_log_probs, return_, advantage in self.ppo_iter( mini_batch_size, states, actions, log_probs, returns, advantages): dist, value = self.actor_critic(state) entropy = dist.entropy().mean() new_log_probs = dist.log_prob(action) ratio = (new_log_probs - old_log_probs).exp() surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1.0 - clip_param, 1.0 + clip_param) * advantage actor_loss = -torch.min(surr1, surr2).mean() critic_loss = (return_ - value).pow(2).mean() loss = 0.5 * critic_loss + actor_loss - 0.001 * entropy self.loss.append(loss.item()) # Important step: self.optimizer.zero_grad() #pdb.set_trace() loss.backward() if self.args.grad_norm is not None: nn.utils.clip_grad_norm_(self.actor_critic.parameters(), self.args.grad_norm) self.optimizer.step() def save_network(self, reward): network_path = self.output_path + "/network" + str(reward) pickle.dump(self.actor_critic.state_dict(), open(network_path, "wb")) def load_network(self, path): network_new = pickle.load(open(path, "rb")) self.actor_critic.load_state_dict(network_new) def random_seed(self): torch.manual_seed(self.args.seed) random.seed(self.args.seed) np.random.seed(self.args.seed) def scheduler(self): for g in self.optimizer.param_groups: lr = g["lr"] if self.args.lr_end > lr: lr = self.args.lr_end else: lr -= self.delta self.lr.append(lr) g["lr"] = lr def info(self): fhandler = logging.FileHandler(filename=self.output_path + '/mylog.log', mode='a') logger.addHandler(fhandler) logger.info("--- INFO ---") logger.info("args: {}".format(self.args))
help='rewards discount factor') parser.add_argument('--entropy_weight', default=0.0001, type=float) parser.add_argument('--alpha', default=0.95, type=float) parser.add_argument('--type', default='notrpo', type=str, help='iftrpo') parser.add_argument('--render', action='store_true', help='render') args = parser.parse_args() # print(args) torch.manual_seed(args.seed) env = gym.make("CartPole-v0") replay_buffer = ReplayBuffer(args.capacity, args.max_episode_length) model = ActorCritic(env.observation_space.shape[0], env.action_space.n).cuda() average_model = ActorCritic(env.observation_space.shape[0], env.action_space.n).cuda() optimizer = optim.Adam(model.parameters()) frame_idx = 0 test_rewards = [] episode_count = 0 step_count = 0 state = env.reset() running_rew = 0 plotcount = 0 while frame_idx < args.max_frames: policies = [] average_policies = [] actions = []
class Learner(object): def __init__(self, opt, q_batch): self.opt = opt self.q_batch = q_batch self.network = ActorCritic(opt).to(device) self.optimizer = Adam(self.network.parameters(), lr=opt.lr) self.network.share_memory() def learning(self): torch.manual_seed(self.opt.seed) coef_hat = torch.Tensor([[self.opt.coef_hat]]).to(device) rho_hat = torch.Tensor([[self.opt.rho_hat]]).to(device) while True: # batch-trace # s[batch, n_step+1, 3, width, height] # a[batch, n_step, a_space] # rew[batch, n_step] # a_prob[batch, n_step, a_space] s, a, rew, prob = self.q_batch.get(block=True) ########################### # variables we need later # ########################### v, coef, rho, entropies, log_prob = [], [], [], [], [] cx = torch.zeros(self.opt.batch_size, 256).to(device) hx = torch.zeros(self.opt.batch_size, 256).to(device) for step in range(s.size(1)): # value[batch] # logit[batch, 12] value, logit, (hx, cx) = self.network((s[:, step, ...], (hx, cx))) v.append(value) if step >= a.size( 1 ): # noted that s[, n_step+1, ...] but a[, n_step,...] break # loop for n_step+1 because v in n_step+1 is needed. # π/μ[batch] # TODO: cumprod might produce runtime problem logit_a = a[:, step, :] * logit.detach() + ( 1 - a[:, step, :]) * (1 - logit.detach()) prob_a = a[:, step, :] * prob[:, step, :] + ( 1 - a[:, step, :]) * (1 - prob[:, step, :]) is_rate = torch.cumprod(logit_a / (prob_a + 1e-6), dim=1)[:, -1] coef.append(torch.min(coef_hat, is_rate)) rho.append(torch.min(rho_hat, is_rate)) # enpy_aspace[batch, 12] # calculating the entropy[batch, 1] # more specifically there are [a_space] entropy for each batch, sum over them here. # noted that ~do not~ use detach here enpy_aspace = -torch.log(logit) * logit - torch.log( 1 - logit) * (1 - logit) enpy = (enpy_aspace).sum(dim=1, keepdim=True) entropies.append(enpy) # calculating the prob that the action is taken by target policy # and the prob_pi_a[batch, 12] and log_prob[batch, 1] of this action # noted that ~do not~ use detach here prob_pi_a = (a[:, step, :] * logit) + (1 - a[:, step, :]) * (1 - logit) log_prob_pi_a = torch.log(prob_pi_a).sum(dim=1, keepdim=True) log_prob.append(log_prob_pi_a) # prob_pi_a = torch.cumprod(prob_pi_a, dim=1)[:, -1:] # log_prob_pi_a = torch.log(prob_pi_a) #################### # calculating loss # #################### policy_loss = 0 value_loss = 0 # gae = torch.zeros(self.opt.batch_size, 1) for rev_step in reversed(range(s.size(1) - 1)): # compute v_(s+1)[batch] for policy gradient fix_vp = rew[:, rev_step] + self.opt.gamma * ( v[rev_step + 1] + value_loss) - v[rev_step] # value_loss[batch] td = rew[:, rev_step] + self.opt.gamma * v[rev_step + 1] - v[rev_step] value_loss = self.opt.gamma * coef[ rev_step] * value_loss + rho[rev_step] * td # policy_loss = policy_loss - log_probs[i] * Variable(gae) # the td must be detach from network-v # # dalta_t[batch] # delta_t = rew[:, rev_step] + self.opt.gamma * v[rev_step + 1] - v[rev_step] # gae = gae * self.opt.gamma + delta_t.detach() policy_loss = policy_loss \ - rho[rev_step] * log_prob[rev_step] * fix_vp.detach() \ - self.opt.entropy_coef * entropies[rev_step] self.optimizer.zero_grad() policy_loss = policy_loss.sum() value_loss = value_loss.sum() loss = policy_loss + self.opt.value_loss_coef * value_loss loss.backward() torch.nn.utils.clip_grad_norm_(self.network.parameters(), self.opt.max_grad_norm) print("v_loss {:.3f} p_loss {:.3f}".format(value_loss.item(), policy_loss.item())) self.optimizer.step()