def pd_test(env_fn, policy, load_path): env = env_fn() actions = env.unwrapped.action_list env._seed(int(time.time())) obs = env.reset() obs = np.expand_dims(obs, axis=0) action_list = [] with tf.Session() as sess: actor_critic = ActorCritic(sess, policy, env.observation_space.shape, env.action_space, 1, 5) if load_path: actor_critic.load(load_path) else: sess.run(tf.global_variables_initializer()) print('WARNING: No Model Loaded!') print(env.unwrapped.scramble_current) d = False while not d: print('-------------------------------------------------') print('Current Observation') env.render() a, v, neg = actor_critic.act(obs, stochastic=True) print('') print('action: ', actions[a[0]]) print('value: ', v) print('neglogp: ', neg) print('pd: ') for ac, pd in zip(actions, actor_critic.step_model.logits(obs)[0][0]): print('\t', ac, pd) obs, r, d, _ = env.step(a[0]) print('r: ', r) obs = np.expand_dims(obs, axis=0) env.render() env.close()
class PPO: def __init__(self, device, state_dim, action_dim, action_std, lr, betas, gamma, K_epochs, eps_clip): self.lr = lr self.device = device self.betas = betas self.gamma = gamma self.eps_clip = eps_clip self.K_epochs = K_epochs self.policy = ActorCritic(state_dim, action_dim, action_std).to(device) #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas) self.optimizer = optim.Adam(self.policy.parameters(), lr=lr) self.policy_old = ActorCritic(state_dim, action_dim, action_std).to(device) self.policy_old.load_state_dict(self.policy.state_dict()) self.MseLoss = nn.MSELoss() def select_action(self, state, memory): if np.any(np.isnan(state)): print('in select action: state is nan', state) state = torch.FloatTensor(state.reshape(1, -1)).to(self.device) return self.policy_old.act(state, memory).cpu().data.numpy().flatten() def update(self, memory): # Monte Carlo estimate of rewards: rewards = [] discounted_reward = 0 for reward, is_terminal in zip(reversed(memory.rewards), reversed(memory.is_terminals)): if is_terminal: discounted_reward = 0 discounted_reward = reward + (self.gamma * discounted_reward) rewards.insert(0, discounted_reward) # Normalizing the rewards: rewards = torch.tensor(rewards).to(self.device) rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5) # convert list to tensor old_states_ = torch.squeeze( torch.stack(memory.states).to(self.device)).detach() old_actions_ = torch.squeeze( torch.stack(memory.actions).to(self.device)).detach() old_logprobs_ = torch.squeeze(torch.stack(memory.logprobs)).to( self.device).detach() batch_size = old_states_.shape[0] mini_batch_size = batch_size // 8 # 64 # Optimize policy for K epochs: for _ in range(self.K_epochs): # Evaluating old actions and values : for i in range(batch_size // mini_batch_size): rand_ids = np.random.randint(0, batch_size, mini_batch_size) old_states = old_states_[rand_ids, :] old_actions = old_actions_[rand_ids, :] old_logprobs = old_logprobs_[rand_ids, :] rewards_batch = rewards[rand_ids] logprobs, state_values, dist_entropy = self.policy.evaluate( old_states, old_actions) # Finding the ratio (pi_theta / pi_theta__old): ratios = torch.exp(logprobs - old_logprobs.detach()) # Finding Surrogate Loss: advantages = rewards_batch - state_values.detach() ## torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip) #surr = -torch.min(ratios, 1) * advantages # as per the paper len_adv = advantages.shape[0] advantages = advantages.reshape((len_adv, 1)) surr1 = ratios * advantages surr2 = 1 * advantages ## as per the paper surr = -torch.min(surr1, surr2).mean() w_crit_loss = 1 loss = surr + w_crit_loss * (rewards_batch - state_values).pow( 2).mean() #- 0.01 * dist_entropy # take gradient step self.optimizer.zero_grad() loss.mean().backward() self.optimizer.step() # Copy new weights into old policy: self.policy_old.load_state_dict(self.policy.state_dict())
""" scaled = scaler.transform([state]) featurized = featurizer.transform(scaled) return featurized[0] ac = ActorCritic(featurize_state(observation_space).shape[0]) while True: done = False s = env.reset() reward = [] while not done: s = featurize_state(s) #env.render() action = ac.act(s) s_prime, r, done, _ = env.step([action]) reward.append(r) value_next = ac.value_estimate(featurize_state(s_prime)).detach() td_target = r + gamma * value_next td_error = td_target - ac.value_estimate(s).detach() ac.update(s, td_target, td_error, action) s = s_prime print('Avg reward:', np.mean(reward), np.max(reward)) ac.plot()