def clipped_surrogate(policy, old_probs, states, actions, rewards, discount=0.995, epsilon=0.1, beta=0.01): actions = torch.tensor(actions, dtype=torch.int8, device=device) rewards = torch.tensor(rewards, dtype=torch.float, device=device) old_probs = torch.tensor(old_probs, dtype=torch.float, device=device) # convert states to policy (or probability) new_probs = pong_utils.states_to_prob(policy, states, actions) new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0 - new_probs) # discounted cumulative reward R_future = discounted_future_rewards(rewards, discount) # subtract baseline (= mean of reward) R_mean = torch.mean(R_future) R_future -= R_mean ratio = new_probs / (old_probs + 1e-6) ratio_clamped = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) ratio_PPO = torch.where(ratio < ratio_clamped, ratio, ratio_clamped) # policy gradient maxmize target surrogates = (R_future * ratio_PPO).mean() # include a regularization term # this steers new_policy towards 0.5 # which prevents policy to become exactly 0 or 1 # this helps with exploration # add in 1.e-10 to avoid log(0) which gives nan # entropy = -(new_probs*torch.log(old_probs+1.e-10) + (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10)) # surrogates += torch.mean(beta*entropy) return surrogates
def clipped_surrogate(policy, old_probs, states, actions, rewards, discount=0.995, epsilon=0.1, beta=0.01): actions = torch.tensor(actions, dtype=torch.int8, device=device) rewards = torch.tensor(rewards, dtype=torch.float, device=device) old_probs = torch.tensor(old_probs, dtype=torch.float, device=device) # convert states to policy (or probability) new_probs = pong_utils.states_to_prob(policy, states, actions) new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0 - new_probs) # discounted cumulative reward R_future = discounted_future_rewards(rewards, discount) # subtract baseline (= mean of reward) R_mean = torch.mean(R_future) R_future -= R_mean ratio = new_probs / (old_probs + 1e-6) ratio_clamped = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) ratio_PPO = torch.where(ratio < ratio_clamped, ratio, ratio_clamped) surrogates = (R_future * ratio_PPO).mean() return surrogates
def surrogate(policy, old_probs, states, actions, rewards, discount = 0.995, beta=0.01): discount = discount**np.arange(len(rewards)) rewards = np.asarray(rewards)*discount[:,np.newaxis] # convert rewards to future rewards rewards_future = rewards[::-1].cumsum(axis=0)[::-1] # Normalize rewards mean = np.mean(rewards_future, axis=1) std = np.std(rewards_future, axis=1) + 1.0e-10 rewards_normalized = (rewards_future - mean[:,np.newaxis])/std[:,np.newaxis] actions = torch.tensor(actions, dtype=torch.int8, device=device) old_probs = torch.tensor(old_probs, dtype=torch.float, device=device) rewards = torch.tensor(rewards_normalized, dtype=torch.float, device=device) # convert states to policy (or probability) new_probs = pong_utils.states_to_prob(policy, states) new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0-new_probs) # include a regularization term # this steers new_policy towards 0.5 # which prevents policy to become exactly 0 or 1 # this helps with exploration # add in 1.e-10 to avoid log(0) which gives nan entropy = -(new_probs*torch.log(old_probs+1.e-10)+ (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10)) return torch.mean(torch.log(new_probs)*rewards + beta*entropy)
def clipped_surrogate_PPO(policy, old_probs, states, actions, rewards, gamma=0.995, epsilon=0.1, beta=0.01): # get number of trajectories = num of agents steps_in_trajectories = len(states) num_trajectories = len(states[0]) actions = torch.tensor(actions, dtype=torch.int8, device=device) # convert states to policy (or probability) new_probs = pong_utils.states_to_prob(policy, states) new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0 - new_probs) rewards_future = get_future_rewards(rewards, num_trajectories, gamma) R = torch.tensor(reward_normalization(rewards_future)).float().to(device) # ratio for clipping old_probs = torch.tensor(old_probs).to(device) ratio = new_probs / old_probs # clipped function clip = torch.clamp(ratio, 1 - epsilon, 1 + epsilon) clipped_surrogate = torch.min(ratio * R, clip * R) return torch.sum(clipped_surrogate) / num_trajectories """
def surrogate(policy, old_probs, states, actions, rewards, discount=0.995, beta=0.01, epsilon=0.1, use_ppo_clip=False): norm_rewards = disc_rewards(rewards, discount) t_actions = torch.tensor(actions, dtype=torch.int8, device=device) t_old_probs = torch.tensor(old_probs, dtype=torch.float, device=device) t_norm_rewards = torch.tensor(norm_rewards, dtype=torch.float, device=device) # convert states to policy (or probability) t_new_probs = pong_utils.states_to_prob(policy, states) t_new_probs = torch.where(t_actions == pong_utils.RIGHT, t_new_probs, 1.0 - t_new_probs) """ now we can either calc log(t_new_probs) or take directly t_new_probs/t_old_probs (old_probs is same same and fixed) """ t_rap = t_new_probs / t_old_probs # only t_new_probs is diferentiable if use_ppo_clip: t_rap_clip = torch.clamp(t_rap, 1 - epsilon, 1 + epsilon) t_main_loss = torch.min(t_norm_rewards * t_rap, t_norm_rewards * t_rap_clip) else: t_main_loss = t_norm_rewards * t_rap # include a regularization term # this steers new_policy towards 0.5 # which prevents policy to become exactly 0 or 1 # this helps with exploration # add in 1.e-10 to avoid log(0) which gives nan entropy = -(t_new_probs*torch.log(t_old_probs+1.e-10)+ \ (1.0-t_new_probs)*torch.log(1.0-t_old_probs+1.e-10)) return torch.mean(t_main_loss + beta * entropy)
def calculate_loss_REINFORCE(policy, old_probs, states, actions, rewards, gamma=0.995, beta=0.01): ######## ## CREDIT ASSIGNMENT --> TAKING INTO ACCOUNT ONLY FUTURE REWARDS ## --> DISCOUNTED REWARD IMPLEMENTED WITH GAMMA ## NOISE REDUCTION --> NORMALIZATION OF REWARD ######## # get number of trajectories = num of agents steps_in_trajectories = len(states) num_trajectories = len(states[0]) actions = torch.tensor( actions, dtype=torch.int8, device=device) # ACTIONS: 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 new_probs = pong_utils.states_to_prob( policy, states) # convert states to policy (or probability) new_probs = torch.where(actions == pong_utils.RIGHT, new_probs, 1.0 - new_probs) # REWARDS #rewards_future = get_future_rewards_recursive(rewards,gamma) rewards_future = get_future_rewards(rewards, num_trajectories, gamma) R_np = reward_normalization(rewards_future) with torch.no_grad(): R = torch.from_numpy(R_np).float().to( device) #specify to prepare data for CUDA # POLICY_LOSS policy_loss = [] for i, prob in enumerate(new_probs): log_prob = torch.log(prob) result = torch.mul(log_prob, R[i]).to( device) # CALCULATE GRADIENT --> multiply element-wise policy_loss.append(result) policy_loss = torch.cat(policy_loss) #concat in single 1D-tensor policy_loss = policy_loss.sum(dim=0) # sum all values policy_loss /= num_trajectories # calculate the gradient estimation (divide total number of trajectories) return policy_loss
def surrogate(policy, old_probs, states, actions, rewards, discount=0.995, beta=0.01): discount = discount**np.arange(len(rewards)) rewards = np.asarray(rewards) * discount[:, np.newaxis] # convert rewards to future rewards rewards_future = rewards[::-1].cumsum(axis=0)[::-1] mean = np.mean(rewards_future, axis=1) std = np.std(rewards_future, axis=1) + 1.0e-10 rewards_normalized = (rewards_future - mean[:, np.newaxis]) / std[:, np.newaxis] # convert everything into pytorch tensors and move to gpu if available actions = torch.tensor(actions, dtype=torch.int8, device=device) # old_probs = torch.tensor(old_probs, dtype=torch.float, device=device) rewards = torch.tensor(rewards_normalized, dtype=torch.float, device=device) # convert states to policy (or probability) new_probs = pong_utils.states_to_prob(policy, states) new_probs = torch.where(actions == RIGHT, new_probs, 1.0 - new_probs) log_probs_new = torch.log(new_probs) log_prob_actions_v = rewards * log_probs_new loss_policy_v = -log_prob_actions_v.mean() entropy_v = -(new_probs * log_probs_new).sum(dim=1).mean() entropy_loss_v = -beta * entropy_v loss_v = loss_policy_v + entropy_loss_v return loss_v
def surrogate(policy, old_probs, states, actions, rewards, discount=0.995, beta=0.01, epsilon=0.1): discount = discount**np.arange(len(rewards)) rewards = np.asarray(rewards) * discount[:, np.newaxis] # convert rewards to future rewards rewards_future = rewards[::-1].cumsum(axis=0)[::-1] mean = np.mean(rewards_future, axis=1) std = np.std(rewards_future, axis=1) + 1.0e-10 rewards_normalized = (rewards_future - mean[:, np.newaxis]) / std[:, np.newaxis] # convert everything into pytorch tensors and move to gpu if available actions = torch.tensor(actions, dtype=torch.int8, device=device) old_probs = torch.tensor(old_probs, dtype=torch.float, device=device) rewards = torch.tensor(rewards_normalized, dtype=torch.float, device=device) # convert states to policy (or probability) new_probs = pong_utils.states_to_prob(policy, states) new_probs = torch.where(actions == RIGHT, new_probs, 1.0 - new_probs) reweighting_factor = new_probs / old_probs clipped = torch.clamp(reweighting_factor, 1 - epsilon, 1 + epsilon) clipped_surrogate = torch.min(reweighting_factor * rewards, clipped * rewards) entropy = -(new_probs*torch.log(old_probs+1.e-10)+ \ (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10)) return torch.mean(clipped_surrogate + beta * entropy)