import torch.optim as optim import gym import time # check which device is being used. # I recommend disabling gpu until you've made sure that the code runs device = pong_utils.device print("using device: ", device) # PongDeterministic does not contain random frameskip # so is faster to train than the vanilla Pong-v4 environment env = gym.make('PongDeterministic-v4') print("List of available actions: ", env.unwrapped.get_action_meanings()) policy = pong_utils.Policy().to(device) # we use the adam optimizer with learning rate 2e-4 # optim.SGD is also possible optimizer = optim.Adam(policy.parameters(), lr=1e-4) def discounted_future_rewards(rewards, ratio=0.999): n = rewards.shape[1] step = torch.arange(n)[:, None] - torch.arange(n)[None, :] ones = torch.ones_like(step) zeros = torch.zeros_like(step) target = torch.where(step >= 0, ones, zeros) step = torch.where(step >= 0, step, zeros) discount = target * (ratio**step)
def main(loop): beta = .01 tmax = int(250 / Env.vw) SGD_epoch = 4 epsilon = 0.1 episode = 500 envs = Env.envs() # check which device is being used. # I recommend disabling gpu until you've made sure that the code runs device = pong_utils.device print("using device: ", device) # keep track of progress mean_rewards = [] policy = pong_utils.Policy().to(device) # we use the adam optimizer with learning rate 2e-4 # optim.SGD is also possible optimizer = optim.Adam(policy.parameters(), lr=1e-4) for e in range(episode): # collect trajectories old_probs, states, actions, rewards = \ pong_utils.collect_trajectories(envs, policy, tmax=tmax) total_rewards = np.sum(rewards, axis=0) # gradient ascent step for _ in range(SGD_epoch): # uncomment to utilize your own clipped function! # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta) L = -pong_utils.clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta) optimizer.zero_grad() L.backward() optimizer.step() del L # the clipping parameter reduces as time goes on epsilon *= .999 # the regulation term also reduces # this reduces exploration in later runs beta *= .995 # get the average reward of the parallel environments mean_rewards.append(np.mean(total_rewards)) # display some progress every 20 iterations if (e + 1) % 20 == 0: print("Episode: {0:d}, score: {1:f}".format( e + 1, np.mean(total_rewards))) print(total_rewards) env = envs.ps[0] mean_rewards = np.array(mean_rewards) np.savetxt('data_{}.csv'.format(loop), mean_rewards, newline='\n')