epsilon=epsilon, beta=beta) optimizer.zero_grad() L.backward() optimizer.step() del L # the clipping parameter reduces as time goes on epsilon *= .999 # the regulation term also reduces # this reduces exploration in later runs beta *= .995 # get the average reward of the parallel environments mean_rewards.append(np.mean(total_rewards)) # display some progress every 20 iterations if (e + 1) % 20 == 0: print("Episode: {0:d}, score: {1:f}".format(e + 1, np.mean(total_rewards))) print(total_rewards) # update progress widget bar timer.update(e + 1) timer.finish() pong_utils.play(env, policy, time=200) torch.save(policy, 'PPO.policy')
import pong_utils device = pong_utils.device print("using device: ", device) import gym env = gym.make('PongDeterministic-v4') print("List of available actions: ", env.unwrapped.get_action_meanings()) # The actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 makes the game restarts if done import matplotlib.pyplot as plt from agent import Policy agent = Policy() agent = agent.to(device) pong_utils.play(env, agent, time=100) envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345) prob, state, action, reward = pong_utils.collect_trajectories(envs, agent, tmax=100)