Example #1
0
                                          epsilon=epsilon,
                                          beta=beta)
        optimizer.zero_grad()
        L.backward()
        optimizer.step()
        del L

    # the clipping parameter reduces as time goes on
    epsilon *= .999

    # the regulation term also reduces
    # this reduces exploration in later runs
    beta *= .995

    # get the average reward of the parallel environments
    mean_rewards.append(np.mean(total_rewards))

    # display some progress every 20 iterations
    if (e + 1) % 20 == 0:
        print("Episode: {0:d}, score: {1:f}".format(e + 1,
                                                    np.mean(total_rewards)))
        print(total_rewards)

    # update progress widget bar
    timer.update(e + 1)

timer.finish()

pong_utils.play(env, policy, time=200)
torch.save(policy, 'PPO.policy')
Example #2
0
import pong_utils
device = pong_utils.device
print("using device: ", device)

import gym
env = gym.make('PongDeterministic-v4')
print("List of available actions: ", env.unwrapped.get_action_meanings())
# The actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 makes the game restarts if done

import matplotlib.pyplot as plt

from agent import Policy
agent = Policy()
agent = agent.to(device)

pong_utils.play(env, agent, time=100)

envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345)
prob, state, action, reward = pong_utils.collect_trajectories(envs,
                                                              agent,
                                                              tmax=100)