Beispiel #1
0
import torch.optim as optim
import gym
import time

# check which device is being used.
# I recommend disabling gpu until you've made sure that the code runs
device = pong_utils.device
print("using device: ", device)

# PongDeterministic does not contain random frameskip
# so is faster to train than the vanilla Pong-v4 environment
env = gym.make('PongDeterministic-v4')

print("List of available actions: ", env.unwrapped.get_action_meanings())

policy = pong_utils.Policy().to(device)

# we use the adam optimizer with learning rate 2e-4
# optim.SGD is also possible
optimizer = optim.Adam(policy.parameters(), lr=1e-4)


def discounted_future_rewards(rewards, ratio=0.999):
    n = rewards.shape[1]
    step = torch.arange(n)[:, None] - torch.arange(n)[None, :]
    ones = torch.ones_like(step)
    zeros = torch.zeros_like(step)

    target = torch.where(step >= 0, ones, zeros)
    step = torch.where(step >= 0, step, zeros)
    discount = target * (ratio**step)
def main(loop):
    beta = .01
    tmax = int(250 / Env.vw)
    SGD_epoch = 4
    epsilon = 0.1
    episode = 500
    envs = Env.envs()
    # check which device is being used.
    # I recommend disabling gpu until you've made sure that the code runs
    device = pong_utils.device
    print("using device: ", device)
    # keep track of progress
    mean_rewards = []
    policy = pong_utils.Policy().to(device)

    # we use the adam optimizer with learning rate 2e-4
    # optim.SGD is also possible
    optimizer = optim.Adam(policy.parameters(), lr=1e-4)
    for e in range(episode):
        # collect trajectories
        old_probs, states, actions, rewards = \
            pong_utils.collect_trajectories(envs, policy, tmax=tmax)

        total_rewards = np.sum(rewards, axis=0)

        # gradient ascent step
        for _ in range(SGD_epoch):
            # uncomment to utilize your own clipped function!
            # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta)

            L = -pong_utils.clipped_surrogate(policy,
                                              old_probs,
                                              states,
                                              actions,
                                              rewards,
                                              epsilon=epsilon,
                                              beta=beta)
            optimizer.zero_grad()
            L.backward()
            optimizer.step()
            del L

        # the clipping parameter reduces as time goes on
        epsilon *= .999

        # the regulation term also reduces
        # this reduces exploration in later runs
        beta *= .995

        # get the average reward of the parallel environments
        mean_rewards.append(np.mean(total_rewards))

        # display some progress every 20 iterations
        if (e + 1) % 20 == 0:
            print("Episode: {0:d}, score: {1:f}".format(
                e + 1, np.mean(total_rewards)))
            print(total_rewards)

    env = envs.ps[0]
    mean_rewards = np.array(mean_rewards)
    np.savetxt('data_{}.csv'.format(loop), mean_rewards, newline='\n')