コード例 #1
0
    def train(self):
        for e in range(self.episode):

            # collect trajectories
            old_probs, states, actions, rewards = pong_utils.collect_trajectories(
                self.envs, self.pong_agent.policy, tmax=self.tmax)

            total_rewards = np.sum(rewards, axis=0)

            self.pong_agent.train(self.epoch,
                                  old_probs,
                                  states,
                                  actions,
                                  rewards,
                                  epsilon=self.epsilon,
                                  beta=self.beta)
            # the clipping parameter reduces as time goes on
            self.epsilon *= .999
            # the regulation term also reduces
            # this reduces exploration in later runs
            self.beta *= .995
            # get the average reward of the parallel environments
            self.mean_rewards.append(np.mean(total_rewards))
            self.time_display.display(e, total_rewards)
        self.time_display.timer.finish()
        torch.save(self.pong_agent.policy, 'PongAgent.policy')
コード例 #2
0
    surrogates = (R_future * ratio_PPO).mean()

    # include a regularization term
    # this steers new_policy towards 0.5
    # which prevents policy to become exactly 0 or 1
    # this helps with exploration
    # add in 1.e-10 to avoid log(0) which gives nan
    # entropy = -(new_probs*torch.log(old_probs+1.e-10) + (1.0-new_probs)*torch.log(1.0-old_probs+1.e-10))
    # surrogates += torch.mean(beta*entropy)

    return surrogates


envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345)
prob, state, action, reward = pong_utils.collect_trajectories(envs,
                                                              policy,
                                                              tmax=100)
Lsur = clipped_surrogate(policy, prob, state, action, reward)
print(Lsur)

from parallelEnv import parallelEnv
import numpy as np

# keep track of how long training takes
# WARNING: running through all 800 episodes will take 30-45 minutes

# training loop max iterations
episode = 500

# widget bar to display progress
import progressbar as pb
コード例 #3
0
ファイル: ppo_pong.py プロジェクト: jsrimr/ppo_implement
model = ActorCritic().to(device)  #return dist, v
if args.load_weight:
    model.load_state_dict(
        torch.load(f'PongDeterministic-v4_{load_weight_n}.pth'))
optimizer = optim.Adam(model.parameters(), lr=lr)

f1 = envs.reset()
f2 = envs.step([0] * num_envs)

if __name__ == "__main__":
    while not early_stop and frame_idx < max_frames:
        frame_idx += 1
        print(frame_idx)
        if frame_idx % 100 == 0:
            num_steps += args.additional_num_step
        log_probs, states, actions, rewards, next_state, masks, values = collect_trajectories(
            envs, model, num_steps)
        scores = np.asarray(rewards).sum(axis=0)
        scores_list.append(scores.mean())
        print("Mean:", scores.mean(), "\nRaw:", scores)

        # stop if any of the trajectories is done
        # we want all the lists to be retangular

        for _ in range(n_updates):

            # uncomment to utilize your own clipped function!
            # raise Exception(type(states), states[0].size())
            if args.beta_decay and beta > 0.01:
                beta *= discount
            L = -clipped_surrogate(model, log_probs, states, actions, rewards,
                                   discount, epsilon, beta)
コード例 #4
0
# envs = parallelEnv('PongDeterministic-v4', n=8, seed=1234)

discount_rate = .99
beta = .01
tmax = 200
SGD_epoch = 4
epsilon = 0.1
max_num_ac = 5
min_num_ac = 5

# keep track of progress
mean_rewards = []
for e in range(episode):
    # collect trajectories
    old_probs, states, actions, rewards = \
        pong_utils.collect_trajectories(envs, policy, tmax=tmax)
    if len(envs.ps[0].aircrafts) > max_num_ac:
        max_num_ac = len(envs.ps[0].aircrafts)
    if len(envs.ps[0].aircrafts) < min_num_ac:
        min_num_ac = len(envs.ps[0].aircrafts)

    total_rewards = np.sum(rewards, axis=0)

    # gradient ascent step
    for _ in range(SGD_epoch):
        # uncomment to utilize your own clipped function!
        # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta)

        L = -pong_utils.clipped_surrogate(policy,
                                          old_probs,
                                          states,
コード例 #5
0
timer = pb.ProgressBar(widgets=widget, maxval=episode).start()'''

discount_rate = .99
beta = .01
value_coef = .5
tmax = 200
SGD_epoch = 4
epsilon = 0.1

# keep track of progress
mean_rewards = []

for e in range(episode):
    # collect trajectories
    old_probs, states, actions, rewards = \
        pong_utils.collect_trajectories(envs, policy, tmax=tmax)

    total_rewards = np.sum(rewards, axis=0)

    # gradient ascent step
    for _ in range(SGD_epoch):
        L = -pong_utils.clipped_surrogate(policy,
                                          old_probs,
                                          states,
                                          actions,
                                          rewards,
                                          epsilon=epsilon,
                                          beta=beta,
                                          value_coef=value_coef)
        optimizer.zero_grad()
        L.backward()
コード例 #6
0
def main(loop):
    beta = .01
    tmax = int(250 / Env.vw)
    SGD_epoch = 4
    epsilon = 0.1
    episode = 500
    envs = Env.envs()
    # check which device is being used.
    # I recommend disabling gpu until you've made sure that the code runs
    device = pong_utils.device
    print("using device: ", device)
    # keep track of progress
    mean_rewards = []
    policy = pong_utils.Policy().to(device)

    # we use the adam optimizer with learning rate 2e-4
    # optim.SGD is also possible
    optimizer = optim.Adam(policy.parameters(), lr=1e-4)
    for e in range(episode):
        # collect trajectories
        old_probs, states, actions, rewards = \
            pong_utils.collect_trajectories(envs, policy, tmax=tmax)

        total_rewards = np.sum(rewards, axis=0)

        # gradient ascent step
        for _ in range(SGD_epoch):
            # uncomment to utilize your own clipped function!
            # L = -clipped_surrogate(policy, old_probs, states, actions, rewards, epsilon=epsilon, beta=beta)

            L = -pong_utils.clipped_surrogate(policy,
                                              old_probs,
                                              states,
                                              actions,
                                              rewards,
                                              epsilon=epsilon,
                                              beta=beta)
            optimizer.zero_grad()
            L.backward()
            optimizer.step()
            del L

        # the clipping parameter reduces as time goes on
        epsilon *= .999

        # the regulation term also reduces
        # this reduces exploration in later runs
        beta *= .995

        # get the average reward of the parallel environments
        mean_rewards.append(np.mean(total_rewards))

        # display some progress every 20 iterations
        if (e + 1) % 20 == 0:
            print("Episode: {0:d}, score: {1:f}".format(
                e + 1, np.mean(total_rewards)))
            print(total_rewards)

    env = envs.ps[0]
    mean_rewards = np.array(mean_rewards)
    np.savetxt('data_{}.csv'.format(loop), mean_rewards, newline='\n')
コード例 #7
0
import pong_utils
device = pong_utils.device
print("using device: ", device)

import gym
env = gym.make('PongDeterministic-v4')
print("List of available actions: ", env.unwrapped.get_action_meanings())
# The actions 'RIGHTFIRE' = 4 and 'LEFTFIRE" = 5 makes the game restarts if done

import matplotlib.pyplot as plt

from agent import Policy
agent = Policy()
agent = agent.to(device)

pong_utils.play(env, agent, time=100)

envs = pong_utils.parallelEnv('PongDeterministic-v4', n=4, seed=12345)
prob, state, action, reward = pong_utils.collect_trajectories(envs,
                                                              agent,
                                                              tmax=100)
コード例 #8
0
        x = self.conv_2(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        # MLP        
        x = x.view(-1,9248) # flatten the tensor
        
        return self.sig(self.fc(x))  # P(left) = 1-P(right)


policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-4)

#%% Trajectories rollout
envs = pong_utils.parallelEnv('PongDeterministic-v4', n=8, seed=12345)
prob, state, action, reward = pong_utils.collect_trajectories(envs, policy, tmax=100)


#%% Function Definitions
def surrogate(policy, old_probs, states, actions, rewards,
              discount = 0.995, beta=0.01):

    discount = discount**np.arange(len(rewards))
    rewards = np.asarray(rewards)*discount[:,np.newaxis]
    
    # convert rewards to future rewards
    rewards_future = rewards[::-1].cumsum(axis=0)[::-1]
    
    # Normalize rewards
    mean = np.mean(rewards_future, axis=1)
    std = np.std(rewards_future, axis=1) + 1.0e-10