def train_single_player_return(): max_episodes = 50000 episodes_per_update = 10 render = False gamma = 0.99 lr = 0.005 betas = (0.9, 0.999) path = 'models/single_player.pkl' env = SinglePlayerReturn(DiscreteActionBotSim()) policy = ActorCritic() optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) r = [] i_episode = 0 while i_episode < max_episodes: state = env.reset() done = False while not done: action = policy(state) state, reward, done, i = env.step(action) policy.temp_rewards.append(reward) i_episode += 1 r.append(reward) if i_episode % 10: policy.updateMemory(gamma) continue # Updating the policy : optimizer.zero_grad() loss = policy.calculateLoss() loss.backward() policy.clip_grads() optimizer.step() policy.clearMemory() if i_episode % 500 == 0: if len(r) > 50: print('Episode ', i_episode, ': avg reward ', sum(r) / len(r)) else: print('Episode ', i_episode, ': avg reward n/a') if i_episode % 1000 == 0: torch.save(policy, path) if len(r) > 100: r.pop(0) if sum(r) / len(r) > 0.8: r = [] v += 1 print('CONVERGED v', v) break torch.save(policy, path)
def train(): # Defaults parameters: # gamma = 0.99 # lr = 0.02 # betas = (0.9, 0.999) # random_seed = 543 render = False gamma = 0.99 lr = 0.02 betas = (0.9, 0.999) random_seed = 543 torch.manual_seed(random_seed) env = gym.make('LunarLander-v2') env.seed(random_seed) policy = ActorCritic() optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas) print(lr,betas) running_reward = 0 for i_episode in range(0, 10000): state = env.reset() for t in range(10000): action = policy(state) state, reward, done, _ = env.step(action) policy.rewards.append(reward) running_reward += reward if render and i_episode > 1000: env.render() if done: break # Updating the policy : optimizer.zero_grad() loss = policy.calculateLoss(gamma) loss.backward() optimizer.step() policy.clearMemory() # saving the model if episodes > 999 OR avg reward > 200 #if i_episode > 999: # torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) if running_reward > 4000: torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) print("########## Solved! ##########") test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1])) break if i_episode % 20 == 0: running_reward = running_reward/20 print('Episode {}\tlength: {}\treward: {}'.format(i_episode, t, running_reward)) running_reward = 0