def train(args): ############## Hyperparameters ############## env_name = "GridExplore-v0" # creating environment env = gym.make(env_name) state_dim = env.observation_space[0].shape[0] action_dim = 5 model = ConvNet(action_dim).to(device) render = False solved_reward = 50 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = 10000 # max training episodes max_timesteps = 500 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 2400 # update policy every n timesteps lr = 0.0001 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 2 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None mini_batch_size = 32 ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr,betas) # buffer = {key:value for key,value in memory.__dict__.items() if not key.startswith('__') and not callable(key)} # logging variables running_reward = 0 avg_length = 0 timestep = 0 writer = SummaryWriter("logs") memory = Memory() q = queue(20) # training loop for i_episode in range(1, max_episodes+1): state = env.reset() # print("length of state arr is : " ,type(state)) for t in range(max_timesteps): timestep += 1 # env.render() state = np.array([state]) outputs = torch.from_numpy(state).float().to(device) # Running policy_old: action = ppo.policy_old.act(outputs, memory) state, reward, done, _ = env.step([action]) # Saving reward and is_terminal: memory.rewards.append(reward) memory.dones.append(done[0]) # update if its time if timestep % update_timestep == 0: ppo.update(memory) memory.clear_memory() timestep = 0 running_reward += reward[0] if render: env.render() if all(done): break avg = q.push(running_reward) avg_length += t writer.add_scalar('i_episode/avg_reward', avg , i_episode) grid = torchvision.utils.make_grid(torch.tensor(env.grid)) writer.add_image('images', grid, max_timesteps) # stop training if avg_reward > solved_reward if running_reward > (log_interval*solved_reward): print("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name)) torch.save(ppo.policy.state_dict(), './savedmodels/PPO_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S"))) break # logging if i_episode % log_interval == 0: avg_length = int(avg_length/log_interval) running_reward = int((running_reward/log_interval)) print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 writer.close() torch.save(ppo.policy.state_dict(), './PPO_NOTSOLVED_{}.pth'.format(env_name)) torch.save(ppo.policy.state_dict(), './savedmodels/PPO_NOTSOLVED_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S")))
def main(): # ############## Hyperparameters ############## # env_name = "LunarLander-v2" # # creating environment # env = gym.make(env_name) # state_dim = env.observation_space.shape[0] # action_dim = 4 # render = 'render' in sys.argv # solved_reward = 230 # stop training if avg_reward > solved_reward # log_interval = 20 # print avg reward in the interval # max_episodes = 50000 # max training episodes # max_timesteps = 300 # max timesteps in one episode # n_latent_var = 64 # number of variables in hidden layer # update_timestep = 2000 # update policy every n timesteps # lr = 0.002 # betas = (0.9, 0.999) # gamma = 0.99 # discount factor # K_epochs = 4 # update policy for K epochs # eps_clip = 0.2 # clip parameter for PPO # random_seed = None # ############################################# ############## Hyperparameters ############## env_name = "SuperMarioBros-v3" # creating environment env = gym_super_mario_bros.make(env_name) state_dim = env.observation_space.shape[2] # print('state_dim:', state_dim) action_dim = 4 render = 'render' in sys.argv solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 1 # print avg reward in the interval max_episodes = 20 # max training episodes max_timesteps = 50 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 256 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) memory = Memory() ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) print(lr, betas) # logging variables running_reward = 0 avg_length = 0 timestep = 0 # training loop for i_episode in range(1, max_episodes + 1): state = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: action = ppo.policy_old.act(state.copy(), memory) state, reward, done, _ = env.step(action.cpu()) # Saving reward and is_terminal: memory.rewards.append(reward) memory.is_terminals.append(done) # update if its time if timestep % update_timestep == 0: ppo.update(memory) memory.clear_memory() timestep = 0 running_reward += reward if render: env.render() if done: state = env.reset() avg_length += t # stop training if avg_reward > solved_reward if running_reward > (log_interval * solved_reward): print("########## Solved! ##########") torch.save(ppo.policy.state_dict(), './saved_models/PPO_{}.pth'.format(env_name)) break # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0
env.episode_num += 1 env.episode_reward = 0 episode_timesteps = 0 """ action selected based on pure policy """ action = policy.select_action(state, memory) log_f.write('action based on policy:{}\n'.format(action)) # Perform action new_state, reward, done = env.step(action) done_bool = 0 if episode_timesteps + 1 == env.max_time else float(done) env.episode_reward += reward # Saving reward: memory.rewards.append(reward) state = new_state episode_timesteps += 1 env.total_timesteps += 1 timesteps_since_eval += 1 # update if its time if time_step % args.update_timestep == 0: policy.update(memory) memory.clear_memory() time_step = 0 plt.plot(range(len(Reward)), np.array(Reward), 'b') plt.savefig('./results/episode reward.png')
def adv_training(): ############## Hyperparameters ############## # creating environment env = TronEnv() state_dim = env.observation_space.shape[0] action_dim = 4 render = False solved_reward = 230 # stop training if avg_reward > solved_reward log_interval = 20 # print avg reward in the interval max_episodes = 20000 # max training episodes max_timesteps = 300 # max timesteps in one episode n_latent_var = 64 # number of variables in hidden layer update_timestep = 2000 # update policy every n timesteps lr = 0.002 betas = (0.9, 0.999) gamma = 0.99 # discount factor K_epochs = 4 # update policy for K epochs eps_clip = 0.2 # clip parameter for PPO random_seed = None ############################################# if random_seed: torch.manual_seed(random_seed) env.seed(random_seed) h_memory = Memory() a_memory = Memory() hero = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) adv = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip) # logging variables running_reward = 0 avg_length = 0 timestep = 0 # training loop for i_episode in range(1, max_episodes + 1): p_state, e_state = env.reset() for t in range(max_timesteps): timestep += 1 # Running policy_old: p_action = hero.policy_old.act(np.array(p_state), h_memory) e_action = adv.policy_old.act(np.array(e_state), a_memory) p_state, h_reward1, a_reward1, h_done, _ = env.step(p_action, 0) e_state, h_reward2, a_reward2, a_done, _ = env.step(e_action, 1) # Saving reward and is_terminal: h_memory.rewards.append(h_reward1 + h_reward2) h_memory.is_terminals.append(h_done) # Saving reward and is_terminal: a_memory.rewards.append(a_reward1 + a_reward2) a_memory.is_terminals.append(a_done) # update if its time if timestep % update_timestep == 0: hero.update(h_memory) h_memory.clear_memory() adv.update(a_memory) a_memory.clear_memory() timestep = 0 running_reward += (h_reward1 + h_reward2) if render: env.render() if h_done or a_done: break avg_length += t # logging if i_episode % log_interval == 0: avg_length = int(avg_length / log_interval) running_reward = int((running_reward / log_interval)) env.render() print('Episode {} \t avg length: {} \t reward: {}'.format( i_episode, avg_length, running_reward)) running_reward = 0 avg_length = 0 adv.save_all()