Esempio n. 1
0
def train(args):
    ############## Hyperparameters ##############
    env_name = "GridExplore-v0"
    # creating environment
    env = gym.make(env_name)
    state_dim = env.observation_space[0].shape[0]

    action_dim = 5
    model = ConvNet(action_dim).to(device)

    render = False
    solved_reward = 50         # stop training if avg_reward > solved_reward
    log_interval = 20           # print avg reward in the interval
    max_episodes = 10000        # max training episodes
    max_timesteps = 500         # max timesteps in one episode
    n_latent_var = 64           # number of variables in hidden layer
    update_timestep = 2400      # update policy every n timesteps
    lr = 0.0001
    betas = (0.9, 0.999)
    gamma = 0.99                # discount factor
    K_epochs = 2                # update policy for K epochs
    eps_clip = 0.2              # clip parameter for PPO
    random_seed = None
    mini_batch_size = 32
    #############################################
    
    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)
    
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs, eps_clip)
    print(lr,betas)

    # buffer = {key:value for key,value in memory.__dict__.items() if not key.startswith('__') and not callable(key)}

    
  
    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    writer = SummaryWriter("logs")

    memory = Memory()
    q = queue(20)

    # training loop
    for i_episode in range(1, max_episodes+1):
        state = env.reset()
        
        # print("length of state arr is : " ,type(state))
        for t in range(max_timesteps):
            timestep += 1
           # env.render()

            state = np.array([state])

            outputs = torch.from_numpy(state).float().to(device)

            # Running policy_old:
            action = ppo.policy_old.act(outputs, memory)
            state, reward, done, _ = env.step([action])

            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.dones.append(done[0])
            
            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                timestep = 0
            
            running_reward += reward[0]
            if render:
                env.render()
            if all(done):
                break
        
        avg = q.push(running_reward)
        avg_length += t
        
        writer.add_scalar('i_episode/avg_reward', avg , i_episode)
        
        grid = torchvision.utils.make_grid(torch.tensor(env.grid))
        writer.add_image('images', grid, max_timesteps)


        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval*solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            torch.save(ppo.policy.state_dict(), './savedmodels/PPO_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S")))
            break
            
        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length/log_interval)
            running_reward = int((running_reward/log_interval))
            
            print('Episode {} \t avg length: {} \t reward: {}'.format(i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0

    writer.close()
    torch.save(ppo.policy.state_dict(), './PPO_NOTSOLVED_{}.pth'.format(env_name))
    torch.save(ppo.policy.state_dict(), './savedmodels/PPO_NOTSOLVED_{}.pth'.format(time.strftime("%Y%m%d-%H%M%S")))
Esempio n. 2
0
def main():
    # ############## Hyperparameters ##############
    # env_name = "LunarLander-v2"
    # # creating environment
    # env = gym.make(env_name)
    # state_dim = env.observation_space.shape[0]
    # action_dim = 4
    # render = 'render' in sys.argv
    # solved_reward = 230         # stop training if avg_reward > solved_reward
    # log_interval = 20           # print avg reward in the interval
    # max_episodes = 50000        # max training episodes
    # max_timesteps = 300         # max timesteps in one episode
    # n_latent_var = 64           # number of variables in hidden layer
    # update_timestep = 2000      # update policy every n timesteps
    # lr = 0.002
    # betas = (0.9, 0.999)
    # gamma = 0.99                # discount factor
    # K_epochs = 4                # update policy for K epochs
    # eps_clip = 0.2              # clip parameter for PPO
    # random_seed = None
    # #############################################

    ############## Hyperparameters ##############
    env_name = "SuperMarioBros-v3"
    # creating environment
    env = gym_super_mario_bros.make(env_name)
    state_dim = env.observation_space.shape[2]
    # print('state_dim:', state_dim)
    action_dim = 4
    render = 'render' in sys.argv
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 1  # print avg reward in the interval
    max_episodes = 20  # max training episodes
    max_timesteps = 50  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 256  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes + 1):
        state = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            action = ppo.policy_old.act(state.copy(), memory)
            state, reward, done, _ = env.step(action.cpu())

            # Saving reward and is_terminal:
            memory.rewards.append(reward)
            memory.is_terminals.append(done)

            # update if its time
            if timestep % update_timestep == 0:
                ppo.update(memory)
                memory.clear_memory()
                timestep = 0

            running_reward += reward
            if render:
                env.render()
            if done:
                state = env.reset()

        avg_length += t

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval * solved_reward):
            print("########## Solved! ##########")
            torch.save(ppo.policy.state_dict(),
                       './saved_models/PPO_{}.pth'.format(env_name))
            break

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
Esempio n. 3
0
            env.episode_num += 1
            env.episode_reward = 0
            episode_timesteps = 0
        """ action selected based on pure policy """
        action = policy.select_action(state, memory)
        log_f.write('action based on policy:{}\n'.format(action))

        # Perform action
        new_state, reward, done = env.step(action)

        done_bool = 0 if episode_timesteps + 1 == env.max_time else float(done)
        env.episode_reward += reward
        # Saving reward:
        memory.rewards.append(reward)

        state = new_state

        episode_timesteps += 1
        env.total_timesteps += 1
        timesteps_since_eval += 1

        # update if its time
        if time_step % args.update_timestep == 0:
            policy.update(memory)
            memory.clear_memory()
            time_step = 0

    plt.plot(range(len(Reward)), np.array(Reward), 'b')
    plt.savefig('./results/episode reward.png')
Esempio n. 4
0
def adv_training():
    ############## Hyperparameters ##############
    # creating environment
    env = TronEnv()
    state_dim = env.observation_space.shape[0]
    action_dim = 4
    render = False
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 20  # print avg reward in the interval
    max_episodes = 20000  # max training episodes
    max_timesteps = 300  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 2000  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    h_memory = Memory()
    a_memory = Memory()
    hero = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
               eps_clip)
    adv = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0

    # training loop
    for i_episode in range(1, max_episodes + 1):
        p_state, e_state = env.reset()
        for t in range(max_timesteps):
            timestep += 1

            # Running policy_old:
            p_action = hero.policy_old.act(np.array(p_state), h_memory)
            e_action = adv.policy_old.act(np.array(e_state), a_memory)

            p_state, h_reward1, a_reward1, h_done, _ = env.step(p_action, 0)
            e_state, h_reward2, a_reward2, a_done, _ = env.step(e_action, 1)
            # Saving reward and is_terminal:
            h_memory.rewards.append(h_reward1 + h_reward2)
            h_memory.is_terminals.append(h_done)

            # Saving reward and is_terminal:
            a_memory.rewards.append(a_reward1 + a_reward2)
            a_memory.is_terminals.append(a_done)

            # update if its time
            if timestep % update_timestep == 0:
                hero.update(h_memory)
                h_memory.clear_memory()

                adv.update(a_memory)
                a_memory.clear_memory()

                timestep = 0

            running_reward += (h_reward1 + h_reward2)
            if render:
                env.render()
            if h_done or a_done:
                break

        avg_length += t

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))
            env.render()
            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0
    adv.save_all()