Exemple #1
0
        env.reset()

        S = env.s

        while not finished:

            A = epsilon_greedy_action(env, Q, S)

            next_S, R, finished, _ = env.step(A)

            #A = epsilon_greedy_action(env,Q,next_S)

            #Q[S][A] = Q[S][A] + alpha*(R + gamma * Q[next_S][next_A] - Q[S][A])
            Q[S][A] = Q[S][A] + alpha * (R + gamma * np.max(Q[next_S][A]) -
                                         Q[S][A])

            S = next_S

    return Q


# In[28]:

#zaokrąglamy do dwóch miejsc po przecinku
Q = Q_Learning(env, 10000)
print(Q)

env.close()

# In[ ]:
def game(N_episodes, AI_type, Intrinsic_type):
    ############## Hyperparameters ##############
    env = FrozenLakeEnv()
    #memory = Memory(max_size=300)
    ppo = 0
    #n_episodes = number_of_episodes
    #n_actions = env.action_space.n
    #intrinsic = intrinsic
    #print(n_actions)
    #n_agents = 1
    #n_episodes = number_of_episodes
    #state_size = env.observation_space.n

    #env_name = "LunarLander-v2"
    # creating environment
    state_dim = env.observation_space.n
    action_dim = env.action_space.n
    render = False
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 20  # print avg reward in the interval
    max_episodes = N_episodes  # max training episodes
    max_timesteps = 100  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 200  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    samp_rewards = []
    avg_rewards = []
    best_avg_reward = -np.inf
    n_agents = 1
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    avg_reward = 0
    ppo.memcount.delete()
    state_size = env.observation_space.n
    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd()
    norm_step = 5000
    #Pre Run
    next_obs = []
    for norm_step in range(norm_step):
        action_norm = np.random.randint(0, action_dim)
        state_norm, reward_norm, done_norm, _ = env.step(action_norm)
        state_norm = to_categorical(state_norm, state_size)  #optional
        next_obs.append(state_norm)
    obs_rms.update(next_obs)
    #print(obs_rms.mean)

    # training loop
    for i_episode in range(1, max_episodes + 1):
        state = env.reset()
        state = to_categorical(state, state_size)
        done = False
        t = 0
        episode_reward = 0
        intrinsic_rewards = 0
        reward = 0
        for t in range(max_timesteps):
            #while not done:
            timestep += 1
            t += 1

            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            state = to_categorical(state, state_size)

            #========================================================
            if ((AI_type == "PPO" or AI_type == "A2C")
                    and Intrinsic_type == "1"):
                intrinsic_rewards = get_intrinsic_rewards(
                    AI_type, state, ppo, n_agents, 10)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards1",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "2"):
                intrinsic_rewards = get_intrinsic_rewards2(
                    AI_type, state, action, ppo, n_agents, 10)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards2",intrinsic_rewards)

            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "3"):
                intrinsic_rewards = get_intrinsic_rewards3(
                    AI_type, state, action, ppo, n_agents, reward, 1)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards3",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "4"):
                intrinsic_rewards = get_intrinsic_rewards4(
                    AI_type, state, action, ppo, n_agents, reward * 10, t, 100,
                    0.99)
                #print("intrinsic_rewards---",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "5"):
                intrinsic_rewards = get_intrinsic_rewards5(
                    AI_type, state, ppo, n_agents, 1, 16)

                #print("intrinsic_rewards5",intrinsic_rewards)
            else:
                intrinsic_rewards = 0
            #reward_sum = reward + intrinsic_rewards
            reward_sum = reward
            #===========================================================
            # Saving reward and is_terminal:
            memory.rewards.append(reward_sum)
            #temp_int = memory.intrinsic_rewards.data.numpy()
            #temp_int = memory.intrinsic_rewards
            #print(temp_int)
            memory.intrinsic_rewards.append(intrinsic_rewards)
            memory.is_terminals.append(done)
            """
            try:
                mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int)
                reward_rms.update_from_moments(mean1, std1 ** 2, count1)
                adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var)
            except:
                adv_int = 0
            """
            """
            print(temp_int.data.numpy())
            mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int)
            reward_rms.update_from_moments(mean1, std1 ** 2, count1)
            adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var)
            """

            # update if its time
            if timestep % update_timestep == 0:
                temp_int = memory.intrinsic_rewards
                mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(
                    temp_int)
                reward_rms.update_from_moments(mean1, std1**2, count1)
                adv_int = (temp_int) / np.sqrt(reward_rms.var)
                ppo.update(memory, adv_int)
                memory.clear_memory()
                timestep = 0

            running_reward += reward
            episode_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval * solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            #break

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0

        samp_rewards.append(episode_reward)
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards[-100:])
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward

        print("Total reward in episode {} = {}".format(i_episode,
                                                       episode_reward))
        print("Best_avg_reward =", np.round(best_avg_reward, 3),
              "Average_rewards =", np.round(avg_reward, 3))
    #env.save_replay()
    env.close()

    return avg_rewards, best_avg_reward, samp_rewards, "0"