Ejemplo n.º 1
0
def main():
    new_map = ["SFFF", "FHFH", "FFFH", "HFFG"]
    env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY)
    env = env.unwrapped
    succeed_episode = 0

    for i_episode in range(1000000):

        if use_random_map and i_episode % 10 == 0:
            env.close()
            new_map = random_map(HOLE_NUM)
            env = FrozenLakeEnv(desc=new_map, is_slippery=IS_SLIPPERY)
            env = env.unwrapped

        pos = env.reset()
        state = encode_state(new_map, pos)

        ep_r = 0

        while True:
            a = select_action(state)

            pos_next, r, done, info = env.step(a)
            ep_r += r
            #state_next = encode_state(new_map, pos_next)

            if args.render:
                env.render()
            model.rewards.append(r)

            if done:
                break

        finish_episode()

        episode_durations.append(ep_r)

        if ep_r > 0:
            # EPSILON = 1 - 1. / ((i_episode / 500) + 10)
            succeed_episode += 1

        if i_episode % 1000 == 1:
            print('EP: {:d} succeed rate {:4f}'.format(i_episode,
                                                       succeed_episode / 1000))
            succeed_episode = 0

        if i_episode % 5000 == 1:
            plot_durations()
Ejemplo n.º 2
0
    def test_expected(self):
        env = FrozenLakeEnv(is_slippery=False)
        policy = UserInputPolicy(env)

        s = env.reset()
        env.render()

        for i in [RIGHT, RIGHT, DOWN, DOWN, DOWN, RIGHT]:
            with MockInputFunction(return_value=i):
                a = policy(s)

            s, r, done, info = env.step(a)
            env.render()

            if done:
                break
Ejemplo n.º 3
0
                 Q_table[state, action])

    Q_table[state, action] += learning_rate * delta


reward_list = []
for k in range(N_trial + N_trial_test):

    acc_reward = 0  # Init the accumulated reward
    observation = env.reset()  # Init the state
    action = policy(Q_table, observation, epsilon)  # Init the first action

    for t in range(trial_duration):
        if render: env.render()

        new_observation, reward, done, info = env.step(
            action)  # Take the action
        new_action = policy(Q_table, new_observation, epsilon)
        update_Q_table(Q_table=Q_table,
                       state=observation,
                       action=action,
                       reward=reward,
                       new_state=new_observation,
                       new_action=new_action,
                       is_done=done)

        observation = new_observation  # Pass the new state to the next step
        action = new_action  # Pass the new action to the next step
        acc_reward += reward  # Accumulate the reward
        if done:
            break  # Stop the trial when you fall in a hole or when you find the goal
Ejemplo n.º 4
0
import gym
import random
import numpy as np
from gym.envs.toy_text.frozen_lake import FrozenLakeEnv

char_list = list('SFFFFFFFFFFFFFFG')
for i in range(2):
    char_list[random.randint(1, 14)] = 'H'
my_map = [''.join(char_list[i:i + 4]) for i in [0, 4, 8, 12]]
env = FrozenLakeEnv(desc=np.asarray(my_map, dtype='c'), is_slippery=False)
env = env.unwrapped

for i in range(10):
    b = env.render()
    a = env.step(1)
    print(a)
Ejemplo n.º 5
0
        # 3. Choose an action a in the current world state (s)
        ## First we randomize a number
        exp_exp_tradeoff = random.uniform(0, 1)
        # print(exp_exp_tradeoff,epsilon)

        ## If this number > greater than epsilon --> exploitation (taking the biggest Q value for this state)
        if exp_exp_tradeoff > epsilon:
            action = np.argmax(qtable[state, :])
            # print("action",action)

        # Else doing a random choice --> exploration
        else:
            action = env.action_space.sample()

        # Take the action (a) and observe the outcome state(s') and reward (r)
        new_state, reward, done, info = env.step(action)

        if reward>0:
            print(episode,qtable)

        # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
        # qtable[new_state,:] : all the actions we can take from new state
        qtable[state, action] = qtable[state, action] + learning_rate * (
        reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])


        # print("qvalue",qtable[state, action])

        total_rewards += reward

        # Our new state is state
Ejemplo n.º 6
0
agent = QAgent(num_states, num_actions)

sum_reward = 0

for episode in range(NUM_EPISODES):
    done = False
    last_state = environment.reset()
    last_reward = None
    # Number of steps taken. A bit of a safeguard...
    num_steps = 0
    while not done:
        # Epsilon-greedy policy
        action = agent.get_action(last_state, environment)

        state, reward, done, info = environment.step(action)

        # A crude timeout: If we play too long without
        # completing the level, kill the game
        num_steps += 1
        if num_steps > 1000:
            print(
                "Episode timeout! Could not finish in 1000 steps. Check your actions!"
            )
            done = True

        # Update Q-table if we have one whole experience of
        # s, a, r, s', t'
        if last_state is not None:
            agent.update(
                last_state,
Ejemplo n.º 7
0
    return a


averageepisodelength = []
for i in range(num_episodes):
    episodelength = 0
    state = env.reset()
    totalreward = 0

    rand = np.random.randn(1, env.action_space.n)
    #action = random.randint(0,num_actions-1)
    done = False
    action = epsilon_policy(state, Q, epsilon)
    #print(state,action)
    while not done:
        newstate, reward, done, q = env.step(action)
        #print(newstate, reward, done , q)
        newaction = epsilon_policy(newstate, Q, epsilon)
        #newaction = np.argmax(Q[newstate, :] + np.random.randn(1, env.action_space.n) * (1. / (i + 1)) )
        #print("A:",newaction)
        Q[state, action] = Q[state, action] + alpha * (
            reward + gamma * Q[newstate, newaction] - Q[state, action])
        totalreward += reward

        state = newstate
        action = newaction
        episodelength += 1
    rewardvector.append(totalreward)
    averageepisodelength.append(episodelength)
    if i % 500 == 0 and i is not 0:
        print("Average episode length", np.mean(averageepisodelength))
Ejemplo n.º 8
0
cache = km.caching.MonteCarloCache(env, gamma=0.99)


# static parameters
num_episodes = 250
num_steps = 30


# train
for ep in range(num_episodes):
    s = env.reset()
    cache.reset()

    for t in range(num_steps):
        a = pi(s)
        s_next, r, done, info = env.step(a)

        # small incentive to keep moving
        if np.array_equal(s_next, s):
            r = -0.1

        cache.add(s, a, r, done)

        if done:
            while cache:
                S, A, G = cache.pop()
                pi.batch_update(S, A, G)
            break

        s = s_next
Ejemplo n.º 9
0
SHOW_EVERY_EPISODES = 100

environment = FrozenLakeEnv(is_slippery=False)

num_states = environment.observation_space.n

# Create a tabular record of values
vtable = VTable(num_states)

for episode in range(NUM_EPISODES):
    done = False
    state = environment.reset()
    # Keep track of visited states and rewards
    # obtained
    states = []
    rewards = []
    while not done:
        # Store state
        states.append(state)
        # Take random action
        state, reward, done, info = environment.step(
            environment.action_space.sample())
        # Store reward
        rewards.append(reward)

    # Update v-estimate with the played game
    vtable.process_trajectory(states, rewards)

    if ((episode + 1) % SHOW_EVERY_EPISODES) == 0:
        vtable.visualize_v((4, 4))