Beispiel #1
0
def main():
    env = None

    # environment selection
    if args.lake:
        env = FrozenLakeEnv(map_name='8x8')
        if args.size > 0:
            env = FrozenLakeEnv(desc=None, map_name=None, size=args.size)
    if args.tower:
        rings = tuple(range(args.rings - 1, -1, -1))
        print(rings)
        init = (rings, (), ())
        goal = ((), (), rings)
        env = TohEnv(initial_state=init, goal_state=goal, noise=args.noise)

    print('> env number of states: {}'.format(env.nS))
    print('> noise factor: {}'.format(args.noise))

    # solver selection
    discount = args.discount
    print('> discount factor: {}'.format(discount))
    if args.vi:
        vi_policy = value_iteration(env, discount=discount)
        policy = vi_policy
        print_policy(vi_policy)
    if args.pi:
        pi_policy = policy_iteration(env,
                                     discount=discount)  # reshape to 1d array
        pi_policy = np.reshape(np.argmax(pi_policy, axis=1), [env.nS])
        policy = pi_policy
        print_policy(pi_policy)
    if args.vi and args.pi:
        # Compare the two policies
        diffs = policy_differences(vi_policy, pi_policy)
        print(diffs)
        print('VI and PI policy differences: {}'.format(sum(diffs.values())))
    if args.q:
        Q = q_learning(env, total_episodes=args.episodes)
        # The optimal policy for Q learning is the argmax action with probability 1 - epsilon.
        q_policy = np.reshape(np.argmax(Q, axis=1), [env.nS]).tolist()
        policy = q_policy
        print_policy(q_policy)

    print('Scoring the policy...')
    if args.lake:
        score_frozen_lake(env, policy)
    if args.tower:
        score_tower_of_hanoi(env, policy)
def get_environment(ENV_NAME):
    env_kwargs = {
        'map_name': ENV_NAME,
        'slip_rate': .2,
        'rewards': (-0.01, -1, 1)
    }
    env = FrozenLakeEnv(**env_kwargs)
    env = env.unwrapped
    return env
from utils import *
from methods import *
from joblib import Parallel, delayed
from MC import *
from true_online_GTD import *
import numpy.matlib, os
from frozen_lake import FrozenLakeEnv

unit = 1
env = FrozenLakeEnv(None, '4x4', True, unit)
N = env.observation_space.n

runtimes = 10
mc_episodes = int(1e7)
gamma = lambda x: 0.95
runtime = 0
target_policy = np.matlib.repmat(
    np.ones((1, env.action_space.n)) / env.action_space.n,
    env.observation_space.n, 1)
true_expectations = np.zeros((runtimes, env.action_space.n**2))
true_variances = np.zeros((runtimes, env.action_space.n**2))
stationary_dists = np.zeros((runtimes, env.action_space.n**2))

cumulative_expectation = np.zeros((1, env.action_space.n**2))
cumulative_variance = np.zeros((1, env.action_space.n**2))
cumulative_distribution = np.zeros((1, env.action_space.n**2))
count = 0

directory = 'frozenlake'
filelist = os.listdir(directory)
for filename in filelist:
Beispiel #4
0
import gym   # install this by "pip install gym"
import itertools
import matplotlib.style
import sys
import numpy as np
import plotting

matplotlib.style.use('ggplot')

if "../" not in sys.path:
  sys.path.append("../") 

from collections import defaultdict
from frozen_lake import FrozenLakeEnv

env = FrozenLakeEnv()


def make_epsilon_greedy_policy(Q, epsilon, num_actions):
    """
    Creates an epsilon-greedy policy based
    on a given Q-function and epsilon.

    Returns a function that takes the state
    as an input and returns the probabilities
    for each action in the form of a numpy array
    of length of the action space(set of possible actions).
    """

    def policyFunction(state):
        Action_probabilities = np.ones(num_actions,
Beispiel #5
0
def count_different_entries(a, b):
    assert a.size == b.size, 'Arrays need to be the same size'
    return a.size - np.sum(np.isclose(a, b))

if __name__ == '__main__':
    
    for ENV_NAME in ENV_NAMES:
        gamma = 0.9
        theta = 0.0001        
        env_kwargs = {
            'map_name': ENV_NAME,
            'slip_rate': .2,
            'rewards': (-0.1, -1, 1)
        }
        print(ENV_NAME)
        pi_env = FrozenLakeEnv(**env_kwargs)
        pi_env = pi_env.unwrapped
        print('policy iteration begin')
        pi_policy, pi_V, pi_iter, pi_time = policy_iteration(pi_env, discount_factor=gamma, theta=theta)
        print('policy iteration end')
        visualize_policy(pi_policy, ENV_NAME, pi_env.desc.shape,'pi', 'Policy Iteration - Optimal Policy {} Iterations'.format(pi_iter))
        visualize_value(pi_V, ENV_NAME, pi_env.desc.shape,'pi', 'Policy Iteration - Estimated Value of each State')


    for ENV_NAME in ENV_NAMES:
        gamma = 0.85
        theta = 0.001        
        env_kwargs = {
            'map_name': ENV_NAME,
            'slip_rate': .2,
            'rewards': (-0.1, -1, 1)
Beispiel #6
0
import numpy as np
import sys
import tensorflow as tf
import collections
from frozen_lake import FrozenLakeEnv

if "../" not in sys.path:
  sys.path.append("../") 
from lib.envs.cliff_walking import CliffWalkingEnv
from lib import plotting

matplotlib.style.use('ggplot')

#env = CliffWalkingEnv()
#env = gym.make('FrozenLake-v0')
env = FrozenLakeEnv(is_slippery=False)

class PolicyEstimator():
    """
    Policy Function approximator. 
    """
    
    def __init__(self, learning_rate=0.001, scope="policy_estimator"):
        with tf.variable_scope(scope):
            self.state = tf.placeholder(tf.int32, [], "state")
            self.action = tf.placeholder(dtype=tf.int32, name="action")
            self.target = tf.placeholder(dtype=tf.float32, name="target")

            # This is just table lookup estimator
            state_one_hot = tf.one_hot(self.state, int(env.observation_space.n))
            self.output_layer = tf.contrib.layers.fully_connected(
Beispiel #7
0
#!/usr/bin/env python
# coding: utf-8

# In[11]:

from frozen_lake import FrozenLakeEnv
import numpy as np
import sys

# In[12]:

env = FrozenLakeEnv(map_name="4x4", is_slippery=False)

# Access the number of states:
nS = env.observation_space
print("State space of the Env: ", nS)
# or you could even use
nS = env.nS
print("State space of the Env by accessing env.nS: ", nS)

# Action space of the agent:
nA = env.nA
print("Action space of the Env: ", nA)

# In[13]:
"""
For policy iteration, you would need to access
State(s), Action(a), Next State(ns), Reward(r), episode ended? (is_done) tuples.

Note that in this environment, the orientation of the agent does not matter.
No matter what direction the agent is facing, if a left action is performed, 
def main(t_expert=1e-2,
         t_irl=1e-2,
         gamma=1,
         h=10,
         n_traj=200,
         traj_len=10,
         learning_rate=0.01,
         epochs=300):
    '''
    Demonstrates the usage of the implemented MaxCausalEnt IRL algorithm. 
    
    First a number of expert trajectories is generated using the true reward 
    giving rise to the Boltzmann rational expert policy with temperature t_exp. 
    
    Hereafter the max_causal_ent_irl() function is used to find a reward vector
    that maximizes the log likelihood of the generated expert trajectories, 
    modelling the expert as a Boltzmann rational agent with temperature t_irl.
    
    Parameters
    ----------
    t_expert : float >= 0
        The temperature parameter for computing V, Q and policy of the 
        Boltzmann rational expert: p(a|s) is proportional to exp(Q/t_expert);
        the closer temperature is to 0 the more rational the expert is.
    t_irl : float
        Temperature of the Boltzmann rational policy the IRL algorithm assumes
        the expert followed when generating the trajectories.
    gamma : float 
        Discount factor; 0<=gamma<=1.
    h : int
        Horizon for the finite horizon version of value iteration subroutine of
        MaxCausalEnt IRL algorithm.
    n_traj : int
        Number of expert trajectories generated.
    traj_len : int
        Number of timesteps in each of the expert trajectories.
    learning_rate : float
        Learning rate for gradient descent in the MaxCausalEnt IRL algorithm.
    epochs : int
        Number of gradient descent steps in the MaxCausalEnt IRL algorithm.
    '''
    np.random.seed(0)
    mdp = MDPOneTimeR(FrozenLakeEnv(is_slippery=False))    

    # Features
    feature_matrix = np.eye(mdp.nS)
    # Add dummy feature to show that features work
    if False:
        feature_matrix = np.concatenate((feature_matrix, np.ones((mdp.nS,1))), 
                                        axis=1)
    
    # The true reward weights and the reward
    theta_expert = np.zeros(feature_matrix.shape[1])
    theta_expert[24] = 1
    r_expert = np.dot(feature_matrix, theta_expert)
    
    # Compute the Boltzmann rational expert policy from the given true reward.
    if t_expert>0:
        V, Q, policy_expert = vi_boltzmann(mdp, gamma, r_expert, h, t_expert)
    if t_expert==0:
        V, Q, policy_expert = vi_rational(mdp, gamma, r_expert, h)
        
    # Generate expert trajectories using the given expert policy.
    trajectories = generate_trajectories(mdp, policy_expert, traj_len, n_traj)
    
    # Compute and print the stats of the generated expert trajectories.
    sa_visit_count, _ = compute_s_a_visitations(mdp, gamma, trajectories)
    log_likelihood = np.sum(sa_visit_count * (Q - V))
    print('Generated {} traj of length {}'.format(n_traj, traj_len))
    print('Log likelihood of all traj under the policy generated ', 
          'from the true reward: {}, \n average per traj step: {}'.format(
           log_likelihood, log_likelihood / (n_traj * traj_len)))
    print('Average return per expert trajectory: {} \n'.format(
            np.sum(np.sum(sa_visit_count, axis=1)*r_expert) / n_traj))

    # Find a reward vector that maximizes the log likelihood of the generated 
    # expert trajectories.
    theta = max_causal_ent_irl(mdp, feature_matrix, trajectories, gamma, h, 
                               t_irl, epochs, learning_rate)
    print('Final reward weights: ', theta)
Beispiel #9
0

if __name__ == "__main__":
    args = init_args()
    skip_render = args.no_render
    map_name = args.map_name
    is_slippery = args.slippery
    gamma = args.gamma
    tol = args.tol

    # comment/uncomment these lines to switch between deterministic/stochastic environments
    # env = gym.make("Deterministic-4x4-FrozenLake-v0")
    # env = gym.make("Stochastic-4x4-FrozenLake-v0")

    # using local customized env
    env = FrozenLakeEnv(map_name=map_name, is_slippery=is_slippery)

    print("\n" + "-" * 25 + "\nBeginning Policy Iteration\n" + "-" * 25)

    V_pi, p_pi = policy_iteration(env.P, env.nS, env.nA, gamma=gamma, tol=tol)
    print('# policy evaluations:', len(policy_eval_iter_count), ' : ',
          policy_eval_iter_count)
    print('Optimal policy:')
    print_policy(env, p_pi, V_pi)
    if not skip_render:
        render_single(env, p_pi, 100)

    print("\n" + "-" * 25 + "\nBeginning Value Iteration\n" + "-" * 25)

    V_vi, p_vi = value_iteration(env.P, env.nS, env.nA, gamma=gamma, tol=tol)
    print('# value iteration:', n_value_iter)
Beispiel #10
0
            received_bits = received_bits + str(
                send_receive(int(bit), quantum_engine))
        received_bytes_list.append(received_bits)

    binary_to_string = ''.join([chr(int(x, 2)) for x in received_bytes_list])
    #print('Received Binary message: ', received_bytes_list)
    #print('Received message: ', binary_to_string)
    return binary_to_string


quantum_engine = MainEngine()
#message = 'DataEspresso'
#send_full_message(message=message,quantum_engine=quantum_engine)

#env = gym.make('FrozenLake-v0')
env = FrozenLakeEnv(is_slippery=False)

Q = np.zeros([env.observation_space.n, env.action_space.n])
lr = .8
y = .95
num_episodes = 2000
#jList = []
rList = []
for i in range(num_episodes):
    #Reset environment and get first new observation
    s = env.reset()
    rAll = 0
    d = False
    j = 0
    #The Q-Table learning algorithm
    while j < 99:
Beispiel #11
0
#!/usr/bin/env python
# coding: utf-8

# In[27]:

import numpy as np
from frozen_lake import FrozenLakeEnv
import random

env = FrozenLakeEnv()


def epsilon_greedy_action(env, Q, state, epsilon=0.3):
    n = random.uniform(0, 1)
    if n <= epsilon:
        return np.random.randint(env.action_space.n)
    else:
        return np.argmax(Q[state])


def Q_Learning(env, episodes=1000, gamma=0.91, alpha=0.1):
    Q = np.zeros([env.nS, env.nA])

    for i in range(episodes):
        finished = False

        env.reset()

        S = env.s

        while not finished:
def game(N_episodes, AI_type, Intrinsic_type):
    ############## Hyperparameters ##############
    env = FrozenLakeEnv()
    #memory = Memory(max_size=300)
    ppo = 0
    #n_episodes = number_of_episodes
    #n_actions = env.action_space.n
    #intrinsic = intrinsic
    #print(n_actions)
    #n_agents = 1
    #n_episodes = number_of_episodes
    #state_size = env.observation_space.n

    #env_name = "LunarLander-v2"
    # creating environment
    state_dim = env.observation_space.n
    action_dim = env.action_space.n
    render = False
    solved_reward = 230  # stop training if avg_reward > solved_reward
    log_interval = 20  # print avg reward in the interval
    max_episodes = N_episodes  # max training episodes
    max_timesteps = 100  # max timesteps in one episode
    n_latent_var = 64  # number of variables in hidden layer
    update_timestep = 200  # update policy every n timesteps
    lr = 0.002
    betas = (0.9, 0.999)
    gamma = 0.99  # discount factor
    K_epochs = 4  # update policy for K epochs
    eps_clip = 0.2  # clip parameter for PPO
    random_seed = None
    samp_rewards = []
    avg_rewards = []
    best_avg_reward = -np.inf
    n_agents = 1
    #############################################

    if random_seed:
        torch.manual_seed(random_seed)
        env.seed(random_seed)

    memory = Memory()
    ppo = PPO(state_dim, action_dim, n_latent_var, lr, betas, gamma, K_epochs,
              eps_clip)
    print(lr, betas)

    # logging variables
    running_reward = 0
    avg_length = 0
    timestep = 0
    avg_reward = 0
    ppo.memcount.delete()
    state_size = env.observation_space.n
    reward_rms = RunningMeanStd()
    obs_rms = RunningMeanStd()
    norm_step = 5000
    #Pre Run
    next_obs = []
    for norm_step in range(norm_step):
        action_norm = np.random.randint(0, action_dim)
        state_norm, reward_norm, done_norm, _ = env.step(action_norm)
        state_norm = to_categorical(state_norm, state_size)  #optional
        next_obs.append(state_norm)
    obs_rms.update(next_obs)
    #print(obs_rms.mean)

    # training loop
    for i_episode in range(1, max_episodes + 1):
        state = env.reset()
        state = to_categorical(state, state_size)
        done = False
        t = 0
        episode_reward = 0
        intrinsic_rewards = 0
        reward = 0
        for t in range(max_timesteps):
            #while not done:
            timestep += 1
            t += 1

            # Running policy_old:
            action = ppo.policy_old.act(state, memory)
            state, reward, done, _ = env.step(action)
            state = to_categorical(state, state_size)

            #========================================================
            if ((AI_type == "PPO" or AI_type == "A2C")
                    and Intrinsic_type == "1"):
                intrinsic_rewards = get_intrinsic_rewards(
                    AI_type, state, ppo, n_agents, 10)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards1",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "2"):
                intrinsic_rewards = get_intrinsic_rewards2(
                    AI_type, state, action, ppo, n_agents, 10)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards2",intrinsic_rewards)

            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "3"):
                intrinsic_rewards = get_intrinsic_rewards3(
                    AI_type, state, action, ppo, n_agents, reward, 1)
                intrinsic_rewards = intrinsic_rewards.data.numpy()
                #print("intrinsic_rewards3",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "4"):
                intrinsic_rewards = get_intrinsic_rewards4(
                    AI_type, state, action, ppo, n_agents, reward * 10, t, 100,
                    0.99)
                #print("intrinsic_rewards---",intrinsic_rewards)
            elif ((AI_type == "PPO" or AI_type == "A2C")
                  and Intrinsic_type == "5"):
                intrinsic_rewards = get_intrinsic_rewards5(
                    AI_type, state, ppo, n_agents, 1, 16)

                #print("intrinsic_rewards5",intrinsic_rewards)
            else:
                intrinsic_rewards = 0
            #reward_sum = reward + intrinsic_rewards
            reward_sum = reward
            #===========================================================
            # Saving reward and is_terminal:
            memory.rewards.append(reward_sum)
            #temp_int = memory.intrinsic_rewards.data.numpy()
            #temp_int = memory.intrinsic_rewards
            #print(temp_int)
            memory.intrinsic_rewards.append(intrinsic_rewards)
            memory.is_terminals.append(done)
            """
            try:
                mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int)
                reward_rms.update_from_moments(mean1, std1 ** 2, count1)
                adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var)
            except:
                adv_int = 0
            """
            """
            print(temp_int.data.numpy())
            mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(temp_int)
            reward_rms.update_from_moments(mean1, std1 ** 2, count1)
            adv_int = (memory.intrinsic_rewards-reward_rms.mean)/np.sqrt(reward_rms.var)
            """

            # update if its time
            if timestep % update_timestep == 0:
                temp_int = memory.intrinsic_rewards
                mean1, std1, count1 = np.mean(temp_int), np.std(temp_int), len(
                    temp_int)
                reward_rms.update_from_moments(mean1, std1**2, count1)
                adv_int = (temp_int) / np.sqrt(reward_rms.var)
                ppo.update(memory, adv_int)
                memory.clear_memory()
                timestep = 0

            running_reward += reward
            episode_reward += reward
            if render:
                env.render()
            if done:
                break

        avg_length += t

        # stop training if avg_reward > solved_reward
        if running_reward > (log_interval * solved_reward):
            print("########## Solved! ##########")
            #torch.save(ppo.policy.state_dict(), './PPO_{}.pth'.format(env_name))
            #break

        # logging
        if i_episode % log_interval == 0:
            avg_length = int(avg_length / log_interval)
            running_reward = int((running_reward / log_interval))

            print('Episode {} \t avg length: {} \t reward: {}'.format(
                i_episode, avg_length, running_reward))
            running_reward = 0
            avg_length = 0

        samp_rewards.append(episode_reward)
        if (i_episode >= 100):
            # get average reward from last 100 episodes
            avg_reward = np.mean(samp_rewards[-100:])
            # append to deque
            avg_rewards.append(avg_reward)
            # update best average reward
            if avg_reward > best_avg_reward:
                best_avg_reward = avg_reward

        print("Total reward in episode {} = {}".format(i_episode,
                                                       episode_reward))
        print("Best_avg_reward =", np.round(best_avg_reward, 3),
              "Average_rewards =", np.round(avg_reward, 3))
    #env.save_replay()
    env.close()

    return avg_rewards, best_avg_reward, samp_rewards, "0"
Beispiel #13
0
    plt.title("Epsilon-greedy with decay (epsilon=%.1f, decay=%.3f)" % (epsilon, decay))
    plt.xlabel('Episode')
    plt.legend(loc='best')
    file_name = '{}/{}/{}_epsilondecay.png'.format(FIGURES_DIRECTORY, ENV_NAME, 'ql')
    plt.savefig(file_name, format='png', dpi=150) 
    plt.close()

if __name__ == '__main__':


    ENV_NAMES = [FL4x4, FL8x8, FL20x20]

    for ENV_NAME in ENV_NAMES:
        env = FrozenLakeEnv(
            map_name=ENV_NAME,
            rewards=(-0.01, -1, 1), # living, hole, goal
            slip_rate=0.2
        )
        env = env.unwrapped
        # Tunables
        method='greedy'
        n_episodes = 10000
        gamma = 0.90
        alpha = 0.75
        epsilon = 1.0
        decay = 0.999
        Ne = 10
        start = time()
        q, stats, Nsa, policy = q_learning(
            env=env,
            method=method,
Beispiel #14
0
import numpy.matlib
from frozen_lake import FrozenLakeEnv

parser = argparse.ArgumentParser(description='')
parser.add_argument('--N', type=int, default=4, help='')
parser.add_argument('--alpha', type=float, default=0.05, help='')
parser.add_argument('--beta', type=float, default=0.05, help='')
parser.add_argument('--kappa', type=float, default=0.01, help='')
parser.add_argument('--episodes', type=int, default=int(1e7), help='')
parser.add_argument('--runtimes', type=int, default=16, help='')
parser.add_argument('--off_policy', type=int, default=0, help='')
args = parser.parse_args()

unit = 1.0
# experiment Preparation
env = FrozenLakeEnv(None, '%dx%d' % (args.N, args.N), True, unit)
runtimes, episodes, gamma = args.runtimes, args.episodes, lambda x: 0.95

target_policy = np.matlib.repmat(
    np.array([0.2, 0.3, 0.3, 0.2]).reshape(1, 4), env.observation_space.n, 1)
if args.off_policy == 0:
    behavior_policy = target_policy
else:
    behavior_policy = np.matlib.repmat(
        np.array([0.25, 0.25, 0.25, 0.25]).reshape(1, 4),
        env.observation_space.n, 1)
alpha, beta, kappa = args.alpha, args.beta, args.kappa

# get ground truth expectation, variance and stationary distribution
filename = 'frozenlake_truths_%dx%d.npz' % (args.N, args.N)
loaded = np.load(filename)
Beispiel #15
0
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 14 14:59:23 2017

@author: wsn
"""
from frozen_lake import FrozenLakeEnv
env = FrozenLakeEnv()
print(env.__doc__)

# Some basic imports and setup
import numpy as np, numpy.random as nr, gym
np.set_printoptions(precision=3)
def begin_grading(): print("\x1b[43m")
def end_grading(): print("\x1b[0m")

# Seed RNGs so you get the same printouts as me
env.seed(0); from gym.spaces import prng; prng.seed(10)
# Generate the episode
env.reset()
for t in range(100):
    env.render()
    a = env.action_space.sample()
    ob, rew, done, _ = env.step(a)
    if done:
        break
assert done
env.render();

class MDP(object):
    def __init__(self, P, nS, nA, desc=None):
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 15 16:47:07 2017

@author: wsn
"""

# -*- coding: utf-8 -*-
"""
Created on Tue Mar 14 14:59:23 2017

@author: wsn
"""
from frozen_lake import FrozenLakeEnv
env = FrozenLakeEnv()
print(env.__doc__)

# Some basic imports and setup
import numpy as np, numpy.random as nr, gym
np.set_printoptions(precision=3)


def begin_grading():
    print("\x1b[43m")


def end_grading():
    print("\x1b[0m")


# Seed RNGs so you get the same printouts as me
Beispiel #17
0
#!/usr/bin/env python
# coding: utf-8

# In[5]:

import numpy as np
from frozen_lake import FrozenLakeEnv

environment = FrozenLakeEnv()
epochs = 1000
if_break = True


def Func(alpha, gamma):
    V = np.zeros(16)

    for epoch in range(epochs):
        state = 0  # stan poczatkowy kazdego epizodu
        if_break = True
        while if_break:
            random_action = np.random.randint(4)
            tupl = environment.P[state][random_action]
            next_state = tupl[0][1]

            if next_state == 15:
                R = 1
            else:
                R = 0

            V[state] = (V[state] + alpha *
                        (R + gamma * V[next_state] - V[state]))