def main(): """Run Forzen Lake Value Iteration""" if not os.path.exists('output'): os.makedirs('output') pHeads = [0.4, 0.5, 0.6] for p in pHeads: GamblerValueIteration(GamblerEnv(p_heads=p), "GamblerHProb", p) maxCapital = [512, 777, 1023, 1024, 1025] for maxCap in maxCapital: GamblerCapitalValue(GamblerEnv(max_capital=maxCap), "GamblerCapVal", maxCap) GamblerTime("GamblerTime") env405 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=4, p=0.5)) env408 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=4, p=0.8)) env805 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=8, p=0.5)) env808 = gym.make("FrozenLake-v0", desc=frozen_lake.generate_random_map(size=8, p=0.8)) runQ405 = multiprocessing.Process(target=runQ, args=(env405, 5, 4, 0.5, 20000, 1000, )) runP405 = multiprocessing.Process(target=runP, args=(env405, 5, 4, 0.5, 20000, )) runV405 = multiprocessing.Process(target=runV, args=(env405, 5, 4, 0.5, 20000, )) runQ405.start() runP405.start() runV405.start() runQ408 = multiprocessing.Process(target=runQ, args=(env408, 5, 4, 0.8, 20000, 1000, )) runP408 = multiprocessing.Process(target=runP, args=(env408, 5, 4, 0.8, 20000, )) runV408 = multiprocessing.Process(target=runV, args=(env408, 5, 4, 0.8, 20000, )) runQ408.start() runP408.start() runV408.start() runQ805 = multiprocessing.Process(target=runQ, args=(env805, 5, 8, 0.5, 20000, 1000, )) runP805 = multiprocessing.Process(target=runP, args=(env805, 5, 8, 0.5, 20000, )) runV805 = multiprocessing.Process(target=runV, args=(env805, 5, 8, 0.5, 20000, )) runQ805.start() runP805.start() runV805.start() runQ808 = multiprocessing.Process(target=runQ, args=(env808, 5, 8, 0.8, 20000, 1000, )) runP808 = multiprocessing.Process(target=runP, args=(env808, 5, 8, 0.8, 20000, )) runV808 = multiprocessing.Process(target=runV, args=(env808, 5, 8, 0.8, 20000, )) runQ808.start() runP808.start() runV808.start() runQ405.join() runP405.join() runV405.join() runQ408.join() runP408.join() runV408.join() runQ805.join() runP805.join() runV805.join() runQ808.join() runP808.join() runV808.join()
def find_good_maps(map_p=0.8): sizes = MAP_SIZES # sizes = [4, 8] seeds = range(20) best_maps = {} for size in sizes: smallest_lost_games_perc = float('inf') best_map = None for seed in seeds: print(f'Finding best maps with size {size} (seed {seed})...') np.random.seed(seed) map = generate_random_map(size=size, p=map_p) env = FrozenLakeEnv(desc=map) optimal_policy, optimal_value_function = value_iteration( env, theta=0.0000001, discount_factor=0.999) optimal_policy_flat = np.where(optimal_policy == 1)[1] mean_number_of_steps, lost_games_perc = score_frozen_lake( env, optimal_policy_flat) if lost_games_perc < smallest_lost_games_perc: smallest_lost_games_perc = lost_games_perc best_map = map best_maps[size] = { 'lost_games_perc': smallest_lost_games_perc, 'map': best_map } with open(f'best_maps_{map_p}.json', "wb") as f: f.write(json.dumps(best_maps).encode("utf-8")) return best_maps
def get_env(self): random_map = generate_random_map(size=self.size, p=0.8) env = gym.make("FrozenLake-v0", desc=random_map) env.reset() env.render() return env
def test_frozenlake_dfs_map_generation(): def frozenlake_dfs_path_exists(res): frontier, discovered = [], set() frontier.append((0, 0)) while frontier: r, c = frontier.pop() if not (r, c) in discovered: discovered.add((r, c)) directions = [(1, 0), (0, 1), (-1, 0), (0, -1)] for x, y in directions: r_new = r + x c_new = c + y if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size: continue if res[r_new][c_new] == "G": return True if res[r_new][c_new] not in "#H": frontier.append((r_new, c_new)) return False map_sizes = [5, 10, 200] for size in map_sizes: new_frozenlake = generate_random_map(size) assert len(new_frozenlake) == size assert len(new_frozenlake[0]) == size assert frozenlake_dfs_path_exists(new_frozenlake)
def SecondGridWorld(): #Create frozen lake env rand_map = generate_random_map(size=30, p=.8) env = gym.make("FrozenLake-v0") env.reset() nA, nS = env.nA, env.nS T = np.zeros([nA, nS, nS]) R = np.zeros([nS, nA]) for s in range(nS): for a in range(nA): transitions = env.P[s][a] for p_trans, next_s, rew, done in transitions: T[a, s, next_s] += p_trans R[s, a] = rew T[a, s, :] /= np.sum(T[a, s, :]) q = mdp.QLearning(T, R, .98) q.run() qdf = pd.DataFrame(q.run_stats) qdf.to_csv( "C:/Users/wtomjack/.spyder/CS-7641-/ReinforcementLearning/frozenQ.csv") pi = mdp.PolicyIteration(T, R, .98) pi.run() print len(pi.policy) vi = mdp.ValueIteration(T, R, .98) vi.run() print len(vi.policy)
def test_frozenlake_dfs_map_generation(): def frozenlake_dfs_path_exists(res): frontier, discovered = [], set() frontier.append((0,0)) while frontier: r, c = frontier.pop() if not (r,c) in discovered: discovered.add((r,c)) directions = [(1, 0), (0, 1), (-1, 0), (0, -1)] for x, y in directions: r_new = r + x c_new = c + y if r_new < 0 or r_new >= size or c_new < 0 or c_new >= size: continue if res[r_new][c_new] == 'G': return True if (res[r_new][c_new] not in '#H'): frontier.append((r_new, c_new)) return False map_sizes = [5, 10, 200] for size in map_sizes: new_frozenlake = generate_random_map(size) assert len(new_frozenlake) == size assert len(new_frozenlake[0]) == size assert frozenlake_dfs_path_exists(new_frozenlake)
def init_envs(self, env): if env == 'TH': self.key = 'TH' self.noise = 0.0 # self.N_range = list(range(2, 6)) print("initiaiting envs: \n") for N in tqdm.tqdm(self.N_range): state_N = tuple(range(N, -1, -1)) # env = TohEnv(initial_state=((3, 2, 1, 0), (), ()), goal_state=((), (), (3, 2, 1, 0)), noise=0) env = TohEnv(initial_state=(state_N, (), ()), goal_state=((), (), state_N), noise=self.noise) self.env_list.append(env) self.env = self.env_list[0] elif env == 'FL': self.key = 'FL' self.FL_maps = {} # self.N_range = list(range(4,20)) print("initiaiting envs: \n") for N in tqdm.tqdm(self.N_range): np.random.seed(777) self.noise = 0.8 self.FL_maps[N] = generate_random_map(size=N, p=self.noise) self.env_list.append( gym.make("FrozenLake-v0", desc=self.FL_maps[N])) self.env = self.env_list[0] else: raise KeyError
def __init__(self, desc=None, map_name="4x4", is_slippery=True): if desc is None and map_name is None: desc = generate_random_map() elif desc is None: desc = MAPS[map_name] self.desc = desc = np.asarray(desc, dtype='c') self.nrow, self.ncol = nrow, ncol = desc.shape self.reward_range = (0, 1) nA = 4 nS = nrow * ncol isd = np.array(desc == b'S').astype('float64').ravel() isd /= isd.sum() P = {s: {a: [] for a in range(nA)} for s in range(nS)} def to_s(row, col): return row * ncol + col def inc(row, col, a): if a == LEFT: col = max(col - 1, 0) elif a == DOWN: row = min(row + 1, nrow - 1) elif a == RIGHT: col = min(col + 1, ncol - 1) elif a == UP: row = max(row - 1, 0) return row, col for row in range(nrow): for col in range(ncol): s = to_s(row, col) for a in range(4): li = P[s][a] letter = desc[row, col] if letter in b'GH': li.append((1.0, s, 0, True)) else: newrow, newcol = inc(row, col, a) newstate = to_s(newrow, newcol) newletter = desc[newrow, newcol] done = bytes(newletter) in b'GH' rew = float(newletter == b'G') li.append((1.0, newstate, rew, done)) self.P = P self.isd = isd self.lastaction = None # for rendering self.nS = nS self.nA = nA self.action_space = spaces.Discrete(self.nA) self.observation_space = spaces.Discrete(self.nS) self.seed() self.s = categorical_sample(self.isd, self.np_random) super(FL, self).__init__(nS, nA, P, isd)
def __init__(self, size=10, p=0.8): self.name = 'frozenlake' self.size = size random_map = generate_random_map(size=size, p=p) self.env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=True) self.env.seed(123) self.env.action_space.np_random.seed(123) self.env._max_episode_steps = 20000 self.prob = self.probability_matrix() self.rewards = self.rewards_matrix() self.env.render()
def get_env(size, p=(1 - 1.0 / 40), one_hot_obs=True, neg_dead_rew=True, is_slippery=True): random_map = generate_random_map(size=size, p=p) env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=is_slippery) if neg_dead_rew: env = NegativeOnDeadWrapper(env) if one_hot_obs: env = Int2OneHotWrapper(env) return env
def __init__(self): random_map = generate_random_map(size=8, p=0.8) self.env = gym.make("FrozenLake-v0", is_slippery=True, desc=random_map) self.env.reset() self.epsilon = INITIAL_EPSILON self.learning_rate = INITIAL_LEARNING_RATE self.action_space = self.env.action_space.n self.state_space = [self.env.observation_space.n] self.q_table = np.random.uniform(low=-2, high=0, size=(self.state_space + [self.action_space]))
def __init__(self, desc=None, map_name="4x4", is_slippery=True): if desc is None and map_name is None: desc = generate_random_map() elif desc is None: desc = copy.deepcopy(MAPS[map_name]) self.initial_desc = copy.deepcopy(desc) self.map_name = map_name self.desc = desc = np.asarray(desc, dtype='c') self.nrow, self.ncol = desc.shape self.reward_range = (0, 1) self.max_episode_steps = 100 if map_name == '4x4' else 1000 self.num_steps = 0 self.done = False self.successful_attack = False self.lastaction_a = None self.lastaction_d = None self.lastplayer = None self.nA_a = 4 self.nS_a = self.nrow * self.ncol self.goal = np.nan self.holes = [] for row in range(self.nrow): for col in range(self.ncol): s = self.to_s(row, col) letter = desc[row, col] if letter in b'H': self.holes.append(s) if letter in b'G': self.goal = s # Basically max_position**num_hole self.nS_d = np.sum([((self.ncol * self.nrow) - 1)**(j + 1) for j in range(len(self.holes))]) self.nA_d = len(self.holes) * 4 self.action_space_a = spaces.Discrete(self.nA_a) self.observation_space_a = spaces.Discrete(self.nS_a) self.action_space_d = spaces.Discrete(self.nA_d) self.observation_space_d = spaces.Discrete(self.nS_d) self.seed() self.s_a = 0 self.s_d = self.to_s_d()
def run_experiments_part1(): try: os.makedirs('report/images', exist_ok=True) print("Directory created successfully.") except OSError as error: print("Directory '%s' can not be created") print('STARTING EXPERIMENTS') env = gym.make("FrozenLake-v0") Frozen_Lake_Experiments(env=env, environment="FrozenLake-v0") random_map = generate_random_map(size=40, p=0.8) env = gym.make("FrozenLake-v0", desc=random_map) Frozen_Lake_Experiments(env=env, environment="FrozenLake-40x40") Taxi_Experiments() print('END OF EXPERIMENTS')
def custom_frozen_lake(size=8, p=0.8, nondeterministic=False): """ Create a custom-sized frozen-lake environment :param size: size x size lake :param p: probability of creating frozen tile :return: environment based on: https://reinforcementlearning4.fun/2019/06/24/create-frozen-lake-random-maps/ """ random_map = generate_random_map(size=size, p=p) fl = gym.make('FrozenLake-v0', desc=random_map, is_slippery=nondeterministic) return fl
def __init__(self): random_map = generate_random_map(size=6, p=0.8) self.env = gym.make("FrozenLake-v0", is_slippery=True, desc=random_map) self.env.reset() self.state_space = self.env.observation_space.n self.action_space = self.env.action_space.n self.epsilon = INITIAL_EPSILON # Main model self.model = self._create_model() # Target network self.target_model = self._create_model() self.target_model.set_weights(self.model.get_weights()) # An array with last n steps for training self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE) # Used to count when to update target network with main network's weights self.target_update_counter = 0
def myFrozenLake(size=8, randomMap=True, slippery=False, rewarding=False, equiProbable=True, frozenProb=0.9, seed=RS): setMySeed(seed) if randomMap: map = generate_random_map(size, frozenProb) env_name = "MyFrozenLakeMap_size_{}_seed_{}-v0".format(size, seed) deleteEnvironment(env_name) joblib.dump(map, env_name) gym.envs.register(id=env_name, entry_point='gymEnvs:MyFrozenLakeEnv', kwargs={ 'desc': map, 'is_slippery': slippery, 'rewarding': rewarding, 'equiProbable': equiProbable }, max_episode_steps=size**4, reward_threshold=0.78) return gym.make(env_name) else: env_name = "MyFrozenLakeMapCustom-v0".format(size, seed) deleteEnvironment(env_name) gym.envs.register(id=env_name, entry_point='gymEnvs:MyFrozenLakeEnv', kwargs={ 'map_name': '20x20', 'is_slippery': slippery, 'rewarding': rewarding, 'equiProbable': equiProbable }, max_episode_steps=1000, reward_threshold=0.78) return gym.make(env_name)
def getEnv(env_id='default', rH=0, rG=1, rF=0, size=4, map_name='4x4', is_slippery=True, render_initial=True, desc=None): if env_id in gym.envs.registry.env_specs: del gym.envs.registry.env_specs[env_id] nap_desc = frozen_lake.generate_random_map(size) if not desc else desc register( id=env_id, # name given to this new environment entry_point='my_env:CustomizedFrozenLake', # env entry point kwargs={ 'rH': rH, 'rG': rG, 'rF': rF, 'desc': nap_desc, 'map_name': map_name, 'is_slippery': is_slippery } # argument passed to the env ) this_env = make(env_id) this_env.seed(5) if render_initial: print('--Board--') this_env.render() print('\n--Actions for Position to the Left of the Goal--') pprint(this_env.P[this_env.nS - 2]) return this_env
def __init__(self, map_size=30, map_prob=0.9, is_slippery=True, alt_reward=True): desc = generate_random_map(size=map_size, p=map_prob) self.desc = desc = np.asarray(desc, dtype='c') self.nrow, self.ncol = nrow, ncol = desc.shape self.actions_symbols = {0: "◄", 1: "▼", 2: "►", 3: "▲"} self.actions_symbols2 = {0: "←", 1: "↓", 2: "→", 3: "↑"} self.actions_text = {0: "left", 1: "down", 2: "right", 3: "up"} if alt_reward: self.reward_range = (-100, 100) else: self.reward_range = (0, 1) nA = 4 nS = nrow * ncol isd = np.array(desc == b'S').astype('float64').ravel() isd /= isd.sum() P = {s: {a: [] for a in range(nA)} for s in range(nS)} def to_s(row, col): return row * ncol + col def inc(row, col, a): if a == LEFT: col = max(col - 1, 0) elif a == DOWN: row = min(row + 1, nrow - 1) elif a == RIGHT: col = min(col + 1, ncol - 1) elif a == UP: row = max(row - 1, 0) return row, col for row in range(nrow): for col in range(ncol): s = to_s(row, col) for a in range(4): li = P[s][a] letter = desc[row, col] if letter in b'GH': li.append((1.0, s, 0, True)) else: if is_slippery: for b in [(a - 1) % 4, a, (a + 1) % 4]: newrow, newcol = inc(row, col, b) newstate = to_s(newrow, newcol) newletter = desc[newrow, newcol] done = bytes(newletter) in b'GH' if newletter == b'G': rew = 100.0 if alt_reward else 1.0 elif newletter == b'H': rew = -100.0 if alt_reward else 0.0 else: rew = -1.0 if alt_reward else 0.0 li.append((1.0 / 3.0, newstate, rew, done)) else: newrow, newcol = inc(row, col, a) newstate = to_s(newrow, newcol) newletter = desc[newrow, newcol] done = bytes(newletter) in b'GH' if newletter == b'G': rew = 100.0 if alt_reward else 1.0 elif newletter == b'H': rew = -100.0 if alt_reward else 0.0 else: rew = -1.0 if alt_reward else 0.0 li.append((1.0, newstate, rew, done)) super(FrozenLakeModified, self).__init__(nS, nA, P, isd)
plt.ylabel('Average Reward') if episode is None: f_name = 'base.png' else: f_name = 'output/{}-{}-episode.png'.format(step, episode) plt.savefig(f_name, format='png') plt.clf() start = time.time() # Environment initialization folder = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'q_learning') folder2 = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'output') random_map = generate_random_map(size=64, p=0.95) env = gym.wrappers.Monitor(gym.make('FrozenLake8x8-v0', desc=random_map, map_name=None), folder, force=True) # Q and rewards if parent_step is None: # Q = np.zeros((env.observation_space.n, env.action_space.n)) Q = np.random.rand(env.observation_space.n, env.action_space.n) * 0.0001 else: Q = np.loadtxt("output/{}.csv".format(parent_step), delimiter=',') rewards = [] iterations = []
def learning_experiments(): policy_iteration_times = np.zeros((1000, 10)) n_iterations = np.zeros((1000, 10)) for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)): for states in range(2, 1000): P, R = example.forest(S=states) pi = mdp.mdp.PolicyIteration(P, R, gamma, max_iter=10000) pi.run() policy_iteration_times[states, i] = pi.time n_iterations[states, i] = pi.iter np.save(f'{PATH}/policy_iteration_times_forest.npy', policy_iteration_times) np.save(f'{PATH}/policy_iteration_n_iter_forest.npy', n_iterations) # In[96]: value_iteration_times = np.zeros((1000, 10, 10)) n_iterations = np.zeros((1000, 10, 10)) for j, epsilon in enumerate(np.linspace(0.1, 0.99, 10)): for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)): for states in range(2, 1000): P, R = example.forest(S=states) pi = mdp.mdp.ValueIteration(P, R, discount=gamma, max_iter=10000, epsilon=epsilon) pi.run() value_iteration_times[states, i, j] = pi.time n_iterations[states, i, j] = pi.iter np.save(f'{PATH}/value_iteration_times_forest.npy', value_iteration_times) np.save(f'{PATH}/value_iteration_n_iter_forest.npy', n_iterations) # In[108]: Q_iteration_times = np.zeros((1000, 10)) n_iterations = np.zeros((1000, 10)) for i, gamma in enumerate(np.linspace(0.1, 0.99, 10)): for states in range(2, 1000): P, R = example.forest(S=states) pi = mdp.mdp.QLearning(P, R, discount=gamma, n_iter=10000) pi.run() Q_iteration_times[states, i] = pi.time n_iterations[states, i] = pi.mean_discrepancy np.save(f'{PATH}/Q_iteration_times_forest.npy', Q_iteration_times) np.save(f'{PATH}/Q_iteration_n_iter_forest.npy', n_iterations) # ## MDP 2: FrozenLake # In[98]: # In[109]: from gym.envs.toy_text.frozen_lake import generate_random_map Q_iteration_times = np.zeros((100, 10, 10)) Q_rewards = np.zeros((100, 10, 10)) value_n_iterations = np.zeros((100, 10, 10)) policy_n_iterations = np.zeros((100, 10, 10)) total_states = np.zeros(100) for size in range(2, 100, 5): for i, gamma in enumerate(np.linspace(0, 1, 10)): for j, epsilon in enumerate(np.linspace(0, 1, 10)): random_map = generate_random_map(size=size, p=0.8) environment = gym.make('FrozenLake-v0', desc=random_map) test = QLearner(0.1, gamma, epsilon, verbose=False) start = time.time() n = test.learn(50) Q_iteration_times[size, i, j] = time.time() - start Q_rewards[size, i, j] = n[-1] np.save(f'{PATH}/Q_iteration_times_grid.npy', Q_iteration_times) np.save(f'{PATH}/Q_iteration_rewards_grid.npy', Q_rewards) # In[106]: value_iteration_times = np.zeros((100, 10)) policy_iteration_times = np.zeros((100, 10)) value_n_iterations = np.zeros((100, 10)) policy_n_iterations = np.zeros((100, 10)) total_states = np.zeros(100) for size in range(2, 100, 5): for i, gamma in enumerate(np.linspace(0, 1, 10)): random_map = generate_random_map(size=size, p=0.8) environment = gym.make('FrozenLake-v0', desc=random_map) total_states[size] = environment.nS agent = BasicLearner(environment, environment.nS, environment.nA, 5000, gamma) start = time.time() opt_v2, opt_policy2, value_iter = agent.value_iteration() value_iteration_times[size, i] = time.time() - start value_n_iterations[size, i] = value_iter start = time.time() opt_v2, opt_policy2, policy_iter = agent.policy_iteration() policy_iteration_times[size, i] = time.time() - start policy_n_iterations[size, i] = policy_iter np.save(f'{PATH}/num_states_grid.npy', total_states) np.save(f'{PATH}/policy_iteration_times_grid.npy', policy_iteration_times) np.save(f'{PATH}/value_iteration_times_grid.npy', value_iteration_times) np.save(f'{PATH}/value_iteration_n_iter_grid.npy', value_n_iterations) np.save(f'{PATH}/policy_iteration_n_iter_grid.npy', policy_n_iterations)
import numpy as np from hiive.mdptoolbox import mdp, example import matplotlib matplotlib.use("TKAgg") import matplotlib.pyplot as plt import re if __name__ == '__main__': start = time.time() np.random.seed(0) # TODO probably should make multiple sizes, will make tuning ql a pain because so many iterations will be needed # grid world, frozen lake, small 225 states lake_size = 15 random_map = generate_random_map(size=lake_size, p=0.8) env = gym.make("FrozenLake-v0", desc=random_map) env.reset() env.render() num_states = len(env.env.P) num_actions = len(env.env.P[0]) transitions = np.zeros((num_actions, num_states, num_states)) rewards = np.zeros((num_states, num_actions)) # convert transition matrix dict of dicts of lists to rewards matrix # frozen lake has a mostly 0 matrix, might be worth looking at sparse if it gets really big for state in env.env.P: for action in env.env.P[state]:
# "FFFFFHFFFFFFFFFFHHFFFHFFFFFFFF", # "FFFFFFFFFFFHHFFFFFHHHFFHHFFFFF", # "HHHHFHFFFFFFFFFFHHFFFFFFFFFFFF", # "FFFFFHFFFFHHHFFFFFFFFFFFHFFFFF", # "FFFFFFFFFFFFFFFFHHFFFHFFFFFFFF", # "FFFFFHFFFFFFHFFFHHFFFFHHFFFFFF", # "FFFFFHFFFFFFFFFFHHFFFFFFFFHFFF", # "FFFFFFFFFFFHFFFFFFFFFFFFFFFFFF", # "FHHFFFHFFFFHFFFFFHFFFFHHFFFFFF", # "FHHFHFHFFFFFFFFFFFFFFFFFFFFFFF", # "FFFHFFFFFHFFFFHHFHFHFFFHHHHFFG" #] #env = gym.make('FrozenLake-v0') custom_map = generate_random_map(size=30, p=0.8) env = gym.make("FrozenLake-v0", desc=custom_map) # from https://learning.oreilly.com/library/view/hands-on-reinforcement-learning/9781788836524/e8ad36d5-21fe-442f-8133-3cee6bf31b2e.xhtml def value_iteration(env, gamma=1.0): aa = [] value_table = np.zeros(env.observation_space.n) no_of_iterations = 100000 threshold = 1e-10 for i in range(no_of_iterations): print(i) updated_value_table = np.copy(value_table)
def run_fl(size): seed_val = 42 np.random.seed(seed_val) random.seed(seed_val) if size == 4: env = gym.make("FrozenLake-v0") else: seed_val = 58 np.random.seed(seed_val) random.seed(seed_val) dim = size random_map = generate_random_map(size=dim, p=0.8) env = gym.make("FrozenLake-v0", desc=random_map) env.seed(seed_val) env.reset() # env.render() # env = gym.make('FrozenLake8x8-v0') env = env.unwrapped nA = env.action_space.n nS = env.observation_space.n best_V = '' best_won = -1 best_policy = [] gammas = [0.1, 0.3, 0.4, 0.7, 0.9, 0.99] epsilons = [0.1, 0.01, 0.001, 0.0001, 0.00001, 0.000001, 0.0000001, 0.00000001, 0.000000001, 0.0000000001] gammas = [0.3] epsilons = [0.0001] per_won_hm = np.zeros((len(gammas), len(epsilons))) iters_hm = np.zeros((len(gammas), len(epsilons))) time_hm = np.zeros((len(gammas), len(epsilons))) g_cnt = 0 e_cnt = 0 best_e = 0 best_g = 0 for g in gammas: e_cnt = 0 for e in epsilons: if g >= 0.99 and e <= 0.001: per_won_hm[g_cnt][e_cnt] = 0 iters_hm[g_cnt][e_cnt] = 0 time_hm[g_cnt][e_cnt] = 0 else: start = time.time() V = np.zeros(nS) policy = np.zeros(nS) policy_stable = False it = 0 while not policy_stable: policy_evaluation(env, V, policy, nS, e, g) policy_stable = policy_improvement(env, V, policy, nA, nS, g) run_time = time.time() - start per_won = run_pi_episodes(env, V, policy, 10) per_won_hm[g_cnt][e_cnt] = per_won iters_hm[g_cnt][e_cnt] = it time_hm[g_cnt][e_cnt] = run_time * 1000 print(g, e, it, per_won) if per_won > best_won: best_e = e best_g = g best_V = V best_policy = policy best_won = per_won e_cnt += 1 g_cnt += 1 # Plot Percent Games Won Heatmap fig, ax = plt.subplots() im, cbar = heatmap(per_won_hm, gammas, epsilons, ax=ax, cmap="YlGn", cbarlabel="% Games Won") texts = annotate_heatmap(im, valfmt="{x:.2f}") fig.tight_layout() plt.savefig('Images\\PI-FL-Per_Heatmap' + str(size) + '.png') plt.show() # Plot Iterations Heatmap fig, ax = plt.subplots() im, cbar = heatmap(iters_hm, gammas, epsilons, ax=ax, cmap="YlGn", cbarlabel="# of Iterations to Convergence") texts = annotate_heatmap(im, valfmt="{x:.0f}") fig.tight_layout() plt.savefig('Images\\PI-FL-Iter_Heatmap' + str(size) + '.png') plt.show() # Plot Run time Heatmap fig, ax = plt.subplots() im, cbar = heatmap(time_hm, gammas, epsilons, ax=ax, cmap="YlGn", cbarlabel="Runtime (ms)") texts = annotate_heatmap(im, valfmt="{x:.0f}") fig.tight_layout() plt.savefig('Images\\PI-FL-Time_Heatmap' + str(size) + '.png') plt.show() # Plot Optimal state values with directions plot_values(V, best_policy, size) print(best_V.reshape((size, size))) print(best_policy.reshape((size, size))) print(best_e, best_g, best_won)
return policy, V_ARR, V_SUM np.random.seed(1111) # # Different sizes # N_ITERS = [] SIZE = np.arange(10, 40, 1) TIME_ARR = [] for size in SIZE: print(size) np.random.seed(1111) random_map = generate_random_map(size=size, p=0.85) env = gym.make("FrozenLake-v0", desc=random_map, is_slippery=False) env.reset() time0 = time.time() policy, V_ARR, V_SUM = policy_improvement(env, discount_factor=0.99) time1 = time.time() N_ITERS.append(len(V_SUM)) TIME_ARR.append(time1 - time0) fig, ax = plt.subplots() ax.plot(SIZE, N_ITERS, color='red', label="Number of iterations", linewidth=2.0, linestyle='-')
import gym from gym import utils import sys import numpy as np from gym.envs.toy_text.frozen_lake import generate_random_map m = generate_random_map(8) m_small = [ "SHFHFHFF", "FFHFFFFF", "FFFFFFFF", "FFHFFFFF", "FFHHHFHF", "FFFFFFHF", "HHFFFFHF", "FHFHFFFG", ] m_large = [ "SFHFFFFFFFFFFFHFHFFFFFFFFFHFFFFF", "FFFFFFHFFFFFFFFFHFFHFFFFFFFFFFHF", "HFFFFFFFHFFFFFHHFHFFFFFFFFFFFFHF", "FFFFFFFFFFFFFFHFFFFFFFFHFFFHHFFF", "HFFFFHFFFFFFHFHFFFHFHFFFFFFFHFFF", "HFFFFFFHFFHFFFFFFFFFFFFFFFFFFFFF", "FFFFFFFFFFFHHHFFHFFFFFFFFFFFFFHF", "FFFFFFFHHFFFFFHFFFFFFFFFFHFFFHFF", "FFFFFFFFFFFFHFFFFHFFFHFFFFFFFFHH", "HFFFFFFFFFFFFFFFHFHFFFFFFFFFFFHF", "HFFFFFFFFFFFFHFFHFFHFFFFFFFFHHHF",
import mdp_copy import numpy as np import gym from gym.envs.toy_text.frozen_lake import generate_random_map from matplotlib import pyplot as plt import seaborn as sns ACTION_MAP = ['<', 'V', '>', '^'] if __name__ == '__main__': np.random.seed(300) grid_size = 30 random_map = generate_random_map(size=grid_size) env = gym.make("FrozenLake-v0", desc=random_map) action_space = env.action_space.n observation_space = env.observation_space.n T = np.zeros((action_space, observation_space, observation_space)) R = np.zeros((observation_space)) for state in env.env.P.keys(): choices = env.env.P[state] for action in choices.keys(): outcomes = choices[action] for outcome in outcomes: prob, next_state, reward, terminal = outcome T[action][state][next_state] += prob if not terminal or state != next_state: R[next_state] = reward R.reshape((grid_size, grid_size)) ### 0.9 discount
size=14) else: ax.text(j, i, mapping[actions[i, j]], ha='center', va='center', color='k', size=12) fig.tight_layout() plt.show() if __name__ == '__main__': env_name = 'FrozenLake8x8-v0' new_map = generate_random_map(size=30, p=0.9) conveged_at_determined = [] policy_average_score_determined = [] conveged_at_stochastic = [] policy_average_score_stochastic = [] gamma_list = [] gamma_range = np.arange(0.1, 1.0, 0.1) for i in gamma_range: gamma = 1 - i / 100 gamma_list.append(gamma) """ deterministic """ env1 = gym.make(env_name, is_slippery=False) env1.seed(3006)
def sixteen_by_sixteen_map(): return frozen_lake.generate_random_map(size=16)
import numpy as np import gym from gym import wrappers from gym.envs.toy_text.frozen_lake import generate_random_map import seaborn as sns sns.set() import matplotlib.pyplot as plt import random import pandas as pd import time random_map = generate_random_map(size=20, p=0.8) def run_episode(env, policy, gamma=1.0, render=False): """ Runs an episode and return the total reward """ obs = env.reset() total_reward = 0 step_idx = 0 while True: if render: env.render() obs, reward, done, _ = env.step(int(policy[obs])) #print "Reward received for each action" #print reward total_reward += (gamma**step_idx * reward) #print "Total reward inside whileloop" #print total_reward step_idx += 1 if done: break
import statistics import random import numpy as np import pandas as pd import gym from gym.envs.toy_text.frozen_lake import generate_random_map import matplotlib.pyplot as plt seed = 0 # Init PRNG random.seed(seed) np.random.seed(seed) evnt = 'FrozenLake-v0' size = 8 rndm = generate_random_map(size) env = gym.make(evnt, desc=rndm) env.seed(0) import tensorflow as tf tf.random.set_seed(seed) from collections import deque import keras from keras.models import Sequential from keras.layers import Dense from keras.optimizers import Adam def epsilon_greedy(model, state, epsilon, num_actions): greedy = np.random.uniform(0, 1)
import gym import numpy as np import tensorflow as tf from gym.envs.toy_text.frozen_lake import generate_random_map size = 40 random_map = generate_random_map(size=size, p=0.88) env = gym.make("FrozenLake-v0", desc=random_map) #reward list, used to evaluate the agents performance rList = [] #used for experience replay experiences = [] def add_state_to_memory(state, reward): mem = {state: state, reward: reward} experiences.append(mem) def shape_reward(current_reward, current_state, done): if current_reward == 1: return current_reward if done and current_reward == 0: return -1.0 return current_reward