def __init__(self, true_mdp_code, do_discount, do_randchoose, do_temp): danger_r = -2 goal_reward = 10 init_ground = (0, 2) goal_s = (5, 2) fr_vals = [{'x': danger_r, 'o': 0}[v] for v in true_mdp_code] feature_rewards = dict(zip('opc', fr_vals)) feature_rewards['y'] = goal_reward params = { 'gridworld_array': ['.oooo.', '.oppp.', '.opccy', '.oppc.', '.cccc.'], 'feature_rewards': feature_rewards, 'absorbing_states': [ goal_s, ], 'init_state': init_ground, 'wall_action': False, 'step_cost': 0, 'wait_action': False, } self.gw = GridWorld(**params) self.planner = self.gw.solve(discount_rate=do_discount, softmax_temp=do_temp, randchoose=do_randchoose)
class StandardPlanningModel(object): def __init__(self, true_mdp_code, do_discount, do_randchoose, do_temp): danger_r = -2 goal_reward = 10 init_ground = (0, 2) goal_s = (5, 2) fr_vals = [{'x': danger_r, 'o': 0}[v] for v in true_mdp_code] feature_rewards = dict(zip('opc', fr_vals)) feature_rewards['y'] = goal_reward params = { 'gridworld_array': ['.oooo.', '.oppp.', '.opccy', '.oppc.', '.cccc.'], 'feature_rewards': feature_rewards, 'absorbing_states': [ goal_s, ], 'init_state': init_ground, 'wall_action': False, 'step_cost': 0, 'wait_action': False, } self.gw = GridWorld(**params) self.planner = self.gw.solve(discount_rate=do_discount, softmax_temp=do_temp, randchoose=do_randchoose) def trajectory_loglikelihood(self, wtraj): logl = 0 for s, a in wtraj: adist = self.planner.act_dist(s) logl += math.log(adist[a]) return logl
def test_qlearner_on_simple_deterministic_gridworld(self): gw = GridWorld( gridworld_array=['...........', '.xxxxxxxxxy', '.xxxxxxxxxx'], absorbing_states=[ (10, 1), ], init_state=(0, 1), feature_rewards={ '.': -1, 'x': -10, 'y': 100 }) np.random.seed(123) params = { 'learning_rate': 1, 'eligibility_trace_decay': .8, 'initial_qvalue': 100 } qlearn = Qlearning(gw, softmax_temp=1, discount_rate=.99, **params) qlearn.train(episodes=100, max_steps=100) test = qlearn.run(softmax_temp=0.0, randchoose=0.0, max_steps=50) totr = sum([r for s, a, ns, r in test]) self.assertEqual(totr, 89)
def test_discretizedobmdp_stochastic_gridworld(self): seed_trajs = [] non_std_t_features = {'g': { '2forward': .5, 'forward': .5 }} exp_mdp = GridWorld( gridworld_array=[['w', 'y', 'w'], ['w', 'w', 'w'], ['w', 'w', 'w'], ['g', 'w', 'g'], ['w', 'w', 'w']], init_state=(1, 0), feature_rewards={'w': 0, 'r': 0, 'g': 0, 'y': 5}, absorbing_states=[(1, 4), ], include_intermediate_terminal=True, non_std_t_features=non_std_t_features ) exp_planner = exp_mdp.solve(discount_rate=.99, randchoose=.5, softmax_temp=0.0) for _ in range(50): traj = exp_planner.run() seed_trajs.append(traj) dobmdp = DiscretizedObserverBeliefMDPApproximation( planners=self.sto_planners, true_planner_name='s', belief_reward=5, belief_reward_type='true_gain', n_probability_bins=5, seed_trajs=seed_trajs ) dobmdp_planner = dobmdp.solve(discount_rate=.99, softmax_temp=0.0, randchoose=0.0) traj = dobmdp_planner.run() s_i = dobmdp.planner_order.index('s') s_ismax = max_index(traj[-1][0][0]) == s_i self.assertTrue(s_ismax)
def setUp(self): det_planners = {} for mdpc in product('xo', repeat=2): mdpc = ''.join(mdpc) rewards = [{'x': -1, 'o': 0}[c] for c in mdpc] feature_rewards = dict(zip('pc', rewards)) feature_rewards['w'] = 0 feature_rewards['y'] = 5 mdp = GridWorld( gridworld_array=[['w', 'p', 'y'], ['w', 'c', 'w']], feature_rewards=feature_rewards, init_state=(0, 0), absorbing_states=[(2, 1)], include_intermediate_terminal=True ) planner = mdp.solve(discount_rate=.99, softmax_temp=.5, randchoose=.1) det_planners[mdpc] = planner self.det_planners = det_planners sto_planners = {} for mdpc in 'sw': non_std_t_features = {'g': { '2forward': {'s':.7, 'w':.3}[mdpc], 'forward': 1 - {'s':.7, 'w':.3}[mdpc] }} mdp = GridWorld( gridworld_array=[['w', 'y', 'w'], ['w', 'w', 'r'], ['r', 'w', 'w'], ['g', 'w', 'g'], ['w', 'w', 'w']], init_state=(1, 0), feature_rewards={'w': 0, 'r': -1, 'g': 0, 'y': 5}, absorbing_states=[(1, 4),], include_intermediate_terminal=True, non_std_t_features=non_std_t_features ) planner = mdp.solve(discount_rate=.99, softmax_temp=.5, randchoose=.1) sto_planners[mdpc] = planner self.sto_planners = sto_planners
def __init__(self, true_mdp_code, do_discount, do_randchoose, do_temp, show_discount, show_reward, show_randchoose, show_temp, n_bins=8, seed_trajs=None, disc_tf=None, solved_planner=None): self.show_discount = show_discount self.show_randchoose = show_randchoose self.show_temp = show_temp #=============================# #if the solved planner is provided, no need to #compute other stuff if solved_planner is not None: self.obmdp_planner = solved_planner self.obmdp = solved_planner.mdp return #=============================# # Build set of ground MDPs # #=============================# danger_r = -2 goal_reward = 10 init_ground = (0, 2) goal_s = (5, 2) mdp_params = [] feature_rewards = [] for rs in product([0, danger_r], repeat=3): feature_rewards.append(dict(zip('opc', rs))) mdp_codes = [] for fr in feature_rewards: rfc = ['o' if fr[f] == 0 else 'x' for f in 'opc'] rfc = ''.join(rfc) mdp_codes.append(rfc) fr['y'] = goal_reward fr['.'] = 0 planners = {} for mdpc, frewards in zip(mdp_codes, feature_rewards): params = { 'gridworld_array': ['.oooo.', '.oppp.', '.opccy', '.oppc.', '.cccc.'], 'feature_rewards': frewards, 'absorbing_states': [goal_s, ], 'init_state': init_ground, 'wall_action': False, 'step_cost': 0, 'wait_action': False, 'include_intermediate_terminal': True } mdp = GridWorld(**params) planner = mdp.solve( softmax_temp=do_temp, randchoose=do_randchoose, discount_rate=do_discount) planners[mdpc] = planner #===========================================# # Build Observer Belief MDP and support # #===========================================# obmdp = DiscretizedObserverBeliefMDPApproximation( n_probability_bins=n_bins, seed_trajs=seed_trajs, branch_steps=0, discretized_tf=disc_tf, planners=planners, true_planner_name=true_mdp_code, belief_reward_type='true_gain', only_belief_reward=False, belief_reward=show_reward, update_includes_intention=True) self.obmdp = obmdp self.obmdp_planner = None
import time import numpy as np import pandas as pd import seaborn as sns from pyrlap.domains.gridworld import GridWorld from pyrlap.algorithms.qlearning import Qlearning from pyrlap.domains.gridworld.gridworldvis import visualize_trajectory # %% gw = GridWorld(gridworld_array=['...........', '.xxxxxxxxxy', '.xxxxxxxxxx'], absorbing_states=[ (10, 1), ], init_state=(0, 1), feature_rewards={ '.': -1, 'x': -10, 'y': 100 }) s_features = gw.state_features # %% np.random.seed(1234) all_run_data = [] # %% start = time.time() for i in range(20): params = { 'learning_rate': 1,
def test_slip_state_world(self): state_features = [['w', 'x', 'x', 'x', 'x', 'w'], ['w', 'a', 'a', 'a', 'a', 'y'], ['w', 'x', 'x', 'x', 'x', 'w']] w = len(state_features[0]) h = len(state_features) state_features = {(x, y): state_features[h - 1 - y][x] for x, y in product(range(w), range(h))} absorbing_states = [ (5, 1), ] feature_rewards = {'a': 0, 'b': 0, 'x': -1, 'y': 5, 'w': 0} slip_features = {'a': {'forward': .6, 'side': .4, 'back': 0}} params = { 'width': w, 'height': h, 'state_features': state_features, 'feature_rewards': feature_rewards, 'absorbing_states': absorbing_states, 'slip_features': slip_features, 'init_state': (0, 1), 'include_intermediate_terminal': True } gw = GridWorld(**params) planner = gw.solve(discount_rate=.99) true_policy = { (-2, -2): '%', (-1, -1): '%', (0, 0): '^', (0, 1): '>', (0, 2): 'v', (1, 0): '^', (1, 1): '>', (1, 2): 'v', (2, 0): '^', (2, 1): '>', (2, 2): 'v', (3, 0): '>', (3, 1): '>', (3, 2): '>', (4, 0): '>', (4, 1): '>', (4, 2): '>', (5, 0): '^', (5, 1): '%', (5, 2): 'v' } self.assertEqual(planner.optimal_policy, true_policy) np.random.seed(2223124) traj = [] s = gw.get_init_state() for _ in range(20): a = planner.optimal_policy[s] ns = gw.transition(s, a) r = gw.reward(s, a, ns) traj.append((s, a, ns, r)) s = ns if s in gw.absorbing_states: break true_traj = [((0, 1), '>', (1, 1), 0), ((1, 1), '>', (1, 0), -1), ((1, 0), '^', (1, 1), 0), ((1, 1), '>', (2, 1), 0), ((2, 1), '>', (3, 1), 0), ((3, 1), '>', (3, 0), -1), ((3, 0), '>', (4, 0), -1), ((4, 0), '>', (5, 0), 0), ((5, 0), '^', (5, 1), 5)] self.assertEqual(traj, true_traj)
def test_simple_deterministic_world(self): state_features = [['w', 'c', 'c', 'c', 'c', 'w'], ['w', 'c', 'c', 'c', 'c', 'w'], ['w', 'a', 'a', 'a', 'a', 'y'], ['w', 'c', 'c', 'c', 'c', 'w'], ['w', 'b', 'b', 'b', 'b', 'w'], ['w', 'c', 'c', 'c', 'c', 'w']] w = len(state_features[0]) h = len(state_features) state_features = {(x, y): state_features[h - 1 - y][x] for x, y in product(range(w), range(h))} absorbing_states = [ (5, 3), ] feature_rewards = {'a': -2, 'b': 0, 'c': -1, 'y': 1, 'w': 0} states = list(state_features.keys()) params = { 'width': w, 'height': h, 'state_features': state_features, 'feature_rewards': feature_rewards, 'absorbing_states': absorbing_states, 'init_state': (0, 3), 'include_intermediate_terminal': True } gw = GridWorld(**params) planner = gw.solve(discount_rate=.99) true_policy = { (-2, -2): '%', (-1, -1): '%', (0, 0): '^', (0, 1): '>', (0, 2): 'v', (0, 3): 'v', (0, 4): 'v', (0, 5): 'v', (1, 0): '^', (1, 1): '>', (1, 2): 'v', (1, 3): '<', (1, 4): '<', (1, 5): '<', (2, 0): '^', (2, 1): '>', (2, 2): 'v', (2, 3): 'v', (2, 4): '<', (2, 5): '<', (3, 0): '^', (3, 1): '>', (3, 2): 'v', (3, 3): 'v', (3, 4): '>', (3, 5): '>', (4, 0): '>', (4, 1): '>', (4, 2): '>', (4, 3): '>', (4, 4): '>', (4, 5): '>', (5, 0): '^', (5, 1): '^', (5, 2): '^', (5, 3): '%', (5, 4): 'v', (5, 5): 'v' } self.assertEqual(planner.optimal_policy, true_policy) traj = [] s = gw.get_init_state() while s not in gw.absorbing_states: a = planner.optimal_policy[s] ns = gw.transition(s, a) r = gw.reward(s, a, ns) traj.append((s, a, ns, r)) s = ns if len(traj) > 100: break true_traj = [((0, 3), 'v', (0, 2), 0), ((0, 2), 'v', (0, 1), 0), ((0, 1), '>', (1, 1), 0), ((1, 1), '>', (2, 1), 0), ((2, 1), '>', (3, 1), 0), ((3, 1), '>', (4, 1), 0), ((4, 1), '>', (5, 1), 0), ((5, 1), '^', (5, 2), 0), ((5, 2), '^', (5, 3), 1)] self.assertEqual(traj, true_traj)
def get_gridworld(self, absorbing_states): return GridWorld(width=self.width, height=self.height, walls=self.walls)
from itertools import product from pyrlap.hierarchicalrl.ham.ham import AbstractMachine, \ HierarchyOfAbstractMachines from pyrlap.domains.gridworld import GridWorld from pyrlap.domains.taxicab.utils import get_building_walls gw = GridWorld(width=7, height=7, walls=get_building_walls(corners=[(1, 1), (6, 1), (6, 6), (1, 6)], exits=[((1, 5), '^'), ((5, 1), '>')]), absorbing_states=[ (0, 6), ], reward_dict={(0, 6): 100}, step_cost=-1, init_state=(4, 1)) outside_states = { (0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (0, 6), (1, 6), (2, 6), (3, 6), (4, 6), (5, 6), (6, 6), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5) } inside_states = [ s for s in product(range(7), range(7)) if s not in outside_states ] class Root(AbstractMachine):
def __init__(self, gw: GridWorld, tile_colors: dict = None, feature_colors: dict = None, ax: plt.Axes = None, figsize: tuple = None, title: str = None): default_feature_colors = { 'a': 'orange', 'b': 'purple', 'c': 'cyan', 'x': 'red', 'p': 'pink', '.': 'white', 'y': 'yellow', 'g': 'yellow', 'n': 'white', '#': 'black', 'j': 'lightgreen' } if feature_colors is None: feature_colors = default_feature_colors else: temp_fcolors = copy.deepcopy(default_feature_colors) temp_fcolors.update(feature_colors) feature_colors = temp_fcolors if tile_colors is None: tile_colors = {} else: tile_colors = copy.copy(tile_colors) states_to_plot = [] for s in gw.states: if gw.is_any_terminal(s): continue states_to_plot.append(s) if s in tile_colors: continue if (s, None) in gw.walls: continue f = gw.state_features.get(s, '.') tile_colors[s] = feature_colors.get(f, 'grey') if figsize is None: figsize = (5, 5) if ax is None: fig, ax = plt.subplots(1, 1, figsize=figsize) if title is not None: ax.set_title(title) self.gw = gw self.feature_colors = feature_colors self.tile_colors = tile_colors self.ax = ax self.states_to_plot = states_to_plot self.annotations = {} self.trajectories = {}