def get_gridworld(): gridworld = EnvBaseline(name='Sample Grid World', s_hash_rowL=s_hash_rowL, row_tickL=row_tickL, x_axis_label=x_axis_label, col_tickL=col_tickL, y_axis_label=y_axis_label, colorD={ 'Goal': 'g', 'Pit': 'r', 'Start': 'b' }, basic_color='skyblue') gridworld.set_info('Sample Grid World showing basic MDP creation.') # add actions from each state # (note: a_prob will be normalized within add_action_dict) gridworld.add_action_dict(actionD) # for each action, define the next state and transition probability # (here we use the layout definition to aid the logic) for s_hash, aL in actionD.items(): for a_desc in aL: sn_hash = get_next_state(s_hash, a_desc) reward = rewardD.get(sn_hash, 0.0) # for deterministic MDP, use t_prob=1.0 gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward) # after the "add" commands, send all states and actions to environment # (any required normalization is done here as well.) gridworld.define_env_states_actions() # If there is a start state, define it here. gridworld.start_state_hash = 'Start' # If a limited number of start states are desired, define them here. gridworld.define_limited_start_state_list([(2, 0), (2, 2)]) # if a default policy is desired, define it as a dict. gridworld.default_policyD = { (0, 0): 'R', (1, 0): 'U', (0, 1): 'R', (0, 2): 'R', (1, 2): 'U', 'Start': 'U', (2, 2): 'U', (2, 1): 'R', (2, 3): 'L' } return gridworld
def get_env(): env = EnvBaseline(name='Simple Six State World') env.set_info('Simple Six State World') actionD = { 'A': ('U', ), 'B': ('ur', 'D'), '<C>': ('ur', 'dl'), 'D': ('ur', 'ul') } rewardD = {'A': -1.0, 'E': 0.5, 'F': 1.0} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): r = rewardD.get(sn_hash, 0.0) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) add_event('A', 'U', 'B') #add_event( 'A', 'Te', 'E' ) add_event('B', 'D', 'A') add_event('B', 'ur', '<C>') add_event('<C>', 'dl', 'B') add_event('<C>', 'ur', 'D') add_event('D', 'ur', 'F') add_event('D', 'ul', 'E') env.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [('*', 'E', '*', 'F'), ('*', '*', 'D', '*'), ('*', '<C>', '*', '*'), ('B', '*', '*', '*'), ('A', '*', '*', '*')] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = '<C>' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['B'] = 1 policyD['<C>'] = 1 policyD['D'] = 1 env.default_policyD = policyD return env
def get_random_walk(): env = EnvBaseline(name='Random Walk MRP') # GenericLayout set below env.set_info('Random Walk MRP') actionD = { 'A': ('L', 'R'), 'B': ('L', 'R'), 'C': ('L', 'R'), 'D': ('L', 'R'), 'E': ('L', 'R') } rewardD = {'Win': 1.0, 'Lose': 0.0} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash) r = rewardD.get(sn_hash, 0.0) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) mrpL = ['Lose', 'A', 'B', 'C', 'D', 'E', 'Win'] for i, ci in enumerate(mrpL[1:-1]): add_event(ci, 'L', mrpL[i]) add_event(ci, 'R', mrpL[i + 2]) env.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [mrpL] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = 'C' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['A'] = ('L', 'R') policyD['B'] = ('L', 'R') policyD['C'] = ('L', 'R') policyD['D'] = ('L', 'R') policyD['E'] = ('L', 'R') env.default_policyD = policyD return env
def get_gambler(prob_heads=0.4): gambler = EnvBaseline( name='Gamblers Coin Flip Problem', s_hash_rowL=s_hash_rowL, colorD={100:'g', 0:'r'}, basic_color='skyblue' ) gambler.set_info( 'Example 4.3 from Sutton & Barto 2nd Edition page 84.' ) for s in range(1, 100): # 1 to 99 s_max = min(s, 100-s) for a_desc in range(1, s_max + 1): gambler.add_action( s, a_desc, a_prob=1.0 ) # define reward for all states def get_reward( sn ): if sn==100: return 1.0 else: return 0.0 # define all possible transitions. for s in range(1, 100): # 1 to 99 s_max = min(s, 100-s) for a_desc in range(1, s_max + 1): sn_hash = s - a_desc rval = get_reward( sn_hash ) gambler.add_transition( s, a_desc, sn_hash, t_prob=1.0-prob_heads, reward_obj=rval) sn_hash = s + a_desc rval = get_reward( sn_hash ) gambler.add_transition( s, a_desc, sn_hash, t_prob=prob_heads, reward_obj=rval) gambler.define_env_states_actions() # send all states and actions to environment # If there is a start state, define it here. gambler.start_state_hash = (50) # define default policy (if any) gambler.default_policyD = {} return gambler
def get_gridworld(step_reward=-0.04): gridworld = EnvBaseline(name='Sutton Ex4.1 5x5 Grid World', s_hash_rowL=s_hash_rowL) gridworld.set_info(""" Sutton 5x5 Gridworld Book Answer from page 65 (linear eqn solve) for gamma=0.9 22.0 24.4 22.0 19.4 17.5 19.8 22.0 19.8 17.8 16.0 17.8 19.8 17.8 16.0 14.4 16.0 17.8 16.0 14.4 13.0 14.4 16.0 14.4 13.0 11.7 ================================================= """) def get_action_snext_reward(s_hash, action): """returns reward and state_next_hash""" di = 0 dj = 0 reward = 0 if action == 'N': di = 1 elif action == 'S': di = -1 elif action == 'E': dj = 1 elif action == 'W': dj = -1 (i, j) = s_hash i_next = i + di j_next = j + dj if (i == 4) and (j == 1): i_next = 0 j_next = 1 reward = 10 elif (i == 4) and (j == 3): i_next = 2 j_next = 3 reward = 5 elif (i_next < 0) or (i_next > 4) or (j_next < 0) or (j_next > 4): i_next = i j_next = j reward = -1 state_next_hash = (i_next, j_next) return reward, state_next_hash # define default policy gridworld.default_policyD = { } #index=s_hash, value=list of equiprobable actions for i in range(5): for j in range(5): s_hash = (i, j) gridworld.default_policyD[s_hash] = ('N', 'S', 'E', 'W') for a_desc in ['N', 'S', 'E', 'W']: gridworld.add_action(s_hash, a_desc, a_prob=1.0) # a_prob will be normalized reward_val, sn_hash = get_action_snext_reward(s_hash, a_desc) # add each event to transitions object gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment gridworld.start_state_hash = (0, 0) return gridworld
def get_gridworld(step_reward=-1, height=7, goal=(3,7), windT=(0,0,0,1,1,1,2,2,1,0)): """ Windy Gridworld with (0,0) at lower left width is defined by length of windT tuple. """ gridworld = EnvBaseline( name='Windy Kings Gridworld' ) # GenericLayout set below gridworld.set_info( """""" ) width = len( windT ) def get_action_snext( s_hash, action): """returns state_next_hash""" di = 0 dj = 0 if 'N' in action: di = 1 elif 'S' in action: di = -1 if 'E' in action: dj = 1 elif 'W' in action: dj = -1 (i,j) = s_hash wind_di = windT[ j ] i_next = i + di # constrain basic move to be inside the grid i_next = max(0, min(height-1, i_next)) i_next += wind_di # add wind to constrained move. j_next = j + dj # constrain next position to be inside the grid i_next = max(0, min(height-1, i_next)) j_next = max(0, min(width-1, j_next)) state_next_hash = (i_next, j_next) if state_next_hash == goal: state_next_hash = 'Goal' return state_next_hash # define default policy gridworld.default_policyD = {} #index=s_hash, value=list of equiprobable actions for i in range(height): for j in range(width): s_hash = (i,j) if s_hash == goal: pass # s_hash == 'Goal' else: gridworld.default_policyD[ s_hash ] = ('N','S','E','W', 'NE','SE','SW','NW') for a_desc in ['N','S','E','W', 'NE','SE','SW','NW']: gridworld.add_action( s_hash, a_desc, a_prob=1.0 ) # a_prob will be normalized sn_hash = get_action_snext( s_hash, a_desc ) # add each event to transitions object gridworld.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=step_reward) gridworld.define_env_states_actions() # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height): # put (0,0) at upper left rowL = [] for j in range(width): s_hash = (i,j) if s_hash == goal: s_hash = 'Goal' rowL.append( s_hash ) # use insert to put (0,0) at lower left, append for upper left s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output gridworld.layout = GenericLayout( gridworld, s_hash_rowL=s_hash_rowL, col_tickL=windT, x_axis_label='Upward Wind Speed' ) gridworld.start_state_hash = (3,0) return gridworld
def get_gridworld(step_reward=0.0): gridworld = EnvBaseline( name='Simple Grid World') # GenericLayout set below gridworld.set_info('Simple Grid World Example.') actionD = { (0, 0): ('D', 'R'), (0, 1): ('L', 'R'), (0, 2): ('L', 'D', 'R'), (1, 0): ('U', 'D'), (1, 2): ('U', 'D', 'R'), (2, 0): ('U', 'R'), (2, 1): ('L', 'R'), (2, 2): ('L', 'R', 'U'), (2, 3): ('L', 'U') } rewardD = {(0, 3): 1, (1, 3): -1} for state_hash, actionL in actionD.items(): for action_desc in actionL: gridworld.add_action(state_hash, action_desc, a_prob=1.0) # a_prob will be normalized a = action_desc s = state_hash if a == 'U': state_next_hash = (s[0] - 1, s[1]) elif a == 'D': state_next_hash = (s[0] + 1, s[1]) elif a == 'R': state_next_hash = (s[0], s[1] + 1) elif a == 'L': state_next_hash = (s[0], s[1] - 1) reward_val = rewardD.get(state_next_hash, step_reward) gridworld.add_transition(state_hash, action_desc, state_next_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment gridworld.layout = GenericLayout( gridworld) # uses default "get_layout_row_col_of_state" # If there is a start state, define it here. gridworld.start_state_hash = (2, 0) gridworld.define_limited_start_state_list([(2, 0), (2, 2)]) # define default policy (if any) # Policy Dictionary for: GridWorld policyD = {} # index=state_hash, value=action_desc # Vpi shown for gamma=0.9 policyD[(0, 0)] = 'R' # Vpi=0.81 policyD[(1, 0)] = 'U' # Vpi=0.729 policyD[(0, 1)] = 'R' # Vpi=0.9 policyD[(0, 2)] = 'R' # Vpi=1.0 policyD[(1, 2)] = 'U' # Vpi=0.9 policyD[(2, 0)] = 'U' # Vpi=0.6561 policyD[(2, 2)] = 'U' # Vpi=0.81 policyD[(2, 1)] = 'R' # Vpi=0.729 policyD[(2, 3)] = 'L' # Vpi=0.729 gridworld.default_policyD = policyD return gridworld
def get_robot(step_reward=-0.04): gridworld = EnvBaseline(name='Slipper Cleaning Robot', s_hash_rowL=s_hash_rowL) gridworld.set_info(""" Example taken from "Dissecting Reinforcement Learning-Part 1" Dec 9, 2016 Massimiliano Patacchiola https://mpatacchiola.github.io/blog/2016/12/09/dissecting-reinforcement-learning.html """) def get_right_angle_list(a): if a == 'U': raL = ['L', 'R'] elif a == 'D': raL = ['L', 'R'] elif a == 'R': raL = ['U', 'D'] elif a == 'L': raL = ['U', 'D'] return raL def get_move_s_next(a, s): sn = s if a == 'U': sn = (s[0] + 1, s[1]) elif a == 'D': sn = (s[0] - 1, s[1]) elif a == 'R': sn = (s[0], s[1] + 1) elif a == 'L': sn = (s[0], s[1] - 1) if sn == (2, 2): # can't move into block in the middle. sn = s # limit moves to inside the edges. sn_hash = (clamp(sn[0], 1, 3), clamp(sn[1], 1, 4)) return sn_hash non_termL = [(3, 1), (3, 2), (3, 3), (2, 1), (2, 3), (1, 1), (1, 2), (1, 3), (1, 4)] rewardD = {(3, 4): 1, (2, 4): -1} # put in 80% and both 10% moves to target for s_hash in non_termL: for a_desc in ['U', 'D', 'L', 'R']: # normal move gridworld.add_action(s_hash, a_desc, a_prob=0.25) # 80% sn_hash = get_move_s_next(a_desc, s_hash) reward_val = rewardD.get(sn_hash, step_reward) gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=0.8, reward_obj=reward_val) # both 10% right_angL = get_right_angle_list(a_desc) for ar_desc in right_angL: sn_hash = get_move_s_next(ar_desc, s_hash) reward_val = rewardD.get(sn_hash, step_reward) gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=0.1, reward_obj=reward_val) gridworld.define_env_states_actions() # If there is a start state, define it here. gridworld.start_state_hash = (1, 1) # define default policy (if any) policyD = {} # index=s_hash, value=a_desc policyD[(3, 1)] = 'R' policyD[(3, 3)] = 'R' policyD[(3, 2)] = 'R' policyD[(2, 1)] = 'U' policyD[(2, 3)] = 'U' policyD[(1, 1)] = 'U' policyD[(1, 2)] = 'L' policyD[(1, 3)] = 'L' policyD[(1, 4)] = 'L' gridworld.default_policyD = policyD return gridworld
def get_gridworld(step_reward=0.0, width=9, height=6, goal=(0, 8), start=(2, 0), wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))): gridworld = EnvBaseline( name='Sutton Ex8.1 Dyna Maze') # GenericLayout set below gridworld.set_info("""Sutton Ex8.1 Dyna Maze""") def get_action_snext_reward(s_hash, action): """returns reward and state_next_hash""" di = 0 dj = 0 reward = 0 if action == 'U': di = -1 elif action == 'D': di = 1 elif action == 'R': dj = 1 elif action == 'L': dj = -1 (i, j) = s_hash i_next = i + di j_next = j + dj if j_next >= width: j_next = j elif j_next < 0: j_next = j if i_next >= height: i_next = i elif i_next < 0: i_next = i if (i_next, j_next) in wallL: i_next, j_next = i, j state_next_hash = (i_next, j_next) if state_next_hash == goal: reward = 1.0 else: reward = 0.0 return reward, state_next_hash # define default policy gridworld.default_policyD = { } #index=s_hash, value=list of equiprobable actions for i in range(height): for j in range(width): s_hash = (i, j) if s_hash != goal: gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L') for a_desc in ['U', 'D', 'R', 'L']: gridworld.add_action( s_hash, a_desc, a_prob=1.0) # a_prob will be normalized reward_val, sn_hash = get_action_snext_reward( s_hash, a_desc) # add each event to transitions object gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height): # put (0,0) at upper left rowL = [] for j in range(width): s = (i, j) if s in wallL: rowL.append('"Wall"') else: rowL.append(s) # use insert to put (0,0) at lower left s_hash_rowL.append(rowL) # layout rows for makeing 2D output named_s_hashD = {start: 'Start', goal: 'Goal'} gridworld.layout = GenericLayout(gridworld, s_hash_rowL=s_hash_rowL, named_s_hashD=named_s_hashD) gridworld.start_state_hash = start return gridworld
def get_robot(): robot = EnvBaseline(name='Slow-Fast Fallen Robot', s_hash_rowL=s_hash_rowL) robot.set_info(""" Sample 3 State Fallen, Standing, Moving Robot. https://sandipanweb.wordpress.com/2017/03/23/some-reinforcement-learning-using-policy-value-iteration-and-q-learning-for-a-markov-decision-process-in-python-and-r/ Some Reinforcement Learning: Using Policy & Value Iteration and Q-learning for a Markov Decision Process in Python and R """) robot.add_action('Fallen', 'Slow', a_prob=1.0) robot.add_action('Standing', 'Slow', a_prob=1.0) robot.add_action('Moving', 'Slow', a_prob=1.0) robot.add_action('Standing', 'Fast', a_prob=1.0) robot.add_action('Moving', 'Fast', a_prob=1.0) robot.add_transition('Fallen', 'Slow', 'Fallen', t_prob=0.6, reward_obj=-1.0) robot.add_transition('Fallen', 'Slow', 'Standing', t_prob=0.4, reward_obj=1.0) robot.add_transition('Standing', 'Slow', 'Moving', t_prob=1.0, reward_obj=1.0) robot.add_transition('Moving', 'Slow', 'Moving', t_prob=1.0, reward_obj=1.0) robot.add_transition('Standing', 'Fast', 'Moving', t_prob=0.6, reward_obj=2.0) robot.add_transition('Standing', 'Fast', 'Fallen', t_prob=0.4, reward_obj=-1.0) robot.add_transition('Moving', 'Fast', 'Moving', t_prob=0.8, reward_obj=2.0) robot.add_transition('Moving', 'Fast', 'Fallen', t_prob=0.2, reward_obj=-1.0) robot.define_env_states_actions( ) # send all states and actions to environment robot.start_state_hash = 'Standing' # define default policy (if any) policyD = {} # index=state_hash, value=action_desc policyD['Standing'] = 'Slow' policyD['Fallen'] = 'Slow' policyD['Moving'] = 'Slow' robot.default_policyD = policyD return robot
def get_gridworld( step_reward=0.0, N_mult=1, # N_mult must be an integer. width=9, height=6, goal=(0, 8), start=(2, 0), wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))): gridworld = EnvBaseline( name='Sutton Ex8.4 Priority Sweep Maze') # GenericLayout set below gridworld.set_info("""Sutton Ex8.1 Dyna Maze""") width_big = width * N_mult height_big = height * N_mult gridworld.characteristic_dim = width_big + height_big * 2 # get relaxed optimal length from Zhang. gridworld.optimal_path_len = int(14 * N_mult * 1.2) + 1 def get_action_snext_reward(s_hash, action): """returns reward and state_next_hash""" di = 0 dj = 0 reward = 0 if action == 'U': di = -1 elif action == 'D': di = 1 elif action == 'R': dj = 1 elif action == 'L': dj = -1 (i, j) = s_hash i_next = i + di j_next = j + dj if j_next >= width_big: j_next = j elif j_next < 0: j_next = j if i_next >= height_big: i_next = i elif i_next < 0: i_next = i if (i_next, j_next) in wall_set: i_next, j_next = i, j state_next_hash = (i_next, j_next) if state_next_hash in goal_set: reward = 1.0 else: reward = 0.0 return reward, state_next_hash def make_big_set(pos): """Take an (i,j) position, pos, and expand to new, big size in x and y""" pos_set = set() ip, jp = pos ip *= N_mult jp *= N_mult for ixn in range(N_mult): for jxn in range(N_mult): pos_set.add((ip + ixn, jp + jxn)) return pos_set # define default policy gridworld.default_policyD = { } #index=s_hash, value=list of equiprobable actions # redefine start istart, jstart = start start = (istart * N_mult, jstart * N_mult) # make goal set goal_set = make_big_set(goal) # make wall set wall_set = set() for wall in wallL: wall_set.update(make_big_set(wall)) # create state hash entries for i in range(height_big): for j in range(width_big): s_hash = (i, j) if (s_hash not in wall_set) and (s_hash not in goal_set): gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L') for a_desc in ['U', 'D', 'R', 'L']: gridworld.add_action( s_hash, a_desc, a_prob=1.0) # a_prob will be normalized reward_val, sn_hash = get_action_snext_reward( s_hash, a_desc) # add each event to transitions object gridworld.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=reward_val) gridworld.define_env_states_actions( ) # send all states and actions to environment # -------------------- s_hash_rowL = [] # layout rows for makeing 2D output for i in range(height_big): # put (0,0) at upper left rowL = [] for j in range(width_big): s = (i, j) if s in wall_set: rowL.append('"Wall"') else: rowL.append(s) # use insert to put (0,0) at lower left s_hash_rowL.append(rowL) # layout rows for makeing 2D output named_s_hashD = {} named_s_hashD[start] = 'Start' for g in goal_set: named_s_hashD[g] = 'Goal' gridworld.layout = GenericLayout(gridworld, s_hash_rowL=s_hash_rowL, named_s_hashD=named_s_hashD) gridworld.start_state_hash = start return gridworld
def get_gridworld(step_reward=-0.04): gridworld = EnvBaseline(name='Sutton Ex4.1 Grid World', s_hash_rowL=s_hash_rowL) gridworld.set_info(""" Example 4.1 grid Label for blank space is "0" (both blanks are the same actual state) (i.e. upper left corner and lower right corner are state "0") """) for state_hash in range(1, 15): # states are numbered 1-14 for action_desc in ['U', 'D', 'R', 'L']: gridworld.add_action(state_hash, action_desc, a_prob=1.0) # a_prob will be normalized a = action_desc s = state_hash if a == 'U': sn = s - 4 elif a == 'D': sn = s + 4 elif a == 'R': if s not in [3, 7, 11]: sn = s + 1 else: sn = s elif a == 'L': if s not in [4, 8, 12]: sn = s - 1 else: sn = s if sn < 0: sn = s elif sn > 15: sn = s elif sn == 15: sn = 0 gridworld.add_transition(state_hash, action_desc, sn, t_prob=1.0, reward_obj=-1.0) gridworld.define_env_states_actions( ) # send all states and actions to environment gridworld.start_state_hash = 12 # define default policy (if any) policyD = {} # index=state_hash, value=action_desc for (s_hash, a_desc) in gridworld.iter_state_hash_action_desc(): if s_hash not in policyD: policyD[s_hash] = [] policyD[s_hash].append((a_desc, 0.25)) # make policyD entries hashable for later use (i.e. tuple, not list) for s_hash, aL in policyD.items(): policyD[s_hash] = tuple(aL) gridworld.default_policyD = policyD return gridworld
def get_env(): env = EnvBaseline( name="Jacks Car Rental (const rtn)" ) # GenericLayout set below simplified_str ="""Shangtong Zhang's simplified model such that the # of cars returned in daytime becomes constant rather than a random value from poisson distribution, which will reduce calculation time and leave the optimal policy/value state matrix almost the same""" env.set_info( 'Example 4.2 from Sutton & Barto 2nd Edition page 81.\n' + simplified_str ) # define all possible actions. saL = [] # a list of (s1, s2, adesc) s_hash_rowL = [] # layout rows for makeing 2D output for s1 in range( MAX_CARS + 1 ): # 20 cars max rowL = [] # row of s_hash_rowL for s2 in range( MAX_CARS + 1 ): # 20 cars max s_hash = (s1, s2) rowL.append( s_hash ) for a_desc in range(-5, 6): # -5 moves 5 cars from 2nd to 1st. +5 from 1st to 2nd. if a_desc < 0: # can only move cars if they are present if (abs(a_desc) <= s2): env.add_action( s_hash, a_desc, a_prob=1.0 ) saL.append( (s1, s2, a_desc) ) else: if (a_desc <= s1): # can only move cars if they are present env.add_action( s_hash, a_desc, a_prob=1.0 ) saL.append( (s1, s2, a_desc) ) # use insert to put (0,0) at lower left s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output # ------------------------------ # figure out transition probabilities and rewards for s1 in range( MAX_CARS + 1 ): for s2 in range( MAX_CARS + 1 ): for a_desc in range( -5, 6 ): get_prob_reward( s1, s2, a_desc) # ------------------------------ print('\nStarting to define car rental transitions') # with all the probability figured out, define all transitions for (s1, s2, a_desc, sn_hash), t_prob in total_probD.items(): txr = sum_prob_x_rewardD[ (s1, s2, a_desc, sn_hash) ] rval = txr / t_prob env.add_transition( (s1,s2), a_desc, sn_hash, t_prob=t_prob, reward_obj=rval) #if s1==10 and s2==10: # print('for (10,10) a_desc=',a_desc,' sn_hash=',sn_hash,' t_prob=',t_prob,' rval=',rval) print('Calling: env.define_env_states_actions') env.define_env_states_actions() # send all states and actions to environment print('Environment Ready.') # If there is a start state, define it here. env.start_state_hash = (10,10) # define default policy (if any) env.default_policyD = {} # -------------------- # define layout for output env.layout = GenericLayout( env, s_hash_rowL=s_hash_rowL, x_axis_label='#Cars at Second Location', y_axis_label='#Cars at First Location') return env
def get_random_walk(Nside_states=9, win_reward=1.0, lose_reward=-1.0, step_reward=0.0): Nstates = 2 * Nside_states + 1 s = '(L%i, R%i)' % (Nside_states, Nside_states) env = EnvBaseline(name='%i State Random Walk MRP' % Nstates + s) # GenericLayout set below env.set_info('%i State Random Walk MRP' % Nstates + s) RstateL = ['R+%i' % i for i in range(1, Nside_states + 1)] LstateL = list(reversed([s.replace('R+', 'L-') for s in RstateL])) actionD = {} for s in LstateL: actionD[s] = ('L', 'R') actionD['C'] = ('L', 'R') for s in RstateL: actionD[s] = ('L', 'R') rewardD = {'Win': win_reward, 'Lose': lose_reward} for (s_hash, moveL) in actionD.items(): for a_desc in moveL: env.add_action(s_hash, a_desc, a_prob=1.0) def add_event(s_hash, a_desc, sn_hash): #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash) r = rewardD.get(sn_hash, step_reward) env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r) mrpL = ['Lose'] + LstateL + ['C'] + RstateL + ['Win'] for i, ci in enumerate(mrpL[1:-1]): add_event(ci, 'L', mrpL[i]) add_event(ci, 'R', mrpL[i + 2]) env.define_env_states_actions( ) # send all states and actions to environment # -------------------- make layout for printing ------------------ s_hash_rowL = [mrpL] env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL) env.start_state_hash = 'C' # define default_policyD policyD = {} # index=state_hash, value=action_desc policyD['C'] = ('L', 'R') for s in LstateL: policyD[s] = ('L', 'R') for s in RstateL: policyD[s] = ('L', 'R') env.default_policyD = policyD return env