コード例 #1
0
ファイル: chk_gen_layout_v2.py プロジェクト: sonofeft/IntroRL
def get_env():

    env = EnvBaseline(name='Simple Six State World')
    env.set_info('Simple Six State World')

    actionD = {
        'A': ('U', ),
        'B': ('ur', 'D'),
        '<C>': ('ur', 'dl'),
        'D': ('ur', 'ul')
    }

    rewardD = {'A': -1.0, 'E': 0.5, 'F': 1.0}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        r = rewardD.get(sn_hash, 0.0)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    add_event('A', 'U', 'B')
    #add_event( 'A', 'Te', 'E' )
    add_event('B', 'D', 'A')
    add_event('B', 'ur', '<C>')
    add_event('<C>', 'dl', 'B')
    add_event('<C>', 'ur', 'D')
    add_event('D', 'ur', 'F')
    add_event('D', 'ul', 'E')

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [('*', 'E', '*', 'F'), ('*', '*', 'D', '*'),
                   ('*', '<C>', '*', '*'), ('B', '*', '*', '*'),
                   ('A', '*', '*', '*')]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = '<C>'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['B'] = 1
    policyD['<C>'] = 1
    policyD['D'] = 1

    env.default_policyD = policyD

    return env
コード例 #2
0
ファイル: random_walk_mrp.py プロジェクト: sonofeft/IntroRL
def get_random_walk():

    env = EnvBaseline(name='Random Walk MRP')  # GenericLayout set below
    env.set_info('Random Walk MRP')

    actionD = {
        'A': ('L', 'R'),
        'B': ('L', 'R'),
        'C': ('L', 'R'),
        'D': ('L', 'R'),
        'E': ('L', 'R')
    }

    rewardD = {'Win': 1.0, 'Lose': 0.0}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash)
        r = rewardD.get(sn_hash, 0.0)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    mrpL = ['Lose', 'A', 'B', 'C', 'D', 'E', 'Win']
    for i, ci in enumerate(mrpL[1:-1]):
        add_event(ci, 'L', mrpL[i])
        add_event(ci, 'R', mrpL[i + 2])

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [mrpL]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = 'C'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['A'] = ('L', 'R')
    policyD['B'] = ('L', 'R')
    policyD['C'] = ('L', 'R')
    policyD['D'] = ('L', 'R')
    policyD['E'] = ('L', 'R')

    env.default_policyD = policyD

    return env
コード例 #3
0
def get_gridworld(step_reward=-1, height=7, goal=(3,7),
            windT=(0,0,0,1,1,1,2,2,1,0)):
    """
    Windy Gridworld with (0,0) at lower left
    width is defined by length of windT tuple.
    """

    gridworld = EnvBaseline( name='Windy Kings Gridworld' ) # GenericLayout set below
    gridworld.set_info( """""" )

    width = len( windT )

    def get_action_snext( s_hash, action):
        """returns state_next_hash"""

        di = 0
        dj = 0

        if 'N' in action:
            di = 1
        elif 'S' in action:
            di = -1
            
        if 'E' in action:
            dj = 1
        elif 'W' in action:
            dj = -1

        (i,j) = s_hash
        wind_di = windT[ j ]

        i_next = i + di
        # constrain basic move to be inside the grid
        i_next = max(0, min(height-1, i_next))

        i_next += wind_di # add wind to constrained move.
        j_next = j + dj

        # constrain next position to be inside the grid
        i_next = max(0, min(height-1, i_next))
        j_next = max(0, min(width-1, j_next))

        state_next_hash = (i_next, j_next)
        if state_next_hash == goal:
            state_next_hash = 'Goal'
        return state_next_hash


    # define default policy
    gridworld.default_policyD = {} #index=s_hash, value=list of equiprobable actions

    for i in range(height):
        for j in range(width):
            s_hash = (i,j)
            if s_hash == goal:
                pass  # s_hash == 'Goal'
            else:
                gridworld.default_policyD[ s_hash ] = ('N','S','E','W', 'NE','SE','SW','NW')
                for a_desc in ['N','S','E','W', 'NE','SE','SW','NW']:
                    gridworld.add_action( s_hash, a_desc, a_prob=1.0 ) # a_prob will be normalized

                    sn_hash = get_action_snext( s_hash, a_desc )
                    # add each event to transitions object
                    gridworld.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=step_reward)

    gridworld.define_env_states_actions()  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [] # layout rows for makeing 2D output
    for i in range(height): # put (0,0) at upper left
        rowL = []
        for j in range(width):
            s_hash = (i,j)
            if s_hash == goal:
                s_hash = 'Goal'

            rowL.append( s_hash )

        # use insert to put (0,0) at lower left, append for upper left
        s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output

    gridworld.layout = GenericLayout( gridworld, s_hash_rowL=s_hash_rowL,
                                      col_tickL=windT,
                                      x_axis_label='Upward Wind Speed'  )


    gridworld.start_state_hash =  (3,0)

    return gridworld
コード例 #4
0
def get_gridworld(step_reward=0.0):
    gridworld = EnvBaseline(
        name='Simple Grid World')  # GenericLayout set below
    gridworld.set_info('Simple Grid World Example.')

    actionD = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U')
    }

    rewardD = {(0, 3): 1, (1, 3): -1}

    for state_hash, actionL in actionD.items():

        for action_desc in actionL:
            gridworld.add_action(state_hash, action_desc,
                                 a_prob=1.0)  # a_prob will be normalized

            a = action_desc
            s = state_hash

            if a == 'U':
                state_next_hash = (s[0] - 1, s[1])
            elif a == 'D':
                state_next_hash = (s[0] + 1, s[1])
            elif a == 'R':
                state_next_hash = (s[0], s[1] + 1)
            elif a == 'L':
                state_next_hash = (s[0], s[1] - 1)

            reward_val = rewardD.get(state_next_hash, step_reward)

            gridworld.add_transition(state_hash,
                                     action_desc,
                                     state_next_hash,
                                     t_prob=1.0,
                                     reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    gridworld.layout = GenericLayout(
        gridworld)  # uses default "get_layout_row_col_of_state"

    # If there is a start state, define it here.
    gridworld.start_state_hash = (2, 0)
    gridworld.define_limited_start_state_list([(2, 0), (2, 2)])

    # define default policy (if any)
    # Policy Dictionary for: GridWorld

    policyD = {}  # index=state_hash, value=action_desc

    #                 Vpi shown for gamma=0.9
    policyD[(0, 0)] = 'R'  # Vpi=0.81
    policyD[(1, 0)] = 'U'  # Vpi=0.729
    policyD[(0, 1)] = 'R'  # Vpi=0.9
    policyD[(0, 2)] = 'R'  # Vpi=1.0
    policyD[(1, 2)] = 'U'  # Vpi=0.9
    policyD[(2, 0)] = 'U'  # Vpi=0.6561
    policyD[(2, 2)] = 'U'  # Vpi=0.81
    policyD[(2, 1)] = 'R'  # Vpi=0.729
    policyD[(2, 3)] = 'L'  # Vpi=0.729

    gridworld.default_policyD = policyD

    return gridworld
コード例 #5
0
def get_gridworld(step_reward=0.0,
                  width=9,
                  height=6,
                  goal=(0, 8),
                  start=(2, 0),
                  wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4,
                                                                          5))):

    gridworld = EnvBaseline(
        name='Sutton Ex8.1 Dyna Maze')  # GenericLayout set below
    gridworld.set_info("""Sutton Ex8.1 Dyna Maze""")

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'U':
            di = -1
        elif action == 'D':
            di = 1
        elif action == 'R':
            dj = 1
        elif action == 'L':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if j_next >= width:
            j_next = j
        elif j_next < 0:
            j_next = j

        if i_next >= height:
            i_next = i
        elif i_next < 0:
            i_next = i

        if (i_next, j_next) in wallL:
            i_next, j_next = i, j

        state_next_hash = (i_next, j_next)

        if state_next_hash == goal:
            reward = 1.0
        else:
            reward = 0.0

        return reward, state_next_hash

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    for i in range(height):
        for j in range(width):
            s_hash = (i, j)
            if s_hash != goal:
                gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L')
                for a_desc in ['U', 'D', 'R', 'L']:
                    gridworld.add_action(
                        s_hash, a_desc,
                        a_prob=1.0)  # a_prob will be normalized

                    reward_val, sn_hash = get_action_snext_reward(
                        s_hash, a_desc)
                    # add each event to transitions object
                    gridworld.add_transition(s_hash,
                                             a_desc,
                                             sn_hash,
                                             t_prob=1.0,
                                             reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = []  # layout rows for makeing 2D output
    for i in range(height):  # put (0,0) at upper left
        rowL = []
        for j in range(width):
            s = (i, j)
            if s in wallL:
                rowL.append('"Wall"')
            else:
                rowL.append(s)

        # use insert to put (0,0) at lower left
        s_hash_rowL.append(rowL)  # layout rows for makeing 2D output

    named_s_hashD = {start: 'Start', goal: 'Goal'}
    gridworld.layout = GenericLayout(gridworld,
                                     s_hash_rowL=s_hash_rowL,
                                     named_s_hashD=named_s_hashD)

    gridworld.start_state_hash = start

    return gridworld
コード例 #6
0
def get_gridworld(
    step_reward=0.0,
    N_mult=1,  # N_mult must be an integer.
    width=9,
    height=6,
    goal=(0, 8),
    start=(2, 0),
    wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))):

    gridworld = EnvBaseline(
        name='Sutton Ex8.4 Priority Sweep Maze')  # GenericLayout set below
    gridworld.set_info("""Sutton Ex8.1 Dyna Maze""")

    width_big = width * N_mult
    height_big = height * N_mult

    gridworld.characteristic_dim = width_big + height_big * 2
    # get relaxed optimal length from Zhang.
    gridworld.optimal_path_len = int(14 * N_mult * 1.2) + 1

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'U':
            di = -1
        elif action == 'D':
            di = 1
        elif action == 'R':
            dj = 1
        elif action == 'L':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if j_next >= width_big:
            j_next = j
        elif j_next < 0:
            j_next = j

        if i_next >= height_big:
            i_next = i
        elif i_next < 0:
            i_next = i

        if (i_next, j_next) in wall_set:
            i_next, j_next = i, j

        state_next_hash = (i_next, j_next)

        if state_next_hash in goal_set:
            reward = 1.0
        else:
            reward = 0.0

        return reward, state_next_hash

    def make_big_set(pos):
        """Take an (i,j) position, pos, and expand to new, big size in x and y"""
        pos_set = set()
        ip, jp = pos
        ip *= N_mult
        jp *= N_mult
        for ixn in range(N_mult):
            for jxn in range(N_mult):
                pos_set.add((ip + ixn, jp + jxn))
        return pos_set

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    # redefine start
    istart, jstart = start
    start = (istart * N_mult, jstart * N_mult)

    # make goal set
    goal_set = make_big_set(goal)

    # make wall set
    wall_set = set()
    for wall in wallL:
        wall_set.update(make_big_set(wall))

    # create state hash entries
    for i in range(height_big):
        for j in range(width_big):
            s_hash = (i, j)
            if (s_hash not in wall_set) and (s_hash not in goal_set):
                gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L')
                for a_desc in ['U', 'D', 'R', 'L']:
                    gridworld.add_action(
                        s_hash, a_desc,
                        a_prob=1.0)  # a_prob will be normalized

                    reward_val, sn_hash = get_action_snext_reward(
                        s_hash, a_desc)
                    # add each event to transitions object
                    gridworld.add_transition(s_hash,
                                             a_desc,
                                             sn_hash,
                                             t_prob=1.0,
                                             reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = []  # layout rows for makeing 2D output
    for i in range(height_big):  # put (0,0) at upper left
        rowL = []
        for j in range(width_big):
            s = (i, j)
            if s in wall_set:
                rowL.append('"Wall"')
            else:
                rowL.append(s)

        # use insert to put (0,0) at lower left
        s_hash_rowL.append(rowL)  # layout rows for makeing 2D output

    named_s_hashD = {}
    named_s_hashD[start] = 'Start'
    for g in goal_set:
        named_s_hashD[g] = 'Goal'

    gridworld.layout = GenericLayout(gridworld,
                                     s_hash_rowL=s_hash_rowL,
                                     named_s_hashD=named_s_hashD)

    gridworld.start_state_hash = start

    return gridworld
コード例 #7
0
def get_env():

    env = EnvBaseline( name="Jacks Car Rental (const rtn)" ) # GenericLayout set below
    
    simplified_str ="""Shangtong Zhang's simplified model such that
the # of cars returned in daytime becomes constant
rather than a random value from poisson distribution, which will reduce calculation time
and leave the optimal policy/value state matrix almost the same"""    
    
    env.set_info( 'Example 4.2 from Sutton & Barto 2nd Edition page 81.\n' + simplified_str )

    
    # define all possible actions.
    saL = [] # a list of (s1, s2, adesc)
    s_hash_rowL = [] # layout rows for makeing 2D output

    
    for s1 in range( MAX_CARS + 1 ): # 20 cars max
        rowL = [] # row of s_hash_rowL
        
        for s2 in range( MAX_CARS + 1 ): # 20 cars max
            s_hash = (s1, s2)
            rowL.append( s_hash )
            
            for a_desc in range(-5, 6): # -5 moves 5 cars from 2nd to 1st. +5 from 1st to 2nd.
                
                if a_desc < 0: # can only move cars if they are present
                    if (abs(a_desc) <= s2):
                        env.add_action( s_hash, a_desc, a_prob=1.0 )
                        saL.append( (s1, s2, a_desc) )
                else:
                    if (a_desc <= s1): # can only move cars if they are present
                        env.add_action( s_hash, a_desc, a_prob=1.0 )
                        saL.append( (s1, s2, a_desc) )
        
        # use insert to put (0,0) at lower left
        s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output
    
    # ------------------------------
    # figure out transition probabilities and rewards
    for s1 in range( MAX_CARS + 1 ):
        for s2 in range( MAX_CARS + 1 ):
            for a_desc in range( -5, 6 ):
                get_prob_reward( s1, s2, a_desc)

    # ------------------------------                                                
        
    print('\nStarting to define car rental transitions')
    # with all the probability figured out, define all transitions
    for (s1, s2, a_desc, sn_hash), t_prob in total_probD.items():
        txr = sum_prob_x_rewardD[ (s1, s2, a_desc, sn_hash) ]
        rval = txr / t_prob
        env.add_transition( (s1,s2), a_desc, sn_hash, t_prob=t_prob, reward_obj=rval)
    
        #if s1==10 and s2==10:
        #    print('for (10,10) a_desc=',a_desc,' sn_hash=',sn_hash,'  t_prob=',t_prob,'  rval=',rval)
    
    print('Calling: env.define_env_states_actions')
    env.define_env_states_actions()  # send all states and actions to environment
    print('Environment Ready.')

    # If there is a start state, define it here.
    env.start_state_hash = (10,10)

    # define default policy (if any)
    env.default_policyD = {}


    # --------------------
    # define layout for output

    env.layout = GenericLayout( env, s_hash_rowL=s_hash_rowL, 
                                x_axis_label='#Cars at Second Location',
                                y_axis_label='#Cars at First Location')
    
    return env
コード例 #8
0
def get_random_walk(Nside_states=9,
                    win_reward=1.0,
                    lose_reward=-1.0,
                    step_reward=0.0):

    Nstates = 2 * Nside_states + 1

    s = '(L%i, R%i)' % (Nside_states, Nside_states)
    env = EnvBaseline(name='%i State Random Walk MRP' % Nstates +
                      s)  # GenericLayout set below
    env.set_info('%i State Random Walk MRP' % Nstates + s)

    RstateL = ['R+%i' % i for i in range(1, Nside_states + 1)]
    LstateL = list(reversed([s.replace('R+', 'L-') for s in RstateL]))

    actionD = {}
    for s in LstateL:
        actionD[s] = ('L', 'R')
    actionD['C'] = ('L', 'R')
    for s in RstateL:
        actionD[s] = ('L', 'R')

    rewardD = {'Win': win_reward, 'Lose': lose_reward}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash)
        r = rewardD.get(sn_hash, step_reward)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    mrpL = ['Lose'] + LstateL + ['C'] + RstateL + ['Win']
    for i, ci in enumerate(mrpL[1:-1]):
        add_event(ci, 'L', mrpL[i])
        add_event(ci, 'R', mrpL[i + 2])

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # -------------------- make layout for printing ------------------

    s_hash_rowL = [mrpL]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = 'C'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['C'] = ('L', 'R')
    for s in LstateL:
        policyD[s] = ('L', 'R')
    for s in RstateL:
        policyD[s] = ('L', 'R')

    env.default_policyD = policyD

    return env