Esempio n. 1
0
def get_gridworld():
    gridworld = EnvBaseline(name='Sample Grid World',
                            s_hash_rowL=s_hash_rowL,
                            row_tickL=row_tickL,
                            x_axis_label=x_axis_label,
                            col_tickL=col_tickL,
                            y_axis_label=y_axis_label,
                            colorD={
                                'Goal': 'g',
                                'Pit': 'r',
                                'Start': 'b'
                            },
                            basic_color='skyblue')

    gridworld.set_info('Sample Grid World showing basic MDP creation.')

    # add actions from each state
    #   (note: a_prob will be normalized within add_action_dict)
    gridworld.add_action_dict(actionD)

    # for each action, define the next state and transition probability
    # (here we use the layout definition to aid the logic)
    for s_hash, aL in actionD.items():
        for a_desc in aL:
            sn_hash = get_next_state(s_hash, a_desc)
            reward = rewardD.get(sn_hash, 0.0)

            # for deterministic MDP, use t_prob=1.0
            gridworld.add_transition(s_hash,
                                     a_desc,
                                     sn_hash,
                                     t_prob=1.0,
                                     reward_obj=reward)

    # after the "add" commands, send all states and actions to environment
    # (any required normalization is done here as well.)
    gridworld.define_env_states_actions()

    # If there is a start state, define it here.
    gridworld.start_state_hash = 'Start'

    # If a limited number of start states are desired, define them here.
    gridworld.define_limited_start_state_list([(2, 0), (2, 2)])

    # if a default policy is desired, define it as a dict.
    gridworld.default_policyD = {
        (0, 0): 'R',
        (1, 0): 'U',
        (0, 1): 'R',
        (0, 2): 'R',
        (1, 2): 'U',
        'Start': 'U',
        (2, 2): 'U',
        (2, 1): 'R',
        (2, 3): 'L'
    }

    return gridworld
Esempio n. 2
0
def get_env():

    env = EnvBaseline(name='Simple Six State World')
    env.set_info('Simple Six State World')

    actionD = {
        'A': ('U', ),
        'B': ('ur', 'D'),
        '<C>': ('ur', 'dl'),
        'D': ('ur', 'ul')
    }

    rewardD = {'A': -1.0, 'E': 0.5, 'F': 1.0}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        r = rewardD.get(sn_hash, 0.0)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    add_event('A', 'U', 'B')
    #add_event( 'A', 'Te', 'E' )
    add_event('B', 'D', 'A')
    add_event('B', 'ur', '<C>')
    add_event('<C>', 'dl', 'B')
    add_event('<C>', 'ur', 'D')
    add_event('D', 'ur', 'F')
    add_event('D', 'ul', 'E')

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [('*', 'E', '*', 'F'), ('*', '*', 'D', '*'),
                   ('*', '<C>', '*', '*'), ('B', '*', '*', '*'),
                   ('A', '*', '*', '*')]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = '<C>'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['B'] = 1
    policyD['<C>'] = 1
    policyD['D'] = 1

    env.default_policyD = policyD

    return env
Esempio n. 3
0
def get_random_walk():

    env = EnvBaseline(name='Random Walk MRP')  # GenericLayout set below
    env.set_info('Random Walk MRP')

    actionD = {
        'A': ('L', 'R'),
        'B': ('L', 'R'),
        'C': ('L', 'R'),
        'D': ('L', 'R'),
        'E': ('L', 'R')
    }

    rewardD = {'Win': 1.0, 'Lose': 0.0}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash)
        r = rewardD.get(sn_hash, 0.0)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    mrpL = ['Lose', 'A', 'B', 'C', 'D', 'E', 'Win']
    for i, ci in enumerate(mrpL[1:-1]):
        add_event(ci, 'L', mrpL[i])
        add_event(ci, 'R', mrpL[i + 2])

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [mrpL]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = 'C'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['A'] = ('L', 'R')
    policyD['B'] = ('L', 'R')
    policyD['C'] = ('L', 'R')
    policyD['D'] = ('L', 'R')
    policyD['E'] = ('L', 'R')

    env.default_policyD = policyD

    return env
Esempio n. 4
0
def get_gambler(prob_heads=0.4):

    gambler = EnvBaseline( name='Gamblers Coin Flip Problem',
                           s_hash_rowL=s_hash_rowL,
                           colorD={100:'g', 0:'r'},
                           basic_color='skyblue' )
    gambler.set_info( 'Example 4.3 from Sutton & Barto 2nd Edition page 84.' )

    for s in range(1, 100): # 1 to 99
        s_max = min(s, 100-s)
        for a_desc in range(1, s_max + 1):
            gambler.add_action( s, a_desc, a_prob=1.0 )

    # define reward for all states
    def get_reward( sn ):
        if sn==100:
            return 1.0
        else:
            return 0.0

    # define all possible transitions.
    for s in range(1, 100): # 1 to 99
        s_max = min(s, 100-s)
        for a_desc in range(1, s_max + 1):
            sn_hash = s - a_desc
            rval = get_reward( sn_hash )
            gambler.add_transition( s, a_desc, sn_hash, t_prob=1.0-prob_heads, reward_obj=rval)

            sn_hash = s + a_desc
            rval = get_reward( sn_hash )
            gambler.add_transition( s, a_desc, sn_hash, t_prob=prob_heads, reward_obj=rval)
            
    gambler.define_env_states_actions()  # send all states and actions to environment

    # If there is a start state, define it here.
    gambler.start_state_hash = (50)

    # define default policy (if any)
    gambler.default_policyD = {}
    
    return gambler
Esempio n. 5
0
def get_gridworld(step_reward=-0.04):

    gridworld = EnvBaseline(name='Sutton Ex4.1 5x5 Grid World',
                            s_hash_rowL=s_hash_rowL)
    gridworld.set_info("""
           Sutton 5x5 Gridworld
        Book Answer from page 65 (linear eqn solve) for gamma=0.9
         22.0     24.4      22.0      19.4      17.5
         19.8     22.0      19.8      17.8      16.0
         17.8     19.8      17.8      16.0      14.4
         16.0     17.8      16.0      14.4      13.0
         14.4     16.0      14.4      13.0      11.7
    =================================================    """)

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'N':
            di = 1
        elif action == 'S':
            di = -1
        elif action == 'E':
            dj = 1
        elif action == 'W':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if (i == 4) and (j == 1):
            i_next = 0
            j_next = 1
            reward = 10
        elif (i == 4) and (j == 3):
            i_next = 2
            j_next = 3
            reward = 5
        elif (i_next < 0) or (i_next > 4) or (j_next < 0) or (j_next > 4):
            i_next = i
            j_next = j
            reward = -1

        state_next_hash = (i_next, j_next)
        return reward, state_next_hash

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    for i in range(5):
        for j in range(5):
            s_hash = (i, j)

            gridworld.default_policyD[s_hash] = ('N', 'S', 'E', 'W')
            for a_desc in ['N', 'S', 'E', 'W']:
                gridworld.add_action(s_hash, a_desc,
                                     a_prob=1.0)  # a_prob will be normalized

                reward_val, sn_hash = get_action_snext_reward(s_hash, a_desc)
                # add each event to transitions object
                gridworld.add_transition(s_hash,
                                         a_desc,
                                         sn_hash,
                                         t_prob=1.0,
                                         reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    gridworld.start_state_hash = (0, 0)

    return gridworld
Esempio n. 6
0
def get_gridworld(step_reward=-1, height=7, goal=(3,7),
            windT=(0,0,0,1,1,1,2,2,1,0)):
    """
    Windy Gridworld with (0,0) at lower left
    width is defined by length of windT tuple.
    """

    gridworld = EnvBaseline( name='Windy Kings Gridworld' ) # GenericLayout set below
    gridworld.set_info( """""" )

    width = len( windT )

    def get_action_snext( s_hash, action):
        """returns state_next_hash"""

        di = 0
        dj = 0

        if 'N' in action:
            di = 1
        elif 'S' in action:
            di = -1
            
        if 'E' in action:
            dj = 1
        elif 'W' in action:
            dj = -1

        (i,j) = s_hash
        wind_di = windT[ j ]

        i_next = i + di
        # constrain basic move to be inside the grid
        i_next = max(0, min(height-1, i_next))

        i_next += wind_di # add wind to constrained move.
        j_next = j + dj

        # constrain next position to be inside the grid
        i_next = max(0, min(height-1, i_next))
        j_next = max(0, min(width-1, j_next))

        state_next_hash = (i_next, j_next)
        if state_next_hash == goal:
            state_next_hash = 'Goal'
        return state_next_hash


    # define default policy
    gridworld.default_policyD = {} #index=s_hash, value=list of equiprobable actions

    for i in range(height):
        for j in range(width):
            s_hash = (i,j)
            if s_hash == goal:
                pass  # s_hash == 'Goal'
            else:
                gridworld.default_policyD[ s_hash ] = ('N','S','E','W', 'NE','SE','SW','NW')
                for a_desc in ['N','S','E','W', 'NE','SE','SW','NW']:
                    gridworld.add_action( s_hash, a_desc, a_prob=1.0 ) # a_prob will be normalized

                    sn_hash = get_action_snext( s_hash, a_desc )
                    # add each event to transitions object
                    gridworld.add_transition( s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=step_reward)

    gridworld.define_env_states_actions()  # send all states and actions to environment

    # --------------------

    s_hash_rowL = [] # layout rows for makeing 2D output
    for i in range(height): # put (0,0) at upper left
        rowL = []
        for j in range(width):
            s_hash = (i,j)
            if s_hash == goal:
                s_hash = 'Goal'

            rowL.append( s_hash )

        # use insert to put (0,0) at lower left, append for upper left
        s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output

    gridworld.layout = GenericLayout( gridworld, s_hash_rowL=s_hash_rowL,
                                      col_tickL=windT,
                                      x_axis_label='Upward Wind Speed'  )


    gridworld.start_state_hash =  (3,0)

    return gridworld
Esempio n. 7
0
def get_gridworld(step_reward=0.0):
    gridworld = EnvBaseline(
        name='Simple Grid World')  # GenericLayout set below
    gridworld.set_info('Simple Grid World Example.')

    actionD = {
        (0, 0): ('D', 'R'),
        (0, 1): ('L', 'R'),
        (0, 2): ('L', 'D', 'R'),
        (1, 0): ('U', 'D'),
        (1, 2): ('U', 'D', 'R'),
        (2, 0): ('U', 'R'),
        (2, 1): ('L', 'R'),
        (2, 2): ('L', 'R', 'U'),
        (2, 3): ('L', 'U')
    }

    rewardD = {(0, 3): 1, (1, 3): -1}

    for state_hash, actionL in actionD.items():

        for action_desc in actionL:
            gridworld.add_action(state_hash, action_desc,
                                 a_prob=1.0)  # a_prob will be normalized

            a = action_desc
            s = state_hash

            if a == 'U':
                state_next_hash = (s[0] - 1, s[1])
            elif a == 'D':
                state_next_hash = (s[0] + 1, s[1])
            elif a == 'R':
                state_next_hash = (s[0], s[1] + 1)
            elif a == 'L':
                state_next_hash = (s[0], s[1] - 1)

            reward_val = rewardD.get(state_next_hash, step_reward)

            gridworld.add_transition(state_hash,
                                     action_desc,
                                     state_next_hash,
                                     t_prob=1.0,
                                     reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    gridworld.layout = GenericLayout(
        gridworld)  # uses default "get_layout_row_col_of_state"

    # If there is a start state, define it here.
    gridworld.start_state_hash = (2, 0)
    gridworld.define_limited_start_state_list([(2, 0), (2, 2)])

    # define default policy (if any)
    # Policy Dictionary for: GridWorld

    policyD = {}  # index=state_hash, value=action_desc

    #                 Vpi shown for gamma=0.9
    policyD[(0, 0)] = 'R'  # Vpi=0.81
    policyD[(1, 0)] = 'U'  # Vpi=0.729
    policyD[(0, 1)] = 'R'  # Vpi=0.9
    policyD[(0, 2)] = 'R'  # Vpi=1.0
    policyD[(1, 2)] = 'U'  # Vpi=0.9
    policyD[(2, 0)] = 'U'  # Vpi=0.6561
    policyD[(2, 2)] = 'U'  # Vpi=0.81
    policyD[(2, 1)] = 'R'  # Vpi=0.729
    policyD[(2, 3)] = 'L'  # Vpi=0.729

    gridworld.default_policyD = policyD

    return gridworld
def get_robot(step_reward=-0.04):

    gridworld = EnvBaseline(name='Slipper Cleaning Robot',
                            s_hash_rowL=s_hash_rowL)
    gridworld.set_info("""
        Example taken from "Dissecting Reinforcement Learning-Part 1" 
        Dec 9, 2016   Massimiliano Patacchiola
        https://mpatacchiola.github.io/blog/2016/12/09/dissecting-reinforcement-learning.html
        """)

    def get_right_angle_list(a):

        if a == 'U':
            raL = ['L', 'R']
        elif a == 'D':
            raL = ['L', 'R']
        elif a == 'R':
            raL = ['U', 'D']
        elif a == 'L':
            raL = ['U', 'D']

        return raL

    def get_move_s_next(a, s):

        sn = s
        if a == 'U':
            sn = (s[0] + 1, s[1])
        elif a == 'D':
            sn = (s[0] - 1, s[1])
        elif a == 'R':
            sn = (s[0], s[1] + 1)
        elif a == 'L':
            sn = (s[0], s[1] - 1)

        if sn == (2, 2):  # can't move into block in the middle.
            sn = s

        # limit moves to inside the edges.
        sn_hash = (clamp(sn[0], 1, 3), clamp(sn[1], 1, 4))

        return sn_hash

    non_termL = [(3, 1), (3, 2), (3, 3), (2, 1), (2, 3), (1, 1), (1, 2),
                 (1, 3), (1, 4)]

    rewardD = {(3, 4): 1, (2, 4): -1}

    # put in 80% and both 10% moves to target
    for s_hash in non_termL:
        for a_desc in ['U', 'D', 'L', 'R']:  # normal move
            gridworld.add_action(s_hash, a_desc, a_prob=0.25)

            # 80%
            sn_hash = get_move_s_next(a_desc, s_hash)
            reward_val = rewardD.get(sn_hash, step_reward)

            gridworld.add_transition(s_hash,
                                     a_desc,
                                     sn_hash,
                                     t_prob=0.8,
                                     reward_obj=reward_val)

            # both 10%
            right_angL = get_right_angle_list(a_desc)
            for ar_desc in right_angL:
                sn_hash = get_move_s_next(ar_desc, s_hash)
                reward_val = rewardD.get(sn_hash, step_reward)

                gridworld.add_transition(s_hash,
                                         a_desc,
                                         sn_hash,
                                         t_prob=0.1,
                                         reward_obj=reward_val)
    gridworld.define_env_states_actions()

    # If there is a start state, define it here.
    gridworld.start_state_hash = (1, 1)

    # define default policy (if any)
    policyD = {}  # index=s_hash, value=a_desc

    policyD[(3, 1)] = 'R'
    policyD[(3, 3)] = 'R'
    policyD[(3, 2)] = 'R'

    policyD[(2, 1)] = 'U'
    policyD[(2, 3)] = 'U'

    policyD[(1, 1)] = 'U'
    policyD[(1, 2)] = 'L'
    policyD[(1, 3)] = 'L'
    policyD[(1, 4)] = 'L'

    gridworld.default_policyD = policyD

    return gridworld
Esempio n. 9
0
def get_gridworld(step_reward=0.0,
                  width=9,
                  height=6,
                  goal=(0, 8),
                  start=(2, 0),
                  wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4,
                                                                          5))):

    gridworld = EnvBaseline(
        name='Sutton Ex8.1 Dyna Maze')  # GenericLayout set below
    gridworld.set_info("""Sutton Ex8.1 Dyna Maze""")

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'U':
            di = -1
        elif action == 'D':
            di = 1
        elif action == 'R':
            dj = 1
        elif action == 'L':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if j_next >= width:
            j_next = j
        elif j_next < 0:
            j_next = j

        if i_next >= height:
            i_next = i
        elif i_next < 0:
            i_next = i

        if (i_next, j_next) in wallL:
            i_next, j_next = i, j

        state_next_hash = (i_next, j_next)

        if state_next_hash == goal:
            reward = 1.0
        else:
            reward = 0.0

        return reward, state_next_hash

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    for i in range(height):
        for j in range(width):
            s_hash = (i, j)
            if s_hash != goal:
                gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L')
                for a_desc in ['U', 'D', 'R', 'L']:
                    gridworld.add_action(
                        s_hash, a_desc,
                        a_prob=1.0)  # a_prob will be normalized

                    reward_val, sn_hash = get_action_snext_reward(
                        s_hash, a_desc)
                    # add each event to transitions object
                    gridworld.add_transition(s_hash,
                                             a_desc,
                                             sn_hash,
                                             t_prob=1.0,
                                             reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = []  # layout rows for makeing 2D output
    for i in range(height):  # put (0,0) at upper left
        rowL = []
        for j in range(width):
            s = (i, j)
            if s in wallL:
                rowL.append('"Wall"')
            else:
                rowL.append(s)

        # use insert to put (0,0) at lower left
        s_hash_rowL.append(rowL)  # layout rows for makeing 2D output

    named_s_hashD = {start: 'Start', goal: 'Goal'}
    gridworld.layout = GenericLayout(gridworld,
                                     s_hash_rowL=s_hash_rowL,
                                     named_s_hashD=named_s_hashD)

    gridworld.start_state_hash = start

    return gridworld
Esempio n. 10
0
def get_robot():

    robot = EnvBaseline(name='Slow-Fast Fallen Robot', s_hash_rowL=s_hash_rowL)
    robot.set_info("""
        Sample 3 State Fallen, Standing, Moving Robot.
        https://sandipanweb.wordpress.com/2017/03/23/some-reinforcement-learning-using-policy-value-iteration-and-q-learning-for-a-markov-decision-process-in-python-and-r/
        Some Reinforcement Learning: Using Policy & Value Iteration and Q-learning for a Markov Decision Process in Python and R
        """)

    robot.add_action('Fallen', 'Slow', a_prob=1.0)
    robot.add_action('Standing', 'Slow', a_prob=1.0)
    robot.add_action('Moving', 'Slow', a_prob=1.0)

    robot.add_action('Standing', 'Fast', a_prob=1.0)
    robot.add_action('Moving', 'Fast', a_prob=1.0)

    robot.add_transition('Fallen',
                         'Slow',
                         'Fallen',
                         t_prob=0.6,
                         reward_obj=-1.0)
    robot.add_transition('Fallen',
                         'Slow',
                         'Standing',
                         t_prob=0.4,
                         reward_obj=1.0)

    robot.add_transition('Standing',
                         'Slow',
                         'Moving',
                         t_prob=1.0,
                         reward_obj=1.0)
    robot.add_transition('Moving',
                         'Slow',
                         'Moving',
                         t_prob=1.0,
                         reward_obj=1.0)

    robot.add_transition('Standing',
                         'Fast',
                         'Moving',
                         t_prob=0.6,
                         reward_obj=2.0)
    robot.add_transition('Standing',
                         'Fast',
                         'Fallen',
                         t_prob=0.4,
                         reward_obj=-1.0)

    robot.add_transition('Moving',
                         'Fast',
                         'Moving',
                         t_prob=0.8,
                         reward_obj=2.0)
    robot.add_transition('Moving',
                         'Fast',
                         'Fallen',
                         t_prob=0.2,
                         reward_obj=-1.0)

    robot.define_env_states_actions(
    )  # send all states and actions to environment

    robot.start_state_hash = 'Standing'

    # define default policy (if any)
    policyD = {}  # index=state_hash, value=action_desc

    policyD['Standing'] = 'Slow'
    policyD['Fallen'] = 'Slow'
    policyD['Moving'] = 'Slow'
    robot.default_policyD = policyD

    return robot
Esempio n. 11
0
def get_gridworld(
    step_reward=0.0,
    N_mult=1,  # N_mult must be an integer.
    width=9,
    height=6,
    goal=(0, 8),
    start=(2, 0),
    wallL=((1, 2), (2, 2), (3, 2), (0, 7), (1, 7), (2, 7), (4, 5))):

    gridworld = EnvBaseline(
        name='Sutton Ex8.4 Priority Sweep Maze')  # GenericLayout set below
    gridworld.set_info("""Sutton Ex8.1 Dyna Maze""")

    width_big = width * N_mult
    height_big = height * N_mult

    gridworld.characteristic_dim = width_big + height_big * 2
    # get relaxed optimal length from Zhang.
    gridworld.optimal_path_len = int(14 * N_mult * 1.2) + 1

    def get_action_snext_reward(s_hash, action):
        """returns reward and state_next_hash"""

        di = 0
        dj = 0
        reward = 0

        if action == 'U':
            di = -1
        elif action == 'D':
            di = 1
        elif action == 'R':
            dj = 1
        elif action == 'L':
            dj = -1

        (i, j) = s_hash
        i_next = i + di
        j_next = j + dj

        if j_next >= width_big:
            j_next = j
        elif j_next < 0:
            j_next = j

        if i_next >= height_big:
            i_next = i
        elif i_next < 0:
            i_next = i

        if (i_next, j_next) in wall_set:
            i_next, j_next = i, j

        state_next_hash = (i_next, j_next)

        if state_next_hash in goal_set:
            reward = 1.0
        else:
            reward = 0.0

        return reward, state_next_hash

    def make_big_set(pos):
        """Take an (i,j) position, pos, and expand to new, big size in x and y"""
        pos_set = set()
        ip, jp = pos
        ip *= N_mult
        jp *= N_mult
        for ixn in range(N_mult):
            for jxn in range(N_mult):
                pos_set.add((ip + ixn, jp + jxn))
        return pos_set

    # define default policy
    gridworld.default_policyD = {
    }  #index=s_hash, value=list of equiprobable actions

    # redefine start
    istart, jstart = start
    start = (istart * N_mult, jstart * N_mult)

    # make goal set
    goal_set = make_big_set(goal)

    # make wall set
    wall_set = set()
    for wall in wallL:
        wall_set.update(make_big_set(wall))

    # create state hash entries
    for i in range(height_big):
        for j in range(width_big):
            s_hash = (i, j)
            if (s_hash not in wall_set) and (s_hash not in goal_set):
                gridworld.default_policyD[s_hash] = ('U', 'D', 'R', 'L')
                for a_desc in ['U', 'D', 'R', 'L']:
                    gridworld.add_action(
                        s_hash, a_desc,
                        a_prob=1.0)  # a_prob will be normalized

                    reward_val, sn_hash = get_action_snext_reward(
                        s_hash, a_desc)
                    # add each event to transitions object
                    gridworld.add_transition(s_hash,
                                             a_desc,
                                             sn_hash,
                                             t_prob=1.0,
                                             reward_obj=reward_val)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    # --------------------

    s_hash_rowL = []  # layout rows for makeing 2D output
    for i in range(height_big):  # put (0,0) at upper left
        rowL = []
        for j in range(width_big):
            s = (i, j)
            if s in wall_set:
                rowL.append('"Wall"')
            else:
                rowL.append(s)

        # use insert to put (0,0) at lower left
        s_hash_rowL.append(rowL)  # layout rows for makeing 2D output

    named_s_hashD = {}
    named_s_hashD[start] = 'Start'
    for g in goal_set:
        named_s_hashD[g] = 'Goal'

    gridworld.layout = GenericLayout(gridworld,
                                     s_hash_rowL=s_hash_rowL,
                                     named_s_hashD=named_s_hashD)

    gridworld.start_state_hash = start

    return gridworld
Esempio n. 12
0
def get_gridworld(step_reward=-0.04):

    gridworld = EnvBaseline(name='Sutton Ex4.1 Grid World',
                            s_hash_rowL=s_hash_rowL)
    gridworld.set_info("""
        Example 4.1 grid 
        Label for blank space is "0" (both blanks are the same actual state)
        (i.e. upper left corner and lower right corner are state "0")
        """)

    for state_hash in range(1, 15):  # states are numbered 1-14
        for action_desc in ['U', 'D', 'R', 'L']:
            gridworld.add_action(state_hash, action_desc,
                                 a_prob=1.0)  # a_prob will be normalized

            a = action_desc
            s = state_hash

            if a == 'U':
                sn = s - 4
            elif a == 'D':
                sn = s + 4
            elif a == 'R':
                if s not in [3, 7, 11]:
                    sn = s + 1
                else:
                    sn = s
            elif a == 'L':
                if s not in [4, 8, 12]:
                    sn = s - 1
                else:
                    sn = s

            if sn < 0:
                sn = s
            elif sn > 15:
                sn = s
            elif sn == 15:
                sn = 0

            gridworld.add_transition(state_hash,
                                     action_desc,
                                     sn,
                                     t_prob=1.0,
                                     reward_obj=-1.0)

    gridworld.define_env_states_actions(
    )  # send all states and actions to environment

    gridworld.start_state_hash = 12

    # define default policy (if any)
    policyD = {}  # index=state_hash, value=action_desc

    for (s_hash, a_desc) in gridworld.iter_state_hash_action_desc():
        if s_hash not in policyD:
            policyD[s_hash] = []
        policyD[s_hash].append((a_desc, 0.25))

    # make policyD entries hashable for later use (i.e. tuple, not list)
    for s_hash, aL in policyD.items():
        policyD[s_hash] = tuple(aL)

    gridworld.default_policyD = policyD

    return gridworld
Esempio n. 13
0
def get_env():

    env = EnvBaseline( name="Jacks Car Rental (const rtn)" ) # GenericLayout set below
    
    simplified_str ="""Shangtong Zhang's simplified model such that
the # of cars returned in daytime becomes constant
rather than a random value from poisson distribution, which will reduce calculation time
and leave the optimal policy/value state matrix almost the same"""    
    
    env.set_info( 'Example 4.2 from Sutton & Barto 2nd Edition page 81.\n' + simplified_str )

    
    # define all possible actions.
    saL = [] # a list of (s1, s2, adesc)
    s_hash_rowL = [] # layout rows for makeing 2D output

    
    for s1 in range( MAX_CARS + 1 ): # 20 cars max
        rowL = [] # row of s_hash_rowL
        
        for s2 in range( MAX_CARS + 1 ): # 20 cars max
            s_hash = (s1, s2)
            rowL.append( s_hash )
            
            for a_desc in range(-5, 6): # -5 moves 5 cars from 2nd to 1st. +5 from 1st to 2nd.
                
                if a_desc < 0: # can only move cars if they are present
                    if (abs(a_desc) <= s2):
                        env.add_action( s_hash, a_desc, a_prob=1.0 )
                        saL.append( (s1, s2, a_desc) )
                else:
                    if (a_desc <= s1): # can only move cars if they are present
                        env.add_action( s_hash, a_desc, a_prob=1.0 )
                        saL.append( (s1, s2, a_desc) )
        
        # use insert to put (0,0) at lower left
        s_hash_rowL.insert(0, rowL )# layout rows for makeing 2D output
    
    # ------------------------------
    # figure out transition probabilities and rewards
    for s1 in range( MAX_CARS + 1 ):
        for s2 in range( MAX_CARS + 1 ):
            for a_desc in range( -5, 6 ):
                get_prob_reward( s1, s2, a_desc)

    # ------------------------------                                                
        
    print('\nStarting to define car rental transitions')
    # with all the probability figured out, define all transitions
    for (s1, s2, a_desc, sn_hash), t_prob in total_probD.items():
        txr = sum_prob_x_rewardD[ (s1, s2, a_desc, sn_hash) ]
        rval = txr / t_prob
        env.add_transition( (s1,s2), a_desc, sn_hash, t_prob=t_prob, reward_obj=rval)
    
        #if s1==10 and s2==10:
        #    print('for (10,10) a_desc=',a_desc,' sn_hash=',sn_hash,'  t_prob=',t_prob,'  rval=',rval)
    
    print('Calling: env.define_env_states_actions')
    env.define_env_states_actions()  # send all states and actions to environment
    print('Environment Ready.')

    # If there is a start state, define it here.
    env.start_state_hash = (10,10)

    # define default policy (if any)
    env.default_policyD = {}


    # --------------------
    # define layout for output

    env.layout = GenericLayout( env, s_hash_rowL=s_hash_rowL, 
                                x_axis_label='#Cars at Second Location',
                                y_axis_label='#Cars at First Location')
    
    return env
Esempio n. 14
0
def get_random_walk(Nside_states=9,
                    win_reward=1.0,
                    lose_reward=-1.0,
                    step_reward=0.0):

    Nstates = 2 * Nside_states + 1

    s = '(L%i, R%i)' % (Nside_states, Nside_states)
    env = EnvBaseline(name='%i State Random Walk MRP' % Nstates +
                      s)  # GenericLayout set below
    env.set_info('%i State Random Walk MRP' % Nstates + s)

    RstateL = ['R+%i' % i for i in range(1, Nside_states + 1)]
    LstateL = list(reversed([s.replace('R+', 'L-') for s in RstateL]))

    actionD = {}
    for s in LstateL:
        actionD[s] = ('L', 'R')
    actionD['C'] = ('L', 'R')
    for s in RstateL:
        actionD[s] = ('L', 'R')

    rewardD = {'Win': win_reward, 'Lose': lose_reward}

    for (s_hash, moveL) in actionD.items():
        for a_desc in moveL:
            env.add_action(s_hash, a_desc, a_prob=1.0)

    def add_event(s_hash, a_desc, sn_hash):
        #print('s_hash, a_desc, sn_hash',s_hash, a_desc, sn_hash)
        r = rewardD.get(sn_hash, step_reward)
        env.add_transition(s_hash, a_desc, sn_hash, t_prob=1.0, reward_obj=r)

    mrpL = ['Lose'] + LstateL + ['C'] + RstateL + ['Win']
    for i, ci in enumerate(mrpL[1:-1]):
        add_event(ci, 'L', mrpL[i])
        add_event(ci, 'R', mrpL[i + 2])

    env.define_env_states_actions(
    )  # send all states and actions to environment

    # -------------------- make layout for printing ------------------

    s_hash_rowL = [mrpL]

    env.layout = GenericLayout(env, s_hash_rowL=s_hash_rowL)

    env.start_state_hash = 'C'

    # define default_policyD
    policyD = {}  # index=state_hash, value=action_desc

    policyD['C'] = ('L', 'R')
    for s in LstateL:
        policyD[s] = ('L', 'R')
    for s in RstateL:
        policyD[s] = ('L', 'R')

    env.default_policyD = policyD

    return env