Beispiel #1
0
    def __init__(self, true_mdp_code, do_discount, do_randchoose, do_temp):
        danger_r = -2
        goal_reward = 10
        init_ground = (0, 2)
        goal_s = (5, 2)

        fr_vals = [{'x': danger_r, 'o': 0}[v] for v in true_mdp_code]
        feature_rewards = dict(zip('opc', fr_vals))
        feature_rewards['y'] = goal_reward
        params = {
            'gridworld_array':
            ['.oooo.', '.oppp.', '.opccy', '.oppc.', '.cccc.'],
            'feature_rewards': feature_rewards,
            'absorbing_states': [
                goal_s,
            ],
            'init_state': init_ground,
            'wall_action': False,
            'step_cost': 0,
            'wait_action': False,
        }
        self.gw = GridWorld(**params)
        self.planner = self.gw.solve(discount_rate=do_discount,
                                     softmax_temp=do_temp,
                                     randchoose=do_randchoose)
Beispiel #2
0
class StandardPlanningModel(object):
    def __init__(self, true_mdp_code, do_discount, do_randchoose, do_temp):
        danger_r = -2
        goal_reward = 10
        init_ground = (0, 2)
        goal_s = (5, 2)

        fr_vals = [{'x': danger_r, 'o': 0}[v] for v in true_mdp_code]
        feature_rewards = dict(zip('opc', fr_vals))
        feature_rewards['y'] = goal_reward
        params = {
            'gridworld_array':
            ['.oooo.', '.oppp.', '.opccy', '.oppc.', '.cccc.'],
            'feature_rewards': feature_rewards,
            'absorbing_states': [
                goal_s,
            ],
            'init_state': init_ground,
            'wall_action': False,
            'step_cost': 0,
            'wait_action': False,
        }
        self.gw = GridWorld(**params)
        self.planner = self.gw.solve(discount_rate=do_discount,
                                     softmax_temp=do_temp,
                                     randchoose=do_randchoose)

    def trajectory_loglikelihood(self, wtraj):
        logl = 0
        for s, a in wtraj:
            adist = self.planner.act_dist(s)
            logl += math.log(adist[a])
        return logl
Beispiel #3
0
    def test_qlearner_on_simple_deterministic_gridworld(self):
        gw = GridWorld(
            gridworld_array=['...........', '.xxxxxxxxxy', '.xxxxxxxxxx'],
            absorbing_states=[
                (10, 1),
            ],
            init_state=(0, 1),
            feature_rewards={
                '.': -1,
                'x': -10,
                'y': 100
            })

        np.random.seed(123)
        params = {
            'learning_rate': 1,
            'eligibility_trace_decay': .8,
            'initial_qvalue': 100
        }
        qlearn = Qlearning(gw, softmax_temp=1, discount_rate=.99, **params)
        qlearn.train(episodes=100, max_steps=100)
        test = qlearn.run(softmax_temp=0.0, randchoose=0.0, max_steps=50)
        totr = sum([r for s, a, ns, r in test])

        self.assertEqual(totr, 89)
Beispiel #4
0
    def test_discretizedobmdp_stochastic_gridworld(self):
        seed_trajs = []
        non_std_t_features = {'g': {
            '2forward': .5,
            'forward': .5
        }}
        exp_mdp = GridWorld(
            gridworld_array=[['w', 'y', 'w'],
                             ['w', 'w', 'w'],
                             ['w', 'w', 'w'],
                             ['g', 'w', 'g'],
                             ['w', 'w', 'w']],
            init_state=(1, 0),
            feature_rewards={'w': 0, 'r': 0, 'g': 0, 'y': 5},
            absorbing_states=[(1, 4), ],
            include_intermediate_terminal=True,
            non_std_t_features=non_std_t_features
        )
        exp_planner = exp_mdp.solve(discount_rate=.99,
                                    randchoose=.5,
                                    softmax_temp=0.0)
        for _ in range(50):
            traj = exp_planner.run()
            seed_trajs.append(traj)

        dobmdp = DiscretizedObserverBeliefMDPApproximation(
            planners=self.sto_planners,
            true_planner_name='s',
            belief_reward=5,
            belief_reward_type='true_gain',
            n_probability_bins=5,
            seed_trajs=seed_trajs
        )
        dobmdp_planner = dobmdp.solve(discount_rate=.99,
                                      softmax_temp=0.0,
                                      randchoose=0.0)
        traj = dobmdp_planner.run()

        s_i = dobmdp.planner_order.index('s')
        s_ismax = max_index(traj[-1][0][0]) == s_i
        self.assertTrue(s_ismax)
Beispiel #5
0
    def setUp(self):
        det_planners = {}
        for mdpc in product('xo', repeat=2):
            mdpc = ''.join(mdpc)

            rewards = [{'x': -1, 'o': 0}[c] for c in mdpc]
            feature_rewards = dict(zip('pc', rewards))
            feature_rewards['w'] = 0
            feature_rewards['y'] = 5
            mdp = GridWorld(
                gridworld_array=[['w', 'p', 'y'],
                                 ['w', 'c', 'w']],
                feature_rewards=feature_rewards,
                init_state=(0, 0),
                absorbing_states=[(2, 1)],
                include_intermediate_terminal=True
            )
            planner = mdp.solve(discount_rate=.99,
                                softmax_temp=.5,
                                randchoose=.1)
            det_planners[mdpc] = planner
        self.det_planners = det_planners

        sto_planners = {}
        for mdpc in 'sw':
            non_std_t_features = {'g': {
                '2forward': {'s':.7, 'w':.3}[mdpc],
                'forward': 1 - {'s':.7, 'w':.3}[mdpc]
            }}
            mdp = GridWorld(
                gridworld_array=[['w', 'y', 'w'],
                                 ['w', 'w', 'r'],
                                 ['r', 'w', 'w'],
                                 ['g', 'w', 'g'],
                                 ['w', 'w', 'w']],
                init_state=(1, 0),
                feature_rewards={'w': 0, 'r': -1, 'g': 0, 'y': 5},
                absorbing_states=[(1, 4),],
                include_intermediate_terminal=True,
                non_std_t_features=non_std_t_features
            )
            planner = mdp.solve(discount_rate=.99,
                                softmax_temp=.5,
                                randchoose=.1)
            sto_planners[mdpc] = planner
        self.sto_planners = sto_planners
Beispiel #6
0
    def __init__(self,
                 true_mdp_code,
                 do_discount, 
                 do_randchoose,
                 do_temp,
                 show_discount,
                 show_reward,
                 show_randchoose,
                 show_temp,
                 n_bins=8,
                 seed_trajs=None,
                 disc_tf=None, 
                 solved_planner=None):
        self.show_discount = show_discount
        self.show_randchoose = show_randchoose
        self.show_temp = show_temp
        
        #=============================#

        #if the solved planner is provided, no need to
        #compute other stuff
        if solved_planner is not None:
            self.obmdp_planner = solved_planner
            self.obmdp = solved_planner.mdp
            return
        
                
        #=============================#
        #   Build set of ground MDPs  #
        #=============================#
        danger_r = -2
        goal_reward = 10
        init_ground = (0, 2)
        goal_s = (5, 2)
        
        mdp_params = []
        feature_rewards = []
        for rs in product([0, danger_r], repeat=3):
            feature_rewards.append(dict(zip('opc', rs)))
            
        mdp_codes = []
        for fr in feature_rewards:
            rfc = ['o' if fr[f] == 0 else 'x' for f in 'opc']
            rfc = ''.join(rfc)
            mdp_codes.append(rfc)
            fr['y'] = goal_reward
            fr['.'] = 0
        
        planners = {}
        for mdpc, frewards in zip(mdp_codes, feature_rewards):
            params = {
                'gridworld_array': ['.oooo.',
                                    '.oppp.',
                                    '.opccy',
                                    '.oppc.',
                                    '.cccc.'],
                'feature_rewards': frewards,
                'absorbing_states': [goal_s, ],
                'init_state': init_ground,
                'wall_action': False,
                'step_cost': 0,
                'wait_action': False,
                'include_intermediate_terminal': True
            }
            mdp = GridWorld(**params)
            planner = mdp.solve(
                softmax_temp=do_temp, 
                randchoose=do_randchoose, 
                discount_rate=do_discount)
            planners[mdpc] = planner
            
        #===========================================#
        #   Build Observer Belief MDP and support   #
        #===========================================#
        obmdp = DiscretizedObserverBeliefMDPApproximation(
            n_probability_bins=n_bins,
            seed_trajs=seed_trajs,
            branch_steps=0,
            discretized_tf=disc_tf,
            planners=planners,
            true_planner_name=true_mdp_code,
            belief_reward_type='true_gain',
            only_belief_reward=False,
            belief_reward=show_reward,
            update_includes_intention=True)
        self.obmdp = obmdp
        self.obmdp_planner = None
Beispiel #7
0
import time

import numpy as np
import pandas as pd
import seaborn as sns

from pyrlap.domains.gridworld import GridWorld
from pyrlap.algorithms.qlearning import Qlearning
from pyrlap.domains.gridworld.gridworldvis import visualize_trajectory

# %%
gw = GridWorld(gridworld_array=['...........', '.xxxxxxxxxy', '.xxxxxxxxxx'],
               absorbing_states=[
                   (10, 1),
               ],
               init_state=(0, 1),
               feature_rewards={
                   '.': -1,
                   'x': -10,
                   'y': 100
               })
s_features = gw.state_features

# %%
np.random.seed(1234)
all_run_data = []

# %%
start = time.time()
for i in range(20):
    params = {
        'learning_rate': 1,
Beispiel #8
0
    def test_slip_state_world(self):
        state_features = [['w', 'x', 'x', 'x', 'x', 'w'],
                          ['w', 'a', 'a', 'a', 'a', 'y'],
                          ['w', 'x', 'x', 'x', 'x', 'w']]
        w = len(state_features[0])
        h = len(state_features)
        state_features = {(x, y): state_features[h - 1 - y][x]
                          for x, y in product(range(w), range(h))}
        absorbing_states = [
            (5, 1),
        ]
        feature_rewards = {'a': 0, 'b': 0, 'x': -1, 'y': 5, 'w': 0}

        slip_features = {'a': {'forward': .6, 'side': .4, 'back': 0}}

        params = {
            'width': w,
            'height': h,
            'state_features': state_features,
            'feature_rewards': feature_rewards,
            'absorbing_states': absorbing_states,
            'slip_features': slip_features,
            'init_state': (0, 1),
            'include_intermediate_terminal': True
        }

        gw = GridWorld(**params)
        planner = gw.solve(discount_rate=.99)

        true_policy = {
            (-2, -2): '%',
            (-1, -1): '%',
            (0, 0): '^',
            (0, 1): '>',
            (0, 2): 'v',
            (1, 0): '^',
            (1, 1): '>',
            (1, 2): 'v',
            (2, 0): '^',
            (2, 1): '>',
            (2, 2): 'v',
            (3, 0): '>',
            (3, 1): '>',
            (3, 2): '>',
            (4, 0): '>',
            (4, 1): '>',
            (4, 2): '>',
            (5, 0): '^',
            (5, 1): '%',
            (5, 2): 'v'
        }

        self.assertEqual(planner.optimal_policy, true_policy)

        np.random.seed(2223124)
        traj = []
        s = gw.get_init_state()
        for _ in range(20):
            a = planner.optimal_policy[s]
            ns = gw.transition(s, a)
            r = gw.reward(s, a, ns)
            traj.append((s, a, ns, r))
            s = ns
            if s in gw.absorbing_states:
                break
        true_traj = [((0, 1), '>', (1, 1), 0), ((1, 1), '>', (1, 0), -1),
                     ((1, 0), '^', (1, 1), 0), ((1, 1), '>', (2, 1), 0),
                     ((2, 1), '>', (3, 1), 0), ((3, 1), '>', (3, 0), -1),
                     ((3, 0), '>', (4, 0), -1), ((4, 0), '>', (5, 0), 0),
                     ((5, 0), '^', (5, 1), 5)]
        self.assertEqual(traj, true_traj)
Beispiel #9
0
    def test_simple_deterministic_world(self):
        state_features = [['w', 'c', 'c', 'c', 'c', 'w'],
                          ['w', 'c', 'c', 'c', 'c', 'w'],
                          ['w', 'a', 'a', 'a', 'a', 'y'],
                          ['w', 'c', 'c', 'c', 'c', 'w'],
                          ['w', 'b', 'b', 'b', 'b', 'w'],
                          ['w', 'c', 'c', 'c', 'c', 'w']]
        w = len(state_features[0])
        h = len(state_features)
        state_features = {(x, y): state_features[h - 1 - y][x]
                          for x, y in product(range(w), range(h))}
        absorbing_states = [
            (5, 3),
        ]
        feature_rewards = {'a': -2, 'b': 0, 'c': -1, 'y': 1, 'w': 0}
        states = list(state_features.keys())

        params = {
            'width': w,
            'height': h,
            'state_features': state_features,
            'feature_rewards': feature_rewards,
            'absorbing_states': absorbing_states,
            'init_state': (0, 3),
            'include_intermediate_terminal': True
        }

        gw = GridWorld(**params)
        planner = gw.solve(discount_rate=.99)

        true_policy = {
            (-2, -2): '%',
            (-1, -1): '%',
            (0, 0): '^',
            (0, 1): '>',
            (0, 2): 'v',
            (0, 3): 'v',
            (0, 4): 'v',
            (0, 5): 'v',
            (1, 0): '^',
            (1, 1): '>',
            (1, 2): 'v',
            (1, 3): '<',
            (1, 4): '<',
            (1, 5): '<',
            (2, 0): '^',
            (2, 1): '>',
            (2, 2): 'v',
            (2, 3): 'v',
            (2, 4): '<',
            (2, 5): '<',
            (3, 0): '^',
            (3, 1): '>',
            (3, 2): 'v',
            (3, 3): 'v',
            (3, 4): '>',
            (3, 5): '>',
            (4, 0): '>',
            (4, 1): '>',
            (4, 2): '>',
            (4, 3): '>',
            (4, 4): '>',
            (4, 5): '>',
            (5, 0): '^',
            (5, 1): '^',
            (5, 2): '^',
            (5, 3): '%',
            (5, 4): 'v',
            (5, 5): 'v'
        }

        self.assertEqual(planner.optimal_policy, true_policy)

        traj = []
        s = gw.get_init_state()
        while s not in gw.absorbing_states:
            a = planner.optimal_policy[s]
            ns = gw.transition(s, a)
            r = gw.reward(s, a, ns)
            traj.append((s, a, ns, r))
            s = ns
            if len(traj) > 100:
                break
        true_traj = [((0, 3), 'v', (0, 2), 0), ((0, 2), 'v', (0, 1), 0),
                     ((0, 1), '>', (1, 1), 0), ((1, 1), '>', (2, 1), 0),
                     ((2, 1), '>', (3, 1), 0), ((3, 1), '>', (4, 1), 0),
                     ((4, 1), '>', (5, 1), 0), ((5, 1), '^', (5, 2), 0),
                     ((5, 2), '^', (5, 3), 1)]
        self.assertEqual(traj, true_traj)
Beispiel #10
0
 def get_gridworld(self, absorbing_states):
     return GridWorld(width=self.width,
                      height=self.height,
                      walls=self.walls)
Beispiel #11
0
from itertools import product

from pyrlap.hierarchicalrl.ham.ham import AbstractMachine, \
    HierarchyOfAbstractMachines
from pyrlap.domains.gridworld import GridWorld
from pyrlap.domains.taxicab.utils import get_building_walls

gw = GridWorld(width=7,
               height=7,
               walls=get_building_walls(corners=[(1, 1), (6, 1), (6, 6),
                                                 (1, 6)],
                                        exits=[((1, 5), '^'), ((5, 1), '>')]),
               absorbing_states=[
                   (0, 6),
               ],
               reward_dict={(0, 6): 100},
               step_cost=-1,
               init_state=(4, 1))

outside_states = {
    (0, 0), (1, 0), (2, 0), (3, 0), (4, 0), (5, 0), (6, 0), (0, 6), (1, 6),
    (2, 6), (3, 6), (4, 6), (5, 6), (6, 6), (0, 1), (0, 2), (0, 3), (0, 4),
    (0, 5), (6, 1), (6, 2), (6, 3), (6, 4), (6, 5)
}

inside_states = [
    s for s in product(range(7), range(7)) if s not in outside_states
]


class Root(AbstractMachine):
Beispiel #12
0
    def __init__(self,
                 gw: GridWorld,
                 tile_colors: dict = None,
                 feature_colors: dict = None,
                 ax: plt.Axes = None,
                 figsize: tuple = None,
                 title: str = None):
        default_feature_colors = {
            'a': 'orange',
            'b': 'purple',
            'c': 'cyan',
            'x': 'red',
            'p': 'pink',
            '.': 'white',
            'y': 'yellow',
            'g': 'yellow',
            'n': 'white',
            '#': 'black',
            'j': 'lightgreen'
        }
        if feature_colors is None:
            feature_colors = default_feature_colors
        else:
            temp_fcolors = copy.deepcopy(default_feature_colors)
            temp_fcolors.update(feature_colors)
            feature_colors = temp_fcolors

        if tile_colors is None:
            tile_colors = {}
        else:
            tile_colors = copy.copy(tile_colors)

        states_to_plot = []
        for s in gw.states:
            if gw.is_any_terminal(s):
                continue
            states_to_plot.append(s)
            if s in tile_colors:
                continue
            if (s, None) in gw.walls:
                continue
            f = gw.state_features.get(s, '.')
            tile_colors[s] = feature_colors.get(f, 'grey')

        if figsize is None:
            figsize = (5, 5)

        if ax is None:
            fig, ax = plt.subplots(1, 1, figsize=figsize)

        if title is not None:
            ax.set_title(title)

        self.gw = gw
        self.feature_colors = feature_colors
        self.tile_colors = tile_colors
        self.ax = ax

        self.states_to_plot = states_to_plot
        self.annotations = {}
        self.trajectories = {}