Beispiel #1
0
def info_sa_visualize_abstr(mdp, demo_policy_lambda, beta=2.0, is_deterministic_ib=False, is_agent_in_control=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy_lambda (lambda : simple_rl.State --> str)
        beta (float)
        is_deterministic_ib (bool)
        is_agent_in_control (bool)

    Summary:
        Visualizes the state abstraction found by info_sa using pygame.
    '''
    if is_agent_in_control:
        # Run info_sa with the agent controlling the MDP.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib)
    else:
        # Run info_sa.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib)

    lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf)
    prob_s_phi = ProbStateAbstraction(phi_pmf)
    crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)

    vi = ValueIteration(mdp)
    print "\t|S|", vi.get_num_states()
    print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states()

    from simple_rl.abstraction.state_abs.sa_helpers import visualize_state_abstr_grid
    visualize_state_abstr_grid(mdp, crisp_s_phi)
def main():
    ap_map = {'a': (2, 2), 'b': (6, 3), 'c': (5, 3), 'd': (4, 2)}
    ltlformula = 'F (b & Fa)'
    # Setup MDP, Agents.
    mdp = LTLGridWorldMDP(ltltask=ltlformula,
                          ap_map=ap_map,
                          width=6,
                          height=6,
                          goal_locs=[(6, 6)],
                          slip_prob=0.2)
    mdp.automata.subproblem_flag = 0
    mdp.automata.subproblem_stay = 1
    mdp.automata.subproblem_goal = 0
    value_iter = ValueIteration(mdp, sample_rate=5)
    value_iter.run_vi()

    # Value Iteration.
    action_seq, state_seq = value_iter.plan(mdp.get_init_state())

    print("Plan for", mdp)
    for i in range(len(action_seq)):
        print("\t", action_seq[i], state_seq[i])

    print(ltlformula)
    f = open('/Users/romapatel/Desktop/actions.tsv', 'w+')
    for item in state_seq:
        f.write(str(item) + '\n')

    f.close()
    model = None
    ltl_visualiser(model)
Beispiel #3
0
class StochasticSAPolicy(object):
    def __init__(self, state_abstr, mdp):
        self.state_abstr = state_abstr
        self.mdp = mdp
        self.vi = ValueIteration(mdp)
        self.vi.run_vi()

    def policy(self, state):
        '''
        Args:
            (simple_rl.State)

        Returns:
            (str): An action

        Summary:
            Chooses an action among the optimal actions in the cluster. That is, roughly:

                \pi(a \mid s_a) \sim Pr_{s_g \in s_a} (a = a^*(s_a))
        '''

        abstr_state = self.state_abstr.phi(state)
        ground_states = self.state_abstr.get_ground_states_in_abs_state(
            abstr_state)

        action_distr = defaultdict(float)
        for s in ground_states:
            a = self.vi.policy(s)
            action_distr[a] += 1.0 / len(ground_states)

        sampled_distr = np.random.multinomial(1,
                                              action_distr.values()).tolist()
        indices = [i for i, x in enumerate(sampled_distr) if x > 0]

        return action_distr.keys()[indices[0]]
Beispiel #4
0
def main():
    ap_map = {'a': (2, 2), 'b': (6, 3), 'c': (5, 3), 'd': (4, 2)}
    print('Automic propositions, ', ap_map)
    ltlformula = 'F (b & Fa)'
    print('LTL Formula, ', ltlformula)
    # Setup MDP, Agents.
    print('translatinggg')
    a = spot.translate('(a U b) & GFc & GFd', 'BA', 'complete')
    a.show("v" "")

    return
    mdp = LTLGridWorldMDP(ltltask=ltlformula,
                          ap_map=ap_map,
                          width=6,
                          height=6,
                          goal_locs=[(6, 6)],
                          slip_prob=0.2)

    mdp.automata.subproblem_flag = 0
    mdp.automata.subproblem_stay = 1
    mdp.automata.subproblem_goal = 0
    value_iter = ValueIteration(mdp, sample_rate=5)
    value_iter.run_vi()

    # Value Iteration.
    print('Value iteration')
    action_seq, state_seq = value_iter.plan(mdp.get_init_state())

    print("Plan for", mdp)
    for i in range(len(action_seq)):
        print("\t", action_seq[i], state_seq[i])
 def update_policy(self):
     avg_mdp_vi = ValueIteration(compute_avg_mdp(self.active_mdp_distr),
                                 delta=0.0001,
                                 max_iterations=1000,
                                 sample_rate=5)
     avg_mdp_vi.run_vi()
     self.policy = avg_mdp_vi.policy
def main():

    # Setup MDP, Agents.
    mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4)
    viz = parse_args()

    # Choose viz type.
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
    elif viz == "interactive":
        mdp.visualize_interaction()
Beispiel #7
0
def main():
    # Setup MDP, Agents.
    mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01)
    # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) 
    rm_agent = RMaxAgent(mdp.get_actions())
    viz = parse_args()
    viz = "learning"

    if viz == "value":
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # Solve problem and show agent interaction.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # Run experiment and make plot.
        mdp.visualize_learning(ql_agent)
Beispiel #8
0
    def __init__(self,
                 mdp,
                 lower_values_init,
                 upper_values_init,
                 tau=10.,
                 name='BRTDP'):
        '''
        Args:
            mdp (MDP): underlying MDP to plan in
            lower_values_init (defaultdict): lower bound initialization on the value function
            upper_values_init (defaultdict): upper bound initialization on the value function
            tau (float): scaling factor to help determine when the bounds on the value function are tight enough
            name (str): Name of the planner
        '''
        Planner.__init__(self, mdp, name)
        self.lower_values = lower_values_init
        self.upper_values = upper_values_init

        # Using the value iteration class for accessing the matrix of transition probabilities
        vi = ValueIteration(mdp, sample_rate=1000)
        self.states = vi.get_states()
        vi._compute_matrix_from_trans_func()
        self.trans_dict = vi.trans_dict

        self.max_diff = (self.upper_values[self.mdp.init_state] -
                         self.lower_values[self.mdp.init_state]) / tau
Beispiel #9
0
def get_optimal_policies(environment):
    '''
    Args:
        environment (simple_rl.MDPDistribution)

    Returns:
        (list)
    '''

    # Make State Abstraction
    approx_qds_test = get_sa(environment,
                             indic_func=ind_funcs._q_eps_approx_indicator,
                             epsilon=0.05)

    # True Optimal
    true_opt_vi = ValueIteration(environment)
    true_opt_vi.run_vi()
    opt_agent = FixedPolicyAgent(true_opt_vi.policy, "$\pi^*$")

    # Optimal Abstraction
    opt_det_vi = AbstractValueIteration(environment,
                                        state_abstr=approx_qds_test,
                                        sample_rate=30)
    opt_det_vi.run_vi()
    opt_det_agent = FixedPolicyAgent(opt_det_vi.policy, name="$\pi_{\phi}^*$")

    stoch_policy_obj = StochasticSAPolicy(approx_qds_test, environment)
    stoch_agent = FixedPolicyAgent(stoch_policy_obj.policy,
                                   "$\pi(a \mid s_\phi )$")

    ql_agents = [opt_agent, stoch_agent, opt_det_agent]

    return ql_agents
Beispiel #10
0
def main():
    
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.1)
    ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.2) 
    viz = parse_args()

    # Choose viz type.
    viz = "value"

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        policy = value_iter.policy
        mdp.visualize_policy(policy)
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200)
        mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200)
    elif viz == "interactive":
        # Press <1>, <2>, <3>, and so on to execute action 1, action 2, etc.
    	mdp.visualize_interaction()
Beispiel #11
0
def main():
    # Setup MDP, Agents.
    size = 5
    agent = {
        "x": 1,
        "y": 1,
        "dx": 1,
        "dy": 0,
        "dest_x": size,
        "dest_y": size,
        "has_block": 0
    }
    blocks = [{"x": size, "y": 1}]
    lavas = [{
        "x": x,
        "y": y
    } for x, y in map(lambda z: (z + 1, (size + 1) / 2), xrange(size))]

    mdp = TrenchOOMDP(size, size, agent, blocks, lavas)
    ql_agent = QLearnerAgent(actions=mdp.get_actions())
    rand_agent = RandomAgent(actions=mdp.get_actions())

    # Run experiment and make plot.
    # run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250)

    vi = ValueIteration(mdp, delta=0.0001, max_iterations=5000)
    iters, val = vi.run_vi()
    print " done."
    states = vi.get_states()
    num_states = len(states)
    print num_states, states
Beispiel #12
0
def get_l1_policy(start_room=None,
                  goal_room=None,
                  mdp=None,
                  starting_items=None,
                  goal_items=None,
                  actions=None,
                  doors=None,
                  rooms=None):
    if mdp is None:
        mdp = FourRoomL1MDP(start_room,
                            goal_room,
                            starting_items=starting_items,
                            goal_items=goal_items,
                            actions=actions,
                            doors=doors,
                            rooms=rooms)
    vi = ValueIteration(mdp)
    vi.run_vi()

    policy = defaultdict()
    action_seq, state_seq = vi.plan(mdp.init_state)

    print 'Plan for {}:'.format(mdp)
    for i in range(len(action_seq)):
        print "\tpi[{}] -> {}".format(state_seq[i], action_seq[i])
        policy[state_seq[i]] = action_seq[i]
    return policy
Beispiel #13
0
    def compute_value_iteration_results(self, sample_rate):

        # If value iteration was run previously, don't re-run it
        if self.value_iter is None or self._policy_invalidated == True:
            self.value_iter = ValueIteration(self, sample_rate=sample_rate)
            _ = self.value_iter.run_vi()
            self._policy_invalidated = False
        return self.value_iter
    def __init__(self, mdp, name='MonotoneUpperBound'):
        relaxed_mdp = MonotoneLowerBound._construct_deterministic_relaxation_mdp(
            mdp)

        Planner.__init__(self, relaxed_mdp, name)
        self.vi = ValueIteration(relaxed_mdp)
        self.states = self.vi.get_states()
        self.vi._compute_matrix_from_trans_func()
        self.vi.run_vi()
        self.lower_values = self._construct_lower_values()
Beispiel #15
0
def main():
    mdp1 = GridWorldMDP(width=2,
                        height=1,
                        init_loc=(1, 1),
                        goal_locs=[(2, 1)],
                        slip_prob=0.5,
                        gamma=0.5)

    vi = ValueIteration(mdp1)
    iters, value = vi.run_vi()
    print("value=", value)
Beispiel #16
0
    def run_value_iteration(self):
        """Runs value iteration (if needed).

        Returns:
            ValueIteration object.
        """
        # If value iteration was run previously, don't re-run it
        if self._policy_invalidated == True:
            self.value_iter = ValueIteration(self, sample_rate=1)
            _ = self.value_iter.run_vi()
            self._policy_invalidated = False
        return self.value_iter
Beispiel #17
0
def main():
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2)
    value_iter = ValueIteration(mdp, sample_rate=5)
    value_iter.run_vi()

    # Value Iteration.
    action_seq, state_seq = value_iter.plan(mdp.get_init_state())

    print("Plan for", mdp)
    for i in range(len(action_seq)):
        print("\t", action_seq[i], state_seq[i])
    def get_l1_policy(domain):
        vi = ValueIteration(domain, sample_rate=1)
        vi.run_vi()

        policy = defaultdict()
        action_seq, state_seq = vi.plan(domain.init_state)

        print('Plan for {}:'.format(domain))
        for i in range(len(action_seq)):
            print("\tpi[{}] -> {}\n".format(state_seq[i], action_seq[i]))
            policy[state_seq[i]] = action_seq[i]

        return policy
Beispiel #19
0
    def get_l1_policy(start_room=None, goal_room=None, mdp=None):
        if mdp is None:
            mdp = CubeL1MDP(start_room, goal_room)
        vi = ValueIteration(mdp)
        vi.run_vi()

        policy = defaultdict()
        action_seq, state_seq = vi.plan(mdp.init_state)

        print('Plan for {}:'.format(mdp))
        for i in range(len(action_seq)):
            print("\tpi[{}] -> {}".format(state_seq[i], action_seq[i]))
            policy[state_seq[i]] = action_seq[i]
        return policy
def main():
    # Grab experiment params.
    # Switch between Upworld and Trench
    mdp_class = "upworld"
    # mdp_class = "trench"
    grid_lim = 20 if mdp_class == 'upworld' else 7
    gamma = 0.95
    vanilla_file = "vi.csv"
    sa_file = "vi-$\phi_{Q_d^*}.csv"
    file_prefix = "results/planning-" + mdp_class + "/"
    clear_files(dir_name=file_prefix)

    for grid_dim in xrange(3, grid_lim):
        # ======================
        # == Make Environment ==
        # ======================
        environment = make_mdp.make_mdp(mdp_class=mdp_class, grid_dim=grid_dim)
        environment.set_gamma(gamma)

        # =======================
        # == Make Abstractions ==
        # =======================
        sa_qds = get_sa(environment,
                        indic_func=ind_funcs._q_disc_approx_indicator,
                        epsilon=0.01)

        # ============
        # == Run VI ==
        # ============
        vanilla_vi = ValueIteration(environment, delta=0.0001, sample_rate=15)
        sa_vi = AbstractValueIteration(ground_mdp=environment,
                                       state_abstr=sa_qds)

        print "Running VIs."
        start_time = time.clock()
        vanilla_iters, vanilla_val = vanilla_vi.run_vi()
        vanilla_time = round(time.clock() - start_time, 2)

        start_time = time.clock()
        sa_iters, sa_val = sa_vi.run_vi()
        sa_time = round(time.clock() - start_time, 2)

        print "vanilla", vanilla_iters, vanilla_val, vanilla_time
        print "sa:", sa_iters, sa_val, sa_time

        write_datum(file_prefix + "iters/" + vanilla_file, vanilla_iters)
        write_datum(file_prefix + "iters/" + sa_file, sa_iters)

        write_datum(file_prefix + "times/" + vanilla_file, vanilla_time)
        write_datum(file_prefix + "times/" + sa_file, sa_time)
def main():
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2)

    value_iter = ValueIteration(mdp, sample_rate=5)
    mcts = MCTS(mdp, num_rollouts_per_step=50)
    # _, val = value_iter.run_vi()

    # Value Iteration.
    vi_action_seq, vi_state_seq = value_iter.plan(mdp.get_init_state())
    mcts_action_seq, mcts_state_seq = mcts.plan(mdp.get_init_state())

    print("Plan for", mdp)
    for i in range(len(mcts_action_seq)):
        print("\t", mcts_action_seq[i], mcts_state_seq[i])
Beispiel #22
0
    def get_l1_policy(oomdp=None):
        if oomdp is None:
            oomdp = TaxiL1OOMDP()
        vi = ValueIteration(oomdp, sample_rate=1)
        vi.run_vi()

        policy = defaultdict()
        action_seq, state_seq = vi.plan(oomdp.init_state)

        print('Plan for {}:'.format(oomdp))
        for i in range(len(action_seq)):
            print("\tpi[{}] -> {}\n".format(state_seq[i], action_seq[i]))
            policy[state_seq[i]] = action_seq[i]

        return policy
Beispiel #23
0
def main():
    # Setup MDP, Agents.
    mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2)

    value_iter = ValueIteration(mdp, sample_rate=5)
    mcts = MCTS(mdp, num_rollouts_per_step=50)
    # _, val = value_iter.run_vi()

    # Value Iteration.
    vi_action_seq, vi_state_seq = value_iter.plan(mdp.get_init_state())
    mcts_action_seq, mcts_state_seq = mcts.plan(mdp.get_init_state())

    print("Plan for", mdp)
    for i in range(len(mcts_action_seq)):
        print("\t", mcts_action_seq[i], mcts_state_seq[i])
def _make_mini_mdp_option_policy(mini_mdp):
    '''
    Args:
        mini_mdp (MDP)

    Returns:
        Policy
    '''
    # Solve the MDP defined by the terminal abstract state.
    mini_mdp_vi = ValueIteration(mini_mdp, delta=0.005, max_iterations=500, sample_rate=20)
    iters, val = mini_mdp_vi.run_vi()

    o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states())
    o_policy = PolicyFromDict(o_policy_dict)

    return o_policy.get_action, mini_mdp_vi
Beispiel #25
0
def make_near_optimal_phi_relative_options(mdp,
                                           state_abstr,
                                           method='optimal',
                                           num_rand_opts=0,
                                           **kwargs):
    """
    Args:
        mdp
        state_abstr
        method
        num_rand_opts

    Returns:
        (list)
    """
    # Get the optimal Q function
    from planning.OptionsMDPValueIterationClass import OptionsMDPValueIteration
    from data_structs.OptionsMDPClass import OptionsMDP

    if isinstance(mdp, OptionsMDP):
        value_iter = OptionsMDPValueIteration(mdp, sample_rate=20)
    else:
        value_iter = ValueIteration(mdp, sample_rate=10)

    value_iter.run_vi()

    options = []
    optimal_options = []
    for s_phi in state_abstr.get_abs_states():
        init_predicate = EqPredicate(y=s_phi, func=state_abstr.phi)
        term_predicate = NeqPredicate(y=s_phi, func=state_abstr.phi)
        o_star = Option(init_predicate=init_predicate,
                        term_predicate=term_predicate,
                        policy=lambda s: value_iter.policy(s))

        if method == 'optimal':
            options.append(o_star)
        if method == 'eps-greedy':
            eps = kwargs['eps']

            eps_greedy_policy = get_eps_greedy_policy(eps, value_iter.policy,
                                                      mdp.get_actions())

            o_eps = Option(init_predicate=init_predicate,
                           term_predicate=term_predicate,
                           policy=eps_greedy_policy)

            for _ in range(num_rand_opts):
                o_rand = Option(
                    init_predicate=init_predicate,
                    term_predicate=term_predicate,
                    policy=lambda x: random.choice(mdp.get_actions()))
                options.append(o_rand)

            options.append(o_eps)
        else:
            options.append(o_star)

    return options, optimal_options
Beispiel #26
0
def main():
    ap_map = {'a': (2,2),'b': (6,3), 'c': (5,3), 'd': (4,2)}
    ltlformula = 'F (b & Fa)'
    # Setup MDP, Agents.
    mdp = LTLGridWorldMDP(ltltask=ltlformula, ap_map=ap_map, width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2)
    mdp.automata.subproblem_flag = 0
    mdp.automata.subproblem_stay = 1
    mdp.automata.subproblem_goal = 0
    value_iter = ValueIteration(mdp, sample_rate=5)
    value_iter.run_vi()

    # Value Iteration.
    action_seq, state_seq = value_iter.plan(mdp.get_init_state())

    print("Plan for", mdp)
    for i in range(len(action_seq)):
        print("\t", action_seq[i], state_seq[i])
Beispiel #27
0
def get_l1_policy(start_room=None, goal_room=None, mdp=None):
    if mdp is None:
        mdp = FourRoomL1MDP(start_room,
                            goal_room,
                            starting_items=[2, 0],
                            goal_items=[2,
                                        1])  #room 2, light off =0, light on =1
    vi = ValueIteration(mdp)
    vi.run_vi()

    policy = defaultdict()
    action_seq, state_seq = vi.plan(mdp.init_state)

    print 'Plan for {}:'.format(mdp)
    for i in range(len(action_seq)):
        print "\tpi[{}] -> {}".format(state_seq[i], action_seq[i])
        policy[state_seq[i]] = action_seq[i]
    return policy
Beispiel #28
0
    def __init__(self, mdp, name='MonotoneUpperBound'):
        relaxed_mdp = MonotoneLowerBound._construct_deterministic_relaxation_mdp(mdp)

        Planner.__init__(self, relaxed_mdp, name)
        self.vi = ValueIteration(relaxed_mdp)
        self.states = self.vi.get_states()
        self.vi._compute_matrix_from_trans_func()
        self.vi.run_vi()
        self.lower_values = self._construct_lower_values()
Beispiel #29
0
def generate_agent(mdp_class, data_loc, mdp_parameters, visualize=False):
    try:
        with open('models/' + data_loc + '/vi_agent.pickle', 'rb') as f:
            mdp_agent, vi_agent = pickle.load(f)
    except:
        mdp_agent = make_mdp.make_custom_mdp(mdp_class, mdp_parameters)
        vi_agent = ValueIteration(mdp_agent, sample_rate=1)
        vi_agent.run_vi()

        with open('models/' + data_loc + '/vi_agent.pickle', 'wb') as f:
            pickle.dump((mdp_agent, vi_agent), f)

    # Visualize agent
    if visualize:
        fixed_agent = FixedPolicyAgent(vi_agent.policy)
        mdp_agent.visualize_agent(fixed_agent)
        mdp_agent.reset()  # reset the current state to the initial state
        mdp_agent.visualize_interaction()
Beispiel #30
0
def main():

    args = parse_args()
    mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc,
                       args.l_loc, args.gamma, args.Walls, args.slip)

    ql_agent = QLearningAgent(mdp.get_actions(),
                              epsilon=args.epsilon,
                              alpha=args.alpha,
                              explore=args.explore,
                              anneal=args.anneal)
    viz = args.mode

    if viz == "value":
        # --> Color corresponds to higher value.
        # Run experiment and make plot.
        mdp.visualize_value()
    elif viz == "policy":
        # Viz policy
        value_iter = ValueIteration(mdp)
        value_iter.run_vi()
        mdp.visualize_policy_values(
            (lambda state: value_iter.policy(state)),
            (lambda state: value_iter.value_func[state]))
    elif viz == "agent":
        # --> Press <spacebar> to advance the agent.
        # First let the agent solve the problem and then visualize the agent's resulting policy.
        print("\n", str(ql_agent), "interacting with", str(mdp))
        rand_agent = RandomAgent(actions=mdp.get_actions())
        run_agents_on_mdp([rand_agent, ql_agent],
                          mdp,
                          open_plot=True,
                          episodes=60,
                          steps=200,
                          instances=5,
                          success_reward=1)
        # mdp.visualize_agent(ql_agent)
    elif viz == "learning":
        # --> Press <r> to reset.
        # Show agent's interaction with the environment.
        mdp.visualize_learning(ql_agent,
                               delay=0.005,
                               num_ep=500,
                               num_steps=200)
Beispiel #31
0
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False):
    '''
    Args:
        mdp (simple_rl.MDP)
        demo_policy_lambda (lambda : simple_rl.State --> str)
        beta (float)
        is_deterministic_ib (bool): If True, run DIB, else IB.
        is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead.

    Summary:
        Runs info_sa and compares the value of the found policy with the demonstrator policy.
    '''
    if is_agent_in_control:
        # Run info_sa with the agent controlling the MDP.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib)
    else:
        # Run info_sa.
        pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib)

    # Make demonstrator agent and random agent.
    demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$")
    rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$")

    # Make abstract agent.
    lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf)
    prob_s_phi = ProbStateAbstraction(phi_pmf)
    crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi)
    abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="")
    
    # Run.
    run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000)


    non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0]
    # Print state space sizes.
    demo_vi = ValueIteration(mdp)
    print "\nState Spaces Sizes:"
    print "\t|S| =", demo_vi.get_num_states()
    print "\tH(S_\\phi) =", entropy(pmf_s_phi)
    print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states()
    print "\tdelta_min =", min(non_zero_abstr_states)
    print "\tnum non zero states =", len(non_zero_abstr_states)
    print
    def __init__(self, mdp, lower_values_init, upper_values_init, tau=10., name='BRTDP'):
        '''
        Args:
            mdp (MDP): underlying MDP to plan in
            lower_values_init (defaultdict): lower bound initialization on the value function
            upper_values_init (defaultdict): upper bound initialization on the value function
            tau (float): scaling factor to help determine when the bounds on the value function are tight enough
            name (str): Name of the planner
        '''
        Planner.__init__(self, mdp, name)
        self.lower_values = lower_values_init
        self.upper_values = upper_values_init

        # Using the value iteration class for accessing the matrix of transition probabilities
        vi = ValueIteration(mdp, sample_rate=1000)
        self.states = vi.get_states()
        vi._compute_matrix_from_trans_func()
        self.trans_dict = vi.trans_dict

        self.max_diff = (self.upper_values[self.mdp.init_state] - self.lower_values[self.mdp.init_state]) / tau
Beispiel #33
0
class MonotoneUpperBound(Planner):
    def __init__(self, mdp, name='MonotoneUpperBound'):
        Planner.__init__(self, mdp, name)
        self.vi = ValueIteration(mdp)
        self.states = self.vi.get_states()
        self.upper_values = self._construct_upper_values()

    def _construct_upper_values(self):
        values = defaultdict()
        for state in self.states:
            values[state] = 1. / (1. - self.gamma)
        return values
class MonotoneUpperBound(Planner):
    def __init__(self, mdp, name='MonotoneUpperBound'):
        Planner.__init__(self, mdp, name)
        self.vi = ValueIteration(mdp)
        self.states = self.vi.get_states()
        self.upper_values = self._construct_upper_values()

    def _construct_upper_values(self):
        values = defaultdict()
        for state in self.states:
            values[state] = 1. / (1. - self.gamma)
        return values
Beispiel #35
0
class MonotoneLowerBound(Planner):
    def __init__(self, mdp, name='MonotoneUpperBound'):
        relaxed_mdp = MonotoneLowerBound._construct_deterministic_relaxation_mdp(mdp)

        Planner.__init__(self, relaxed_mdp, name)
        self.vi = ValueIteration(relaxed_mdp)
        self.states = self.vi.get_states()
        self.vi._compute_matrix_from_trans_func()
        self.vi.run_vi()
        self.lower_values = self._construct_lower_values()

    @staticmethod
    def _construct_deterministic_relaxation_mdp(mdp):
        relaxed_mdp = copy.deepcopy(mdp)
        relaxed_mdp.set_slip_prob(0.0)
        return relaxed_mdp

    def _construct_lower_values(self):
        values = defaultdict()
        for state in self.states:
            values[state] = self.vi.get_value(state)
        return values
Beispiel #36
0
 def __init__(self, mdp, name='MonotoneUpperBound'):
     Planner.__init__(self, mdp, name)
     self.vi = ValueIteration(mdp)
     self.states = self.vi.get_states()
     self.upper_values = self._construct_upper_values()
def draw_state(screen,
               cleanup_mdp,
               state,
               policy=None,
               action_char_dict={},
               show_value=False,
               agent=None,
               draw_statics=False,
               agent_shape=None):
    '''
    Args:
        screen (pygame.Surface)
        grid_mdp (MDP)
        state (State)
        show_value (bool)
        agent (Agent): Used to show value, by default uses VI.
        draw_statics (bool)
        agent_shape (pygame.rect)

    Returns:
        (pygame.Shape)
    '''
    # Make value dict.
    val_text_dict = defaultdict(lambda: defaultdict(float))
    if show_value:
        if agent is not None:
            # Use agent value estimates.
            for s in agent.q_func.keys():
                val_text_dict[s.x][s.y] = agent.get_value(s)
        else:
            # Use Value Iteration to compute value.
            vi = ValueIteration(cleanup_mdp)
            vi.run_vi()
            for s in vi.get_states():
                val_text_dict[s.x][s.y] = vi.get_value(s)

    # Make policy dict.
    policy_dict = defaultdict(lambda: defaultdict(str))
    if policy:
        vi = ValueIteration(cleanup_mdp)
        vi.run_vi()
        for s in vi.get_states():
            policy_dict[s.x][s.y] = policy(s)

    # Prep some dimensions to make drawing easier.
    scr_width, scr_height = screen.get_width(), screen.get_height()
    width_buffer = scr_width / 10.0
    height_buffer = 30 + (scr_height / 10.0)  # Add 30 for title.

    width = cleanup_mdp.width
    height = cleanup_mdp.height

    cell_width = (scr_width - width_buffer * 2) / width
    cell_height = (scr_height - height_buffer * 2) / height
    # goal_locs = grid_mdp.get_goal_locs()
    # lava_locs = grid_mdp.get_lavacc_locs()
    font_size = int(min(cell_width, cell_height) / 4.0)
    reg_font = pygame.font.SysFont("CMU Serif", font_size)
    cc_font = pygame.font.SysFont("Courier", font_size * 2 + 2)

    # room_locs = [(x + 1, y + 1) for room in cleanup_mdp.rooms for (x, y) in room.points_in_room]
    door_locs = set([(door.x + 1, door.y + 1) for door in state.doors])

    # Draw the static entities.
    # print(draw_statics)
    # draw_statics = True
    # if draw_statics:
        # For each row:
    for i in range(width):
        # For each column:
        for j in range(height):

            top_left_point = width_buffer + cell_width * i, height_buffer + cell_height * j
            r = pygame.draw.rect(screen, (46, 49, 49), top_left_point + (cell_width, cell_height), 3)

            # if policy and not grid_mdp.is_wall(i+1, height - j):
            if policy and (i + 1, height - j) in cleanup_mdp.legal_states:
                a = policy_dict[i + 1][height - j]
                if a not in action_char_dict:
                    text_a = a
                else:
                    text_a = action_char_dict[a]
                text_center_point = int(top_left_point[0] + cell_width / 2.0 - 10), int(
                    top_left_point[1] + cell_height / 3.0)
                text_rendered_a = cc_font.render(text_a, True, (46, 49, 49))
                screen.blit(text_rendered_a, text_center_point)

            # if show_value and not grid_mdp.is_wall(i+1, grid_mdp.height - j):
            if show_value and (i + 1, height - j) in cleanup_mdp.legal_states:
                # Draw the value.
                val = val_text_dict[i + 1][height - j]
                color = mdpv.val_to_color(val)
                pygame.draw.rect(screen, color, top_left_point + (cell_width, cell_height), 0)
                # text_center_point = int(top_left_point[0] + cell_width/2.0 - 10), int(top_left_point[1] + cell_height/7.0)
                # text = str(round(val,2))
                # text_rendered = reg_font.render(text, True, (46, 49, 49))
                # screen.blit(text_rendered, text_center_point)

            # if grid_mdp.is_wall(i+1, grid_mdp.height - j):
            if (i + 1, height - j) not in cleanup_mdp.legal_states:
                # Draw the walls.
                top_left_point = width_buffer + cell_width * i + 5, height_buffer + cell_height * j + 5
                pygame.draw.rect(screen, (94, 99, 99), top_left_point + (cell_width - 10, cell_height - 10), 0)

            if (i + 1, height - j) in door_locs:
                # Draw door
                # door_color = (66, 83, 244)
                door_color = (0, 0, 0)
                top_left_point = width_buffer + cell_width * i + 5, height_buffer + cell_height * j + 5
                pygame.draw.rect(screen, door_color, top_left_point + (cell_width - 10, cell_height - 10), 0)

            else:
                room = cleanup_mdp.check_in_room(state.rooms, i + 1 - 1, height - j - 1)  # Minus 1 for inconsistent x, y
                if room:
                    top_left_point = width_buffer + cell_width * i + 5, height_buffer + cell_height * j + 5
                    room_rgb = _get_rgb(room.color)
                    pygame.draw.rect(screen, room_rgb, top_left_point + (cell_width - 10, cell_height - 10), 0)

            block = cleanup_mdp.find_block(state.blocks, i + 1 - 1, height - j - 1)
            # print(state)
            # print(block)
            if block:
                circle_center = int(top_left_point[0] + cell_width / 2.0), int(top_left_point[1] + cell_height / 2.0)
                block_rgb = _get_rgb(block.color)
                pygame.draw.circle(screen, block_rgb, circle_center, int(min(cell_width, cell_height) / 4.0))

            # Current state.
            if not show_value and (i + 1, height - j) == (state.x + 1, state.y + 1) and agent_shape is None:
                tri_center = int(top_left_point[0] + cell_width / 2.0), int(top_left_point[1] + cell_height / 2.0)
                agent_shape = _draw_agent(tri_center, screen, base_size=min(cell_width, cell_height) / 2.5 - 8)

    if agent_shape is not None:
        # Clear the old shape.
        pygame.draw.rect(screen, (255, 255, 255), agent_shape)
        top_left_point = width_buffer + cell_width * ((state.x + 1) - 1), height_buffer + cell_height * (
                height - (state.y + 1))
        tri_center = int(top_left_point[0] + cell_width / 2.0), int(top_left_point[1] + cell_height / 2.0)

        # Draw new.
        # if not show_value or policy is not None:
        agent_shape = _draw_agent(tri_center, screen, base_size=min(cell_width, cell_height) / 2.5 - 16)

    pygame.display.flip()

    return agent_shape
Beispiel #38
0
def visualize_options_grid(grid_mdp, action_abstr, scr_width=720, scr_height=720):
    '''
    Args:
        grid_mdp (GridWorldMDP)
        action_abstr (ActionAbstraction)
    '''
    pygame.init()
    title_font = pygame.font.SysFont("CMU Serif", 32)
    small_font = pygame.font.SysFont("CMU Serif", 22)

    if len(action_abstr.get_actions()) == 0:
        print("Options Error: 0 options found. Can't visualize.")
        sys.exit(0)

    if isinstance(grid_mdp, MDPDistribution):
        goal_locs = set([])
        for m in grid_mdp.get_all_mdps():
            for g in m.get_goal_locs():
                goal_locs.add(g)
        grid_mdp = grid_mdp.sample()
    else:
        goal_locs = grid_mdp.get_goal_locs()

    # Pygame init.  
    screen = pygame.display.set_mode((scr_width, scr_height))
    pygame.init()
    screen.fill((255, 255, 255))
    pygame.display.update()
    mdp_visualizer._draw_title_text(grid_mdp, screen)
    option_text_point = scr_width / 2.0 - (14*7), 18*scr_height / 20.0

    # Setup states to compute option init/term funcs.
    state_dict = defaultdict(lambda : defaultdict(None))
    vi = ValueIteration(grid_mdp)
    state_space = vi.get_states()
    for s in state_space:
        state_dict[s.x][s.y] = s

    # Draw inital option.
    option_index = 0
    opt_str = "Option " + str(option_index + 1) + " of " + str(len(action_abstr.get_actions())) # + ":" + str(next_option)
    option_text = title_font.render(opt_str, True, (46, 49, 49))
    screen.blit(option_text, option_text_point)
    next_option = action_abstr.get_actions()[option_index]
    visualize_option(screen, grid_mdp, state_dict, option=next_option)

    # Initiation rect and text.
    option_text = small_font.render("Init: ", True, (46, 49, 49))
    screen.blit(option_text, (40, option_text_point[1]))
    pygame.draw.rect(screen, colors[0], (90, option_text_point[1]) + (24, 24))

    # Terminal rect and text.
    option_text = small_font.render("Term: ", True, (46, 49, 49))
    screen.blit(option_text, (scr_width - 150, option_text_point[1]))
    pygame.draw.rect(screen, colors[1], (scr_width - 80, option_text_point[1]) + (24, 24))
    pygame.display.flip()

    # Keep updating options every space press.
    done = False
    while not done:
        # Check for key presses.
        for event in pygame.event.get():
            if event.type == QUIT or (event.type == KEYDOWN and event.key == K_ESCAPE):
                # Quit.
                pygame.quit()
                sys.exit()
            if event.type == KEYDOWN and event.key == K_RIGHT:
                # Toggle to the next option.
                option_index = (option_index + 1) % len(action_abstr.get_actions())
            elif event.type == KEYDOWN and event.key == K_LEFT:
                # Go to the previous option.
                option_index = (option_index - 1) % len(action_abstr.get_actions())
                if option_index < 0:
                    option_index = len(action_abstr.get_actions()) - 1

            next_option = action_abstr.get_actions()[option_index]
            visualize_option(screen, grid_mdp, state_dict, option=next_option, goal_locs=goal_locs)
            pygame.draw.rect(screen, (255, 255, 255), (130, option_text_point[1]) + (scr_width-290 , 50))
            opt_str = "Option " + str(option_index + 1) + " of " + str(len(action_abstr.get_actions())) # + ":" + str(next_option)
            option_text = title_font.render(opt_str, True, (46, 49, 49))
            screen.blit(option_text, option_text_point)