def info_sa_visualize_abstr(mdp, demo_policy_lambda, beta=2.0, is_deterministic_ib=False, is_agent_in_control=False): ''' Args: mdp (simple_rl.MDP) demo_policy_lambda (lambda : simple_rl.State --> str) beta (float) is_deterministic_ib (bool) is_agent_in_control (bool) Summary: Visualizes the state abstraction found by info_sa using pygame. ''' if is_agent_in_control: # Run info_sa with the agent controlling the MDP. pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib) else: # Run info_sa. pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib) lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) vi = ValueIteration(mdp) print "\t|S|", vi.get_num_states() print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states() from simple_rl.abstraction.state_abs.sa_helpers import visualize_state_abstr_grid visualize_state_abstr_grid(mdp, crisp_s_phi)
def main(): ap_map = {'a': (2, 2), 'b': (6, 3), 'c': (5, 3), 'd': (4, 2)} ltlformula = 'F (b & Fa)' # Setup MDP, Agents. mdp = LTLGridWorldMDP(ltltask=ltlformula, ap_map=ap_map, width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) mdp.automata.subproblem_flag = 0 mdp.automata.subproblem_stay = 1 mdp.automata.subproblem_goal = 0 value_iter = ValueIteration(mdp, sample_rate=5) value_iter.run_vi() # Value Iteration. action_seq, state_seq = value_iter.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(action_seq)): print("\t", action_seq[i], state_seq[i]) print(ltlformula) f = open('/Users/romapatel/Desktop/actions.tsv', 'w+') for item in state_seq: f.write(str(item) + '\n') f.close() model = None ltl_visualiser(model)
class StochasticSAPolicy(object): def __init__(self, state_abstr, mdp): self.state_abstr = state_abstr self.mdp = mdp self.vi = ValueIteration(mdp) self.vi.run_vi() def policy(self, state): ''' Args: (simple_rl.State) Returns: (str): An action Summary: Chooses an action among the optimal actions in the cluster. That is, roughly: \pi(a \mid s_a) \sim Pr_{s_g \in s_a} (a = a^*(s_a)) ''' abstr_state = self.state_abstr.phi(state) ground_states = self.state_abstr.get_ground_states_in_abs_state( abstr_state) action_distr = defaultdict(float) for s in ground_states: a = self.vi.policy(s) action_distr[a] += 1.0 / len(ground_states) sampled_distr = np.random.multinomial(1, action_distr.values()).tolist() indices = [i for i, x in enumerate(sampled_distr) if x > 0] return action_distr.keys()[indices[0]]
def main(): ap_map = {'a': (2, 2), 'b': (6, 3), 'c': (5, 3), 'd': (4, 2)} print('Automic propositions, ', ap_map) ltlformula = 'F (b & Fa)' print('LTL Formula, ', ltlformula) # Setup MDP, Agents. print('translatinggg') a = spot.translate('(a U b) & GFc & GFd', 'BA', 'complete') a.show("v" "") return mdp = LTLGridWorldMDP(ltltask=ltlformula, ap_map=ap_map, width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) mdp.automata.subproblem_flag = 0 mdp.automata.subproblem_stay = 1 mdp.automata.subproblem_goal = 0 value_iter = ValueIteration(mdp, sample_rate=5) value_iter.run_vi() # Value Iteration. print('Value iteration') action_seq, state_seq = value_iter.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(action_seq)): print("\t", action_seq[i], state_seq[i])
def update_policy(self): avg_mdp_vi = ValueIteration(compute_avg_mdp(self.active_mdp_distr), delta=0.0001, max_iterations=1000, sample_rate=5) avg_mdp_vi.run_vi() self.policy = avg_mdp_vi.policy
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(11, 11, goal_locs=[(11, 11)], gamma=0.9, step_cost=0.0) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.4) viz = parse_args() # Choose viz type. viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent) elif viz == "interactive": mdp.visualize_interaction()
def main(): # Setup MDP, Agents. mdp = FourRoomMDP(5, 5, goal_locs=[(5, 5)], gamma=0.99, step_cost=0.01) # mdp = make_grid_world_from_file("octogrid.txt", num_goals=12, randomize=False) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.5) rm_agent = RMaxAgent(mdp.get_actions()) viz = parse_args() viz = "learning" if viz == "value": # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # Solve problem and show agent interaction. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # Run experiment and make plot. mdp.visualize_learning(ql_agent)
def __init__(self, mdp, lower_values_init, upper_values_init, tau=10., name='BRTDP'): ''' Args: mdp (MDP): underlying MDP to plan in lower_values_init (defaultdict): lower bound initialization on the value function upper_values_init (defaultdict): upper bound initialization on the value function tau (float): scaling factor to help determine when the bounds on the value function are tight enough name (str): Name of the planner ''' Planner.__init__(self, mdp, name) self.lower_values = lower_values_init self.upper_values = upper_values_init # Using the value iteration class for accessing the matrix of transition probabilities vi = ValueIteration(mdp, sample_rate=1000) self.states = vi.get_states() vi._compute_matrix_from_trans_func() self.trans_dict = vi.trans_dict self.max_diff = (self.upper_values[self.mdp.init_state] - self.lower_values[self.mdp.init_state]) / tau
def get_optimal_policies(environment): ''' Args: environment (simple_rl.MDPDistribution) Returns: (list) ''' # Make State Abstraction approx_qds_test = get_sa(environment, indic_func=ind_funcs._q_eps_approx_indicator, epsilon=0.05) # True Optimal true_opt_vi = ValueIteration(environment) true_opt_vi.run_vi() opt_agent = FixedPolicyAgent(true_opt_vi.policy, "$\pi^*$") # Optimal Abstraction opt_det_vi = AbstractValueIteration(environment, state_abstr=approx_qds_test, sample_rate=30) opt_det_vi.run_vi() opt_det_agent = FixedPolicyAgent(opt_det_vi.policy, name="$\pi_{\phi}^*$") stoch_policy_obj = StochasticSAPolicy(approx_qds_test, environment) stoch_agent = FixedPolicyAgent(stoch_policy_obj.policy, "$\pi(a \mid s_\phi )$") ql_agents = [opt_agent, stoch_agent, opt_det_agent] return ql_agents
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=4, height=3, init_loc=(1, 1), goal_locs=[(4, 3)], lava_locs=[(4, 2)], gamma=0.95, walls=[(2, 2)], slip_prob=0.1) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=0.2, alpha=0.2) viz = parse_args() # Choose viz type. viz = "value" if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() policy = value_iter.policy mdp.visualize_policy(policy) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) run_single_agent_on_mdp(ql_agent, mdp, episodes=500, steps=200) mdp.visualize_agent(ql_agent) elif viz == "learning": # --> Press <r> to reset. # Show agent's interaction with the environment. mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200) elif viz == "interactive": # Press <1>, <2>, <3>, and so on to execute action 1, action 2, etc. mdp.visualize_interaction()
def main(): # Setup MDP, Agents. size = 5 agent = { "x": 1, "y": 1, "dx": 1, "dy": 0, "dest_x": size, "dest_y": size, "has_block": 0 } blocks = [{"x": size, "y": 1}] lavas = [{ "x": x, "y": y } for x, y in map(lambda z: (z + 1, (size + 1) / 2), xrange(size))] mdp = TrenchOOMDP(size, size, agent, blocks, lavas) ql_agent = QLearnerAgent(actions=mdp.get_actions()) rand_agent = RandomAgent(actions=mdp.get_actions()) # Run experiment and make plot. # run_agents_on_mdp([ql_agent, rand_agent], mdp, instances=30, episodes=250, steps=250) vi = ValueIteration(mdp, delta=0.0001, max_iterations=5000) iters, val = vi.run_vi() print " done." states = vi.get_states() num_states = len(states) print num_states, states
def get_l1_policy(start_room=None, goal_room=None, mdp=None, starting_items=None, goal_items=None, actions=None, doors=None, rooms=None): if mdp is None: mdp = FourRoomL1MDP(start_room, goal_room, starting_items=starting_items, goal_items=goal_items, actions=actions, doors=doors, rooms=rooms) vi = ValueIteration(mdp) vi.run_vi() policy = defaultdict() action_seq, state_seq = vi.plan(mdp.init_state) print 'Plan for {}:'.format(mdp) for i in range(len(action_seq)): print "\tpi[{}] -> {}".format(state_seq[i], action_seq[i]) policy[state_seq[i]] = action_seq[i] return policy
def compute_value_iteration_results(self, sample_rate): # If value iteration was run previously, don't re-run it if self.value_iter is None or self._policy_invalidated == True: self.value_iter = ValueIteration(self, sample_rate=sample_rate) _ = self.value_iter.run_vi() self._policy_invalidated = False return self.value_iter
def __init__(self, mdp, name='MonotoneUpperBound'): relaxed_mdp = MonotoneLowerBound._construct_deterministic_relaxation_mdp( mdp) Planner.__init__(self, relaxed_mdp, name) self.vi = ValueIteration(relaxed_mdp) self.states = self.vi.get_states() self.vi._compute_matrix_from_trans_func() self.vi.run_vi() self.lower_values = self._construct_lower_values()
def main(): mdp1 = GridWorldMDP(width=2, height=1, init_loc=(1, 1), goal_locs=[(2, 1)], slip_prob=0.5, gamma=0.5) vi = ValueIteration(mdp1) iters, value = vi.run_vi() print("value=", value)
def run_value_iteration(self): """Runs value iteration (if needed). Returns: ValueIteration object. """ # If value iteration was run previously, don't re-run it if self._policy_invalidated == True: self.value_iter = ValueIteration(self, sample_rate=1) _ = self.value_iter.run_vi() self._policy_invalidated = False return self.value_iter
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) value_iter = ValueIteration(mdp, sample_rate=5) value_iter.run_vi() # Value Iteration. action_seq, state_seq = value_iter.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(action_seq)): print("\t", action_seq[i], state_seq[i])
def get_l1_policy(domain): vi = ValueIteration(domain, sample_rate=1) vi.run_vi() policy = defaultdict() action_seq, state_seq = vi.plan(domain.init_state) print('Plan for {}:'.format(domain)) for i in range(len(action_seq)): print("\tpi[{}] -> {}\n".format(state_seq[i], action_seq[i])) policy[state_seq[i]] = action_seq[i] return policy
def get_l1_policy(start_room=None, goal_room=None, mdp=None): if mdp is None: mdp = CubeL1MDP(start_room, goal_room) vi = ValueIteration(mdp) vi.run_vi() policy = defaultdict() action_seq, state_seq = vi.plan(mdp.init_state) print('Plan for {}:'.format(mdp)) for i in range(len(action_seq)): print("\tpi[{}] -> {}".format(state_seq[i], action_seq[i])) policy[state_seq[i]] = action_seq[i] return policy
def main(): # Grab experiment params. # Switch between Upworld and Trench mdp_class = "upworld" # mdp_class = "trench" grid_lim = 20 if mdp_class == 'upworld' else 7 gamma = 0.95 vanilla_file = "vi.csv" sa_file = "vi-$\phi_{Q_d^*}.csv" file_prefix = "results/planning-" + mdp_class + "/" clear_files(dir_name=file_prefix) for grid_dim in xrange(3, grid_lim): # ====================== # == Make Environment == # ====================== environment = make_mdp.make_mdp(mdp_class=mdp_class, grid_dim=grid_dim) environment.set_gamma(gamma) # ======================= # == Make Abstractions == # ======================= sa_qds = get_sa(environment, indic_func=ind_funcs._q_disc_approx_indicator, epsilon=0.01) # ============ # == Run VI == # ============ vanilla_vi = ValueIteration(environment, delta=0.0001, sample_rate=15) sa_vi = AbstractValueIteration(ground_mdp=environment, state_abstr=sa_qds) print "Running VIs." start_time = time.clock() vanilla_iters, vanilla_val = vanilla_vi.run_vi() vanilla_time = round(time.clock() - start_time, 2) start_time = time.clock() sa_iters, sa_val = sa_vi.run_vi() sa_time = round(time.clock() - start_time, 2) print "vanilla", vanilla_iters, vanilla_val, vanilla_time print "sa:", sa_iters, sa_val, sa_time write_datum(file_prefix + "iters/" + vanilla_file, vanilla_iters) write_datum(file_prefix + "iters/" + sa_file, sa_iters) write_datum(file_prefix + "times/" + vanilla_file, vanilla_time) write_datum(file_prefix + "times/" + sa_file, sa_time)
def main(): # Setup MDP, Agents. mdp = GridWorldMDP(width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) value_iter = ValueIteration(mdp, sample_rate=5) mcts = MCTS(mdp, num_rollouts_per_step=50) # _, val = value_iter.run_vi() # Value Iteration. vi_action_seq, vi_state_seq = value_iter.plan(mdp.get_init_state()) mcts_action_seq, mcts_state_seq = mcts.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(mcts_action_seq)): print("\t", mcts_action_seq[i], mcts_state_seq[i])
def get_l1_policy(oomdp=None): if oomdp is None: oomdp = TaxiL1OOMDP() vi = ValueIteration(oomdp, sample_rate=1) vi.run_vi() policy = defaultdict() action_seq, state_seq = vi.plan(oomdp.init_state) print('Plan for {}:'.format(oomdp)) for i in range(len(action_seq)): print("\tpi[{}] -> {}\n".format(state_seq[i], action_seq[i])) policy[state_seq[i]] = action_seq[i] return policy
def _make_mini_mdp_option_policy(mini_mdp): ''' Args: mini_mdp (MDP) Returns: Policy ''' # Solve the MDP defined by the terminal abstract state. mini_mdp_vi = ValueIteration(mini_mdp, delta=0.005, max_iterations=500, sample_rate=20) iters, val = mini_mdp_vi.run_vi() o_policy_dict = make_dict_from_lambda(mini_mdp_vi.policy, mini_mdp_vi.get_states()) o_policy = PolicyFromDict(o_policy_dict) return o_policy.get_action, mini_mdp_vi
def make_near_optimal_phi_relative_options(mdp, state_abstr, method='optimal', num_rand_opts=0, **kwargs): """ Args: mdp state_abstr method num_rand_opts Returns: (list) """ # Get the optimal Q function from planning.OptionsMDPValueIterationClass import OptionsMDPValueIteration from data_structs.OptionsMDPClass import OptionsMDP if isinstance(mdp, OptionsMDP): value_iter = OptionsMDPValueIteration(mdp, sample_rate=20) else: value_iter = ValueIteration(mdp, sample_rate=10) value_iter.run_vi() options = [] optimal_options = [] for s_phi in state_abstr.get_abs_states(): init_predicate = EqPredicate(y=s_phi, func=state_abstr.phi) term_predicate = NeqPredicate(y=s_phi, func=state_abstr.phi) o_star = Option(init_predicate=init_predicate, term_predicate=term_predicate, policy=lambda s: value_iter.policy(s)) if method == 'optimal': options.append(o_star) if method == 'eps-greedy': eps = kwargs['eps'] eps_greedy_policy = get_eps_greedy_policy(eps, value_iter.policy, mdp.get_actions()) o_eps = Option(init_predicate=init_predicate, term_predicate=term_predicate, policy=eps_greedy_policy) for _ in range(num_rand_opts): o_rand = Option( init_predicate=init_predicate, term_predicate=term_predicate, policy=lambda x: random.choice(mdp.get_actions())) options.append(o_rand) options.append(o_eps) else: options.append(o_star) return options, optimal_options
def main(): ap_map = {'a': (2,2),'b': (6,3), 'c': (5,3), 'd': (4,2)} ltlformula = 'F (b & Fa)' # Setup MDP, Agents. mdp = LTLGridWorldMDP(ltltask=ltlformula, ap_map=ap_map, width=6, height=6, goal_locs=[(6, 6)], slip_prob=0.2) mdp.automata.subproblem_flag = 0 mdp.automata.subproblem_stay = 1 mdp.automata.subproblem_goal = 0 value_iter = ValueIteration(mdp, sample_rate=5) value_iter.run_vi() # Value Iteration. action_seq, state_seq = value_iter.plan(mdp.get_init_state()) print("Plan for", mdp) for i in range(len(action_seq)): print("\t", action_seq[i], state_seq[i])
def get_l1_policy(start_room=None, goal_room=None, mdp=None): if mdp is None: mdp = FourRoomL1MDP(start_room, goal_room, starting_items=[2, 0], goal_items=[2, 1]) #room 2, light off =0, light on =1 vi = ValueIteration(mdp) vi.run_vi() policy = defaultdict() action_seq, state_seq = vi.plan(mdp.init_state) print 'Plan for {}:'.format(mdp) for i in range(len(action_seq)): print "\tpi[{}] -> {}".format(state_seq[i], action_seq[i]) policy[state_seq[i]] = action_seq[i] return policy
def __init__(self, mdp, name='MonotoneUpperBound'): relaxed_mdp = MonotoneLowerBound._construct_deterministic_relaxation_mdp(mdp) Planner.__init__(self, relaxed_mdp, name) self.vi = ValueIteration(relaxed_mdp) self.states = self.vi.get_states() self.vi._compute_matrix_from_trans_func() self.vi.run_vi() self.lower_values = self._construct_lower_values()
def generate_agent(mdp_class, data_loc, mdp_parameters, visualize=False): try: with open('models/' + data_loc + '/vi_agent.pickle', 'rb') as f: mdp_agent, vi_agent = pickle.load(f) except: mdp_agent = make_mdp.make_custom_mdp(mdp_class, mdp_parameters) vi_agent = ValueIteration(mdp_agent, sample_rate=1) vi_agent.run_vi() with open('models/' + data_loc + '/vi_agent.pickle', 'wb') as f: pickle.dump((mdp_agent, vi_agent), f) # Visualize agent if visualize: fixed_agent = FixedPolicyAgent(vi_agent.policy) mdp_agent.visualize_agent(fixed_agent) mdp_agent.reset() # reset the current state to the initial state mdp_agent.visualize_interaction()
def main(): args = parse_args() mdp = generate_MDP(args.width, args.height, args.i_loc, args.g_loc, args.l_loc, args.gamma, args.Walls, args.slip) ql_agent = QLearningAgent(mdp.get_actions(), epsilon=args.epsilon, alpha=args.alpha, explore=args.explore, anneal=args.anneal) viz = args.mode if viz == "value": # --> Color corresponds to higher value. # Run experiment and make plot. mdp.visualize_value() elif viz == "policy": # Viz policy value_iter = ValueIteration(mdp) value_iter.run_vi() mdp.visualize_policy_values( (lambda state: value_iter.policy(state)), (lambda state: value_iter.value_func[state])) elif viz == "agent": # --> Press <spacebar> to advance the agent. # First let the agent solve the problem and then visualize the agent's resulting policy. print("\n", str(ql_agent), "interacting with", str(mdp)) rand_agent = RandomAgent(actions=mdp.get_actions()) run_agents_on_mdp([rand_agent, ql_agent], mdp, open_plot=True, episodes=60, steps=200, instances=5, success_reward=1) # mdp.visualize_agent(ql_agent) elif viz == "learning": # --> Press <r> to reset. # Show agent's interaction with the environment. mdp.visualize_learning(ql_agent, delay=0.005, num_ep=500, num_steps=200)
def info_sa_compare_policies(mdp, demo_policy_lambda, beta=3.0, is_deterministic_ib=False, is_agent_in_control=False): ''' Args: mdp (simple_rl.MDP) demo_policy_lambda (lambda : simple_rl.State --> str) beta (float) is_deterministic_ib (bool): If True, run DIB, else IB. is_agent_in_control (bool): If True, runs the DIB in agent_in_control.py instead. Summary: Runs info_sa and compares the value of the found policy with the demonstrator policy. ''' if is_agent_in_control: # Run info_sa with the agent controlling the MDP. pmf_s_phi, phi_pmf, abstr_policy_pmf = agent_in_control.run_agent_in_control_info_sa(mdp, demo_policy_lambda, rounds=100, iters=500, beta=beta, is_deterministic_ib=is_deterministic_ib) else: # Run info_sa. pmf_s_phi, phi_pmf, abstr_policy_pmf = run_info_sa(mdp, demo_policy_lambda, iters=500, beta=beta, convergence_threshold=0.00001, is_deterministic_ib=is_deterministic_ib) # Make demonstrator agent and random agent. demo_agent = FixedPolicyAgent(demo_policy_lambda, name="$\\pi_d$") rand_agent = RandomAgent(mdp.get_actions(), name="$\\pi_u$") # Make abstract agent. lambda_abstr_policy = get_lambda_policy(abstr_policy_pmf) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) abstr_agent = AbstractionWrapper(FixedPolicyAgent, state_abstr=crisp_s_phi, agent_params={"policy":lambda_abstr_policy, "name":"$\\pi_\\phi$"}, name_ext="") # Run. run_agents_on_mdp([demo_agent, abstr_agent, rand_agent], mdp, episodes=1, steps=1000) non_zero_abstr_states = [x for x in pmf_s_phi.values() if x > 0] # Print state space sizes. demo_vi = ValueIteration(mdp) print "\nState Spaces Sizes:" print "\t|S| =", demo_vi.get_num_states() print "\tH(S_\\phi) =", entropy(pmf_s_phi) print "\t|S_\\phi|_crisp =", crisp_s_phi.get_num_abstr_states() print "\tdelta_min =", min(non_zero_abstr_states) print "\tnum non zero states =", len(non_zero_abstr_states) print
class MonotoneUpperBound(Planner): def __init__(self, mdp, name='MonotoneUpperBound'): Planner.__init__(self, mdp, name) self.vi = ValueIteration(mdp) self.states = self.vi.get_states() self.upper_values = self._construct_upper_values() def _construct_upper_values(self): values = defaultdict() for state in self.states: values[state] = 1. / (1. - self.gamma) return values
class MonotoneLowerBound(Planner): def __init__(self, mdp, name='MonotoneUpperBound'): relaxed_mdp = MonotoneLowerBound._construct_deterministic_relaxation_mdp(mdp) Planner.__init__(self, relaxed_mdp, name) self.vi = ValueIteration(relaxed_mdp) self.states = self.vi.get_states() self.vi._compute_matrix_from_trans_func() self.vi.run_vi() self.lower_values = self._construct_lower_values() @staticmethod def _construct_deterministic_relaxation_mdp(mdp): relaxed_mdp = copy.deepcopy(mdp) relaxed_mdp.set_slip_prob(0.0) return relaxed_mdp def _construct_lower_values(self): values = defaultdict() for state in self.states: values[state] = self.vi.get_value(state) return values
def __init__(self, mdp, name='MonotoneUpperBound'): Planner.__init__(self, mdp, name) self.vi = ValueIteration(mdp) self.states = self.vi.get_states() self.upper_values = self._construct_upper_values()
def draw_state(screen, cleanup_mdp, state, policy=None, action_char_dict={}, show_value=False, agent=None, draw_statics=False, agent_shape=None): ''' Args: screen (pygame.Surface) grid_mdp (MDP) state (State) show_value (bool) agent (Agent): Used to show value, by default uses VI. draw_statics (bool) agent_shape (pygame.rect) Returns: (pygame.Shape) ''' # Make value dict. val_text_dict = defaultdict(lambda: defaultdict(float)) if show_value: if agent is not None: # Use agent value estimates. for s in agent.q_func.keys(): val_text_dict[s.x][s.y] = agent.get_value(s) else: # Use Value Iteration to compute value. vi = ValueIteration(cleanup_mdp) vi.run_vi() for s in vi.get_states(): val_text_dict[s.x][s.y] = vi.get_value(s) # Make policy dict. policy_dict = defaultdict(lambda: defaultdict(str)) if policy: vi = ValueIteration(cleanup_mdp) vi.run_vi() for s in vi.get_states(): policy_dict[s.x][s.y] = policy(s) # Prep some dimensions to make drawing easier. scr_width, scr_height = screen.get_width(), screen.get_height() width_buffer = scr_width / 10.0 height_buffer = 30 + (scr_height / 10.0) # Add 30 for title. width = cleanup_mdp.width height = cleanup_mdp.height cell_width = (scr_width - width_buffer * 2) / width cell_height = (scr_height - height_buffer * 2) / height # goal_locs = grid_mdp.get_goal_locs() # lava_locs = grid_mdp.get_lavacc_locs() font_size = int(min(cell_width, cell_height) / 4.0) reg_font = pygame.font.SysFont("CMU Serif", font_size) cc_font = pygame.font.SysFont("Courier", font_size * 2 + 2) # room_locs = [(x + 1, y + 1) for room in cleanup_mdp.rooms for (x, y) in room.points_in_room] door_locs = set([(door.x + 1, door.y + 1) for door in state.doors]) # Draw the static entities. # print(draw_statics) # draw_statics = True # if draw_statics: # For each row: for i in range(width): # For each column: for j in range(height): top_left_point = width_buffer + cell_width * i, height_buffer + cell_height * j r = pygame.draw.rect(screen, (46, 49, 49), top_left_point + (cell_width, cell_height), 3) # if policy and not grid_mdp.is_wall(i+1, height - j): if policy and (i + 1, height - j) in cleanup_mdp.legal_states: a = policy_dict[i + 1][height - j] if a not in action_char_dict: text_a = a else: text_a = action_char_dict[a] text_center_point = int(top_left_point[0] + cell_width / 2.0 - 10), int( top_left_point[1] + cell_height / 3.0) text_rendered_a = cc_font.render(text_a, True, (46, 49, 49)) screen.blit(text_rendered_a, text_center_point) # if show_value and not grid_mdp.is_wall(i+1, grid_mdp.height - j): if show_value and (i + 1, height - j) in cleanup_mdp.legal_states: # Draw the value. val = val_text_dict[i + 1][height - j] color = mdpv.val_to_color(val) pygame.draw.rect(screen, color, top_left_point + (cell_width, cell_height), 0) # text_center_point = int(top_left_point[0] + cell_width/2.0 - 10), int(top_left_point[1] + cell_height/7.0) # text = str(round(val,2)) # text_rendered = reg_font.render(text, True, (46, 49, 49)) # screen.blit(text_rendered, text_center_point) # if grid_mdp.is_wall(i+1, grid_mdp.height - j): if (i + 1, height - j) not in cleanup_mdp.legal_states: # Draw the walls. top_left_point = width_buffer + cell_width * i + 5, height_buffer + cell_height * j + 5 pygame.draw.rect(screen, (94, 99, 99), top_left_point + (cell_width - 10, cell_height - 10), 0) if (i + 1, height - j) in door_locs: # Draw door # door_color = (66, 83, 244) door_color = (0, 0, 0) top_left_point = width_buffer + cell_width * i + 5, height_buffer + cell_height * j + 5 pygame.draw.rect(screen, door_color, top_left_point + (cell_width - 10, cell_height - 10), 0) else: room = cleanup_mdp.check_in_room(state.rooms, i + 1 - 1, height - j - 1) # Minus 1 for inconsistent x, y if room: top_left_point = width_buffer + cell_width * i + 5, height_buffer + cell_height * j + 5 room_rgb = _get_rgb(room.color) pygame.draw.rect(screen, room_rgb, top_left_point + (cell_width - 10, cell_height - 10), 0) block = cleanup_mdp.find_block(state.blocks, i + 1 - 1, height - j - 1) # print(state) # print(block) if block: circle_center = int(top_left_point[0] + cell_width / 2.0), int(top_left_point[1] + cell_height / 2.0) block_rgb = _get_rgb(block.color) pygame.draw.circle(screen, block_rgb, circle_center, int(min(cell_width, cell_height) / 4.0)) # Current state. if not show_value and (i + 1, height - j) == (state.x + 1, state.y + 1) and agent_shape is None: tri_center = int(top_left_point[0] + cell_width / 2.0), int(top_left_point[1] + cell_height / 2.0) agent_shape = _draw_agent(tri_center, screen, base_size=min(cell_width, cell_height) / 2.5 - 8) if agent_shape is not None: # Clear the old shape. pygame.draw.rect(screen, (255, 255, 255), agent_shape) top_left_point = width_buffer + cell_width * ((state.x + 1) - 1), height_buffer + cell_height * ( height - (state.y + 1)) tri_center = int(top_left_point[0] + cell_width / 2.0), int(top_left_point[1] + cell_height / 2.0) # Draw new. # if not show_value or policy is not None: agent_shape = _draw_agent(tri_center, screen, base_size=min(cell_width, cell_height) / 2.5 - 16) pygame.display.flip() return agent_shape
def visualize_options_grid(grid_mdp, action_abstr, scr_width=720, scr_height=720): ''' Args: grid_mdp (GridWorldMDP) action_abstr (ActionAbstraction) ''' pygame.init() title_font = pygame.font.SysFont("CMU Serif", 32) small_font = pygame.font.SysFont("CMU Serif", 22) if len(action_abstr.get_actions()) == 0: print("Options Error: 0 options found. Can't visualize.") sys.exit(0) if isinstance(grid_mdp, MDPDistribution): goal_locs = set([]) for m in grid_mdp.get_all_mdps(): for g in m.get_goal_locs(): goal_locs.add(g) grid_mdp = grid_mdp.sample() else: goal_locs = grid_mdp.get_goal_locs() # Pygame init. screen = pygame.display.set_mode((scr_width, scr_height)) pygame.init() screen.fill((255, 255, 255)) pygame.display.update() mdp_visualizer._draw_title_text(grid_mdp, screen) option_text_point = scr_width / 2.0 - (14*7), 18*scr_height / 20.0 # Setup states to compute option init/term funcs. state_dict = defaultdict(lambda : defaultdict(None)) vi = ValueIteration(grid_mdp) state_space = vi.get_states() for s in state_space: state_dict[s.x][s.y] = s # Draw inital option. option_index = 0 opt_str = "Option " + str(option_index + 1) + " of " + str(len(action_abstr.get_actions())) # + ":" + str(next_option) option_text = title_font.render(opt_str, True, (46, 49, 49)) screen.blit(option_text, option_text_point) next_option = action_abstr.get_actions()[option_index] visualize_option(screen, grid_mdp, state_dict, option=next_option) # Initiation rect and text. option_text = small_font.render("Init: ", True, (46, 49, 49)) screen.blit(option_text, (40, option_text_point[1])) pygame.draw.rect(screen, colors[0], (90, option_text_point[1]) + (24, 24)) # Terminal rect and text. option_text = small_font.render("Term: ", True, (46, 49, 49)) screen.blit(option_text, (scr_width - 150, option_text_point[1])) pygame.draw.rect(screen, colors[1], (scr_width - 80, option_text_point[1]) + (24, 24)) pygame.display.flip() # Keep updating options every space press. done = False while not done: # Check for key presses. for event in pygame.event.get(): if event.type == QUIT or (event.type == KEYDOWN and event.key == K_ESCAPE): # Quit. pygame.quit() sys.exit() if event.type == KEYDOWN and event.key == K_RIGHT: # Toggle to the next option. option_index = (option_index + 1) % len(action_abstr.get_actions()) elif event.type == KEYDOWN and event.key == K_LEFT: # Go to the previous option. option_index = (option_index - 1) % len(action_abstr.get_actions()) if option_index < 0: option_index = len(action_abstr.get_actions()) - 1 next_option = action_abstr.get_actions()[option_index] visualize_option(screen, grid_mdp, state_dict, option=next_option, goal_locs=goal_locs) pygame.draw.rect(screen, (255, 255, 255), (130, option_text_point[1]) + (scr_width-290 , 50)) opt_str = "Option " + str(option_index + 1) + " of " + str(len(action_abstr.get_actions())) # + ":" + str(next_option) option_text = title_font.render(opt_str, True, (46, 49, 49)) screen.blit(option_text, option_text_point)