Python FourRoomMDP.reset Exemples

Langage de programmation: Python

Espace de nommage/Pack: simple_rl.tasks

Class/Type: FourRoomMDP

Méthode/Fonction: reset

Exemples au hotexamples.com: 2

Python FourRoomMDP.reset - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de simple_rl.tasks.FourRoomMDP.reset extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

FourRoomMDP(19)

get_actions(8)

visualize_agent(3)

visualize_policy(3)

visualize_value(3)

reset(2)

visualize_interaction(2)

visualize_learning(2)

Méthodes fréquemment utilisées

FourRoomMDP (19)

get_actions (8)

visualize_agent (3)

visualize_policy (3)

visualize_value (3)

reset (2)

visualize_interaction (2)

visualize_learning (2)

Exemple #1

0

Afficher le fichier

Fichier : info_sa.py Projet : apragupta/IB_SA_simple_rl

def info_sa_planning_experiment(min_grid_size=5, max_grid_size=11, beta=10.0): ''' Args: min_grid_size (int) max_grid_size (int) beta (float): Hyperparameter for InfoSA. Summary: Writes num iterations and time (seconds) for planning with and without abstractions. ''' vanilla_file = "vi.csv" sa_file = "vi-$\\phi$.csv" file_prefix = os.path.join("results", "planning-four_room") clear_files(dir_name=file_prefix) for grid_dim in xrange(min_grid_size, max_grid_size + 1): # ====================== # == Make Environment == # ====================== mdp = FourRoomMDP(width=grid_dim, height=grid_dim, init_loc=(1, 1), goal_locs=[(grid_dim, grid_dim)], gamma=0.9) # Get demo policy. vi = ValueIteration(mdp) vi.run_vi() demo_policy = get_lambda_policy(make_det_policy_eps_greedy(vi.policy, vi.get_states(), mdp.get_actions(), epsilon=0.2)) # ======================= # == Make Abstractions == # ======================= pmf_s_phi, phi_pmf, abstr_policy = run_info_sa(mdp, demo_policy, iters=500, beta=beta, convergence_threshold=0.00001) lambda_abstr_policy = get_lambda_policy(abstr_policy) prob_s_phi = ProbStateAbstraction(phi_pmf) crisp_s_phi = convert_prob_sa_to_sa(prob_s_phi) # ============ # == Run VI == # ============ vanilla_vi = ValueIteration(mdp, delta=0.0001, sample_rate=25) sa_vi = AbstractValueIteration(ground_mdp=mdp, state_abstr=crisp_s_phi, delta=0.0001, vi_sample_rate=25, amdp_sample_rate=25) # ========== # == Plan == # ========== print "Running VIs." start_time = time.clock() vanilla_iters, vanilla_val = vanilla_vi.run_vi() vanilla_time = round(time.clock() - start_time, 2) mdp.reset() start_time = time.clock() sa_iters, sa_abs_val = sa_vi.run_vi() sa_time = round(time.clock() - start_time, 2) sa_val = evaluate_agent(FixedPolicyAgent(sa_vi.policy), mdp, instances=25) print "\n" + "*"*20 print "Vanilla", "\n\t Iters:", vanilla_iters, "\n\t Value:", round(vanilla_val, 4), "\n\t Time:", vanilla_time print print "Phi:", "\n\t Iters:", sa_iters, "\n\t Value:", round(sa_val, 4), "\n\t Time:", sa_time print "*"*20 + "\n\n" write_datum(os.path.join(file_prefix, "iters", vanilla_file), vanilla_iters) write_datum(os.path.join(file_prefix, "iters", sa_file), sa_iters) write_datum(os.path.join(file_prefix, "times", vanilla_file), vanilla_time) write_datum(os.path.join(file_prefix, "times", sa_file), sa_time)

Exemple #2

0

Afficher le fichier

def branching_factor_experiment(min_options=0, max_options=20, increment=2, instances=5, epsilon=0.05): ''' Args: min_options (int) max_options (int) increment (int) Summary: Runs an experiment contrasting learning performance for different # options. ''' # Define MDP. grid_size = 7 mdp = FourRoomMDP(width=grid_size, height=grid_size, goal_locs=[(grid_size, grid_size)]) # Make State Abstraction. states, _ = ah.compute_reachable_state_space(mdp, sample_rate=50) state_abstr = core.compute_phi_given_m(mdp, four_rooms_predicate_9x9, level=1, states=states) x_axis = range(min_options, max_options + 1, increment) y_axis = defaultdict(list) #[] #[0] * len(x_axis) conf_intervals = defaultdict(list) num_options_performance = defaultdict(lambda: defaultdict(list)) # Choose dependent variable (either #steps per episode or #episodes). d_var_range = [(20, 5), (40, 250), (400, 2500)] for steps, episodes in d_var_range: print "steps, episodes", steps, episodes # Evaluate. for i, instance in enumerate(range(instances)): print "\tInstance", instance + 1, "of", str(instances) + "." # Make initial Options. for num_options in x_axis: options, _ = make_near_optimal_phi_relative_options( mdp, state_abstr, 'eps-greedy', num_rand_opts=num_options - 1, eps=epsilon) action_abstr = ActionAbstraction( options=options, prim_actions=mdp.get_actions()) # Make agent. AgentClass = RMaxAgent # DoubleQAgent, QLearningAgent, SarsaAgent sa_aa_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O$") _, _, value_per_episode = run_single_agent_on_mdp( sa_aa_agent, mdp, episodes=episodes, steps=steps) mdp.reset() num_options_performance[(steps, episodes)][num_options].append( value_per_episode[-1]) ############ # Other types # Just state abstraction. steps, episodes = d_var_range[-1][0], d_var_range[-1][1] sa_agent = AbstractionWrapper(AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=None, name_ext="-$\\phi$") _, _, value_per_episode = run_single_agent_on_mdp(sa_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["phi"].append( value_per_episode[-1]) y_axis["phi"] = [value_per_episode[-1]] # Run random options. options = make_fixed_random_options(mdp, state_abstr) action_abstr = ActionAbstraction(options=options, prim_actions=mdp.get_actions()) AgentClass = QLearningAgent rand_opt_agent = AbstractionWrapper( AgentClass, agent_params={"actions": mdp.get_actions()}, state_abstr=state_abstr, action_abstr=action_abstr, name_ext="-$\\phi,O_{\text{random}}$") _, _, value_per_episode = run_single_agent_on_mdp(rand_opt_agent, mdp, episodes=episodes, steps=steps) num_options_performance[(d_var_range[-1][0], d_var_range[-1][1])]["random"].append( value_per_episode[-1]) y_axis["random"] = [value_per_episode[-1]] # Makeoptimal agent. value_iter = ValueIteration(mdp) value_iter.run_vi() optimal_agent = FixedPolicyAgent(value_iter.policy) _, _, value_per_episode = run_single_agent_on_mdp(optimal_agent, mdp, episodes=episodes, steps=steps) y_axis["optimal"] = [value_per_episode[-1]] total_steps = d_var_range[0][0] * d_var_range[0][1] # Confidence intervals. for dependent_var in d_var_range: for num_options in x_axis: # Compute mean and standard error. avg_for_n = float( sum(num_options_performance[dependent_var] [num_options])) / instances std_deviation = np.std( num_options_performance[dependent_var][num_options]) std_error = 1.96 * (std_deviation / math.sqrt( len(num_options_performance[dependent_var][num_options]))) y_axis[dependent_var].append(avg_for_n) conf_intervals[dependent_var].append(std_error) plt.xlabel("$|O_\\phi|$") plt.xlim([1, len(x_axis)]) plt.ylabel("$V^{\hat{\pi}_{O_\\phi}}(s_0)$") plt.tight_layout() # Keeps the spacing nice. # Add just state abstraction. ep_val_del_q_phi = y_axis["phi"] label = "$O_{\\phi}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q_phi] * len(x_axis), marker="+", linestyle="--", linewidth=1.0, color=PLOT_COLORS[-1], label=label) # Add random options. ep_val_del_q = y_axis["random"] label = "$O_{random}$" #" N=1e" + str(str(total_steps).count("0")) + "$" plt.plot(x_axis, [ep_val_del_q] * len(x_axis), marker="x", linestyle="--", linewidth=1.0, color=PLOT_COLORS[0]) #, label=label) # Add optimal. ep_val_optimal = y_axis["optimal"] plt.plot(x_axis, [ep_val_optimal] * len(x_axis), linestyle="-", linewidth=1.0, color=PLOT_COLORS[1]) #, label="$\\pi^*$") for i, dependent_var in enumerate(d_var_range): total_steps = dependent_var[0] * dependent_var[1] label = "$O_{\\phi,Q_\\varepsilon^*}, N=1e" + str( str(total_steps).count("0")) + "$" plt.plot(x_axis, y_axis[dependent_var], marker="x", color=PLOT_COLORS[i + 2], linewidth=1.5, label=label) # Confidence intervals. top = np.add(y_axis[dependent_var], conf_intervals[dependent_var]) bot = np.subtract(y_axis[dependent_var], conf_intervals[dependent_var]) plt.fill_between(x_axis, top, bot, alpha=0.25, color=PLOT_COLORS[i + 2]) plt.legend() plt.savefig("branching_factor_results.pdf", format="pdf") plt.cla() plt.close()