Ejemplo n.º 1
0
    )  # send all states and actions to environment

    gridworld.start_state_hash = (0, 0)

    return gridworld


if __name__ == "__main__":  # pragma: no cover
    from introrl.dp_funcs.dp_value_iter import dp_value_iteration

    gridworld = get_gridworld()
    #gridworld.summ_print()
    gridworld.layout_print(vname='reward',
                           fmt='',
                           show_env_states=True,
                           none_str='*')
    gridworld.save_to_pickle_file(fname=None)

    policy, state_value = dp_value_iteration(gridworld,
                                             do_summ_print=True,
                                             max_iter=1000,
                                             err_delta=0.001,
                                             gamma=0.9)

    policy.save_diagram(gridworld,
                        inp_colorD=None,
                        save_name='sutton_5x5_gridworld',
                        show_arrows=True,
                        scale=1.0,
                        h_over_w=0.8)
Ejemplo n.º 2
0
    get_sim = Model( RW, build_initial_model=True )

    get_sim.collect_transition_data( num_det_calls=100, num_stoic_calls=10000 )

    RW.layout.s_hash_print()

    #get_sim.num_calls_layout_print()
    #get_sim.min_num_calls_layout_print()
    
    env = EnvBaseline( s_hash_rowL=RW.s_hash_rowL, 
                       x_axis_label=RW.x_axis_label, 
                       y_axis_label=RW.y_axis_label )
                       
    get_sim.add_all_data_to_an_environment( env )

    policy, state_value = dp_value_iteration( env, do_summ_print=True, fmt_V='%.3f', fmt_R='%.1f',
                                              max_iter=1000, err_delta=0.0001, 
                                              gamma=0.9, iteration_prints=10)
                                  
    policy.save_diagram( RW, inp_colorD=None, save_name='dp_rw1000_policy',
                         show_arrows=False, scale=0.5, h_over_w=0.8,
                         show_terminal_labels=False)

    print( 'Total Time =',time.time() - start_time )

    pickle_esp.save_to_pickle_file( fname='dp_soln_to_randwalk_1000', 
                                    env=env, state_values=state_value, policy=policy)



Ejemplo n.º 3
0
from introrl.dp_funcs.dp_value_iter import dp_value_iteration
from introrl.environments.env_baseline import EnvBaseline

from introrl.mdp_data.sutton_5x5_gridworld import get_gridworld
gridworld = get_gridworld()
gridworld.name = 'Figure 3.5, 5x5 Grid Value Iteration'

policy, state_value = dp_value_iteration( gridworld, do_summ_print=True,fmt_V='%.1f',
                                          max_iter=1000, err_delta=0.001, 
                                          gamma=0.9, allow_multi_actions=True)

policy.save_diagram( gridworld, inp_colorD=None, save_name='figure_3_5_policy',
                     show_arrows=True, scale=0.8, h_over_w=0.8, do_show=True)
Ejemplo n.º 4
0
from introrl.mdp_data.fallen_3state_robot import get_robot

robot = get_robot()

do_VI = 0
if do_VI:
    print('_____________ Value Iteration ________________')
else:
    print('_____________ Policy Iteration ________________')

for gamma in (0.001, 0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999):

    if do_VI:
        policy, sv = dp_value_iteration(robot,
                                        do_summ_print=False,
                                        fmt_V='%.1f',
                                        max_iter=1000,
                                        err_delta=0.001,
                                        gamma=gamma)
    else:

        policy = Policy(environment=robot)
        policy.set_policy_from_piD(robot.get_default_policy_desc_dict())

        sv = StateValues(robot)
        sv.init_Vs_to_zero()

        dp_policy_iteration(policy,
                            sv,
                            do_summ_print=False,
                            max_iter=1000,
                            err_delta=0.001,
Ejemplo n.º 5
0
import matplotlib
import matplotlib.pyplot as plt

from introrl.dp_funcs.dp_value_iter import dp_value_iteration
from introrl.mdp_data.gamblers_problem import get_gambler

gambler = get_gambler(prob_heads=0.4)

policy, state_value = dp_value_iteration(gambler,
                                         allow_multi_actions=True,
                                         do_summ_print=True,
                                         fmt_V='%.4f',
                                         max_iter=1000,
                                         err_delta=0.00001,
                                         gamma=1.0)
print(gambler.get_info())

# --------------- plot logic -------------------
min_state_list = []
min_action_list = []

state_list = []
action_list = []
for i_state in range(1, 100):
    aL = policy.get_list_of_all_action_desc_prob(i_state, incl_zero_prob=False)
    min_state_list.append(i_state - 0.5)
    min_action_list.append(min([a for a, p in aL]))

    min_state_list.append(i_state + 0.5)
    min_action_list.append(min([a for a, p in aL]))