Beispiel #1
0
def main():
    rTrack=raceTrack.RaceTrack()
    lTrack = raceTrack.RaceTrack()
    rTrack.createTrack('R-track.txt')
    lTrack.createTrack('L-track.txt')
    print(rTrack.printTrack())
    rValue = ValueIteration.ValueIteration(rTrack)
    policy = rValue.valueIter('Resume', 1000,10000)
    print(rValue.timeTrial(policy, 10000, 'Resume', False, 'R-Track'))

    print(lTrack.printTrack())
    lValue= ValueIteration.ValueIteration(lTrack)
    lPolicy = lValue.valueIter('Resume', 1000,10000)
    print(lValue.timeTrial(lPolicy, 10000, 'Resume', False, 'L-Track'))

    oTrack = raceTrack.RaceTrack()
    oTrack.createTrack('O-track.txt')
    oValue = ValueIteration.ValueIteration(oTrack)
    oPolicy = oValue.valueIter('Resume', 1000, 10000)
    print(oValue.timeTrial(oPolicy, 10000, 'Resume', False, 'O-Track'))

    rrPolicy = rValue.valueIter('Restart', 1000, 10000)
    print(rValue.timeTrial(rrPolicy, 10000, 'Restart', False, 'R-Track'))

    lrPolicy = lValue.valueIter('Restart', 1000, 10000)
    print(lValue.timeTrial(lrPolicy, 10000, 'Restart', False, 'L-Track'))

    orPolicy = lValue.valueIter('Restart', 1000, 10000)
    print(oValue.timeTrial(orPolicy, 10000, 'Restart', False, 'O-Track'))
Beispiel #2
0
def train_and_save_VI_MDP(track_str, gamma, iters):
    test_track = Track(track_str + '.txt')
    test_mdp = MDP(test_track)
    ValueIteration.value_iteration(test_mdp, gamma=gamma, epsilon=1)
    with open(f'pickles\{track_str}_{gamma}_VI_pickle_Iter{iters}',
              'wb') as file:
        pickle.dump(test_mdp, file)
    print('Training Complete')
def midcardIteration(midcard, count):
    mdp = BlackjackMDP(count=0, midcards=midcard)
    alg = ValueIteration()
    alg.solve(mdp, .001)
    startState = mdp.startState()
    save_obj('Midcard {} V'.format(midcard - 12), alg.V)
    print('Saved Expected Value for midcard {}'.format(midcard))
    save_obj('Midcard {} pi'.format(midcard - 12), alg.pi)
    print('Saved Expected Value for midcard {}'.format(midcard))
def countIteration(count, midcards):
    mdp = BlackjackMDP(count=count, midcards=12)
    alg = ValueIteration()
    alg.solve(mdp, .001)
    startState = mdp.startState()
    print('Expected Value for count {}: {}'.format(count, alg.V[startState]))
    print('Algorithm Value iteration with count {}'.format(count))
    save_obj('Count {} Policy'.format(count), alg.pi)
    print('Saved policy for count {}'.format(count))
    save_obj('Count {} V'.format(count), alg.V)
    print('Saved Expected Value Value for count {}'.format(count))
def valueiteratingpolicy(n):
    v = initvalact.initvalact(n)
    value = v[0]
    action = v[1]
    reward = rewardsegregation.rewardsegregation(n, p, p1, encoder, ENClast)
    print reward
    test = ValueIteration.valueiteration(value, reward, action)
    policy = test[1]
    print policy
    raw = 0
    col = 0
    gotopos.gotopos(raw, col, p, p1, n)
    # 0 = up / 1 = down / 2 = left / 3= right
    global val1
    val1 = pinSetup.valueRead_ON()
    while True and val1 == 0:
        if action[raw][col] == 0:
            act.playAction(0, raw, col, n, p, p1)
            raw = raw - 1

        elif action[raw][col] == 1:
            act.playAction(1, raw, col, n, p, p1)
            raw = raw + 1

        elif action[raw][col] == 2:
            act.playAction(2, raw, col, n, p, p1)
            col = col - 1

        elif action[raw][col] == 3:
            act.playAction(3, raw, col, n, p, p1)
            col = col + 1
        val1 = pinSetup.valueRead_ON()
    if val1 == 1:
        print "Stop"
Beispiel #6
0
def update_policy_displays(which="both"):  # "VI" or "QL" or "both"
    # Check the boolean variables that control policy display, and
    # for each that is true, update and (re)show the corresponding policy.
    # This should be called whenever Q values might have changed from either
    # menu, but it's good to specify which, to avoid unnecessary work doing extra policy extraction.
    global POLICY_from_VI, POLICY_from_QL
    if Vis.VI_POLICY_VAR.get() and which != "QL":
        POLICY_from_VI = VI.extract_policy(CLOSED, ACTIONS)
        Vis.show_policy(POLICY_from_VI)
        Vis.enable_vi_action_menu_items(True)
    else:
        if not Vis.VI_POLICY_VAR.get():
            Vis.clear_a_policy_display(0)
    if Vis.QL_POLICY_VAR.get() and which != "VI":
        POLICY_from_QL = Q_Learn.extract_policy(CLOSED, ACTIONS)
        #print("For debugging... POLICY_from_QL is: "); print(str(POLICY_from_QL))
        Vis.show_policy(POLICY_from_QL,
                        policy_number=1,
                        use_alt_segments=True,
                        color="blue")
        #QL_POLICY = pi # save policy from Value Iteration to compare with VI_POLICY
        if POLICY_from_VI: compare_policies(POLICY_from_VI, POLICY_from_QL)
        Vis.enable_compare_menu_item(True)
    else:
        if not Vis.QL_POLICY_VAR.get():
            Vis.clear_a_policy_display(1)
Beispiel #7
0
def main():

    mdp = MarkovDecisionProblem.MarkovDecisionProblem()

    vi = ValueIteration.ValueIteration(mdp)
    ql = QLearning.QLearning(mdp)
    
    ql.qlearning(iterations=15, exploration=0.2)
Beispiel #8
0
 def constructPoliciesTables(self):
     for i in range(len(self.preferences)):
         valueTable = {
             state: 0
             for state in self.beliefTransitionTable.keys()
         }
         ValueIteration = BoltzmannValueIteration(
             self.beliefTransitionTable, self.beliefRewardTables[i],
             valueTable, self.convergenceTolerance, self.gamma, self.beta)
         _, policy = ValueIteration()
         self.policies.append(policy)
Beispiel #9
0
def run_Agent(param):
    '''Run the agent for several transitions, depending on
    the value of param.  It uses the policy from VI.
    Return True if more turns can still be taken.'''
    global Agent_state, Terminal_state
    for i in range(param):
        if Agent_state == Terminal_state:
            print("Terminal state reached!")
            return False
        a = VI.apply_policy(Agent_state)
        Agent_turn(a)
    return True
Beispiel #10
0
def run_QL_agent(param, action=None):
    '''Return True if more turns can still be taken.'''
    global TERMINATED
    #print("In run_QL_agent, action = "+action)
    global Agent_state, Terminal_state
    for i in range(param):
        if Agent_state == Terminal_state:
            print("Terminal state reached!")
            TERMINATED = True
            return False
        if action:
            a = action
        else:
            a = VI.apply_policy(
                Agent_state)  # Should prob. use a different policy method.
        Agent_turn(a)
        #print("Need to perform a Q update here.")
    return True
Beispiel #11
0
import MDP
import BlockworldMdp
import ValueIteration

russel_norvig_example = BlockworldMdp.Blockworld1(width=4,
                                                  height=3,
                                                  goal_positions=[(4, 3)],
                                                  penalty_positions=[(4, 2)],
                                                  obstacle_positions=[(2, 2)],
                                                  time_cost=0.1)

big_example = BlockworldMdp.Blockworld1(width=10,
                                        height=9,
                                        goal_positions=[(5, 6)],
                                        penalty_positions=[(7, 3), (2, 5),
                                                           (3, 6), (8, 4),
                                                           (1, 2), (9, 5)],
                                        obstacle_positions=[(5, 4), (2, 8),
                                                            (3, 8), (4, 8),
                                                            (3, 3), (7, 1),
                                                            (7, 2)],
                                        time_cost=0.5,
                                        penalty_value=-10.0)

mdp = russel_norvig_example

policy, values = ValueIteration.generate_policy(mdp, 0.9, 0.0001)

mdp.render(policy, values)
Beispiel #12
0
    plt.ylabel('Value at Initial State (5,1)')
    plt.xlabel('Iteration')
    plt.title("Value Iteration with Changing Windspeed in Windy Gridworld")
    return plt


# run value iteration code here
gridworld = gw.Gridworld()
init_vals = {}
plt = initialize_plot()
gamma = 0.9
pRange = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
finalPolicyDf = pd.DataFrame(index=gridworld.states(), columns=pRange)

for p in pRange:
    value = vi.ValueIteration(gridworld, gamma)
    value.set_p(p)  #add wind (stochasticity)
    value.value_iteration()

    #create plot
    V = value.get_V()
    init_vals[p] = [V[v][(5, 1)] for v in V]
    plt.plot(init_vals[p], label=p)
    #get final policy for each gamma
    finalPolicyDf[p] = pd.DataFrame.from_dict(value.get_pi(),
                                              orient='index')[0]

plt.legend()
plt.show()

print(finalPolicyDf)
def run_maze_problem(map_name, size, random_start=False):
    q_diffs_explore = np.array((1, 1))
    q_diffs_exploit = np.array((1, 1))
    diffs_value = np.array((1, 1))
    diffs_policy = np.array((1, 1))

    verbose = True
    penalty = -1000
    goal_reward = 1000
    # read map
    inf = open(map_name)
    data = np.array([[*map(int,
                           s.strip().split(','))] for s in inf.readlines()])
    originalmap = data.copy(
    )  #make a copy so we can revert to the original map later
    print('This is the map of the maze!')
    printmap(data)
    print('')
    rand.seed(5)

    # Make the reward matrix
    R_f = np.zeros([size**2, 1])
    for row in range(0, data.shape[0]):
        for col in range(0, data.shape[1]):
            pos = (row, col)
            val = data[pos]
            reward = -1
            if val == 5: reward = penalty
            elif val == 3: reward = goal_reward
            s = discretize(pos, size)
            R_f[s][0] = reward

    # Q LEARNER EXPLORE
    learner = ql.QLearner(num_states=size**2, num_actions = 4, \
        alpha = 0.2, gamma = 0.9, exr = 0.9, excr = 0.998)
    episodes = 1200
    if random_start: episodes = 20000
    q_diffs_explore = train_q_learner(data,
                                      episodes,
                                      learner,
                                      verbose,
                                      size=size,
                                      run_time=10000,
                                      random_start=random_start)
    #  print(learner.q)
    Pi_star = np.zeros([size**2, 1])
    for i in range(0, Pi_star.shape[0]):
        Pi_star[i][0] = np.argmax(learner.q[i])
    print('This is the policy given by Q Learner Explorer!')
    printpolicy(data, (Pi_star.reshape(size, size)))
    print('')

    # Q LEARNER EXPLOIT
    learner = ql.QLearner(num_states=size**2, num_actions = 4, \
        alpha = 0.2, gamma = 0.9, exr = 0.6, excr = 0.99)
    episodes = 1500
    q_diffs_exploit = train_q_learner(data,
                                      episodes,
                                      learner,
                                      verbose,
                                      size=size,
                                      run_time=10000)
    Pi_star = np.zeros([size**2, 1])
    for i in range(0, Pi_star.shape[0]):
        Pi_star[i][0] = np.argmax(learner.q[i])
    print('This is the policy given by Q Learner Exploiter!')
    printpolicy(data, np.round_(Pi_star.reshape(size, size)))
    print('')

    # VALUE LEANER
    learner = vl.ValueIteration(gamma=0.9, num_states=size**2, num_actions=4, R_f = R_f, \
         return_max_V = return_max_V_maze, env = data, size=size)
    episodes = 300
    diffs_value = np.zeros((episodes, 1))
    #for episode in range(0,episodes):
    diff = -1
    episode = 0
    while diff != 0:
        V_old = learner.V_star.copy()
        learner.iterate_V()
        V_new = learner.V_star.copy()
        diff = np.absolute((V_new - V_old)).max()
        diffs_value[episode, 0] = diff
        episode += 1
    #  print(episode)
    policy = learner.return_policy()

    # printpolicy(data, policy.reshape(size,size))
    print('This is the policy given by Value Iterationer!')
    printpolicy(data, np.round_(policy.reshape(size, size)))
    print('')

    # POLICY LEARNER
    learner = pl.PolicyIteration(gamma=0.9, num_states=size**2, num_actions=4, R_f = R_f, \
         return_max_V = return_max_V_maze, env = data,size=size)
    episodes = 20
    diffs_policy = np.zeros((episodes, 1))
    for episode in range(0, episodes):
        print(episode)
        P_old = learner.Pi_star.copy()
        learner.iterate_P()
        P_new = learner.Pi_star.copy()
        diff = np.absolute((P_new - P_old)).max()
        diffs_policy[episode, 0] = diff
    print('This is the policy given by Policy Iterationer!')
    printpolicy(data, (learner.Pi_star.reshape(size, size)))
    print('')

    # PLOT RESULTS
    fig, axes = plt.subplots(2, 2)
    fig.tight_layout()
    plt.subplots_adjust(hspace=0.35)

    ax = axes[1, 0]
    ax.plot(q_diffs_explore)
    ax.title.set_text('Exploring Q Learner (explore rate = 0.9)')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max abs. update on Q table')

    ax = axes[1, 1]
    ax.plot(q_diffs_exploit)
    ax.title.set_text('Exploiting Q Learner (explore rate = 0.6)')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max abs. update on Q table')

    ax = axes[0, 0]
    ax.plot(diffs_value)
    ax.title.set_text('Value Iteration')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_yscale('log')
    ax.set_ylabel('Max abs. update on Value table')

    ax = axes[0, 1]
    ax.plot(diffs_policy)
    ax.title.set_text('Policy Iteration')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max abs. update on Policy table')

    plt.show()
    return 0


if __name__ == '__main__':

    erro_average_ma = []
    query_average_ma = []

    erro_average_weng = []
    query_average_weng = []

    _state, _action, _d = 5, 3, 4

    for iteration in range(1):

        _Lambda_inequalities = ValueIteration.generate_inequalities(_d)
        _lambda_rand = ValueIteration.interior_easy_points(_d)

        m = Regan.my_mdp.make_simulate_mdp_Yann(_state, _action, _lambda_rand, None)
        w = ValueIteration.Weng(m, _lambda_rand, _Lambda_inequalities)
        w.setStateAction()

        m.set_Lambda(_lambda_rand)
        Uvec = m.policy_iteration()
        exact = m.initial_states_distribution().dot(Uvec)

        # output = w.value_iteration_with_advantages(_epsilon=0.001, k=100000, noise= None,
        #                                            cluster_error = 0.01, threshold = 0.0001)

        output = w.value_iteration_with_advantages(_epsilon=0.001, k=100000, noise= 0.5,
                                                   cluster_error = 0.01, threshold = 0.0001)
Beispiel #15
0
# This procedure should be responsible for initializing the value function(s),
# performing the iterations to update the value function(s), and monitoring convergence.
# Your algorithm should stop as soon as the delta,
# the maximum absolute change in the value function between iteration k - 1 and
# iteration k is less than 1e-6. At the end of this procedure,
# you should plot the delta value at each iteration as a line plot.
# You should also plot the final value functions as heatmaps as described in the previous bullet point.
# You should run your algorithm by creating a python file called run_value_iteration.py that runs your experiment. You may use gamma = 0:95.

# run value iteration code here
invaderdefender = id.InvaderDefender()
init_vals = {}
plt = initialize_plot("Invader - Defender Heatmap")
#gammaRange = [0.95]
#finalPolicyDf = pd.DataFrame(index=gridworld.states(), columns=gammaRange)

gamma = 0.95

value = vi.ValueIteration(invaderdefender, gamma)
U, pi_p, pi_q, delta = value.value_iteration()
print("defender")
heatmap("defender", U)
print("invader")
heatmap("invader", U)

plt = initialize_plot("delta")
plt.plot(delta)
plt.show()

#    return U, pi_p, pi_q
Beispiel #16
0
def MDP_command(cmd, param):
    global GAMMA, ALL_STATES, CLOSED
    global ACTIONS, NOISE, LIVING_REWARD, NGOALS, SILVER_PATH, N_disks
    global V_from_VI, Q_from_VI, V_from_QL, Q_from_QL, POLICY_from_VI, POLICY_from_QL
    global Agent_state, n_iterations, NEED_Q_LEARN_SETUP, LAST_REWARD, TERMINATED, Terminal_state
    global ALPHA, EPSILON, QUIET_MODE
    #print("In MDP_command, cmd = "+cmd+"; param = "+str(param))
    if cmd == "NDISKS":
        N_disks = param
        TowersOfHanoi.N_disks = param
        try:
            Vis.unhighlight(Agent_state)
        except:
            pass
        set_up_state_space()
        return
    if cmd == "noise":
        NOISE = param
    if cmd == "ngoals":
        NGOALS = param
        if NGOALS == 2: SILVER_PATH = make_solution_path(path_type="silver")
        else: SILVER_PATH = []

    if cmd == "living_reward":
        LIVING_REWARD = param
    if cmd == "set_gamma":
        GAMMA = param
        update_qlearn_params()
        return

    if cmd == "show_values":
        if param == 1:
            Vis.display_values(V_from_VI)
            #for s in V_from_VI.keys(): Vis.reshow_state(s,V_from_VI[s])

        if param == 2:
            Vis.show_q_values(Q_from_VI, CLOSED)
            #Vis.reshow_all_q_values(Q_from_VI, CLOSED)

        if param == 3:
            compute_V_from_QL()
            Vis.display_values(V_from_QL)
            #for s in V_from_QL.keys(): Vis.reshow_state(s,V_from_QL[s])

        if param == 4:
            Vis.show_q_values(Q_from_QL, CLOSED)
            #Vis.reshow_all_q_values(Q_from_QL, CLOSED)
        return
    if cmd == "Value_Iteration":
        if param == 0:  # Reset VI state values to 0.
            n_iterations = 0
            initialize_V_from_VI(0)
            init_q_values(Q_from_VI)
            if Vis.DISPLAY_VALS_VAR.get() == 1:
                Vis.display_values(V_from_VI)
            elif Vis.DISPLAY_VALS_VAR.get() == 2:
                Vis.show_q_values(Q_from_VI, CLOSED)
            Vis.enable_value_iteration(True)
            Vis.enable_vi_action_menu_items(False)
            update_policy_displays(which="VI")
            return
        if param == 1:
            (V_from_VI, max_delta) = VI.one_step_of_VI(ALL_STATES, ACTIONS, T,
                                                       R, GAMMA,
                                                       V_from_VI.copy())
            n_iterations += 1
            print("After " + str(n_iterations) + " iterations, max_delta = " +
                  str(max_delta))
            Vis.enable_policy_extraction(True)
            Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS)
            update_policy_displays(which="VI")
        if param > 1:
            for i in range(param):
                (V_from_VI,
                 max_delta) = VI.one_step_of_VI(ALL_STATES, ACTIONS, T, R,
                                                GAMMA, V_from_VI.copy())
                n_iterations += 1
                print("After " + str(n_iterations) +
                      " iterations, max_delta = " + str(max_delta))
                if max_delta < 0.00000001:
                    print("VI has converged after iteration " +
                          str(n_iterations) + ".")
                    break
            Vis.enable_policy_extraction(True)
            Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS)
            update_policy_displays(which="VI")
        # Update the display of values or q-values, whichever is enabled currently.
        mode = Vis.DISPLAY_VALS_VAR.get()
        if mode == 1:
            for s in V_from_VI.keys():
                Vis.reshow_state(s, V_from_VI[s])
        if mode == 2:
            Vis.show_q_values(Q_from_VI, CLOSED)
            return
    if cmd == "Show_Policy_from_VI":  # THIS CMD SHOULD ACTUALLY BE UNNECESSARY NOW.
        update_policy_displays(which="VI")
    if cmd == "Show_Policy_from_QL":  # THIS CMD SHOULD ACTUALLY BE UNNECESSARY NOW.
        update_policy_displays(which="QL")
    if cmd == "Agent":
        if param == 0 or Agent_state == Terminal_state:
            Vis.unhighlight(Agent_state)
            Agent_turn(ACTIONS[0], reset=True)
            initialize_episode()
        elif param == 1:
            a = VI.apply_policy(Agent_state)
            Agent_turn(a)
        else:
            Vis.TK_Canvas.after(10, lambda: run_Agent(param))
    if cmd == "QLearn":
        init_Q_Learn_if_needed()
        if param == -1 or Agent_state == Terminal_state:  # Reset the agent to s0, ready for a new episode.
            Vis.unhighlight(Agent_state)
            Agent_turn(ACTIONS[0], reset=True)
            initialize_episode()
        elif param == -2:
            # Reset all state and Q values to 0.
            init_q_values(Q_from_QL, QL=True)
            initialize_V_from_QL(0)
            #Vis.reshow_all_q_values(Q_from_QL, CLOSED)
            if Vis.DISPLAY_VALS_VAR.get() == 3:
                compute_V_from_QL()
                Vis.display_values(V_from_QL)
                return
            if Vis.DISPLAY_VALS_VAR.get() == 4:
                Vis.show_q_values(Q_from_QL, CLOSED)
            update_policy_displays(which="QL")
            return
        elif param == 0:
            user_drives_agent_via_text_input()


#    elif param==1:
#      a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD, TERMINATED)
#      Agent_turn(a)
#      increment_transition_count()
        elif param > 0:  # Perform up to n transitions of Q learning.
            for i in range(param):
                a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD,
                                               TERMINATED)
                Agent_turn(a)
                if TERMINATED:
                    # Make one more call to the Q_Learn agent so it can do a q-update based
                    # on the reward in going from a goal state to the Terminal_state.
                    # The returned "action" a should be None, but probably does not matter.
                    a = Q_Learn.choose_next_action(Agent_state, LAST_REWARD,
                                                   TERMINATED)
                    print("Sent final reward for this episode: R=" +
                          str(LAST_REWARD))
                    print("Episode ended after transition " +
                          str(get_transition_count()))
                    increment_episode_count()
                    print(
                        str(get_episode_count()) +
                        " episodes so far in this Q-learning run.")
                    TERMINATED = False  # Make it easier to start the next set of transitions.
                    break
                increment_transition_count()
            update_policy_displays(which="QL")
        elif param == -1000:
            # Do 1000 transitions as quickly as possible, using as many episodes
            # as needed.
            train_quietly(1000)
            update_policy_displays(which="QL")
            return
    if cmd == "Exploration":
        if Vis.EXPL_VAR.get():
            init_q_values(Q_from_QL)
            mode = Vis.DISPLAY_VALS_VAR.get()
            if mode == 4:
                Vis.reshow_all_q_values(Q_from_QL)

            Q_Learn.setup(ALL_STATES,
                          ACTIONS,
                          Q_from_QL,
                          update_q_value,
                          is_valid_goal_state,
                          Terminal_state,
                          use_exp_fn=True)
            update_policy_displays(which="QL")
    if cmd == "alpha":
        if param == 1:
            ALPHA = 0.1
        elif param == 2:
            ALPHA = 0.2
        elif param == 3:
            ALPHA = -1
        update_qlearn_params()
        return
    if cmd == "epsilon":
        if param == 1:
            EPSILON = 0.1
        elif param == 2:
            EPSILON = 0.2
        elif param == 3:
            EPSILON = -1
        update_qlearn_params()
        return
    if cmd == "User_chose":
        init_Q_Learn_if_needed()
        a = param
        Agent_turn(a)
        increment_transition_count()
        Q_Learn.handle_transition(a, Agent_state, LAST_REWARD)
        update_policy_displays(which="QL")
    if cmd == "Get_Q_Values":
        return ((ALL_STATES, Q_VALUES)
                )  # Needs updating to refer to one of the types of Q values.
    if cmd == "compare":
        #Compare_QLearn_to_VI.receive_globals(globals())
        #Q_from_VI = VI.return_Q_values(CLOSED, ACTIONS)
        compute_V_from_QL()
        Compare_QLearn_to_VI.full_compare()
    if cmd == "Run_script":
        script.run(globals())
        update_policy_displays(which="both")
    if cmd == "show_golden_path":
        Vis.show_golden_path()
    gamma = 0.9
    epsilon = 0.1
    segmentTotalNumber = 1000

    print('finish setting parameter', time.time() - time0)
    transitionFunction = Transition.TransitionFromStateAndAction(worldRange)
    createTransitionProbabilityDict = Transition.CreateTransitionProbabilityDict(
        transitionFunction)
    transitionFromStateAndAction = Transition.TransitionFromStateAndAction(
        worldRange)
    transitionProbabilityDict = createTransitionProbabilityDict(
        stateList, actionList)
    createRewardDict = Reward.MultiTargetsRewardDict(stateList, actionList,
                                                     targetReward)
    runValueIteration = ValueIteration.ValueIteration(stateList, actionList,
                                                      decayRate,
                                                      convergeThreshold,
                                                      maxIterationStep)
    createPolicyFromValue = ValueIteration.PolicyFromValue(
        stateList, actionList, decayRate)
    runQLearning = QLearning.QLearning(alpha, gamma, epsilon,
                                       segmentTotalNumber, stateList,
                                       actionList,
                                       transitionFromStateAndAction)

    print('finish setting function', time.time() - time0)
    trainWolfPolicy = TrainWolfPolicyValueIteration(stateList,
                                                    transitionProbabilityDict,
                                                    createRewardDict,
                                                    runValueIteration,
                                                    createPolicyFromValue)
    # trainWolfPolicy = TrainWolfPolicyQLearning(stateList, createRewardDict, runQLearning)
                ## MC approximation
                for k in range(n_samples):
                    # U_next = += V[next_state]
                    next_state = np.random.choice(mdp.n_states, p = P_sa)
                    U_next += u[next_state%representation_size]
                U_next /= n_samples
                Q[a] = mdp.get_reward(s, a) + gamma * U_next
            V[s] = max(Q)
            policy[s] = np.argmax(Q)
            #print(V)
        cnt = np.zeros(representation_size);
        u = np.zeros(representation_size);
        # Minimise \sum_s |V(s) - u(s)|^2 with SGD
        for k in range(n_iterations):
            for s in SampledStates:
                s = np.random.choice(mdp.n_states)
                s_hat = s % representation_size
                u[s_hat] += 0.1 * (V[s] - u[s_hat])

    return policy, V

n_actions = 4
n_states = 64
n_iterations = 1000
gamma = 0.9
mdp = MDP.DiscreteMDP(n_states, n_actions)
a_policy, a_V= approximate_value_iteration(mdp, n_iterations, gamma, 32)
policy, V, Q= ValueIteration.value_iteration(mdp, n_iterations, gamma)
print(V)
print(a_V)
Beispiel #19
0
        return 5
    else:
        return 0


"""
值迭代
"""

print("\n")
print("===============================")
print("\n")

print("确定情况下的值迭代:")
print("Q值变化过程:")
vi = ValueIteration(S, A, dP, dR, gama)
vi.run()
print("结果:")
print(vi.get_h())

print("\n")
print("===============================")
print("\n")

print("随机情况下的值迭代:")
print("Q值变化过程:")
vi = ValueIteration(S, A, aP, aR, gama)
vi.run()
print("结果:")
print(vi.get_h())
"""
def run_lock_problem():
    q_diffs_explore = np.array((1, 1))
    q_diffs_exploit = np.array((1, 1))
    diffs_value = np.array((1, 1))
    diffs_policy = np.array((1, 1))

    sequence = np.array([1, 2, 1, 2])
    minor_reward = 2
    major_reward = 100
    state_count = sum(sequence) + 1

    # Q EXPLORE # FIND CORRECT SEQUENCE 90% OF TIME
    learner = ql.QLearner(num_states=state_count, num_actions = 2, \
        alpha = 0.3, gamma = 0.9, exr = 0.9998, excr = 0.9998)
    episodes = 800
    q_diffs_explore = train_lock_learner(sequence, episodes, learner)
    Pi_star = np.zeros([state_count, 1])
    for i in range(0, Pi_star.shape[0]):
        Pi_star[i][0] = np.argmax(learner.q[i])
    print(learner.q)
    print('This is the policy given by Q Learner Explorer!')
    printsequence(Pi_star)
    print('')
    #print(np.round_(Pi_star))

    # Q EXPLOIT
    learner = ql.QLearner(num_states=state_count, num_actions = 2, \
        alpha = 0.3, gamma = 0.9, exr = 0.6, excr = 0.9998)
    episodes = 800
    q_diffs_exploit = train_lock_learner(sequence, episodes, learner)
    Pi_star = np.zeros([state_count, 1])
    for i in range(0, Pi_star.shape[0]):
        Pi_star[i][0] = np.argmax(learner.q[i])
    print('This is the policy given by Q Learner Exploiter!')
    printsequence(Pi_star)
    print('')

    # VALUE LEARNER
    state_count = sum(sequence) + 2
    R_f = np.zeros([state_count, 1])
    R_f[-1, 0] = minor_reward
    R_f[-2, 0] = major_reward

    learner = vl.ValueIteration(gamma=0.9, num_states=state_count, num_actions=2, R_f = R_f, \
         return_max_V = return_max_V_lock, env = sequence)
    episodes = 20
    diffs_value = np.zeros((episodes, 1))
    #  for episode in range(0,episodes):
    diff = -1
    episode = 0
    while diff != 0:
        V_old = learner.V_star.copy()
        learner.iterate_V()
        V_new = learner.V_star.copy()
        diff = np.absolute((V_new - V_old)).max()
        diffs_value[episode, 0] = diff
        episode += 1
    policy = learner.return_policy()
    print('This is the policy given by Value Iterationer!')
    printsequence(policy)
    print('')

    # POLICY LEARNER
    learner = pl.PolicyIteration(gamma=0.9, num_states=state_count, num_actions=2, R_f = R_f, \
         return_max_V = return_max_V_lock, env = sequence)
    episodes = 20
    diffs_policy = np.zeros((episodes, 1))
    for episode in range(0, episodes):
        P_old = learner.Pi_star.copy()
        learner.iterate_P()
        P_new = learner.Pi_star.copy()
        diff = np.absolute((P_new - P_old)).sum()
        diffs_policy[episode, 0] = diff
    policy = learner.Pi_star
    print('This is the policy given by Policy Iterationer!')
    printsequence(policy)
    print('')

    # PLOT RESULTS
    fig, axes = plt.subplots(2, 2)
    fig.tight_layout()
    plt.subplots_adjust(hspace=0.35)

    ax = axes[1, 0]
    ax.plot(q_diffs_explore)
    ax.title.set_text('Exploring Q Learner (explore rate = 0.9998)')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max abs. update on Q table')

    ax = axes[1, 1]
    ax.plot(q_diffs_exploit)
    ax.title.set_text('Exploiting Q Learner (explore rate = 0.6)')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max abs. update on Q table')

    ax = axes[0, 0]
    ax.plot(diffs_value)
    ax.title.set_text('Value Iteration')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max abs. update on Value table')

    ax = axes[0, 1]
    ax.plot(diffs_policy)
    ax.title.set_text('Policy Iteration')
    ax.xaxis.set_major_formatter(FormatStrFormatter('%d'))
    ax.set_xlabel('Iteration')
    ax.set_ylabel('Max abs. update on Policy table')
    plt.show()
Beispiel #21
0
 def calculate_policy(self):
     mdp = self.belief.get_MDP_sample()
     self.policy, self.V, _ = ValueIteration.value_iteration(mdp, self.n_iterations, self.discount, self.V)