コード例 #1
0
ファイル: rl.py プロジェクト: stella81088/aima-python
def demoQLearningAgent():
    print '--------------------'
    print 'DEMO QLearningAgent'
    print '--------------------'
    # Setup values
    policy = {
        (0, 1): (0, 1),
        (1, 2): (1, 0),
        (3, 2): None,
        (0, 0): (0, 1),
        (3, 0): (-1, 0),
        (3, 1): None,
        (2, 1): (0, 1),
        (2, 0): (0, 1),
        (2, 2): (1, 0),
        (1, 0): (1, 0),
        (0, 2): (1, 0)
    }

    time_start = time()
    trials = 100
    agent = QLearningAgent(Fig[17, 1])
    for i in range(0, trials):
        execute_trial(agent, Fig[17, 1])
    time_end = time()

    print 'Executed %i trials' % trials
    print 'Took %d seconds' % (time_end - time_start)
    print 'Utilities: %s' % {s: max(agent.Q[s].values()) for s in agent.Q}
    print '\nCorrect Utilities (estimated by value iteration):'
    print value_iteration(Fig[17, 1])
コード例 #2
0
ファイル: main.py プロジェクト: diegotbl/Lab4-CTC17
def main():
    board = Board()
    board.print()
    robot = Robot(board)

    value_iteration(board, robot)
    determine_policy(board)
コード例 #3
0
ファイル: rl.py プロジェクト: yeesian/aima-python
def demoPassiveTDAgent():
    print '--------------------'
    print 'DEMO PassiveTDAgent'
    print '--------------------'
    # Setup values
    policy = {(0, 1): (0, 1),
              (1, 2): (1, 0),
              (3, 2): None,
              (0, 0): (0, 1),
              (3, 0): (-1, 0),
              (3, 1): None,
              (2, 1): (0, 1),
              (2, 0): (0, 1),
              (2, 2): (1, 0),
              (1, 0): (1, 0),
              (0, 2): (1, 0)}
    
    time_start = time()
    trials = 100
    agent = PassiveTDAgent(Fig[17,1], policy)
    for i in range (0,trials):
        execute_trial(agent,Fig[17,1])
    time_end = time()
    
    print 'Executed %i trials' % trials
    print 'Took %d seconds' % (time_end - time_start)
    print 'Utilities: %s' % agent.U
    print '\nCorrect Utilities (estimated by value iteration):'
    print value_iteration(Fig[17,1])
コード例 #4
0
def main():
    g = Grid(4, 4)

    terminals = [{
        "x": 3,
        "y": 0,
        "reward": 1
    }, {
        "x": 1,
        "y": 3,
        "reward": 1
    }, {
        "x": 2,
        "y": 3,
        "reward": -10
    }, {
        "x": 3,
        "y": 3,
        "reward": 10
    }]
    blocks = [{"x": 1, "y": 1}]

    g.init_world(terminals, blocks)

    np.random.seed(62)

    mdp.value_iteration(g, -0.02, 0.8, 0.8)

    mdp.policy_iteration(g, -0.02, 0.8, 0.8)

    mdp.q_function(g, "s6", -0.02, 0.8, 0.1, 0.1, 0.8, 1000000)
コード例 #5
0
ファイル: rl.py プロジェクト: stella81088/aima-python
def demoPassiveTDAgent():
    print '--------------------'
    print 'DEMO PassiveTDAgent'
    print '--------------------'
    # Setup values
    policy = {
        (0, 1): (0, 1),
        (1, 2): (1, 0),
        (3, 2): None,
        (0, 0): (0, 1),
        (3, 0): (-1, 0),
        (3, 1): None,
        (2, 1): (0, 1),
        (2, 0): (0, 1),
        (2, 2): (1, 0),
        (1, 0): (1, 0),
        (0, 2): (1, 0)
    }

    time_start = time()
    trials = 100
    agent = PassiveTDAgent(Fig[17, 1], policy)
    for i in range(0, trials):
        execute_trial(agent, Fig[17, 1])
    time_end = time()

    print 'Executed %i trials' % trials
    print 'Took %d seconds' % (time_end - time_start)
    print 'Utilities: %s' % agent.U
    print '\nCorrect Utilities (estimated by value iteration):'
    print value_iteration(Fig[17, 1])
コード例 #6
0
ファイル: rl.py プロジェクト: yeesian/aima-python
def demoQLearningAgent():
    print '--------------------'
    print 'DEMO QLearningAgent'
    print '--------------------'
    # Setup values
    policy = {(0, 1): (0, 1),
              (1, 2): (1, 0),
              (3, 2): None,
              (0, 0): (0, 1),
              (3, 0): (-1, 0),
              (3, 1): None,
              (2, 1): (0, 1),
              (2, 0): (0, 1),
              (2, 2): (1, 0),
              (1, 0): (1, 0),
              (0, 2): (1, 0)}
    
    time_start = time()
    trials = 100
    agent = QLearningAgent(Fig[17,1])
    for i in range (0,trials):
        execute_trial(agent,Fig[17,1])
    time_end = time()

    print 'Executed %i trials' % trials
    print 'Took %d seconds' % (time_end - time_start)
    print 'Utilities: %s' % {s:max(agent.Q[s].values()) for s in agent.Q}
    print '\nCorrect Utilities (estimated by value iteration):'
    print value_iteration(Fig[17,1])
コード例 #7
0
ファイル: policy.py プロジェクト: adn5327/grid-world
def policy(mazey, terminal = True):
	#call value iteration here
	value_iteration(mazey, terminal)
	for i in range(mazey.size):
		for j in range(mazey.size):
			policy = max_of_neighbors(mazey, i, j)
			if terminal and mazey.grid[i][j].is_terminal():
				policy = 't'
			if mazey.grid[i][j].is_wall():
				policy = 'w'
			mazey.grid[i][j].policy = policy
コード例 #8
0
ファイル: ex2.py プロジェクト: ApplyHiTech/ml_hw2
    def __init__(self, problem, steps):
        self.original_problem = deepcopy(problem)
        start_state, special_things = checker.problem_to_state(problem)
        self.steps = steps
        self.current_state_of_board, self.current_special_things = checker.problem_to_state(
            problem)
        self.eval = checker.Evaluator(0, problem, steps)
        self.act_list = ACT_LIST
        all_states, trans_dict, rewards = self.compute_states
        print(all_states)
        print(rewards)
        mdp.MDP.__init__(self,
                         init=start_state,
                         actlist=["U", "D", "R", "L"],
                         terminals=[],
                         transitions=trans_dict,
                         states=all_states,
                         gamma=0.01)

        self.reward = rewards  #mpd rewards dictionary

        self.U = mdp.value_iteration(self)

        self.pi = mdp.best_policy(self, self.U)

        # print(mdp.best_policy(self, self.U))
        print("end of initialization\n\n\n\n")
        return
コード例 #9
0
 def solve(self,
           episodes=200,
           iterations=200,
           reset=True,
           seed=False,
           gamma=0.95):
     mdp = EnvMDP(self.env, gamma=gamma)
     self.policy = policy_iteration(mdp)
     self.U = value_iteration(mdp, epsilon=0.000000000001)
コード例 #10
0
ファイル: game.py プロジェクト: davidbenhaim/pyquest
 def __init__(self, player, territories, start_state):
     self.State = make_State(territories)
     self.player = player
     self.territories = {t.name:t for t in territories}
     self.state = start_state
     self.turn = 0
     if not self.player:
         init_state = self.state
         self.comp = mdp.QuestMDP(set(self.territories.values()), init_state)
         self.comp.generate_states()
         self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp))
         print self.state
コード例 #11
0
    def _value_iteration_slow(self):
        old_values = dict(self.mdp.values)
        for i in range(100):
            values = value_iteration(self.mdp.values, self.mdp, num_iter=1)
            policy = policy_extraction(values, self.mdp)
            self.gridworldwindow.update_grid(values, policy)
            self.mdp.update_values(values)
            self.mdp.update_policies(policy)

            self.gridworldwindow.window.update()
            time.sleep(0.25)
            self.gridworldwindow.window.update()

            new_values = dict(values)
            if values_converged(new_values, old_values):
                break

            old_values = new_values
        self.gridworldwindow.show_dialog('Value Iteration has converged in {} steps!'.format(i+1))
コード例 #12
0
ファイル: game.py プロジェクト: davidbenhaim/pyquest
 def take_turn(self):
     if self.player:
         actions = self.actions(self.state)
         print "Actions available:"
         for i,action in enumerate(actions):
             print '%i: %s' % (i, action)
         usr_input = None
         while usr_input not in [i for i in range(len(actions))]:
             usr_input = input("Action: ")
         self.do_action(actions[usr_input])
         print action
         self.print_state()
     else:
         # time.sleep(3)
         if self.state not in self.policy:
             self.comp = mdp.QuestMDP(set(self.territories.values()), self.state)
             self.comp.generate_states()
             self.policy = mdp.best_policy(self.comp, mdp.value_iteration(self.comp))
         # pdb.set_trace()
         action = self.policy[self.state]
         self.do_action(action)
         print action
コード例 #13
0
ファイル: ex2.py プロジェクト: ApplyHiTech/ml_hw2
 def choose_next_action(self, state):
     state_of_board, special_things = checker.problem_to_state(state)
     eval_state = checker.Evaluator(0, state, 1)
     if not "pacman" in special_things:
         # check if PACMAN is still in the game
         return "reset"
     # if pacman is still in the game, then, choose best next step.
     s = self.eval_state_to_ab_state_plus_md(eval_state)
     if s in self.pi:
         new_min_md = 0
         # check if we need to update R based on Ghost location:
         min_md = self.find_min_md_from_ghosts(eval_state)
         # we check if there any ghosts on the board, and if they are very close.
         if min_md != -100 and min_md <= 2:
             print("performing update to R")
             # start scanning for a better position
             for action in ["U", "L", "R", "D"]:
                 child_eval = deepcopy(eval_state)
                 checker.Evaluator.change_state_after_action(
                     child_eval, action)
                 temp_new_md = self.find_min_md_from_ghosts(child_eval)
                 if temp_new_md != -100 and temp_new_md > new_min_md:
                     new_min_md = temp_new_md
                     next_state_md = self.eval_state_to_ab_state_plus_md(
                         child_eval)
                     self.rewards[next_state_md] = self.rewards[
                         next_state_md] + 10 * new_min_md
             # TODO: we might be yeilding a state that didnt exist before
             self.U = mdp.value_iteration(self)
             self.pi = mdp.best_policy(self, self.U)
         return self.pi[s]
     else:
         a = ["U", "D", "L", "R"]
         print("random chosen")
         # maybe here we should go into a simple dfs to find rest of the route to finish the board? @meir
         index = random.randint(0, 3)
         return a[index]
コード例 #14
0
    table = [header] + table
  table = [[(numfmt % x if isnumber(x) else x) for x in row]
           for row in table]
  maxlen = lambda seq: max(map(len, seq))
  sizes = map(maxlen, zip(*[map(str, row) for row in table]))
  for row in table:
    print sep.join(getattr(str(x), j)(size)
                   for (j, size, x) in zip(justs, sizes, row))

prize = 1
trap = -1
neg = -0.4

mdp1 = GridMDP([[neg, trap, prize],
                [neg, None, neg],
                [neg,  neg, neg]],
                terminals=[(1, 2), (2, 2)],
                error=.8)

print "GRID"
print
print "Value iteration"
pi = best_policy(mdp1, value_iteration(mdp1, .01))
print_table(mdp1.to_arrows(pi))

print "Policy iteration"
print_table(mdp1.to_arrows(policy_iteration(mdp1)))

print "Q Learning"
pi = best_policyQ(mdp1, qlearn(mdp1, (0, 0), 5000))
print_table(mdp1.to_arrows(pi))
コード例 #15
0
ファイル: a3.py プロジェクト: zpixley/Portfolio
def partD():
    policy = mdp.best_policy(chatbot_mdp, mdp.value_iteration(chatbot_mdp))
    print('State: choice\n-----------------------')
    for s in chatbot_mdp.states:
        print(str(s) + ': ' + str(policy[s]))
コード例 #16
0
for mdp_grid, term_grid in unique_mdps:
    print("--"*10)
    state_features = mdp_grid
    terminals = mdp_gen.get_terminals_from_grid(term_grid)
    #print("state features\n",state_features)
    state_features = mdp_gen.categorical_to_one_hot_features(state_features, num_features)
    print('one hot features', state_features)

    world = mdp.LinearFeatureGridWorld(state_features, true_weights, initials, terminals, gamma)
    mdp_family.append(world)

#plot for visualization
all_opts = []
all_features = []
for i,mdp_env in enumerate(mdp_family):
    V = mdp.value_iteration(mdp_env, epsilon=precision)
    Qopt = mdp.compute_q_values(mdp_env, V=V, eps=precision)
    opt_policy = mdp.find_optimal_policy(mdp_env, Q = Qopt, epsilon=precision)
    print(opt_policy)
    print(mdp_env.features)
    all_opts.append(opt_policy)
    all_features.append(mdp_env.features)
    #input()
filename = "./data_analysis/figs/twoXtwo/firstthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[:3], all_features[:3], 1, 3, filename=filename)
filename = "./data_analysis/figs/twoXtwo/lastthree.png"
mdp_plot.plot_optimal_policy_vav_grid(all_opts[-3:], all_features[-3:], 1, 3, filename=filename)
#plt.show()

family_teacher = machine_teaching.MdpFamilyTeacher(mdp_family, precision, debug)
mdp_set_cover = family_teacher.get_machine_teaching_mdps()
コード例 #17
0
ファイル: a3.py プロジェクト: simrangidwani/CS1571
def mdpProblem(conversationLength):
    if conversationLength == 'short':
        t = {
            "D1": {
                "Respond-Resolved": [(0.30, "U1")],
                "Respond-notResolved": [(0.70, "D2")],
                "Redirect-Frustrated": [(0.20, "U2")],
                "Redirect-notFrustrated": [(0.80, "U3")]
            },
            "D2": {
                "Respond-Resolved": [(0.30, "U5")],
                "Respond-notResolved": [(0.70, "U4")],
                "Redirect-Frustrated": [(0.20, "U6")],
                "Redirect-notFrustrated": [(0.80, "U7")]
            }
        }
    elif conversationLength == 'medium':
        t = {
            "D1": {
                "Respond-Resolved": [(0.50, "U1")],
                "Respond-notResolved": [(0.50, "D2")],
                "Redirect-Frustrated": [(0.30, "U2")],
                "Redirect-notFrustrated": [(0.70, "U3")]
            },
            "D2": {
                "Respond-Resolved": [(0.50, "U5")],
                "Respond-notResolved": [(0.50, "U4")],
                "Redirect-Frustrated": [(0.30, "U6")],
                "Redirect-notFrustrated": [(0.70, "U7")]
            }
        }
    elif conversationLength == 'long':
        t = {
            "D1": {
                "Respond-Resolved": [(0.70, "U1")],
                "Respond-notResolved": [(0.30, "D2")],
                "Redirect-Frustrated": [(0.60, "U2")],
                "Redirect-notFrustrated": [(0.40, "U3")]
            },
            "D2": {
                "Respond-Resolved": [(0.70, "U5")],
                "Respond-notResolved": [(0.30, "U4")],
                "Redirect-Frustrated": [(0.60, "U6")],
                "Redirect-notFrustrated": [(0.40, "U7")]
            }
        }

    init = "D1"

    terminals = ["U1", "U2", "U3", "U4", "U5", "U6", "U7"]

    rewards = {
        "U1": 5,
        "U2": -1,
        "U3": 5,
        "U4": -3,
        "U5": 5,
        "U6": -1,
        "U7": 5,
        "D2": 0,
        "D1": 0
    }
    markov = createMDP(init, terminals, t, rewards, gamma=.9)
    solution = mdp.value_iteration(markov)
    print(solution)
コード例 #18
0
def main():
    number_of_iterations = 10

    # expert_mdp = GridMDP([[-10, -5, 0, 0, 10],
    #         [-5, -3, 0, 0, 0],
    #         [0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0]],
    #         terminals=[(4,3)])

    # expert_mdp = GridMDP([[-10, -5, -3, -1, 0, 0, 0, 0, 0, 10],
    #         [-8, -5, -3, 0, 0, 0, 0, 0, 0, 0],
    #         [-5, -2, -1, 0, 0, 0, 0, 0, 0, 0],
    #         [-3, -1, 0, 0, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
    #         terminals=[(9,4)])
    #
    # expert_mdp = GridMDP([[0, 0, 0, 0, -1, -1, 0, 0, 0, 10],
    #                     [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #                     [0, 0, 0, -3, -5, -5, -3, 0, 0, 0],
    #                     [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #                     [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]],
    #                     terminals=[(9,4)])
    #
    # rewards = [[0, 0, 0, 0, -1, -1, 0, 0, 0, 10],
    #            [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #            [0, 0, 0, -3, -5, -5, -3, 0, 0, 0],
    #            [0, 0, 0, -3, -3, -3, -3, 0, 0, 0],
    #            [0, 0, 0, 0, 0, -1, -1, 0, 0, 0]]
    #

    rewards = [[0, 0, 0, 0, -8, -8, 0, 0, 0, 10],
               [0, 0, 0, -8, -10, -10, -8, 0, 0, 0],
               [0, 0, 0, -8, -10, -10, -8, 0, 0, 0],
               [0, 0, 0, -8, -10, -10, -8, 0, 0, 0],
               [0, 0, 0, 0, 0, -8, -8, 0, 0, 0]]

    # rewards = [[-6, -3, -1, 0, 0, 0, 0, 0, 0, 10],
    #             [-3, -3, -1, 0, 0, 0, 0, 0, 0, 0],
    #             [-1, -1, -1, 0, 0, 0, 0, -1, -1, -1],
    #             [0, 0, 0, 0, 0, 0, 0, -1, -3, -3],
    #             [0, 0, 0, 0, 0, 0, 0, -1, -3, -6]]
    #
    # rewards = [[0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0, 10],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, -3, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0],
    #         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -3, -3, -3, 0, 0, 0, 0, 0, 0]]



    expert_mdp = mdp.GridMDP(rewards,
                         terminals=[(9, 4)])

    expert_trace = mdp.best_policy(expert_mdp, mdp.value_iteration(expert_mdp, 0.001))
    print "Expert rewards:"
    expert_mdp.print_rewards()
    print "Expert policy:"
    utils.print_table(expert_mdp.to_arrows(expert_trace))
    print "---------------"

    expert_trace.pop((0,1))
    expert_trace.pop((0,2))
    expert_trace.pop((0,3))


    mybirl = birl.BIRL(expert_trace, expert_mdp.get_grid_size(), expert_mdp.terminals,
                partial(calculate_error_sum, expert_mdp), birl_iteration=2, step_size=1.0)
    run_multiple_birl(mybirl, expert_mdp, expert_trace, number_of_iterations)
コード例 #19
0
ファイル: sp.py プロジェクト: fmgc/mdp
import mdp
from sputil import *

if __name__ == "__main__":
    from model2 import *

    p = map(obs2prob, [s,v,r,g,i,o])

    A = make_actions( p[0], p[1], p[2], p[3], p[4], p[5] )
    R = make_reward(pragmatic_reward)

    #print A
    V,P,ok = mdp.value_iteration(A,R)
    D = {0:[], 1:[]}
    for (k,v) in P.items():
        D[v].append(k)

    
    for k in D.keys():
        print A[k].name
        for x in D[k]:
            print x
            #(s,v,r,g,i,o) = x
            #if r == 0:
            #    print x
コード例 #20
0
 def _value_iteration_100_steps(self):
     values = value_iteration(self.mdp.values, self.mdp, num_iter=100)
     policy = policy_extraction(values, self.mdp)
     self.gridworld.update_grid(values, policy)
     self.mdp.update_values(values)
     self.mdp.update_policy(policy)
コード例 #21
0
    ax2.plot(S,dp_solution_q)
    ax2.legend(('SARSA-LAMBDA','Value Iteration'))
    plt.xlabel('State Index')
    plt.ylabel('Optimal Value Function V*')
    plt.title('Comparison of SARSA-LAMBDA and value iteration for gridworld ')
    plt.show()
    '''
    #--------------------------------
    # Q-LAMBDA

    lambd = 0.9
    alpha = 10**(-2)
    n_episodes = 30000
    eps = 0.1
    [Q, E] = q_lambda(gamma, lambd, alpha, eps, n_episodes, S, A, sampler)
    best_action_2, dp_solution_q = mdp.value_iteration(S, A, P, R, gamma, pi)
    #print(Q)
    print(dp_solution_q)

    #best_action_extract
    best_actions, best_valfn = get_actions(Q, P)
    print(best_valfn)

    fig2, ax2 = plt.subplots()
    ax2.plot(S, best_valfn, marker='x')
    ax2.plot(S, dp_solution_q)
    ax2.legend(('Q-LAMBDA', 'Value Iteration'))
    plt.xlabel('State Index')
    plt.ylabel('Optimal Value Function V*')
    plt.title('Comparison of Q-LAMBDA and value iteration for gridworld ')
    plt.show()
コード例 #22
0
                if key2 == 0:
                    key3 = key1 - 4
                elif key2 == 1:
                    key3 = key1 + 1
                elif key2 == 2:
                    key3 = key1 + 4
                elif key2 == 3:
                    key3 = key1 - 1
                else:
                    key3 = key1

                P_dict[key1][key2][key3] = 1

    [S, A, P, R, gamma, pi] = mdp.create_MDP(
        S, A, P_dict, R_dict, gamma,
        pi_dict)  # Creates the relevant matrices for the gridworld

    vi = mdp.evaluate_policy(S, A, P, R, gamma, pi)
    # Evaluates Initial random policy, Pg 12 of DP lecture
    print(
        vi
    )  # Printed values are different due to round offs in lecture numbers
    best_action, vk = mdp.policy_iteration(S, A, P, R, gamma, pi)
    # Policy Iteration to find the best policy
    print(
        best_action
    )  # Acc to the code, it finds only 1 best action for a given state, in decreasing priority of
    #action vector
    best_action_2, vk = mdp.value_iteration(S, A, P, R, gamma, pi)
    print(best_action_2)
    print("seed", seed)
    np.random.seed(seed)
    random.seed(seed)

    #First let's generate a random MDP
    state_features = eutils.create_random_features_row_col_m(
        num_rows, num_cols, num_features)
    #print("state features\n",state_features)
    true_weights = random_weights(num_features)
    print("true weights: ", true_weights)
    true_world = mdp.LinearFeatureGridWorld(state_features, true_weights,
                                            initials, terminals, gamma)
    print("rewards")
    true_world.print_rewards()
    print("value function")
    V = mdp.value_iteration(true_world)
    true_world.print_map(V)
    print("mdp features")
    utils.display_onehot_state_features(true_world)
    #find the optimal policy under this MDP
    Qopt = mdp.compute_q_values(true_world, V=V)
    opt_policy = mdp.find_optimal_policy(true_world, Q=Qopt)
    print("optimal policy")
    true_world.print_map(true_world.to_arrows(opt_policy))
    #input()
    #now find a bunch of other optimal policies for the same MDP but with different weight vectors.
    #TODO: I wonder if there is a better way to create these eval policies?
    # Can we efficiently solve for all of them or should they all be close? (e.g. rewards sampled from gaussian centerd on true reward?)
    world = copy.deepcopy(true_world)
    eval_policies = []
    eval_Qvalues = []
コード例 #24
0
 def _value_iteration_1_step(self):
     values = value_iteration(self.mdp.values, self.mdp, num_iter=1)
     policy = policy_extraction(values, self.mdp)
     self.gridworldwindow.update_grid(values, policy)
     self.mdp.update_values(values)
     self.mdp.update_policies(policy)