Example #1
0
 def __init__(self):
     super(GraphicDisplay, self).__init__()
     self.title('Value Iteration')
     self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
     self.texts = []
     self.arrows = []
     self.util = Util()
     self.agent = ValueIteration(self.util)
     self._build_env()
Example #2
0
    def clear(self):
        for i in self.texts:
            self.canvas.delete(i)

        for i in self.arrows:
            self.canvas.delete(i)

        self.canvas.delete(self.rectangle)
        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
        self.agent = ValueIteration(self.util)
 def __init__(self):
     super(GraphicDisplay, self).__init__()
     self.title('Value Iteration')
     self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
     self.texts = []
     self.arrows = []
     self.env = Env()
     self.agent = ValueIteration(self.env)
     self._build_env()
     self.iteration_count = 0
     self.improvement_count = 0
     self.is_moving = 0
Example #4
0
 def test_5x5_maze_value_iteration(self):
     env = MazeEnvSpecial5x5()
     alg = ValueIteration(env)
     alg.train()
     done_cnt = 0
     current_state = env.reset()
     while True:
         action = alg.predict(current_state)
         current_state, reward, done, _ = env.step(action)
         if done:
             break
         done_cnt += 1
     self.assertEqual(done_cnt, 15)
     self.assertEqual(reward, 1)
    def clear(self):

        if self.is_moving == 0:
            self.iteration_count = 0
            self.improvement_count = 0
            for i in self.texts:
                self.canvas.delete(i)

            for i in self.arrows:
                self.canvas.delete(i)

            self.canvas.delete(self.rectangle)
            self.rectangle = self.canvas.create_image(
                50, 50, image=self.rectangle_image)
            self.agent = ValueIteration(self.env)
Example #6
0
def train(gamma, epsilon, n_samples, n_steps, n_epochs, learning_rate):
    env = gym.make('EasyFrozenLakeEnv-v0')
    value_iteration = ValueIteration(env.nS, env.nA, env.P)
    print('Env States: %i' % (env.nS))

    # preparing an expert
    # Calculate OPTIMAL POLICY
    V, policy = value_iteration(gamma, epsilon)
    # Use PI_opt to Sample
    trajectories = sample_trajectories(env, policy, n_steps, n_samples)
    # 1 if Visited, 0 if Not Visited
    # Feature = Average visited in 100 Samples of 10 Steps
    experts_feature = compute_experts_feature(env.nS, trajectories)
    print(experts_feature[:, ])

    # training
    feature_matrix = np.eye(env.nS)
    reward_function = Reward(env.nS)
    svf = StateVisitationFrequency(env.nS, env.nA, env.P)

    for i in range(n_epochs):
        # Iterate to get
        V, policy = value_iteration(gamma, epsilon, reward_function)
        P = svf(policy, trajectories)
        grad = experts_feature - feature_matrix.T.dot(P)
        reward_function.update(learning_rate * grad)

    return reward_function(feature_matrix)
Example #7
0
 def test_3x3_maze_value_iteration(self):
     env = MazeEnvSample3x3()
     alg = ValueIteration(env, max_iter=90)
     alg.train()
     expected_values = np.array([[2.048, 2.56, 3.2], [2.56, 3.2, 4],
                                 [3.2, 4, 5]])
     # expected values are solved by Bell equation x = 1 + 0.8 * x for V[2, 2] = 5, etc..
     assert_array_almost_equal(alg.values, expected_values)
     done_cnt = 0
     current_state = env.reset()
     while True:
         action = alg.predict(current_state)
         current_state, reward, done, _ = env.step(action)
         if done:
             break
         done_cnt += 1
     self.assertEqual(done_cnt, 3)
     self.assertEqual(reward, 1)
Example #8
0
    def human_model_goal(self, policy_index, theta, final_value_param):
        valiter = ValueIteration(self.grid_size)
        final_value = theta * final_value_param
        value, q_value, optimal_policies = valiter.value_iteration(
            final_value, self.discount)
        exp_q_vals = np.zeros(len(valiter.policies))
        for i in range(len(valiter.policies)):
            exp_q_vals[i] = np.exp(
                self.beta *
                q_value[self.robot_state[0], self.robot_state[1], i])

        sum_exp = 0
        for i in range(len(exp_q_vals)):
            if not np.isnan(exp_q_vals[i]):
                sum_exp += exp_q_vals[i]

        exp_q_vals /= sum_exp

        return exp_q_vals[policy_index]
Example #9
0
def main(args):
    # resolve path to world map definition
    if not args.world:
        world_map_path = os.path.join(
            os.path.dirname(os.path.realpath(__file__)), 'world_map.txt')
    else:
        world_map_path = args.world

    print("Reading world from %s" % world_map_path)
    if not os.path.exists(world_map_path):
        raise IOError(
            "World map definition not found at its expected path: %s" %
            world_map_path)

    world = World(world_map_path)
    visualizer = Visualizer(world)

    # Value Iteration
    value_iteration = ValueIteration(world,
                                     one_step_cost_v1,
                                     discount_factor=args.gamma,
                                     eps=10e-10)
    value_iteration.execute()
    optimal_policy = value_iteration.extract_policy()

    fig_vi = plt.figure()
    visualizer.draw(fig_vi, optimal_policy, value_iteration.value_fn,
                    "Value Iteration (gamma = %.2f)" % args.gamma)

    # Policy Iteration
    policy_iteration = PolicyIteration(world,
                                       one_step_cost_v1,
                                       discount_factor=args.gamma)
    value_fn = policy_iteration.execute()

    fig_pi = plt.figure()
    visualizer.draw(fig_pi, policy_iteration.policy, value_fn,
                    "Policy Iteration (gamma = %.2f)" % args.gamma)

    plt.show()
Example #10
0
def execute_value_iteration_test(test, epsilon, output='console'):
    """
    Description
    -----------
    Function used to run the value_iteration for
    the given test.

    Parameters
    ----------
    test: Test() \\
        -- A Test() instance with the information
        needed to run the value_iteration.

    epsilon: float \\
        -- A floating point number, used to set
        the value_iteration's stopping decision.

    output: str \\
        -- A string that tells the function where
        its output is expected.

    Returns
    -------
    ValueIteration() \\
        -- If no recognizable outputs is informed,
        returns the instance of the value_iteration used.
    """

    value_iteration = ValueIteration(test, epsilon=epsilon)

    value_iteration.run()

    if output in ['console', 'file']:
        output_processing(output, test, value_iteration, 'ValueIteration')

    return value_iteration
Example #11
0
def main(algorithm, track, x_start, y_start, discount, learning_rate, threshold, max_iterations, epsilon=None, reset_on_crash=False):
    """
    Program entry. Runs selected algorithm on selected track, at given coordinates, with given parameters
    :param algorithm: String
    :param track: List
    :param x_start: Int
    :param y_start: Int
    :param discount: Float
    :param learning_rate: Float
    :param threshold: Float
    :param max_iterations: Int
    :param epsilon: Float
    :param reset_on_crash: Boolean
    :return: None
    """
    with open(track) as f:
        specs = f.readline().strip().split(',')
        rows = int(specs[0])
        cols = int(specs[1])
        layout = f.read().splitlines()

        initial_state = (x_start, y_start, 0, 0)
        initial_action = (0, 0)

        agent = Car(initial_action, epsilon)
        environment = RaceTrack(rows, cols, layout, initial_state, reset_on_crash=reset_on_crash)

        if algorithm == 'value_iteration':
            value_iterator = ValueIteration(discount, threshold, max_iterations, environment, agent)
            value_iterator.run()
            path = value_iterator.extract_policy(initial_state)
            value_iterator.plot_max_diffs()
        elif algorithm == 'q_learning':
            q_learner = QLearning(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = q_learner.run()
            q_learner.plot_avg_cost()
        elif algorithm == 'sarsa':
            sarsa = Sarsa(discount, learning_rate, threshold, max_iterations, environment, agent)
            path = sarsa.run()
            sarsa.plot_avg_cost()
        else:
            print("No algorithm selected")
            return None
        draw_track(path, layout)
Example #12
0
def train(gamma, epsilon, n_samples, n_steps, n_epochs, learning_rate):
    env = gym.make('EasyFrozenLakeEnv-v0')
    value_iteration = ValueIteration(env.nS, env.nA, env.P)

    # preparing an expert
    V, policy = value_iteration(gamma, epsilon)
    trajectories = sample_trajectories(env, policy, n_steps, n_samples)
    experts_feature = compute_experts_feature(env.nS, trajectories)

    # training
    feature_matrix = np.eye(env.nS)
    reward_function = Reward(env.nS)
    svf = StateVisitationFrequency(env.nS, env.nA, env.P)

    for i in range(n_epochs):
        V, policy = value_iteration(gamma, epsilon, reward_function)
        P = svf(policy, trajectories)
        grad = experts_feature - feature_matrix.T.dot(P)
        reward_function.update(learning_rate * grad)

    return reward_function(feature_matrix)
Example #13
0
def main():
    aiType = 3
    worldSize = 6
    game = Game(aiType, worldSize)

    agent = Agent()
    pc = None
    policy = None
    if aiType == 1:
        policy = ValueIteration()
        pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1],
                                 inpDiscounts=[1, .1, .1],
                                 inpStochastic=[[100, 0, 0], [0, 100, 0],
                                                [0, 0, 100]])
    elif aiType == 2:
        policy = PolicyIteration()
        pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1],
                                 inpDiscounts=[1, .1, .1],
                                 inpStochastic=[[100, 0, 0], [0, 100, 0],
                                                [0, 0, 100]])
    elif aiType == 3:
        policy = qLearningAgent()
        pc = PolicyConfiguration(inpRewards=[0, -1, 0, 10, -1],
                                 inpDiscounts=[1, .1, .1],
                                 inpStochastic=[[100, 0, 0], [0, 100, 0],
                                                [0, 0, 100]],
                                 inpFile="QLValues.p",
                                 inpTrainingLimit=1000)
    elif aiType == 4:
        policy = approximateQLearning()
        pc = PolicyConfiguration(inpRewards=[2, -1, 0, 0, -1],
                                 inpDiscounts=[0.9, .2, .1],
                                 inpStochastic=[[100, 0, 0], [0, 100, 0],
                                                [0, 0, 100]],
                                 inpFile="AQLWeights.json",
                                 inpTrainingLimit=500)
    else:
        policy = ValueIteration()
        pc = PolicyConfiguration()
    policy.config = pc

    agent.policy = policy

    game.agent = agent

    game.mainLoop()
Example #14
0
 def __init__(self, can, direction, inpAIType):
     self.flag = 1
     self.can = can
     self.direction = direction
     self.aiType = inpAIType
     self.agent = Agent()
     pc = None
     policy = None
     #inpRewards = [food reward, hazard reward, living reward, good location reward, bad location reward]
         #good and bad location is only used for qlearning
             #tried to use to cause graph searching
         #not really used and can give wonky results
     #inpDiscounts = [gamma discount, alpha discount, epsilon explore chance]
     #inpStochastic = [forward action[forward chance, left chance, right chance]
     #left action[forward chance, left chance, right chance]
     #right action[forward chance, left chance, right chance]]
     #inpFile file for weight or qvalues
     if self.aiType == 1:
         policy = ValueIteration()
         pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]])
     elif self.aiType == 2:
         policy = PolicyIteration()
         pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]])
     elif self.aiType == 3:
         policy = qLearningAgent()
         #risk aversion aka rarely go off best path seems to work best
         #This one seemed to work #pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]])
         pc = PolicyConfiguration(inpRewards = [2,-1,0,0,0], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 20000)
     elif self.aiType == 4:
         policy = approximateQLearning()
         pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 5000)
     else:
         policy = ValueIteration()
         pc = PolicyConfiguration()
     policy.config = pc
         
     self.agent.policy = policy
Example #15
0
class GraphicDisplay(tk.Tk):
    def __init__(self):
        super(GraphicDisplay, self).__init__()
        self.title('Value Iteration')
        self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50))
        self.texts = []
        self.arrows = []
        self.util = Util()
        self.agent = ValueIteration(self.util)
        self._build_env()

    def _build_env(self):
        self.canvas = tk.Canvas(self, bg='white',
                                height=HEIGHT * UNIT,
                                width=WIDTH * UNIT)

        # Buttons
        iteration_button = tk.Button(self, text="Calculate", command=self.calculate_value)
        iteration_button.configure(width=10, activebackground="#33B5E5")
        self.canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, window=iteration_button)

        policy_button = tk.Button(self, text="Print Policy", command=self.print_optimal_policy)
        policy_button.configure(width=10, activebackground="#33B5E5")
        self.canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, window=policy_button)

        policy_button = tk.Button(self, text="Move", command=self.move_by_policy)
        policy_button.configure(width=10, activebackground="#33B5E5")
        self.canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, window=policy_button)

        policy_button = tk.Button(self, text="Clear", command=self.clear)
        policy_button.configure(width=10, activebackground="#33B5E5")
        self.canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, window=policy_button)

        # create grids
        for c in range(0, WIDTH * UNIT, UNIT):  # 0~400 by 80
            x0, y0, x1, y1 = c, 0, c, HEIGHT * UNIT
            self.canvas.create_line(x0, y0, x1, y1)
        for r in range(0, HEIGHT * UNIT, UNIT):  # 0~400 by 80
            x0, y0, x1, y1 = 0, r, HEIGHT * UNIT, r
            self.canvas.create_line(x0, y0, x1, y1)

        # image_load
        self.up_image = ImageTk.PhotoImage(Image.open("../resources/up.png").resize((13, 13)))
        self.right_image = ImageTk.PhotoImage(Image.open("../resources/right.png").resize((13, 13)))
        self.left_image = ImageTk.PhotoImage(Image.open("../resources/left.png").resize((13, 13)))
        self.down_image = ImageTk.PhotoImage(Image.open("../resources/down.png").resize((13, 13)))
        self.rectangle_image = ImageTk.PhotoImage(
            Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS))
        self.triangle_image = ImageTk.PhotoImage(Image.open("../resources/triangle.png").resize((65, 65)))
        self.circle_image = ImageTk.PhotoImage(Image.open("../resources/circle.png").resize((65, 65)))

        # add image to canvas
        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
        self.hell1 = self.canvas.create_image(250, 150, image=self.triangle_image)
        self.hell2 = self.canvas.create_image(150, 250, image=self.triangle_image)
        self.circle = self.canvas.create_image(250, 250, image=self.circle_image)

        # add reward text
        self.text_reward(2, 2, "R : 1.0")
        self.text_reward(1, 2, "R : -1.0")
        self.text_reward(2, 1, "R : -1.0")

        # pack all
        self.canvas.pack()

    def clear(self):
        for i in self.texts:
            self.canvas.delete(i)

        for i in self.arrows:
            self.canvas.delete(i)

        self.canvas.delete(self.rectangle)
        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
        self.agent = ValueIteration(self.util)

    def reset(self):
        self.update()
        time.sleep(0.5)
        self.canvas.delete(self.rectangle)
        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
        # return observation
        return self.canvas.coords(self.rectangle)

    def text_value(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"):
        origin_x, origin_y = 85, 70
        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
        font = (font, str(size), style)
        return self.texts.append(self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor))

    def text_reward(self, row, col, contents, font='Helvetica', size=12, style='normal', anchor="nw"):
        origin_x, origin_y = 5, 5
        x, y = origin_y + (UNIT * col), origin_x + (UNIT * row)
        font = (font, str(size), style)
        return self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)

    def step(self, action):
        s = self.canvas.coords(self.rectangle)

        base_action = np.array([0, 0])
        if action == 0:  # up
            if s[1] > UNIT:
                base_action[1] -= UNIT
        elif action == 1:  # down
            if s[1] < (HEIGHT - 1) * UNIT:
                base_action[1] += UNIT
        elif action == 2:  # right
            if s[0] < (WIDTH - 1) * UNIT:
                base_action[0] += UNIT
        elif action == 3:  # left
            if s[0] > UNIT:
                base_action[0] -= UNIT

        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent
        s_ = self.canvas.coords(self.rectangle)  # next state
        # reward function
        if s_ == self.canvas.coords(self.circle):
            reward = 1
            done = True
        elif s_ in [self.canvas.coords(self.hell1), self.canvas.coords(self.hell2)]:
            reward = -1
            done = True
        else:
            reward = 0
            done = False

        return s_, reward, done

    def rectangle_move(self, action):

        base_action = np.array([0, 0])
        self.render()

        if action[0] == 1:  # down
            base_action[1] += UNIT
        elif action[0] == -1:  # up
            base_action[1] -= UNIT
        elif action[1] == 1:  # right
            base_action[0] += UNIT
        elif action[1] == -1:  # left
            base_action[0] -= UNIT

        self.canvas.move(self.rectangle, base_action[0], base_action[1])  # move agent

    def rectangle_location(self):
        temp = self.canvas.coords(self.rectangle)
        x = (temp[0] / 100) - 0.5
        y = (temp[1] / 100) - 0.5
        return int(y), int(x)

    def move_by_policy(self):
        self.canvas.delete(self.rectangle)
        self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image)
        agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]]
        while len(self.agent.get_action(agent_state, False)) != 0:
            agent_state = [self.rectangle_location()[0], self.rectangle_location()[1]]
            self.after(100, self.rectangle_move(self.agent.get_action(agent_state, True)))

    def draw_one_arrow(self, col, row, action):
        if action[0] == 1:  # down
            origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col)
            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.down_image))

        elif action[0] == -1:  # up
            origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col)
            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.up_image))

        elif action[1] == 1:  # right
            origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col)
            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.right_image))

        elif action[1] == -1:  # left
            origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col)
            self.arrows.append(self.canvas.create_image(origin_x, origin_y, image=self.left_image))

    def draw_from_values(self, state, action_list):

        i = state[0]
        j = state[1]

        for action in action_list:
            self.draw_one_arrow(i, j, action)

    def print_values(self, values):
        for i in range(WIDTH):
            for j in range(HEIGHT):
                self.text_value(i, j, values[i][j])

    def render(self):
        time.sleep(0.1)
        self.canvas.tag_raise(self.rectangle)
        self.update()

    def calculate_value(self):
        for i in self.texts:
            self.canvas.delete(i)
        self.agent.iteration()
        print(self.agent.get_value_table)
        self.print_values(self.agent.get_value_table())

    def print_optimal_policy(self):
        for i in self.arrows:
            self.canvas.delete(i)
        for state in self.util.get_all_states():
            action = self.agent.get_action(state, False)
            self.draw_from_values(state, action)
Example #16
0
def main(argv):
    # Get command line arguments
    try:
        opts, args = getopt.getopt(argv, "pi:")
    except getopt.GetoptError:
        print("main.py [-p] [-i=n_iter]")
        sys.exit(1)
        
    # Plot switch
    plot = False
    
    # Default iterations
    n_iter = 100000
    
    # Parse command line arguments
    for opt, arg in opts:
        # Help
        if opt == "-h":
            print("main.py [-p] [-i=n_iter]")
            sys.exit()
        
        # Plot
        elif opt == "-p":
            plot = True
        
        # Number of iterations
        elif opt == "-i":
            n_iter = arg
            
    # Construct grid
    grid = construct_grid()
    
    # Find solution with value iteration
    print("Performing value iteration...")
    vi = ValueIteration(grid)
    vi = vi.solve()
    print("----------")
    print("The value for each cell is:")
    print("(x,y): value")
    for cell in grid:
        print(cell.name_x()+","+cell.name_y()+":     "+\
              str(vi[0][cell.get_name()]))
    print("----------")
    print("The policy found by value iteration is:")
    print("(x,y): action")
    for cell in grid:
        print(cell.name_x()+","+cell.name_y()+":     "+\
              str(vi[1][cell.get_name()]))
              
    # Find solution with Q-learning
    print("\n")
    print("Performing Q-learning...")
    ql = QLearning(grid, N_iter = int(n_iter))
    ql = ql.solve()
    ql_states = ql[0]
    ql_Q = ql[1]
    print("----------")
    
    print("The policy found by Q-learning is.")
    print("(x,y): action")
    
    actions = ["north","east","south","west"]
    for cell in grid:
        ind = ql_states.index(cell.get_name())
        action = actions[np.argmax(ql_Q[ind,])]
        print(cell.name_x()+","+cell.name_y()+":     "+str(action))
        
    if plot:
        # Convergence graph for q-learning: 
        # best Q-value of the best action for the START state 
        fig, ax = plt.subplots()
        ax.plot(ql[2][0:ql[3]], label = "Q-values for best action in START")
        ax.plot((0,ql[3]),(vi[0]["11"],vi[0]["11"]), label = "Values of START")
        plt.xlabel("Iterations")
        
    plt.show()
Example #17
0
import numpy as np
from value_iteration import ValueIteration

grid_size = [5, 5]
final_value = np.zeros((grid_size[0], grid_size[1]))
final_value[0][0] = 1
final_value[2][1] = -1
discount = 0.9

valiter = ValueIteration(grid_size)
value, q_value, optimal_policies = valiter.value_iteration(
    final_value, discount)
print(value)
print(optimal_policies)
Example #18
0
            soft_Q_policy[i][1] = 0
        #左
        if (i % X) == 0:
            soft_Q_policy[i][4] = soft_Q_policy[i][4] + soft_Q_policy[i][2]
            soft_Q_policy[i][2] = 0
        #右
        if (i % X) == X - 1:
            soft_Q_policy[i][4] = soft_Q_policy[i][4] + soft_Q_policy[i][3]
            soft_Q_policy[i][3] = 0

    #報酬を保存
    np.savetxt("R_X5Y5.csv", est_reward.reshape((X, Y)), delimiter=", ")
    #print(est_reward)

    env_est = gridworld.GridWorld(grid_shape, est_reward)
    est_agent = ValueIteration(env_est, gamma)

    #状態価値を出す#確率的な方策にも対応
    V_est = est_agent.get_pi_value(soft_Q_policy)
    print(V_est)
    #np.savetxt("V_Pro_1.csv",V_est.reshape((5,5)),delimiter=", ")

    gap_sum_dist = []
    for q in range(len(traj)):
        pi_check = traj[q]
        #ここのpi_checkは軌跡(方策ではない)
        #gapをだす
        q_gap_one_list = Q_seikika_gap(pi_check, X, Y, V_est, soft_Q_policy)
        q_gap_sum_list = Q_gap_sum_list(q_gap_one_list)
        #print(q_gap_sum_list[-1])
        gap_sum_dist.append(q_gap_sum_list[-1])
Example #19
0
    height = 8
    width = 8
    goal_co_ord = (3, 2)
    tile_size = (200, 200)

    surface, tiles = make_pygame(height, width, goal_co_ord, tile_size)

    grid = GridWorld(height, width, goal_co_ord=goal_co_ord)

    state_values = [0] * len(grid.states)
    update_tiles(state_values, tiles)

    random_policy = {action: 0.25 for action in grid.actions}

    solvers = {
        'dynamic-programming': DynamicProgramming(random_policy, grid),
        'value-iteration': ValueIteration(grid)
    }

    solver = solvers[args.solver]

    running = True
    done = False

    # while not done:
    while running:
        running = check_pygame()
        pygame.display.update()
        state_values, done = solver.forward(state_values)
        update_tiles(state_values, tiles)
Example #20
0
    parser.add_argument('-r',
                        '--renderEvery',
                        type=int,
                        default=0,
                        help="Render every nth episode. 0 to disable.")
    args = parser.parse_args()

    # Initialize the environment
    env = Environment(args.environment, args.numEpisodesPerEval,
                      args.renderEvery)

    if args.algorithm == 'SARSA' or args.algorithm == 'Qlearning' or args.algorithm == 'MonteCarlo' or args.algorithm == 'MinVar':
        from value_iteration import ValueIteration
        policy = policies.DiscreteQfunction(env, args.hiddenLayers)
        algo = ValueIteration(policy,
                              gamma=args.gamma,
                              learnrate=args.learningRate,
                              estimator=args.algorithm)
    else:  # policy based methods
        # Initialize the policy
        if env.actionType == 'discrete':
            policy = policies.DiscretePolicy(env, args.hiddenLayers)
        elif env.actionType == 'continuous':
            policy = policies.GaussianPolicy(env, args.hiddenLayers,
                                             args.explorationNoise)
        else:
            raise Exception("Unreachable.")

        # Select a training algorithm.
        if args.algorithm == 'Reinforce' or args.algorithm == 'PG':
            from reinforce import Reinforce
            algo = Reinforce(policy,
Example #21
0
################################################################################
actions = [
    (-1,-1), (0,-1), (1,-1),
    (-1, 0), (0, 0), (1, 0),
    (-1, 1), (0, 1), (1, 1),
]

vl_opts = [0, 1,  2, 3, 4, 5, -5, -4, -3, -2, -1]

# tiny test track
################################################################################
track = dl.load_tinytrack()
simulator = TrackSimulator(track = track, min_velocity = min(vl_opts), max_velocity = max(vl_opts), crash_restart = False)

learner = ValueIteration(env = simulator, vl_opts = vl_opts, actions = actions,
                        gamma = 1.0, epsilon = 0.001)

learner = Q_SARSA_Learner(env = simulator, vl_opts = vl_opts,
                    actions = actions, alpha = 0.25, gamma = 0.9)

learner = Q_SARSA_Learner(env = simulator, vl_opts = vl_opts, actions = actions,
                    alpha = 0.25, gamma = 0.9, sarsa = True)

simulator.pretty_print()
trial_helper(simulator, learner, 50, 10, 'tinytrack', policy = None)
trial_helper(simulator, learner, 100000, 10, 'tinytrack-q', policy = None)
trial_helper(simulator, learner, 100000, 10, 'tinytrack-sarsa', policy = None)

# l-track
################################################################################
track = dl.load_l()