def execute_policy_iteration_test(test, output='console'): """ Description ----------- Function used to run the policy_iteration for the given test. Parameters ---------- test: Test() \\ -- A Test() instance with the information needed to run the policy_iteration. output: str \\ -- A string that tells the function where its output is expected. Returns ------- PolicyIteration() \\ -- If no recognizable outputs is informed, returns the instance of the policy_iteration used. """ policy_iteration = PolicyIteration(test) policy_iteration.run() if output in ['console', 'file']: output_processing(output, test, policy_iteration, 'PolicyIteration') return policy_iteration
def __init__(self): super(GraphicDisplay, self).__init__() self.title('Policy Iteration') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) self.texts = [] self.arrows = [] self.util = Util() self.agent = PolicyIteration(self.util) self._build_env()
def clear(self): for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) self.agent = PolicyIteration(self.util)
def clear(self): if self.is_moving == 0: self.evaluation_count = 0 self.improvement_count = 0 for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image( 50, 50, image=self.rectangle_image) self.agent = PolicyIteration(self.util)
def main(args): # resolve path to world map definition if not args.world: world_map_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'world_map.txt') else: world_map_path = args.world print("Reading world from %s" % world_map_path) if not os.path.exists(world_map_path): raise IOError( "World map definition not found at its expected path: %s" % world_map_path) world = World(world_map_path) visualizer = Visualizer(world) # Value Iteration value_iteration = ValueIteration(world, one_step_cost_v1, discount_factor=args.gamma, eps=10e-10) value_iteration.execute() optimal_policy = value_iteration.extract_policy() fig_vi = plt.figure() visualizer.draw(fig_vi, optimal_policy, value_iteration.value_fn, "Value Iteration (gamma = %.2f)" % args.gamma) # Policy Iteration policy_iteration = PolicyIteration(world, one_step_cost_v1, discount_factor=args.gamma) value_fn = policy_iteration.execute() fig_pi = plt.figure() visualizer.draw(fig_pi, policy_iteration.policy, value_fn, "Policy Iteration (gamma = %.2f)" % args.gamma) plt.show()
def main(): aiType = 3 worldSize = 6 game = Game(aiType, worldSize) agent = Agent() pc = None policy = None if aiType == 1: policy = ValueIteration() pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]]) elif aiType == 2: policy = PolicyIteration() pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]]) elif aiType == 3: policy = qLearningAgent() pc = PolicyConfiguration(inpRewards=[0, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]], inpFile="QLValues.p", inpTrainingLimit=1000) elif aiType == 4: policy = approximateQLearning() pc = PolicyConfiguration(inpRewards=[2, -1, 0, 0, -1], inpDiscounts=[0.9, .2, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]], inpFile="AQLWeights.json", inpTrainingLimit=500) else: policy = ValueIteration() pc = PolicyConfiguration() policy.config = pc agent.policy = policy game.agent = agent game.mainLoop()
def __init__(self, can, direction, inpAIType): self.flag = 1 self.can = can self.direction = direction self.aiType = inpAIType self.agent = Agent() pc = None policy = None #inpRewards = [food reward, hazard reward, living reward, good location reward, bad location reward] #good and bad location is only used for qlearning #tried to use to cause graph searching #not really used and can give wonky results #inpDiscounts = [gamma discount, alpha discount, epsilon explore chance] #inpStochastic = [forward action[forward chance, left chance, right chance] #left action[forward chance, left chance, right chance] #right action[forward chance, left chance, right chance]] #inpFile file for weight or qvalues if self.aiType == 1: policy = ValueIteration() pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) elif self.aiType == 2: policy = PolicyIteration() pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) elif self.aiType == 3: policy = qLearningAgent() #risk aversion aka rarely go off best path seems to work best #This one seemed to work #pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) pc = PolicyConfiguration(inpRewards = [2,-1,0,0,0], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 20000) elif self.aiType == 4: policy = approximateQLearning() pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 5000) else: policy = ValueIteration() pc = PolicyConfiguration() policy.config = pc self.agent.policy = policy
for i in range(size): for j in range(size): cell_type = cell_matrix[j][i] if cell_type == CellType.WHOOPING: is_terminal = True reward = -10 elif cell_type == CellType.KFC: is_terminal = True reward = 10 else: is_terminal = False reward = -1 cell = CellState((i, j), reward, cell_type, is_terminal) env.place_cell(i, j, cell) states.append(cell) return env, states env, states = create_game_env() agent = Agent("policy_eval", (0, 0)) policy_iter_algo = PolicyIteration(states) policy = policy_iter_algo.run() game = Game(Config, Controller, env, agent, policy) # initiate env game.draw_env() pygame.display.update() game.start()
if i < row_num - 2: trans_probs[i * col_num + j][down][(i + 1) * col_num + j] = p else: trans_probs[i * col_num + j][down][i * col_num + j] = p # 0 trans_probs[0][:] = 0 # 15 trans_probs[15][:] = 0 # 16 trans_probs[16][:] = 0 # 18 trans_probs[18][:] = 0 # 19 trans_probs[19][:] = 0 trans_probs[17][up][13] = 1 trans_probs[17][left][12] = 1 trans_probs[17][left][16] = 0 trans_probs[17][right][14] = 1 trans_probs[17][right][18] = 0 trans_probs[17][down][17] = 1 trans_probs[13][down][17] = 1 trans_probs[13][down][13] = 0 policy_iteration = PolicyIteration(trans_probs, rewards, action_probs, 0.00001, states_num, actions_num, 1) policy_iteration.policy_evaluation() for i in range(row_num): for j in range(col_num): print("%.2f" % policy_iteration.v[i * col_num + j], end=" ") print()
from config import Config from policy_iteration import PolicyIteration if __name__ == "__main__": conf = Config() pi = PolicyIteration(conf, "pi", True)
# Sanity check transitions = grid_mdp.T(grid_mdp.initial_state, action=1) print(transitions) # Run value iteration # vi = ValueIteration(grid_mdp) # print('Running value iteration') # vi.run() # vi.plot_learning_curve() # # print('\nFinal V table:') # grid_mdp.print_values(vi.V) # # # Print the optimal policy # policy = vi.get_best_policy() # print('\nBest policy:') # grid_mdp.print_policy(policy) # Run policy iteration pi = PolicyIteration(grid_mdp) print('Running policy iteration') pi.run() # Print the optimal policy print('\nBest policy:') grid_mdp.print_policy(pi.policy) # Save policy to file with open('results/policy.h5', 'wb') as file: pickle.dump(pi.policy, file)
t = L2norm(muE - mu_i_1) print("===== t = {} =====".format(t)) if t <= epsilon: break #R = np.zeros(x_size*y_size) #Rの計算 R = Rcalc(w) #print("===== R: =====") #print(R) #print("") np.savetxt('R.csv', R, delimiter=',') #強化学習により、Rからpi_selectedを求める state = np.zeros(11) for hoge in range(11): state[hoge] = hoge pi_selected = PolicyIteration(state, P, R, gamma) # print(pi_selected) #mu(pi_selected) を計算 mu_old = Mu(pi_selected) mu_i_2 = mu_i_1 #i を加算、ループ続行 i = i + 1 print(R) ##########################################################################
class GraphicDisplay(tk.Tk): def __init__(self): super(GraphicDisplay, self).__init__() self.title('Policy Iteration') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) self.texts = [] self.arrows = [] self.util = Util() self.agent = PolicyIteration(self.util) self._build_env() def _build_env(self): self.canvas = tk.Canvas(self, bg='white', height=HEIGHT * UNIT, width=WIDTH * UNIT) # Buttons iteration_button = tk.Button(self, text="Evaluation", command=self.policy_evaluation) iteration_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.13, (HEIGHT * UNIT) + 10, window=iteration_button) policy_button = tk.Button(self, text="Improvement", command=self.policy_improvement) policy_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.37, (HEIGHT * UNIT) + 10, window=policy_button) policy_button = tk.Button(self, text="move", command=self.move_by_policy) policy_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.62, (HEIGHT * UNIT) + 10, window=policy_button) policy_button = tk.Button(self, text="clear", command=self.clear) policy_button.configure(width=10, activebackground="#33B5E5") self.canvas.create_window(WIDTH * UNIT * 0.87, (HEIGHT * UNIT) + 10, window=policy_button) # create grids for col in range(0, WIDTH * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = col, 0, col, HEIGHT * UNIT self.canvas.create_line(x0, y0, x1, y1) for row in range(0, HEIGHT * UNIT, UNIT): # 0~400 by 80 x0, y0, x1, y1 = 0, row, HEIGHT * UNIT, row self.canvas.create_line(x0, y0, x1, y1) # image_load self.up_image = ImageTk.PhotoImage( Image.open("../resources/up.png").resize((13, 13))) self.right_image = ImageTk.PhotoImage( Image.open("../resources/right.png").resize((13, 13))) self.left_image = ImageTk.PhotoImage( Image.open("../resources/left.png").resize((13, 13))) self.down_image = ImageTk.PhotoImage( Image.open("../resources/down.png").resize((13, 13))) self.rectangle_image = ImageTk.PhotoImage( Image.open("../resources/rectangle.png").resize((65, 65), Image.ANTIALIAS)) self.triangle_image = ImageTk.PhotoImage( Image.open("../resources/triangle.png").resize((65, 65))) self.circle_image = ImageTk.PhotoImage( Image.open("../resources/circle.png").resize((65, 65))) # add image to canvas self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) self.triangle1 = self.canvas.create_image(250, 150, image=self.triangle_image) self.triangle2 = self.canvas.create_image(150, 250, image=self.triangle_image) self.circle = self.canvas.create_image(250, 250, image=self.circle_image) # add reward text self.text_reward(2, 2, "R : 1.0") self.text_reward(1, 2, "R : -1.0") self.text_reward(2, 1, "R : -1.0") # pack all self.canvas.pack() def clear(self): for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) self.agent = PolicyIteration(self.util) def text_value(self, row, col, contents, font='Helvetica', size=10, style='normal', anchor="nw"): origin_x, origin_y = 85, 70 x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) font = (font, str(size), style) return self.texts.append( self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor)) def text_reward(self, row, col, contents, font='Helvetica', size=10, style='normal', anchor="nw"): origin_x, origin_y = 5, 5 x, y = origin_y + (UNIT * col), origin_x + (UNIT * row) font = (font, str(size), style) return self.canvas.create_text(x, y, fill="black", text=contents, font=font, anchor=anchor) def rectangle_move(self, action): base_action = np.array([0, 0]) self.render() if action[0] == 1: # down base_action[1] += UNIT elif action[0] == -1: # up base_action[1] -= UNIT elif action[1] == 1: # right base_action[0] += UNIT elif action[1] == -1: # left base_action[0] -= UNIT self.canvas.move(self.rectangle, base_action[0], base_action[1]) # move agent def rectangle_location(self): temp = self.canvas.coords(self.rectangle) x = (temp[0] / 100) - 0.5 y = (temp[1] / 100) - 0.5 return int(y), int(x) def move_by_policy(self): self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) while len(self.agent.get_policy_table()[self.rectangle_location()[0]][ self.rectangle_location()[1]]) != 0: self.after( 100, self.rectangle_move( self.agent.get_action([ self.rectangle_location()[0], self.rectangle_location()[1] ]))) def draw_one_arrow(self, col, row, action): if col == 2 and row == 2: return if action[0] > 0: # up origin_x, origin_y = 50 + (UNIT * row), 10 + (UNIT * col) self.arrows.append( self.canvas.create_image(origin_x, origin_y, image=self.up_image)) if action[1] > 0: # down origin_x, origin_y = 50 + (UNIT * row), 90 + (UNIT * col) self.arrows.append( self.canvas.create_image(origin_x, origin_y, image=self.down_image)) if action[2] > 0: # left origin_x, origin_y = 10 + (UNIT * row), 50 + (UNIT * col) self.arrows.append( self.canvas.create_image(origin_x, origin_y, image=self.left_image)) if action[3] > 0: # right origin_x, origin_y = 90 + (UNIT * row), 50 + (UNIT * col) self.arrows.append( self.canvas.create_image(origin_x, origin_y, image=self.right_image)) def draw_from_policy(self, policy_table): for i in range(HEIGHT): for j in range(WIDTH): self.draw_one_arrow(i, j, policy_table[i][j]) def print_value_table(self, value_table): for i in range(WIDTH): for j in range(HEIGHT): self.text_value(i, j, value_table[i][j]) def render(self): time.sleep(0.1) self.canvas.tag_raise(self.rectangle) self.update() def policy_evaluation(self): for i in self.texts: self.canvas.delete(i) self.agent.policy_evaluation() self.print_value_table(self.agent.get_value_table()) def policy_improvement(self): for i in self.arrows: self.canvas.delete(i) self.agent.policy_improvement() self.draw_from_policy(self.agent.get_policy_table())