def execute_policy_iteration_test(test, output='console'): """ Description ----------- Function used to run the policy_iteration for the given test. Parameters ---------- test: Test() \\ -- A Test() instance with the information needed to run the policy_iteration. output: str \\ -- A string that tells the function where its output is expected. Returns ------- PolicyIteration() \\ -- If no recognizable outputs is informed, returns the instance of the policy_iteration used. """ policy_iteration = PolicyIteration(test) policy_iteration.run() if output in ['console', 'file']: output_processing(output, test, policy_iteration, 'PolicyIteration') return policy_iteration
def __init__(self): super(GraphicDisplay, self).__init__() self.title('Policy Iteration') self.geometry('{0}x{1}'.format(HEIGHT * UNIT, HEIGHT * UNIT + 50)) self.texts = [] self.arrows = [] self.util = Util() self.agent = PolicyIteration(self.util) self._build_env()
def clear(self): for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image(50, 50, image=self.rectangle_image) self.agent = PolicyIteration(self.util)
def clear(self): if self.is_moving == 0: self.evaluation_count = 0 self.improvement_count = 0 for i in self.texts: self.canvas.delete(i) for i in self.arrows: self.canvas.delete(i) self.canvas.delete(self.rectangle) self.rectangle = self.canvas.create_image( 50, 50, image=self.rectangle_image) self.agent = PolicyIteration(self.util)
def main(): aiType = 3 worldSize = 6 game = Game(aiType, worldSize) agent = Agent() pc = None policy = None if aiType == 1: policy = ValueIteration() pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]]) elif aiType == 2: policy = PolicyIteration() pc = PolicyConfiguration(inpRewards=[1, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]]) elif aiType == 3: policy = qLearningAgent() pc = PolicyConfiguration(inpRewards=[0, -1, 0, 10, -1], inpDiscounts=[1, .1, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]], inpFile="QLValues.p", inpTrainingLimit=1000) elif aiType == 4: policy = approximateQLearning() pc = PolicyConfiguration(inpRewards=[2, -1, 0, 0, -1], inpDiscounts=[0.9, .2, .1], inpStochastic=[[100, 0, 0], [0, 100, 0], [0, 0, 100]], inpFile="AQLWeights.json", inpTrainingLimit=500) else: policy = ValueIteration() pc = PolicyConfiguration() policy.config = pc agent.policy = policy game.agent = agent game.mainLoop()
def main(args): # resolve path to world map definition if not args.world: world_map_path = os.path.join( os.path.dirname(os.path.realpath(__file__)), 'world_map.txt') else: world_map_path = args.world print("Reading world from %s" % world_map_path) if not os.path.exists(world_map_path): raise IOError( "World map definition not found at its expected path: %s" % world_map_path) world = World(world_map_path) visualizer = Visualizer(world) # Value Iteration value_iteration = ValueIteration(world, one_step_cost_v1, discount_factor=args.gamma, eps=10e-10) value_iteration.execute() optimal_policy = value_iteration.extract_policy() fig_vi = plt.figure() visualizer.draw(fig_vi, optimal_policy, value_iteration.value_fn, "Value Iteration (gamma = %.2f)" % args.gamma) # Policy Iteration policy_iteration = PolicyIteration(world, one_step_cost_v1, discount_factor=args.gamma) value_fn = policy_iteration.execute() fig_pi = plt.figure() visualizer.draw(fig_pi, policy_iteration.policy, value_fn, "Policy Iteration (gamma = %.2f)" % args.gamma) plt.show()
def __init__(self, can, direction, inpAIType): self.flag = 1 self.can = can self.direction = direction self.aiType = inpAIType self.agent = Agent() pc = None policy = None #inpRewards = [food reward, hazard reward, living reward, good location reward, bad location reward] #good and bad location is only used for qlearning #tried to use to cause graph searching #not really used and can give wonky results #inpDiscounts = [gamma discount, alpha discount, epsilon explore chance] #inpStochastic = [forward action[forward chance, left chance, right chance] #left action[forward chance, left chance, right chance] #right action[forward chance, left chance, right chance]] #inpFile file for weight or qvalues if self.aiType == 1: policy = ValueIteration() pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) elif self.aiType == 2: policy = PolicyIteration() pc = PolicyConfiguration(inpRewards = [1,-1,0,10,-1], inpDiscounts = [1,.1,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) elif self.aiType == 3: policy = qLearningAgent() #risk aversion aka rarely go off best path seems to work best #This one seemed to work #pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]]) pc = PolicyConfiguration(inpRewards = [2,-1,0,0,0], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 20000) elif self.aiType == 4: policy = approximateQLearning() pc = PolicyConfiguration(inpRewards = [2,-1,0,0,-1], inpDiscounts = [0.9,.2,.1], inpStochastic = [[100,0,0],[0,100,0],[0,0,100]], inpFile = None, inpTrainingLimit = 5000) else: policy = ValueIteration() pc = PolicyConfiguration() policy.config = pc self.agent.policy = policy
for i in range(size): for j in range(size): cell_type = cell_matrix[j][i] if cell_type == CellType.WHOOPING: is_terminal = True reward = -10 elif cell_type == CellType.KFC: is_terminal = True reward = 10 else: is_terminal = False reward = -1 cell = CellState((i, j), reward, cell_type, is_terminal) env.place_cell(i, j, cell) states.append(cell) return env, states env, states = create_game_env() agent = Agent("policy_eval", (0, 0)) policy_iter_algo = PolicyIteration(states) policy = policy_iter_algo.run() game = Game(Config, Controller, env, agent, policy) # initiate env game.draw_env() pygame.display.update() game.start()
if i < row_num - 2: trans_probs[i * col_num + j][down][(i + 1) * col_num + j] = p else: trans_probs[i * col_num + j][down][i * col_num + j] = p # 0 trans_probs[0][:] = 0 # 15 trans_probs[15][:] = 0 # 16 trans_probs[16][:] = 0 # 18 trans_probs[18][:] = 0 # 19 trans_probs[19][:] = 0 trans_probs[17][up][13] = 1 trans_probs[17][left][12] = 1 trans_probs[17][left][16] = 0 trans_probs[17][right][14] = 1 trans_probs[17][right][18] = 0 trans_probs[17][down][17] = 1 trans_probs[13][down][17] = 1 trans_probs[13][down][13] = 0 policy_iteration = PolicyIteration(trans_probs, rewards, action_probs, 0.00001, states_num, actions_num, 1) policy_iteration.policy_evaluation() for i in range(row_num): for j in range(col_num): print("%.2f" % policy_iteration.v[i * col_num + j], end=" ") print()
from config import Config from policy_iteration import PolicyIteration if __name__ == "__main__": conf = Config() pi = PolicyIteration(conf, "pi", True)
# Sanity check transitions = grid_mdp.T(grid_mdp.initial_state, action=1) print(transitions) # Run value iteration # vi = ValueIteration(grid_mdp) # print('Running value iteration') # vi.run() # vi.plot_learning_curve() # # print('\nFinal V table:') # grid_mdp.print_values(vi.V) # # # Print the optimal policy # policy = vi.get_best_policy() # print('\nBest policy:') # grid_mdp.print_policy(policy) # Run policy iteration pi = PolicyIteration(grid_mdp) print('Running policy iteration') pi.run() # Print the optimal policy print('\nBest policy:') grid_mdp.print_policy(pi.policy) # Save policy to file with open('results/policy.h5', 'wb') as file: pickle.dump(pi.policy, file)
t = L2norm(muE - mu_i_1) print("===== t = {} =====".format(t)) if t <= epsilon: break #R = np.zeros(x_size*y_size) #Rの計算 R = Rcalc(w) #print("===== R: =====") #print(R) #print("") np.savetxt('R.csv', R, delimiter=',') #強化学習により、Rからpi_selectedを求める state = np.zeros(11) for hoge in range(11): state[hoge] = hoge pi_selected = PolicyIteration(state, P, R, gamma) # print(pi_selected) #mu(pi_selected) を計算 mu_old = Mu(pi_selected) mu_i_2 = mu_i_1 #i を加算、ループ続行 i = i + 1 print(R) ##########################################################################