def __init__(self, episodes, steps, gamma, epsilon, alpha, n_env, n_actions): self.episodes = episodes self.steps = steps self.gamma = gamma self.epsilon = epsilon self.alpha = alpha self.n_env = n_env self.n_actions = n_actions self.qlearning = Qlearning(gamma=self.gamma, alpha=self.alpha, n_env=self.n_env, n_actions=self.n_actions) self.reward_visits = np.zeros((self.n_env, self.n_env), dtype=np.int32) self.scores = [0] self.grid_area = generate_grids(16) self.scoresmap = plt.figure(figsize=(5, 10)) self.scoresmap1 = self.scoresmap.add_subplot(211) self.scoresmap2 = self.scoresmap.add_subplot(212) # first is top left and then clockwise direction self.reward_pos = np.array([[1.5, 1.5], [1.5, 14.5], [14.5, 1.5], [14.5, 14.5]])
def __init__(self): self.app=QtGui.QApplication(sys.argv) self.window = opengl.GLViewWidget() self.window.setGeometry(0,410,800,800) self.window.setCameraPosition(distance=12,azimuth=270) x_axis=opengl.GLGridItem() x_axis.setSize(x=10,y=10) #y_axis=opengl.GLGridItem() #y_axis.rotate(90,0,1,0) #self.window.addItem(y_axis) self.grid=Cell() self.q=Qlearning(no_of_actions,no_of_states,state_combinations) self.window.addItem(x_axis) self.current_node=self.grid.grid_nodes[0] self.nodes=opengl.GLScatterPlotItem(pos=self.grid.grid_nodes,color=glColor((0,255,0)),size=7) self.goal=opengl.GLScatterPlotItem(pos=self.grid.goal_node,color=glColor((0,0,255)),size=15) self.current_node_item=opengl.GLScatterPlotItem(pos=self.current_node,color=glColor((255,0,0)),size=9) self.blocked=opengl.GLScatterPlotItem(pos=self.grid.blocked_nodes,color=glColor((255,255,255)),size=13) self.counter=0 self.generation_counter=0 self.step_counter=0 self.tracker=[] self.window.addItem(self.nodes) self.window.addItem(self.blocked) self.window.addItem(self.current_node_item) self.window.addItem(self.goal) self.window.show()
class TestCharacter(CharacterEntity): def __init__(self, name, avatar, x, y): CharacterEntity.__init__(self, name, avatar, x, y) self.reward = 0 self.q_learn = Qlearning(0) def do(self, wrld): """ Our Code """ # Creation of State state = State(wrld, (self.x, self.y), self.name) act = self.q_learn.step(state) self.act(act) sort_of_me = wrld.me(self) TestCharacter.act(sort_of_me, act) new_wrld, events = wrld.next() new_me = new_wrld.me(self) reward = -1 # new_wrld.scores[self.name] - wrld.scores[self.name] if new_me is not None: res_state = State(new_wrld, (new_me.x, new_me.y), self.name) event_scores = {Event.BOMB_HIT_CHARACTER: -100, Event.CHARACTER_KILLED_BY_MONSTER: -100, Event.CHARACTER_FOUND_EXIT: 100, Event.BOMB_HIT_MONSTER: 20, Event.BOMB_HIT_WALL: 5} for event in events: if event in event_scores: reward += event_scores[event] reward += (state.len_a_star - res_state.len_a_star) * 3 reward -= (state.dist_closest_monster - res_state.dist_closest_monster) * 4 reward -= (res_state.bomb_placed and not state.bomb_placed) * 3 reward += (state.dis_to_exit() - res_state.dis_to_exit()) else: res_state = state print("reward: ", reward) self.q_learn.save_outcome(act, res_state, state, reward) pass def act(self, action): """ Action: ((dx, dy), Boolean) The action we need to make """ self.move(action[0][0], action[0][1]) if action[1]: self.place_bomb()
def digitize_fun(state): def bins(clip_min, clip_max, num): return np.linspace(clip_min, clip_max, num + 1)[1:-1] car_pos, car_v, pole_angle, pole_v = state result = [ np.digitize(car_pos, bins(-2.4, 2.4, 4)), np.digitize(car_v, bins(-3.0, 3.0, 4)), np.digitize(pole_angle, bins(-0.5, 0.5, 4)), np.digitize(pole_v, bins(-2.0, 2.0, 4)) ] x = sum([x * (4**i) for i, x in enumerate(result)]) return x q_f = Qlearning(digitize_fun, 0.2, 0.99, 0.15, [0, 1]) max_number_of_steps = 200 # 每一场游戏的最高得分 goal_average_steps = 195 num_consecutive_iterations = 100 last_time_steps = np.zeros( num_consecutive_iterations) # 只存储最近100场的得分(可以理解为是一个容量为100的栈) env = gym.make('CartPole-v0') for episode in range(5000): observation = env.reset() # 初始化本场游戏的环境 episode_reward = 0 for t in range(max_number_of_steps): action = q_f.get_actions(observation) # env.render()
plt.xlabel('Size of dungeon') plt.ylabel('Time of execution') plt.savefig('exp_' + name + '.png') if len(sys.argv) > 1 and sys.argv[1] == "mdp": game = MDPGame() name = "" if len(sys.argv) > 2: if sys.argv[2] == "v": game.set_solver("value_iteration") name = 'mdp_value_iter' elif sys.argv[2] == "p": game.set_solver("politic_iteration") name = 'mdp_politic_iter' else: game.set_solver("politic_iteration") name = 'mdp_politic_iter' exps = Experience(game) exps.run() exps.crate_image(name) elif len(sys.argv) > 1 and sys.argv[1] == "qlearning": #Qlearning should implement: #Qlearning.init_experience(size) for initialization #Qlearning.run() for execution of solving game = Qlearning() exps = Experience(game) exps.run() exps.crate_image(name)
class Model: def __init__(self, episodes, steps, gamma, epsilon, alpha, n_env, n_actions): self.episodes = episodes self.steps = steps self.gamma = gamma self.epsilon = epsilon self.alpha = alpha self.n_env = n_env self.n_actions = n_actions self.qlearning = Qlearning(gamma=self.gamma, alpha=self.alpha, n_env=self.n_env, n_actions=self.n_actions) self.reward_visits = np.zeros((self.n_env, self.n_env), dtype=np.int32) self.scores = [0] self.grid_area = generate_grids(16) self.scoresmap = plt.figure(figsize=(5, 10)) self.scoresmap1 = self.scoresmap.add_subplot(211) self.scoresmap2 = self.scoresmap.add_subplot(212) # first is top left and then clockwise direction self.reward_pos = np.array([[1.5, 1.5], [1.5, 14.5], [14.5, 1.5], [14.5, 14.5]]) def run_episode(self, i, state): step = 0 done = 0 dist = sum(abs(self.reward_pos[0] - state)) action = self.qlearning.choose_action(state, self.epsilon[i]) while not done and step < self.steps: # choose action using policy rospy.set_param('action', action) print('Action:', 'move forward' if action == 0 else 'turn left' if action == 1 else 'turn right' if action == 2 else '') # execute action action_done = 0 rospy.set_param('action_done', action_done) # wait until action is done while action_done == 0: action_done = rospy.get_param('action_done') state_new = np.array(rospy.get_param('position')) dir = rospy.get_param('direction') red = rospy.get_param('red') print('Position:', state_new[0], state_new[1]) print('Direction:', dir) print('Grid cell:', self.grid_area[ int(state_new[0]), int(state_new[1])], '\n') # update plot self.draw_map(i, state_new, dir) # get reward reward = 0 if red: for r_idx, j in enumerate(self.reward_pos): if np.linalg.norm(state_new - j) <= 0.5: reward = 5 done = 1 self.reward_visits[int(j[0]), int(j[1])] += 1 print('Reward index:', r_idx) break elif np.linalg.norm(state - state_new) < .1: reward = -1 action_new = self.qlearning.choose_action(state, self.epsilon[i]) # update table td = self.qlearning.update_table_sarsa(state, state_new, action, action_new, reward) self.tderrors.append(td) print('TD-error:', td) self.draw_tderror(i, step) # set next state state = state_new action = action_new step += 1 time.sleep(.5) score = dist / step if done else 0 self.scores.append(self.scores[-1] * .9 + score * .1) print('Path length:', step) def run_training(self, vc): for i in range(self.episodes): # create figures self.minimap = plt.figure(figsize=(5, 10)) self.minimap1 = self.minimap.add_subplot(211) self.minimap2 = self.minimap.add_subplot(212) self.tderrormap = plt.figure(figsize=(5, 10)) self.tderrormap1 = self.tderrormap.add_subplot(211) self.tderrormap2 = self.tderrormap.add_subplot(212) # set initial parameters init_position = np.array([7.5, 7.5]) init_direction = 0 self.tderrors = [] # draw minimap rospy.set_param('i', i) self.draw_map(i, init_position, init_direction) # launch experiment try: self.sim = vc.launch_experiment('template_husky_0_0_0') except: time.sleep(1) time.sleep(10) # start the experiment self.sim.start() # start episode self.run_episode(i, init_position) # stop experiment self.sim.stop() time.sleep(10) # draw scores map self.draw_scores(i) pickle.dump(self.reward_visits, open('reward_visits.pkl', 'wb')) pickle.dump(self.tderrors, open('td_errors.pkl', 'wb')) pickle.dump(self.scores, open('scores.pkl', 'wb')) pickle.dump(self.qlearning.Q, open('Q.pkl', 'wb')) def draw_map(self, i, pos, dir): # plot robot position markers = ['v', '>', '^', '<'] self.minimap1.plot(pos[1], pos[0], marker=markers[dir], markersize=3, color='red') self.minimap1.set_xlim([0, 16]) self.minimap1.set_ylim([0, 16]) self.minimap1.invert_yaxis() ticks = np.arange(0, 16, 1) self.minimap1.set_xticks(ticks) self.minimap1.set_yticks(ticks) self.minimap1.grid(True) # plot grid cells self.minimap2.imshow(self.grid_area, cmap='BuGn', interpolation='nearest') self.minimap2.invert_yaxis() self.minimap2.set_xticks([]) self.minimap2.set_yticks([]) self.minimap2.grid(False) # save plot self.minimap.savefig('plots/plot_%d.png' % i) def draw_scores(self, i): # plot reward visits self.scoresmap1.invert_yaxis() ticks = np.arange(0, 16, 1) self.scoresmap1.set_xticks(ticks) self.scoresmap1.set_yticks(ticks) self.scoresmap1.imshow(self.reward_visits, interpolation='none') # plot scores self.scoresmap2.set_xlim([0, self.episodes]) self.scoresmap2.set_ylim([0, 1]) self.scoresmap2.plot([i, i + 1], self.scores[i: i + 2], linestyle='-', color='red') self.scoresmap2.set_ylabel('Score') self.scoresmap2.grid(True) # save plot self.scoresmap.savefig('plots/scores_%d.png' % i) def draw_tderror(self, i, step): if step > 0: self.tderrormap1.plot([step - 1, step], self.tderrors[step - 1: step + 1], linestyle='-', color='red') else: self.tderrormap1.plot(step, self.tderrors[step], linestyle='-', color='red') self.tderrormap1.set_xlabel('Step') self.tderrormap1.set_ylim([0, 100]) self.tderrormap1.set_ylabel('TD-error') self.tderrormap1.grid(True) self.tderrormap.savefig('plots/tderror_%d.png' % i)
print("MDP 1, Option", j, "Reward: ", R_avg / 1000) #MDP2 R_avg = 0 policy2 = generate_policy(numrows + 1, option=j) for i in range(1000): simulator = Simulator(mdp_2) (R_tot, actions_taken, states_visited) = simulator.runPolicy(policy2) R_avg += R_tot print("MDP 2, Option", j, "Reward: ", R_avg / 1000) if qlearn == 1: for eps in [.1, .25, .33, .5, .66, .75, .99]: ql = Qlearning(mdp_1, epsilon=eps) for trial in range(100): ql.learn() ra = 0 for trial in range(1000): (R_tot, actions_taken, states_visited) = ql.run_iteration() ra += R_tot ra /= 1000 print(ra) qt = ql.q_table if valiter == 1: #Read in an MDP from a file k = mdp.read_MDP_from_file('ParkingMDP1.txt') (states, actions, R, T) = mdp.process_MDP(k) t = open('output.txt', 'w+')
def main(sys): if len(sys.argv) == 5: print(sys.argv) dados = util_file.f_read(sys.argv[1]) x = sys.argv[2] y = sys.argv[3] n = sys.argv[4] i = int(dados[0]) j = int(dados[1]) mapa = dados[2] ql = Qlearning(mapa, i, j, x, y, n) ql.setPiMap(ql.getOMap()) ql.calculateInitialQMap() ql.executeQLearning() ql.fiilPiMap() util_file.f_write(ql.getPiMap(), "pi.txt", ql.getQMap(), "q.txt") print("Finish.") else: print("Entrada invalida. Padrão aceito: mapa x y n")
obs = env.getPosition() total_reward = 0.0 i = 0 print("RewardScenario: \n{}".format(env.rewardScenario)) print("RatScenario: \n{}".format(env.ratScenario)) print("================================================") while not done: # if i == 20: # print('fudeu') action = policy[obs] next_obs, reward, done = env.step(action) obs = next_obs total_reward += reward print("RewardScenario: \n{}".format(env.rewardScenario)) print("RatScenario: \n{}".format(env.ratScenario)) print("total reward: {}".format(total_reward)) print("================================================") i+=1 return total_reward if __name__ == "__main__": env = Environment() agent = Qlearning(env, MAX_NUM_EPISODES, STEPS_PER_EPISODE, EPSILON_MIN, EPSILON_DECAY, ALPHA, GAMMA) learned_policy = train(agent,env) env = Environment() for i in range(4): test(agent, env, learned_policy) env.reset() print('DONE \n\n') # test(agent, env, learned_policy) print("done")
def __init__(self, name, avatar, x, y): CharacterEntity.__init__(self, name, avatar, x, y) self.reward = 0 self.q_learn = Qlearning(0)
def main(): # set flags render = 0 render_start = 0 # set model parameters episodes = 1000 steps = 30 gamma = 0.99 alpha = 0.0001 epsilon = np.linspace(0.5, 0.05, episodes) n_env = 17 n_actions = 3 # create environment env = Grid(n_env) scores_manhattan = [] scores_done = [] rewards_variation = [] steps_variation = [] # initialize network qlearning = Qlearning(gamma, alpha, n_env, n_actions) # start training start = time.time() for episode in range(episodes): # set initial state state, dir_, = env.reset(f=False) done = 0 step = 0 rewards = [] # start episode while not done and step < steps: if render_start or (render and episode > episodes - 2): env.render() # choose action and get reward action = qlearning.choose_action(state, dir_, epsilon[episode]) state_new, dir_new, reward, done = env.step(action, f=False) rewards.append(reward) # update weights _ = qlearning.update_table_qlearning(state, state_new, action, dir_, dir_new, reward) # set next state state = state_new dir_ = dir_new step += 1 # print reward scores_manhattan.append(env.dist / float(step) if done else 0.0) scores_done.append(done) rewards_variation.append(sum(rewards)) steps_variation.append(step) print( 'episode: %5d reward: %6.2f score: %6.2f step: %2d %s' % (episode + 1, sum(rewards), scores_manhattan[-1], step, 'Done' if done else '')) # save objects pickle.dump(scores_manhattan, open('results/scores_manhattan.pkl', 'wb')) pickle.dump(scores_done, open('results/scores_done.pkl', 'wb')) pickle.dump(rewards_variation, open('results/rewards_variation.pkl', 'wb')) pickle.dump(steps_variation, open('results/steps_variation.pkl', 'wb')) # print final score print('Time elapsed:', time.time() - start) print('Reward visits:', env.reward_visits)
print((i0, j0), '->', (i1, j1)) color = 'green' w = 0.1 plt.arrow(0.5 + j0, 0.5 + i0, (j1 - j0)*0.5, (i1 - i0)*0.5, width=w, color=color) if save: plt.savefig('img/' + name + '.png') else : plt.show() plt.clf() display_and_save(grille, 'grille', True, []) ### APPLICATION DU QLEARNING Q = Qlearning( states[ 0 ] ) for i in range(iteration): Q.move() Q.update_Qfunction() Q.reset() if (i%(iteration//10) == 0 ): print('Réalistation : ', i*100/iteration, ' %') Q.probability = 1 Q.move()
class GuiGrid: def __init__(self): self.app=QtGui.QApplication(sys.argv) self.window = opengl.GLViewWidget() self.window.setGeometry(0,410,800,800) self.window.setCameraPosition(distance=12,azimuth=270) x_axis=opengl.GLGridItem() x_axis.setSize(x=10,y=10) #y_axis=opengl.GLGridItem() #y_axis.rotate(90,0,1,0) #self.window.addItem(y_axis) self.grid=Cell() self.q=Qlearning(no_of_actions,no_of_states,state_combinations) self.window.addItem(x_axis) self.current_node=self.grid.grid_nodes[0] self.nodes=opengl.GLScatterPlotItem(pos=self.grid.grid_nodes,color=glColor((0,255,0)),size=7) self.goal=opengl.GLScatterPlotItem(pos=self.grid.goal_node,color=glColor((0,0,255)),size=15) self.current_node_item=opengl.GLScatterPlotItem(pos=self.current_node,color=glColor((255,0,0)),size=9) self.blocked=opengl.GLScatterPlotItem(pos=self.grid.blocked_nodes,color=glColor((255,255,255)),size=13) self.counter=0 self.generation_counter=0 self.step_counter=0 self.tracker=[] self.window.addItem(self.nodes) self.window.addItem(self.blocked) self.window.addItem(self.current_node_item) self.window.addItem(self.goal) self.window.show() def update(self): for i,k in self.grid.indexing.items(): if all(k==self.current_node): indexed_current_node=i #get the best next action from q table action=self.q.get_action(indexed_current_node) #get all the possible you can go from that point adjacent_nodes=self.grid.get_adjacent_nodes(self.current_node) #getting the next state using the best move that was made [[next_node,reward,ac],reached_goal]=self.grid.get_next_node(action,adjacent_nodes) for i,k in self.grid.indexing.items(): if all(k==next_node): indexed_next_node=i if not reached_goal: self.step_counter+=1 self.q.update_qtable(reward,indexed_current_node,action,indexed_next_node) elif reached_goal== True: self.tracker.append([self.generation_counter,self.step_counter]) print("-------------------------episode over-------------------------------") print("generation",self.generation_counter,"number of steps took",self.step_counter) #time.sleep(1) self.generation_counter+=1 next_node=self.grid.grid_nodes[0] self.step_counter=0 if generations//2 >= self.generation_counter >=1: self.q.epsilon -= self.q.decay if self.generation_counter%100==0: with open('track_file.txt','w') as track_file: track_file.write(str(self.tracker)) ''' new_nodes=self.grid.set_rewards(new_nodes) print(new_nodes) self.current_node,reward,action=random.choice(new_nodes)''' self.current_node_item.setData(pos=np.array(next_node)) self.current_node=next_node self.counter+=1 print(self.generation_counter,self.counter) def start(self): if (sys.flags.interactive != 1) or not hasattr(QtCore, 'PYQT_VERSION'): QtGui.QApplication.instance().exec_() def animation(self,frametime=10): timer = QtCore.QTimer() timer.timeout.connect(self.update) timer.start(frametime) self.start()
import pygame import keyboard from qlearning import Qlearning from display import MainRun size = (305, 305) main = MainRun(size) main.Main() game = Qlearning() game.set_environment(main.iGrid) print("Instructions:") print("Click the white box to change color:") print("White = Path") print("Green = Starting point (only one)") print("Yellow = end point (if no yellow box, the training will not start") print("Red = Block ") print("Press Enter key to begin") game.saitama_training(main.iGrid) cheese = game.find_my_cheese() main.after_training(cheese, main.iGrid)