def jugar(self): cur = 1 it = 1 ql = Qlearning() while True: self.dist_prev = -1 self.player.__init__() print("intento: ", cur) while not self.game_over: pred = ql.predecir(self.get_estado()) if mal_direc[self.player.direcc] == pred: self.game_over = True else: self.player.direcc = pred self.player.mover() ql.actualizar(self.get_premio(), self.get_estado()) if self.revisar(): ql.actualizar(10, self.get_estado()) self.dist_prev = -1 it += 1 print("iteration: ", it) #no perder avance ql.actualizar(-10, self.get_estado()) self.game_over = False #print('Game over') cur += 1
def __init__(self, root): self.root = root self.root.title("My GridWorld") self.root.resizable(width=FALSE, height=FALSE) self.rightFrame = Frame(self.root) self.rightFrame.pack(side=RIGHT, fill=Y) self.leftFrame = Frame(self.root) self.leftFrame.pack(side=LEFT, fill=Y) Label(self.rightFrame, text=" ", fg='white').grid(row=0, column=2, sticky=W) Label(self.rightFrame, text=" ", fg='white').grid(row=0, column=0, sticky=E) self.grid = Canvas(self.leftFrame, width=640, heigh=640) self.gridMatrix = [[0 for row in range(self.grid_y)] for col in range(self.grid_x)] self.qMatrix = [[0 for row in range(self.grid_y)] for col in range(self.grid_x)] self.qlearner = Qlearning.Algorithm(self.gridMatrix) self.qMatrix_calculated = False # Radio Buttons for Add/remove cell section self.cell_radio_btn_val = IntVar() self.cell_radio_btn_val.set(1) # Radio Buttons for QL selection policies self.policy_radio_btn_val = IntVar() self.policy_radio_btn_val.set(1) # Radio Buttons for heat map selection self.heatmap_radio_btn_val = IntVar() self.heatmap_radio_btn_val.set(2) self.set_new_grid() self.create_left_side_elements() self.grid.bind('<Button-2>', self.reset_start_goal_cell) self.grid.bind('<Button-1>', self.add_start_goal_cell) self.alpha_text_var = StringVar() self.gamma_text_var = StringVar()
def main(): maze = readMaze('maze2.txt') #printMaze(maze) findStartAndFinish(maze) maze[start[0]][start[1]] = PLAYER #solveAStar(maze, start, finish) #debugWM(maze, start, Directions.UP, 10, 10) #play(maze, start, Directions.UP, 5, 5) startDir = Directions.UP Qlearning(3000, 0.4, 0.1, maze, start, startDir, 5, 5)
def __init__(self): self.env = Qgame() self.env.reset() self.action = -1 done = False Q = Qlearning.Qlearning(self) while (True): # suc = self.getSuccessors(self.getStartState()) # # if len(suc) < 1: # print("lose") # self.env.setlose() # done = True # # step = random.randrange(0, 3) # # self.env.step(step) # else: # step = random.choice(suc) # #print(step) # # self.env.render() # self.env.step(step[1]) self.env.render() currentstate = self.getGrid().index(self.getStartState()) food = self.getGrid().index(self.env.getFood()) actions = Q[food][currentstate] action = actions.index(max(actions)) # print(action, self.getStartState()) previouscell = self.env.getSnake()[0] done = self.env.step(action) print(previouscell, action, self.env.getSnake()[0], self.env.getFood()) if done: break
def run_q_learning(): game_count = 1 learner = Qlearning.Learner(DIS_WIDTH, DIS_HEIGHT, BLOCK_SIZE) scores_imp = [] condition = True while condition: learner.Reset() if game_count > 100: ticks = 50 learner.epsilon = 0 else: learner.epsilon = .1 score, reason = Q_GameLoop(learner) print(f"Games: {game_count}; Score: {score}; Reason: {reason}") # Output results of each game to console to monitor as agent is training game_count += 1 if game_count % 100 == 0: # Save qvalues every qvalue_dump_n games print("Save Qvals") learner.SaveQvalues() if game_count > 300 and game_count < 315: scores_imp.append(score) if game_count == 316: condition = False
################### #COMPARISON BETWEEN DDMQ AND BASIC FLAT QLEARNING reward_no_option = np.zeros((nbEpisodsTotal)) n_steps_no_option = np.zeros((nbEpisodsTotal)) reward_option = np.zeros((nbEpisodsTotal)) n_steps_option = np.zeros((nbEpisodsTotal)) score_concepts_50_mean = np.zeros(env.n_states) for k in range(N_runs): print("BASIC Q-LEARNING") print('Visualise cumulated discounted reward from the first trajectory') Q, V, policy, traj_without_option, reward_without_option = Qlearning( env, eps_0, nbEpisodsTotal, Tmax, plot=False, return_trajectories=True) print("MACRO Q-LEARNING") MQL, traj_with_option, reward_with_option, score_concepts_50 = DDMQ( plot=False) reward_no_option += np.array(reward_without_option) n_steps_no_option += np.array([len(t) for t in traj_without_option]) reward_option += np.array(reward_with_option) n_steps_option += np.array([len(t) for t in traj_with_option]) score_concepts_50_mean += score_concepts_50 reward_no_option /= N_runs reward_option /= N_runs n_steps_no_option /= N_runs
def learnFrozenLake(fn, isStochastic, lrStochastic, lrDeepQ, isPrint, deviceName): ## 4-0. 맵 읽고 Q-table 초기화하기 ## info = mapReader.readMap(fn, False) # 맵 읽기 map_ = info[0] width = info[1] # 맵의 가로 길이 height = info[2] # 맵의 세로 길이 Qtable = mapReader.initQtable(width, height) # Q-table 초기화 ## 4-1. Q-learning으로 일단 학습 ## if isPrint: print('\n\n---< 1. Q-learning으로 학습 >---') if isStochastic: data = Qlearning_stochastic.learning(map_, width, height, Qtable, 1, lrStochastic, isPrint) else: data = Qlearning.learning(map_, width, height, Qtable, 0, isPrint) # (S, A, R, S) set의 집합 ## 4-2. 1의 학습 결과 데이터를 사용할 수 있는 데이터로 변환 및 학습 데이터 생성 # Neural Network로 학습시키기 위한 (state, action별 reward)를 저장하기 위한 배열 stateInfo = [] # 형식은 [세로좌표][가로좌표][action별 보상] for i in range(height): temp = [] for j in range(width): temp.append([0, 0, 0, 0]) stateInfo.append(temp) # 학습 데이터 (state, action별 reward) 생성 states = [] # one-hot 적용 outputs = [] # state0, state1에 one-hot을 적용. # width=W, height=H이면 state = [w0, w1, ..., w(W-1), h0, h1, ..., h(H-1)]꼴 # 여기서 현재 위치의 좌표가 (wX, hY)이면 state에서 wX=1, hY=1이고 나머지는 모두 0 # action에 one-hot을 적용. # action = [0, 0, 0, 0]으로 초기화하고 action의 번호가 A이면 action[A]=1로 수정하여 해당 부분만 1로 적용. for i in range(len(data)): # state(t) state0 = [0]*(width+height) # state(t)의 좌표를 (wX, hY)라고 하면 state0[data[i][0][1]] = 1 # state(t)의 wX를 1로 적용 state0[width + data[i][0][0]] = 1 # state(t)의 hY를 1로 적용 # action(t) action0 = [0, 0, 0, 0] action0[data[i][1]] = 1 # reward(t+1) reward1 = data[i][2] # state(t+1) state1 = [0]*(width+height) # state(t+1)의 좌표를 (wX, hY)라고 하면 state1[data[i][3][1]] = 1 # state(t+1)의 wX를 1로 적용 state1[width + data[i][3][0]] = 1 # state(t+1)의 hY를 1로 적용 # 행동을 할 때마다 stateInfo 테이블의 action별 reward 갱신 stateInfo[data[i][0][0]][data[i][0][1]][data[i][1]] = reward1 if i >= len(data)-800: # 마지막 800개만 학습 states.append(state0) # 학습 데이터의 state 부분 temp = [] for j in range(4): rewardVal = stateInfo[data[i][0][0]][data[i][0][1]][j] rewardVal = 1 / (1 + math.exp(-rewardVal)) # sigmoid 함수 적용 temp.append(rewardVal) # 학습 데이터의 action별 reward 부분 outputs.append(temp) # 학습 데이터 일부 출력 if isPrint: print('<학습 데이터 일부>') for i in range(50): print(str(states[i]) + ': ' + str(np.array(outputs[i]))) ## 4-3. Deep Q Learning으로 학습 ## if isPrint: print('\n\n---< 2. Deep learning으로 학습 >---') model = create_model([tf.keras.layers.Flatten(input_shape=(width+height,)), keras.layers.Dense(64, activation='relu'), keras.layers.Dense(64, activation='relu'), keras.layers.Dense(4, activation='sigmoid')], tf.keras.optimizers.Adam(0.001), 'mean_squared_error', isPrint) learning(model, [states], [outputs], 'FrozenLake', 4, deviceName) ## 4-4. 테스트 ## if isPrint: print('\n\n---<3. 테스트>---') # 기존에 학습한 결과 불러오기 jsonFile = open('FrozenLake.json', 'r') loaded_model_json = jsonFile.read() jsonFile.close() newModel = tf.keras.models.model_from_json(loaded_model_json) newModel.load_weights('FrozenLake.h5') if isPrint: newModel.summary() newModel.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='mean_squared_error', metrics=['accuracy']) Qtable = mapReader.initQtable(width, height) # Q-table 초기화 state = [0, 0] # 상태 초기화 # Q table의 값들을 model로부터 얻은 보상 값들로 바꾸기 input_ = newModel.input output_ = [layer.output for layer in newModel.layers] func = K.function([input_, K.learning_phase()], output_) # state [i, j] 들을 입력으로 받음 testInput = [] for i in range(height): for j in range(width): temp = [0]*(width+height) # testInput에 추가할 배열 temp[j] = 1 temp[width+i] = 1 testInput.append(temp) testInput = np.array(testInput) # state [i, j] 들에 대한 output layer 출력값(action별 보상)의 9.9배를 Q table의 해당 값으로 result = func([testInput, 1]) print('\n\n< deep-learned Q-table >') for i in range(height): for j in range(width): for k in range(4): Qtable[i][j][k] = result[3][i*width+j][k] * 9.9 if isPrint: print('Qtable index ' + str(i) + ',' + str(j) + ' -> ' + str(result[3][i*width+j])) # 실행 및 맵 출력 if isStochastic: Qlearning_stochastic.execute(map_, width, height, Qtable, True) else: Qlearning.execute(map_, width, height, Qtable, True)
if __name__ == "__main__": # First grid: env = grid.gridenv() # Second grid: env2 = grid.gridenv() # Combined grid env3 = grid.gridenv() # Probabilistic grid env4 = grid.gridenv() # Q-learning on first grid, test on first ploc, x = env.initDeterministicgrid(1) # v = valueIteration.ValueIteration(env, x, 0.5) # v.runIterations() # print v.run_agent(ploc, env) q = Qlearning.Qlearning(env, x) rewards = q.runIterations() print "?" env2.initProbalisticgrid(1) print "done" print q.run_agent(ploc, env2) exit() # Plot for first part plt.plot(rewards) plt.ylabel('Reward') plt.xlabel('Episode') plt.title('Reward per episode for Q learning')
import Qlearning as Q import Agent as A #___________Initialisation step : Get the initBoard of the game__________________________ Outputs = [] # list of outputs # Create a Random agent first to get the init board #method = RM.RandomMethod() method = RM.RandomMethod() playerAgent = A.Agent(method) initGame = GIA.LaunchGame(playerAgent, Outputs, True) InitBoard, initPosition = initGame.simpleBoard, initGame.initialPosition #_____________Board initialized : we can make the real IA play____________________________ Outputs = [] method = Q.Qlearning() playerAgent = A.Agent( method) # we will be able to update the parameters of our agent easily playerAgent.brain.initBoard_Qtable(InitBoard) playerAgent.display() #GIA.LaunchGame(playerAgent,Outputs) #not trained yet : performs really badly #_____________Training the agent, and making tests________________________ trainIterations = 3 i = 0 epochs = 50 for j in range(trainIterations): while i % epochs != 0: etha = 5 / (j + 5)
master.bind("<Down>", call_down) master.bind("<Right>", call_right) master.bind("<Left>", call_left) me = board.create_rectangle(player[0]*Width+Width*2/10, player[1]*Width+Width*2/10, player[0]*Width+Width*8/10, player[1]*Width+Width*8/10, fill="blue", width=1, tag="me") board.grid(row=0, column=0) ''' ============== 3. Construct Environment with the data provided. ============== ''' env = Environment(n,m,transitions,rewards) ''' ============== 4. Call the Q-learning class to build Q-matrix. ============== ''' print("Building Q-Matrix...") q_matrix = [] q_1 = Qlearning.q_learning(env,unsafe_dictionary,state_goal,0.95) q_matrix.append(q_1) ''' ==== 5. Construct agent with data provided and Q-matrix (Knowledge Base). ==== ''' i_agent = Agent(state_0,env,q_matrix) ''' ===================== 6. Build path (non-deterministic). ===================== ''' discount = 0.3 steps = [] actual_steps = [] while True: next_action = i_agent.agent_function(0) next_state, new_action = i_agent.env.transition(i_agent.current_state,next_action) steps.append(str(new_action)+","+str(next_action))
I don't know why there is always bugs when running MonteCarlo in this file and try to fix the bug but failed. The test about MonteCarlo is done in the MonteCarlo_learning file. I hope you can understand that. I have tried even several days to rewrite the code but still didn't work. running_time_M, episode_M, plot_sum_reward_M = MON.m_test(20) episode_M_.append(episode_M) running_time_M_.append(running_time_M) plot_sum_reward_M_.append(plot_sum_reward_M) ''' running_time_S, episode_S, plot_sum_reward_S = SAR.s_test(20) episode_S_.append(episode_S) running_time_S_.append(running_time_S) plot_sum_reward_S_.append(plot_sum_reward_S) running_time_Q, episode_Q, plot_sum_reward_Q = QL.q_test(20) episode_Q_.append(episode_Q) running_time_Q_.append(running_time_Q) plot_sum_reward_Q_.append(plot_sum_reward_Q) Sarsa_run_time = sum(running_time_S_) / train_volumn average_S_episode = sum(episode_S_) / train_volumn Q_run_time = sum(running_time_Q_) / train_volumn average_Q_episode = sum(episode_Q_) / train_volumn print(f'Sarsa_run_time:{Sarsa_run_time},Q_run_time:{Q_run_time}') print(f'running_time_S_:{running_time_S_}') print(f'running_time_Q_:{running_time_Q_}') # print(f'sum_reward_M:{plot_sum_reward_M_},sum_reward_S:{plot_sum_reward_S_},sum_reward_Q:{plot_sum_reward_Q_}') print(f'episode_S_:{episode_S_}')
def main(win, rows): grid = make_grid(win_size, rows) lookup = lookup_table(rows) q_learning = Qlearning(epsilon=0.7, alpha=0.5, gamma=0.99, episodes=10000, rows=rows) q_table = q_learning.q_table() start = None end = None run = True while run: for event in pygame.event.get(): if event.type == pygame.QUIT: run = False if pygame.mouse.get_pressed()[0]: pos = pygame.mouse.get_pos() row, col = get_mouse_pos(pos, win_size, rows) spot = grid[row][col] if not start and spot != end: start = spot start.make_start() elif not end and spot != start: end = spot end.make_end() elif spot != start and spot != end: spot.make_barrier() if pygame.mouse.get_pressed()[2]: pos = pygame.mouse.get_pos() row, col = get_mouse_pos(pos, win_size, rows) spot = grid[row][col] spot.reset() if spot == start: start = None elif spot == end: end = None if event.type == pygame.KEYDOWN: if event.key == pygame.K_SPACE: gridworld = Gridworld(grid, win_size, rows) s, e, b = gridworld.get_start_end_barrier() print('Training the model') q_table = q_learning.fit(s, e, q_table, gridworld) path = q_learning.return_path(s, e, q_table, gridworld) print(path) for p in path: if p in b: print('Tune the Hyperparameters') for i in path[1:-1]: spot = grid[i[1]][i[0]] spot.make_path() draw(win, rows, grid)
master.bind("<Right>", call_right) master.bind("<Left>", call_left) me = board.create_rectangle(player[0] * Width + Width * 2 / 10, player[1] * Width + Width * 2 / 10, player[0] * Width + Width * 8 / 10, player[1] * Width + Width * 8 / 10, fill="blue", width=1, tag="me") board.grid(row=0, column=0) ''' ============== 3. Construct Environment with the data provided. ============== ''' env = Environment(n, m, transitions, rewards) ''' ============== 4. Call the Q-learning class to build Q-matrix. ============== ''' print("Building Q-Matrix...\n") q_matrix = [] q_1 = Qlearning.q_learning(env, unsafe_dictionary, state_goal, 0.9) env.exchange_start_goal(state_0, state_goal, unsafe_dictionary) q_2 = Qlearning.q_learning(env, punishment_dict, state_0, 0.9) env.exchange_start_goal(state_goal, state_0, unsafe_dictionary) q_matrix.append(q_1) q_matrix.append(q_2) ''' ==== 5. Construct agent with data provided and Q-matrix (Knowledge Base). ==== ''' i_agent = Agent(state_0, env, q_matrix) ''' ===================== 6. Build path (non-deterministic). ===================== ''' discount = 0.3 steps = [[], []] for i in range(len(q_matrix)): while True: next_action = i_agent.agent_function(i) next_state, new_action = i_agent.env.transition( i_agent.current_state, next_action)
def __init__(self, knowledge_filename, unjustified=0.1, *args, **kwargs): self.mind = Qlearning.Qlearning(MF=unjustified) self.knowledge_filename = knowledge_filename super(CompPlayer, self).__init__(*args, **kwargs)
# Environment definition and initialisation env = GridWorld_ORIGINAL_PB #env = GridWorld_SIMPLIFIED_PB env.reset() # Parameters Tmax=100 #Time horizon nbEpisods=10000 #nb of episods eps=0.4 #0.7 #initial exploration/exploitation tradeoff #Q learning print('Visualise cumulated discounted reward from the first trajectory') Q, V, policy = Qlearning(env, eps, nbEpisods, Tmax, plot=True, return_trajectories=False) print("final V",V) print("final policy : ", policy) #Visualise policy print('Visualise policy') gui.render_policy(env,policy) #Trajectory visualisation print('Visualise trajectory') for t in range(10): env.render = True state = env.reset() fps = 10 for i in range(20): #Tmax):
# str = 'we have a small house, we been livd here for 2 years, my children like to play in the back yard' def lexical_diversity(text): return len(set(text)) / len(text) # print lexical_diversity(str) #print sentiment("what is the weather?") reward_table = pd.read_pickle("reward2.pkl") turn_list = ['0', '1', '2'] rate_table = pd.DataFrame([(0.02, 0.01, 0.005)], columns=turn_list) q_learning = Qlearning.Qlearning(actions=action_list, learning_rate=rate_table) q_table = q_learning.q_table for state in state_list: q_table = q_table.append( pd.Series( [0] * len(action_list), index=q_table.columns, name=state, )) value = [] for file in all_convs: with codecs.open("NewData2/" + file, 'r', 'utf-8') as f: lines = f.readlines() utt_list = []
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Wed Oct 30 11:09:02 2019 @author: nizar """ import Qlearning print("-----------------Trial Watkins for 4 * 4----------------") #Qlearning.Qlearning('Tabular', 'FrozenLake-v0', 10000, 100, 0.1, 0.999, 1, 0.4, 1, 1) ###### goes up to 0.9 average reweard on the last 1000 runs print("-----------------Trial for 8 * 8----------------") #Qlearning.Qlearning('Tabular', 'FrozenLake8x8-v0', 100000, 100, 0.1, 0.9, 1, 0.5, 1, 1) ####### doesn't work with the same number of episodes but if u ###### multiply it by 10 you can reach 0.5 average rewards on the last 1000 runs print("-----------------Trial Zap for 4 * 4----------------") Qlearning.Qlearning('Zap', 'CartPole-v1', 3000, 100, 0.9, 0.9, 1, 0.3, 1, 1) print("-----------------Trial Zap for 8 * 8----------------") #Qlearning.Qlearning('Zap', 'FrozenLake8x8-v0', 5000, 100, 0.9, 0.9, 1, 0.3, 1, 1)
from tictactoe.TicTacToeGame import TicTacToeGame, display from tictactoe.TicTacToePlayers import * #from gobang.GobangGame import GobangGame, display #from gobang.GobangPlayers import * import numpy as np from MCS import * from MCTS import * from Qlearning import * from utils import * """ use this script to play any two agents against each other, or play manually with any agent. """ #define games #g = Connect4Game(4) g = TicTacToeGame() #g = GobangGame(7) #define players rp = RandomPlayer(g).play mcsp = MCS(g, 100).play mctsp = MCTS(g, 100).play qlp = Qlearning(g, 100, 0.01, 0.9, 0.9).play arena_rp_op = Arena.Arena(mcsp, rp, g, display=display) print(arena_rp_op.playGames(10, verbose=False)) arena_rp_op = Arena.Arena(mctsp, rp, g, display=display) print(arena_rp_op.playGames(10, verbose=False)) arena_rp_op = Arena.Arena(qlp, rp, g, display=display) print(arena_rp_op.playGames(10, verbose=False))