global_step = 0 scores, episodes = [], [] for e in range(EPISODES): done = False score = 0 state = env.reset() state = np.reshape(state, [1, player1.state_size]) while not done: # env 초기화 global_step += 1 # 홀수 턴(qlearning player) - Black if env.get_turn() % 2 == 1: # 현재 상태에 대한 행동 선택 action = player1.get_action(state) # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 next_state, reward, done = env.step(BLACK, action) ''' print("Action : ", action) print("Reward : ", reward) print("Next State : ", next_state) print() ''' next_state = np.reshape(next_state, [1, player1.state_size]) next_action = player1.get_action(next_state) # 샘플로 모델 학습 player1.train_model(state, action, reward, next_state, next_action, done)
next_state, reward, done = env.step(BLACK, action) next_state = np.reshape(next_state, [1, player.state_size]) if PRINT_FLAG: print("Action : {0} ==> {1}, {2}".format( action, int(action / 10), action % 10)) print("Reward : ", reward) print("Next State : ", next_state) print() player.append_sample(state, action, reward) score += reward state = copy.deepcopy(next_state) if PRINT_FLAG: # board 출력 print("Episode : {0}, Turn : {1}, PLAYER1".format( e, env.get_turn())) env.draw_board() print() time.sleep(1) # White else: # 현재 상태 획득 state = env.get_state() state = np.reshape(state, [1, player.state_size]) # 현재 상태에 대한 행동 선택 action = player.get_action(state) # 선택한 행동으로 환경에서 한 타임스텝 진행 후 샘플 수집 next_state, reward, done = env.step(BLACK, action) next_state = np.reshape(next_state, [1, player.state_size]) if PRINT_FLAG: print("Action : {0} ==> {1}, {2}".format(