def arg_max(q_list): max_idx_list = np.argwhere(q_list == np.amax(q_list)) max_idx_list = max_idx_list.flatten().tolist() return random.choice(max_idx_list) if __name__ == "__main__": env = Env() agent = QLearningAgent(actions=list(range(env.n_actions))) for episode in range(1000): state = env.reset() while True: # 게임 환경과 상태를 초기화 env.render() # 현재 상태에 대한 행동 선택 action = agent.get_action(state) # 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴 next_state, reward, done = env.step(action) # <s,a,r,s'>로 큐함수를 업데이트 agent.learn(state, action, reward, next_state) state = next_state # 모든 큐함수를 화면에 표시 env.print_value_all(agent.q_table) if done: break
env = Env() agent = SARSAgent(actions=list(range(env.n_actions))) for episode in range(1000): # reset environment and initialize state state = env.reset() # get action of state from agent action = agent.get_action(str(state)) while True: env.render() # take action and proceed one step in the environment next_state, reward, done = env.step(action) next_action = agent.get_action(str(next_state)) # with sample <s,a,r,s',a'>, agent learns new q function agent.learn(str(state), action, reward, str(next_state), next_action) state = next_state action = next_action # print q function of all states at screen env.print_value_all(agent.q_table) # if episode ends, then break if done: break
from environment import Env from QLearning import QLearning if __name__ == "__main__": env = Env() QL = QLearning(list(range(env.n_actions))) for episode in range(1000): state = env.reset() while True: env.render() # take action and proceed one step in the environment action = QL.get_action(str(state)) next_state, reward, done = env.step(action) # with sample <s,a,r,s'>, agent learns new q function QL.learn(str(state), action, reward, str(next_state)) state = next_state env.print_value_all(QL.q_table) # if episode ends, then break if done: break
max_index_list.append(index) return random.choice(max_index_list) if __name__ == "__main__": env = Env() agent = QLearningAgent(actions=list(range(env.n_actions))) for episode in range(1000): state = env.reset() while True: env.render() # take action and proceed one step in the environment action = agent.get_action(str(state)) agent.save_samples_for_print(state, action) next_state, reward, done = env.step(action) # with sample <s,a,r,s'>, agent learns new q function agent.learn(str(state), action, reward, str(next_state)) state = next_state env.print_value_all(agent.q_table, agent.samples_for_print) # if episode ends, then break if done: agent.samples_for_print.clear() break
env = Env() env.reset() # action space 출력 print(env.n_actions) # 에이전트의 행동을 환경에 전달하고 정보 받아오기 next_state, reward, done = env.step(1) print(next_state, reward, done) # q 함수를 담을 default dict q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0]) q_table[str([1, 0])] = [100., 100., 100., 100.] print(q_table['blah blah']) print(q_table) # 환경에 q함수를 출력 env.print_value_all(q_table) # 환경 업데이트 env.render() time.sleep(3) # 에이전트 연속 행동 for i in range(100): actions = np.random.choice(4) env.step(actions) env.render() env.reset() env.destroy()