def train(cfg): '''# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up # env = FrozenLakeWapper(env)''' env = env_init_1() agent = QLearning(obs_dim=env.observation_space.n, action_dim=env.action_space.n, learning_rate=cfg.policy_lr, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay) render = False # 是否打开GUI画面 rewards = [] # 记录所有episode的reward MA_rewards = [] # 记录滑动平均的reward steps = [] # 记录所有episode的steps for i_episode in range(1, cfg.max_episodes + 1): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) while True: action = agent.sample(obs) # 根据算法选择一个动作 next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 # 训练 Q-learning算法 agent.learn(obs, action, reward, next_obs, done) # 不需要下一步的action obs = next_obs # 存储上一个观察值 ep_reward += reward ep_steps += 1 # 计算step数 if render: env.render() #渲染新的一帧图形 if done: break steps.append(ep_steps) rewards.append(ep_reward) '''计算滑动平均的reward''' if i_episode == 1: MA_rewards.append(ep_reward) else: MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward) print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps, ep_reward, agent.epsilon)) '''每隔20个episode渲染一下看看效果''' if i_episode % 20 == 0: render = True else: render = False print('Complete training!') save_model(agent, model_path=SAVED_MODEL_PATH) '''存储reward等相关结果''' save_results(rewards, MA_rewards, tag='train', result_path=RESULT_PATH) plot(rewards) plot(MA_rewards, ylabel='moving_average_rewards_train')
def eval(cfg, saved_model_path=SAVED_MODEL_PATH): env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left env = CliffWalkingWapper(env) agent = QLearning(obs_dim=env.observation_space.n, action_dim=env.action_space.n, learning_rate=cfg.policy_lr, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay) agent.load_model(saved_model_path + 'checkpoint.npy') # 导入保存的模型 rewards = [] # 记录所有episode的reward MA_rewards = [] # 记录滑动平均的reward steps = [] # 记录所有episode的steps for i_episode in range(1, 10 + 1): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) while True: action = agent.predict(obs) # 根据算法选择一个动作 next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 obs = next_obs # 存储上一个观察值 time.sleep(0.5) env.render() ep_reward += reward ep_steps += 1 # 计算step数 if done: break steps.append(ep_steps) rewards.append(ep_reward) # 计算滑动平均的reward if i_episode == 1: MA_rewards.append(ep_reward) else: MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward) print('Episode %s: steps = %s , reward = %.1f' % (i_episode, ep_steps, ep_reward)) print('Complete training!') save_model(agent, model_path=SAVED_MODEL_PATH) '''存储reward等相关结果''' save_results(rewards, MA_rewards, tag='train', result_path=RESULT_PATH) plot(rewards) plot(MA_rewards, ylabel='moving_average_rewards_train')
def train(cfg): # env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up # env = FrozenLakeWapper(env) env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left env = CliffWalkingWapper(env) agent = QLearning(obs_dim=env.observation_space.n, action_dim=env.action_space.n, learning_rate=cfg.policy_lr, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay) render = False # 是否打开GUI画面 rewards = [] # 记录所有episode的reward MA_rewards = [] # 记录滑动平均的reward steps = [] # 记录所有episode的steps for i_episode in range(1, cfg.max_episodes + 1): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) while True: action = agent.sample(obs) # 根据算法选择一个动作 next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 # 训练 Q-learning算法 agent.learn(obs, action, reward, next_obs, done) # 不需要下一步的action obs = next_obs # 存储上一个观察值 ep_reward += reward ep_steps += 1 # 计算step数 if render: env.render() #渲染新的一帧图形 if done: break steps.append(ep_steps) rewards.append(ep_reward) # 计算滑动平均的reward if i_episode == 1: MA_rewards.append(ep_reward) else: MA_rewards.append(0.9 * MA_rewards[-1] + 0.1 * ep_reward) print('Episode %s: steps = %s , reward = %.1f, explore = %.2f' % (i_episode, ep_steps, ep_reward, agent.epsilon)) # 每隔20个episode渲染一下看看效果 if i_episode % 20 == 0: render = True else: render = False agent.save() # 训练结束,保存模型 output_path = os.path.dirname(__file__) + "/result/" # 检测是否存在文件夹 if not os.path.exists(output_path): os.mkdir(output_path) np.save(output_path + "rewards_train.npy", rewards) np.save(output_path + "MA_rewards_train.npy", MA_rewards) np.save(output_path + "steps_train.npy", steps)
def test(cfg): env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left env = CliffWalkingWapper(env) agent = QLearning( obs_dim=env.observation_space.n, action_dim=env.action_space.n, learning_rate=cfg.policy_lr, gamma=cfg.gamma, epsilon_start=cfg.epsilon_start, epsilon_end=cfg.epsilon_end, epsilon_decay=cfg.epsilon_decay) agent.load() # 导入保存的模型 rewards = [] # 记录所有episode的reward MA_rewards = [] # 记录滑动平均的reward steps = [] # 记录所有episode的steps for i_episode in range(1, 10+1): ep_reward = 0 # 记录每个episode的reward ep_steps = 0 # 记录每个episode走了多少step obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode) while True: action = agent.predict(obs) # 根据算法选择一个动作 next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互 obs = next_obs # 存储上一个观察值 time.sleep(0.5) env.render() ep_reward += reward ep_steps += 1 # 计算step数 if done: break steps.append(ep_steps) rewards.append(ep_reward) # 计算滑动平均的reward if i_episode == 1: MA_rewards.append(ep_reward) else: MA_rewards.append( 0.9*MA_rewards[-1]+0.1*ep_reward) print('Episode %s: steps = %s , reward = %.1f' % (i_episode, ep_steps, ep_reward)) plt.plot(MA_rewards) plt.show()
import numpy as np from itertools import cycle import random import sys from agent import QLearning import pygame from pygame.locals import * from environment import create_uniform_grid, map_position_tile bot = QLearning(2) FPS = 30 SCREENWIDTH = 288 SCREENHEIGHT = 512 game_grid = create_uniform_grid(SCREENHEIGHT, SCREENWIDTH) PIPEGAPSIZE = 100 # gap between upper and lower part of pipe BASEY = SCREENHEIGHT * 0.79 # image, sound and hitmask dicts IMAGES, SOUNDS, HITMASKS = {}, {}, {} # list of all possible players (tuple of 3 positions of flap) PLAYERS_LIST = ( # red bird ( 'assets/sprites/redbird-upflap.png', 'assets/sprites/redbird-midflap.png', 'assets/sprites/redbird-downflap.png', ), # blue bird
sarsa.save_q_values() sarsa_df = pd.DataFrame(data=sarsa_scores, index=n_episodes_range, columns=['score']) # In[10]: plot_training_results(sarsa_df, "Sarsa", f"sarsa_training_scores_alpha_{alpha}_gamma_{gamma}") # ## Q-Learning # In[11]: qlearning = QLearning(action_space=env.action_space, alpha=alpha, gamma=gamma) # In[12]: q_learning_scores = [] for i_episode in range(1, n_episodes + 1): env = FlappyEnvironment() if i_episode % 100 == 0: print("\rEpisode {}/{} - Max Score {}".format( i_episode, n_episodes, np.array(q_learning_scores).max()), end="") sys.stdout.flush() qlearning.learn(env)
# Gui 렌더링 env.render() # 에이전트로부터 해당 상태에 대한 행동을 받아옴 action = agent.get_action(str(state)) # 에이전트의 행동을 취하고 다음 상태와 보상과 에피소드가 끝났는지의 여부를 받아옴 state_, reward, done = env.step(action) # 에이전트의 learn 함수에 S A R S_ 를 넣어줌 agent.learn(str(state), action, reward, str(state_)) # 현재 상태에 다음 상태를 대입, state = state_ env.print_value_all(agent.q_table) # 에피소드가 끝나면 break if done: break # 모든 에피소드가 다 끝나면 게임오버 print('game over') # env.destroy() if __name__ == "__main__": env = Env() agent = QLearning(actions=list(range(env.n_actions))) update()