import time from model import DQN from rewards import CustomReward # CONFIG wave = True learn_timesteps = 1 * int(1e3) if wave: import arlie env = arlie.make("LunarLander", render_mode=False, reward=CustomReward()) else: import gym env = gym.make("LunarLander-v2") model = DQN(env) print("Training...") _t = time.time() model.learn(total_timesteps=learn_timesteps) t = time.time() - _t str_t = time.strftime("%H h, %M m, %S s", time.gmtime(t)) print("Trained in {} during {} timesteps".format(str_t, learn_timesteps)) model.save("{}-trained-model".format("wave" if wave else "gym")) env.close()
action) # 环境模型 采用行为,获得下个状态,和潜在的奖励 if use_my_reward: x, x_dot, theta, theat_dot = observation_next # 这里拆分了 状态值 ,里面有四个参数 # 这里用了,x 和theta的限度值 来判断奖励的幅度,当然也可以gym自带的 ,但是这个效率据说比较高 reward1 = (env.x_threshold - abs(x)) / env.x_threshold - 0.8 reward2 = (env.theta_threshold_radians - abs(theta)) / env.theta_threshold_radians - 0.5 reward = reward1 + reward2 # 这里将奖励综合 if done: reward = 0 RL.store_transition(observation, action, reward, observation_next) # 先存储到记忆库 ep_r += reward # 这里只是为了观察奖励值是否依据实际情况变化,来方便判断模型的正确性 # 为什么不把失败的情况存起来? if steps > 500: # 这里一开始先不学习,先积累奖励 RL.learn() if done: # 这里判断的是回合结束,显示奖励积累值,你可以看到每回合奖励的变化,来判定这样一连串行为的结果好不好 if ep_r > 400: RENDER = True print('setp :', steps, 'episode :', episode, 'ep_r:', round(ep_r, 2), "RL's epsilon", round(RL.epsilon, 3)) break observation = observation_next # 更新状态 steps += 1 RL.plot_cost() # 训练结束后来观察我们的cost