while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs) if "running_reward" not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = False #True # rendering print("episode:", eposide_i, " reward:", int(running_reward)) vt = RL.learn() # train if eposide_i == 30: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
episode_reward += reward if done: #print("RL.ep_obs: "); print(RL.ep_obs.shape) #print("np.vstack(RL.ep_obs).shape = "); print(np.vstack(RL.ep_obs).shape) #print("np.array(RL.ep_as).shape = "); print(np.array(RL.ep_as).shape) ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering if running_reward > current_max: RL.save_model() current_max = running_reward print("episode:", i_episode, " episode_reward:", episode_reward, " running_reward:", int(running_reward), " t:", t) vt = RL.learn() if i_episode == 0: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break
if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: # calculate running reward ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() # train if i_episode == 30: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) #存储这一回合的transition if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # 判断是否显示模拟 print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() #学习,输出vt if i_episode == 0: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step( action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 # if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print("episode: ", i_episode, ", reward: ", int(running_reward)) vt = RL.learn() # training here if i_episode == 30: plt.plot(vt) plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
print('info:', info) RL.store_transition(observation, action, reward) # 存储这一回合的transition # 将下一个 state_ 变为 下次循环的 state observation = observation_ # 回合结束 if done: # 一个回合的reward求和 ep_rs_sum = sum(RL.ep_rs) # globals() 函数会以字典类型返回当前位置的全部全局变量 if 'running_reward' not in globals(): running_reward = ep_rs_sum else: # 这步为什么,没搞懂 running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # 判断是否显示模拟 print("episode:", i_episode, " reward:", int(running_reward)) # 一个回合的标准化后的收获 vt = RL.learn() # 学习, 输出 vt, 我们下节课讲这个 vt 的作用 if i_episode == 0: plt.plot(vt) # plot 这个回合的 vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break
for i_episode in range(10): observation = env.reset() while True: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) position, velocity = observation_ # the higher the better reward = abs(position - (-0.5)) RL.store_transition(observation, action, reward, observation_) if total_steps > 1000: RL.learn() print('episode: ', i_episode, "step: ", RL.learn_steps, 'cost: ', round(RL.cost, 4), ' epsilon: ', round(RL.epsilon, 2)) if done: break observation = observation_ total_steps += 1 RL.plot_cost()
) for i_episode in range(3000): observation = env.reset() # 获取回合 i_episode 第一个 observation while True: if RENDER: env.render() # 刷新环境 action = RL.choose_action(observation) # 选行为 observation_, reward, done, info = env.step(action) # 获取下一个state RL.store_transition(observation, action, reward) # 存储这一回合的transition if done: # 一个回合结束,开始更新参数 ep_rs_sum = sum(RL.ep_rs) # 统计每回合的reward if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # 判断是否开始模拟 print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() # 学习参数,输出vt if i_episode == 0: # 画图 plt.plot(vt) plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
RL.store_transition(observation, action, reward) if done: print('Done', done) ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt, loss = RL.learn() Y.append(loss) if i_episode == 0: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_ fig = plt.figure() X = np.array(X) Y = np.array(Y)
while True: if RENDER:env.render() action = RL.choose_action(observation) observation_,reward,done,info = env.step(action) RL.store_transition(observation,action,reward) if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() if i_episode == 0: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
action = RL.choose_action(observation) observation_, reward, done, info = env.step( action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: # calculate running reward ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt, loss = RL.learn() # train if i_episode == 5: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) # reward = -1 in all cases RL.store_transition(observation, action, reward) if done: # calculate running reward ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() # train#回合更新,即本次游戏所有幕结束后更新 if i_episode == 30: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: # 本轮游戏结束 ep_rs_sum = sum(RL.ep_rs) # 获取目前为止所有执行动作的reward的总和 if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True # rendering print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() # 训练模型 if i_episode == 30: plt.plot(vt) # plot the episode vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) if done: # 计算运行回报 ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() # 回合结束开始学习 if i_episode == 30: plt.plot(vt) # 第30个回合展示vt plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_