t = 0 track_r = [] while True: # if RENDER: env.render() env.render() a = actor.choose_acton(s) s_, r, done, info = env.step(a) position, velocity = s_ r = abs(position + 0.46) track_r.append(r) td_error = critic.learn(s, r, s_) actor.learn(s, a, td_error) s = s_ t += 1 if done: ep_rs_sum = sum(track_r) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.95 + ep_rs_sum * 0.05 if running_reward > 200: RENDER = True print("episode: ", i_episode, " reward: ", int(running_reward))
action = actor.choose_action(observation) running_reward = 0 critic.reset() count = 0 while count < Tmax: count += 1 if RENDER: env.render() observation_, reward, done, info = env.step( action) # reward = -1 in all cases # print(action, reward, observation_) running_reward += reward if done: Tmax = count delta = critic.learn(observation, reward, observation_) actor.learn(observation, action, delta) print(i_episode, running_reward) if running_reward > DISPLAY_REWARD_THRESHOLD and i_episode > 1900: RENDER = True # rendering break else: action_ = actor.choose_action(observation_) delta = critic.learn(observation, reward, observation_) actor.learn(observation, action, delta) observation = observation_ action = action_