RL.saver.restore(RL.sess, model_file) max_reward = -200 for i_episode in range(1000): observation = env.reset() running_reward = 0 i = 0 while True: if RENDER: env.render() action = RL.choose_action(observation) observation_, reward, done, info = env.step(action) RL.store_transition(observation, action, reward) i += 1 if i % 1000 == 0: print("i=%d, action=%d" % (i, action)) if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True
def main(): env = grc.RemoteEnv('tmp/sock') # Policy gradient has high variance, seed for reproducability env.seed(1) RENDER_ENV = False rewards = [] INITIAL_EPSILON = 0.7 EPSILON_GREEDY_INCREMENT = 0.01 # Load checkpoint load_version = "2018-06-05 18:24:13" timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version) PG = PolicyGradient( n_x = [112,112,3], #env.observation_space.shape, n_y = env.action_space.n, learning_rate=0.02, reward_decay=0.99, load_path=load_path, epsilon_max=0.98, epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT, initial_epsilon = INITIAL_EPSILON ) observation = env.reset() # print("obs", observation) episode_reward = 0 tic = time.clock() while True: if RENDER_ENV: env.render() # 1. Choose an action based on observation observation = observation[:,96:,:] # make square, keep right sight of image observation = observation[::2,::2,:] # downsample to [112,112,3] observation = observation / 255 # normalize action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward, done, info = env.step(action) # 4. Store transition for training PG.store_transition(observation, action, reward) episode_rewards_sum = sum(PG.episode_rewards) toc = time.clock() elapsed_sec = toc - tic # Save new observation observation = observation_ if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) if episode_rewards_sum == 0.0: print("-----------------------------------") print("Backtrack epsilon for more exploration...") PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT, INITIAL_EPSILON) print("==========================================") print("Epsilon: ", PG.epsilon) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) # 5. Train neural network tic = time.clock() discounted_episode_rewards_norm = PG.learn() toc = time.clock() elapsed_sec = toc - tic print("Train Seconds: ", elapsed_sec) observation = env.reset()
这里先用env.unwrapper来移除env上包裹的所有wrapper(其中一个让env的每个episode最大step设置为200), 然后给env包裹上Monitor Wrapper来实现渲染 """ env = env.unwrapped env = wrappers.Monitor(env, "../gym-results", force=True) for episode in range(1000): obs = env.reset() while True: if RENDER: env.render() action = pg.choose_action(obs) obs_, reward, done, info = env.step(action) # print("cur_reward = ", reward) pg.store_transition(obs, action, reward) if done: ep_rewards_sum = sum(pg.ep_rewards) if 'running_reward' not in globals(): running_reward = ep_rewards_sum else: running_reward = running_reward * 0.99 + ep_rewards_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print("episode: ", episode, " reward: ", running_reward) vt = pg.learn() # if episode == 30: # plt.plot(vt) # plt.xlabel("episode")