def main(): model = PolicyGradient( gym.make('CartPole-v1'), n_in=4, hidden_net=lambda x: tf.layers.dense(x, 10, tf.nn.elu), n_out=2) model.learn() model.play()
def main(): env = grc.RemoteEnv('tmp/sock') # Policy gradient has high variance, seed for reproducability env.seed(1) RENDER_ENV = False rewards = [] INITIAL_EPSILON = 0.7 EPSILON_GREEDY_INCREMENT = 0.01 # Load checkpoint load_version = "2018-06-05 18:24:13" timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version) PG = PolicyGradient( n_x = [112,112,3], #env.observation_space.shape, n_y = env.action_space.n, learning_rate=0.02, reward_decay=0.99, load_path=load_path, epsilon_max=0.98, epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT, initial_epsilon = INITIAL_EPSILON ) observation = env.reset() # print("obs", observation) episode_reward = 0 tic = time.clock() while True: if RENDER_ENV: env.render() # 1. Choose an action based on observation observation = observation[:,96:,:] # make square, keep right sight of image observation = observation[::2,::2,:] # downsample to [112,112,3] observation = observation / 255 # normalize action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward, done, info = env.step(action) # 4. Store transition for training PG.store_transition(observation, action, reward) episode_rewards_sum = sum(PG.episode_rewards) toc = time.clock() elapsed_sec = toc - tic # Save new observation observation = observation_ if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) if episode_rewards_sum == 0.0: print("-----------------------------------") print("Backtrack epsilon for more exploration...") PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT, INITIAL_EPSILON) print("==========================================") print("Epsilon: ", PG.epsilon) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) # 5. Train neural network tic = time.clock() discounted_episode_rewards_norm = PG.learn() toc = time.clock() elapsed_sec = toc - tic print("Train Seconds: ", elapsed_sec) observation = env.reset()
def main(): model = PolicyGradient(gym.make('CartPole-v1'), n_in=4, n_hidden=[10], n_out=2) model.learn() model.play()
def main(): model = PolicyGradient() model.learn(gym.make("CartPole-v0")) model.play(gym.make("CartPole-v0"))
if i % 1000 == 0: print("i=%d, action=%d" % (i, action)) if done: ep_rs_sum = sum(RL.ep_rs) if 'running_reward' not in globals(): running_reward = ep_rs_sum else: running_reward = running_reward * 0.99 + ep_rs_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print("episode:", i_episode, " reward:", int(running_reward)) vt = RL.learn() if is_train and running_reward > max_reward: max_reward = running_reward RL.saver.save(RL.sess, 'ckpt/car_pole/car_pole.ckpt') if i_episode == 30: plt.plot(vt) plt.xlabel('episode steps') plt.ylabel('normalized state-action value') plt.show() break observation = observation_
def main(): model = PolicyGradient(lambda x: tf.layers.dense(x, 4, tf.nn.relu)) model.learn(gym.make("CartPole-v0")) model.play(gym.make("CartPole-v0"))
while True: if RENDER: env.render() action = pg.choose_action(obs) obs_, reward, done, info = env.step(action) # print("cur_reward = ", reward) pg.store_transition(obs, action, reward) if done: ep_rewards_sum = sum(pg.ep_rewards) if 'running_reward' not in globals(): running_reward = ep_rewards_sum else: running_reward = running_reward * 0.99 + ep_rewards_sum * 0.01 if running_reward > DISPLAY_REWARD_THRESHOLD: RENDER = True print("episode: ", episode, " reward: ", running_reward) vt = pg.learn() # if episode == 30: # plt.plot(vt) # plt.xlabel("episode") # plt.ylabel("normalized state action value") # plt.save() break obs = obs_ env.close()
rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) if episode_rewards_sum == 0.0: print("-----------------------------------") print("Backtrack epsilon for more exploration...") PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT, INITIAL_EPSILON) print("==========================================") print("Episode: ", episode) print("Epsilon: ", PG.epsilon) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) # 5. Train neural network tic = time.clock() if episode_rewards_sum > MIN_REWARD_TO_LEARN: discounted_episode_rewards_norm = PG.learn() toc = time.clock() elapsed_sec = toc - tic print("Train Seconds: ", elapsed_sec) if max_reward_so_far > RENDER_REWARD_MIN: RENDER_ENV = True break # Save new observation observation = observation_