def main(): model = PolicyGradient( gym.make('CartPole-v1'), n_in=4, hidden_net=lambda x: tf.layers.dense(x, 10, tf.nn.elu), n_out=2) model.learn() model.play()
def main(): env = grc.RemoteEnv('tmp/sock') # Policy gradient has high variance, seed for reproducability env.seed(1) RENDER_ENV = False rewards = [] INITIAL_EPSILON = 0.7 EPSILON_GREEDY_INCREMENT = 0.01 # Load checkpoint load_version = "2018-06-05 18:24:13" timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version) PG = PolicyGradient( n_x = [112,112,3], #env.observation_space.shape, n_y = env.action_space.n, learning_rate=0.02, reward_decay=0.99, load_path=load_path, epsilon_max=0.98, epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT, initial_epsilon = INITIAL_EPSILON ) observation = env.reset() # print("obs", observation) episode_reward = 0 tic = time.clock() while True: if RENDER_ENV: env.render() # 1. Choose an action based on observation observation = observation[:,96:,:] # make square, keep right sight of image observation = observation[::2,::2,:] # downsample to [112,112,3] observation = observation / 255 # normalize action = PG.choose_action(observation) # 2. Take action in the environment observation_, reward, done, info = env.step(action) # 4. Store transition for training PG.store_transition(observation, action, reward) episode_rewards_sum = sum(PG.episode_rewards) toc = time.clock() elapsed_sec = toc - tic # Save new observation observation = observation_ if done: episode_rewards_sum = sum(PG.episode_rewards) rewards.append(episode_rewards_sum) max_reward_so_far = np.amax(rewards) if episode_rewards_sum == 0.0: print("-----------------------------------") print("Backtrack epsilon for more exploration...") PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT, INITIAL_EPSILON) print("==========================================") print("Epsilon: ", PG.epsilon) print("Seconds: ", elapsed_sec) print("Reward: ", episode_rewards_sum) print("Max reward so far: ", max_reward_so_far) # 5. Train neural network tic = time.clock() discounted_episode_rewards_norm = PG.learn() toc = time.clock() elapsed_sec = toc - tic print("Train Seconds: ", elapsed_sec) observation = env.reset()
def main(): model = PolicyGradient(gym.make('CartPole-v1'), n_in=4, n_hidden=[10], n_out=2) model.learn() model.play()
import matplotlib.pyplot as plt DISPLAY_REWARD_THRESHOLD = -200 RENDER = False is_train = False env = gym.make('MountainCar-v0') env.seed(1) env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) RL = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.01, reward_decay=0.98) if not is_train: model_file = RL.restore_file RL.saver.restore(RL.sess, model_file) max_reward = -200 for i_episode in range(1000): observation = env.reset() running_reward = 0 i = 0 while True: if RENDER: env.render()
def main(): model = PolicyGradient() model.learn(gym.make("CartPole-v0")) model.play(gym.make("CartPole-v0"))
# break while loop when end of this episode if done: break step += 1 print(totalReward) # end of game print('game over') env.destroy() if __name__ == "__main__": env = Plane() # RL = QLearningTable(actions=list(range(env.n_actions))) RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, e_greedy=0.9, replace_target_iter=200, memory_size=2000, # output_graph=True ) RL = PolicyGradient(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, # output_graph=True ) env.after(100, update) env.mainloop() #DQN RL.plot_cost()
# training, testing data split train_start = '2014-01-01' train_test_split = '2015-01-01' training_data = df[train_start:train_test_split] train_test_split = '2015-01-01' #train_test_split = '2016-01-01' test_end = '2016-01-01' testing_data = df[train_test_split:test_end] #%% train env = MarketEnv(training_data, 60) #pg = PolicyGradient(env, gamma= 0.9,file_name = "pg_3.h5") pg = PolicyGradient(env, gamma=0.9, file_name="new34.h5") pg.train() #%% test env_test = MarketEnv(testing_data, 60) pg_test = PolicyGradient(env_test, gamma=0.9, weights_path="new34.h5") model = pg_test.model act = [-1, 0, 1] env_test.reset() observation = env_test.reset() game_over = False inputs = [] outputs = [] predicteds = []
def main(): model = PolicyGradient(lambda x: tf.layers.dense(x, 4, tf.nn.relu)) model.learn(gym.make("CartPole-v0")) model.play(gym.make("CartPole-v0"))
import gym from gym import wrappers import numpy as np import matplotlib.pyplot as plt from pg import PolicyGradient DISPLAY_REWARD_THRESHOLD = -2000 RENDER = False env = gym.make('MountainCar-v0') env.seed(1) pg = PolicyGradient(n_actions=env.action_space.n, n_features=env.observation_space.shape[0], learning_rate=0.02, reward_decay=0.995) """ 这里先用env.unwrapper来移除env上包裹的所有wrapper(其中一个让env的每个episode最大step设置为200), 然后给env包裹上Monitor Wrapper来实现渲染 """ env = env.unwrapped env = wrappers.Monitor(env, "../gym-results", force=True) for episode in range(1000): obs = env.reset() while True: if RENDER: env.render() action = pg.choose_action(obs) obs_, reward, done, info = env.step(action) # print("cur_reward = ", reward)
INITIAL_EPSILON = 0.7 EPSILON_GREEDY_INCREMENT = 0.01 if __name__ == "__main__": # Load checkpoint load_version = "2018-06-05 16:31:58" timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime()) load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version) save_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(timestamp) PG = PolicyGradient( n_x=[112, 112, 3], #env.observation_space.shape, n_y=env.action_space.n, learning_rate=0.02, reward_decay=0.99, load_path=load_path, save_path=save_path, epsilon_max=0.98, epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT, initial_epsilon=INITIAL_EPSILON) for episode in range(EPISODES): observation = env.reset() # print("obs", observation) episode_reward = 0 tic = time.clock() while True: if RENDER_ENV: env.render()