Beispiel #1
0
def main():
    model = PolicyGradient(
        gym.make('CartPole-v1'),
        n_in=4,
        hidden_net=lambda x: tf.layers.dense(x, 10, tf.nn.elu),
        n_out=2)
    model.learn()
    model.play()
def main():
    env = grc.RemoteEnv('tmp/sock')

    # Policy gradient has high variance, seed for reproducability
    env.seed(1)

    RENDER_ENV = False
    rewards = []
    INITIAL_EPSILON = 0.7
    EPSILON_GREEDY_INCREMENT = 0.01

    # Load checkpoint
    load_version = "2018-06-05 18:24:13"
    timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version)

    PG = PolicyGradient(
        n_x = [112,112,3], #env.observation_space.shape,
        n_y = env.action_space.n,
        learning_rate=0.02,
        reward_decay=0.99,
        load_path=load_path,
        epsilon_max=0.98,
        epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT,
        initial_epsilon = INITIAL_EPSILON
    )

    observation = env.reset()
    # print("obs", observation)
    episode_reward = 0

    tic = time.clock()

    while True:
        if RENDER_ENV: env.render()

        # 1. Choose an action based on observation
        observation = observation[:,96:,:] # make square, keep right sight of image
        observation = observation[::2,::2,:] # downsample to [112,112,3]
        observation = observation / 255 # normalize
        action = PG.choose_action(observation)

        # 2. Take action in the environment
        observation_, reward, done, info = env.step(action)

        # 4. Store transition for training
        PG.store_transition(observation, action, reward)

        episode_rewards_sum = sum(PG.episode_rewards)

        toc = time.clock()
        elapsed_sec = toc - tic

        # Save new observation
        observation = observation_

        if done:
            episode_rewards_sum = sum(PG.episode_rewards)
            rewards.append(episode_rewards_sum)
            max_reward_so_far = np.amax(rewards)

            if episode_rewards_sum == 0.0:
                print("-----------------------------------")
                print("Backtrack epsilon for more exploration...")
                PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT, INITIAL_EPSILON)

            print("==========================================")
            print("Epsilon: ", PG.epsilon)
            print("Seconds: ", elapsed_sec)
            print("Reward: ", episode_rewards_sum)
            print("Max reward so far: ", max_reward_so_far)

            # 5. Train neural network
            tic = time.clock()
            discounted_episode_rewards_norm = PG.learn()
            toc = time.clock()
            elapsed_sec = toc - tic
            print("Train Seconds: ", elapsed_sec)

            observation = env.reset()
Beispiel #3
0
def main():
    model = PolicyGradient(gym.make('CartPole-v1'),
                           n_in=4, n_hidden=[10], n_out=2)
    model.learn()
    model.play()
Beispiel #4
0
import matplotlib.pyplot as plt

DISPLAY_REWARD_THRESHOLD = -200
RENDER = False
is_train = False

env = gym.make('MountainCar-v0')
env.seed(1)
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)

RL = PolicyGradient(n_actions=env.action_space.n,
                    n_features=env.observation_space.shape[0],
                    learning_rate=0.01,
                    reward_decay=0.98)

if not is_train:
    model_file = RL.restore_file
    RL.saver.restore(RL.sess, model_file)

max_reward = -200
for i_episode in range(1000):
    observation = env.reset()
    running_reward = 0
    i = 0
    while True:
        if RENDER:
            env.render()
Beispiel #5
0
def main():
    model = PolicyGradient()
    model.learn(gym.make("CartPole-v0"))
    model.play(gym.make("CartPole-v0"))
            # break while loop when end of this episode
            if done:
                break
            step += 1

        print(totalReward)

    # end of game
    print('game over')
    env.destroy()

if __name__ == "__main__":
    env = Plane()
    # RL = QLearningTable(actions=list(range(env.n_actions)))
    RL = DeepQNetwork(env.n_actions, env.n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      e_greedy=0.9,
                      replace_target_iter=200,
                      memory_size=2000,
                      # output_graph=True
                      )
    RL = PolicyGradient(env.n_actions, env.n_features,
                      learning_rate=0.01,
                      reward_decay=0.9,
                      # output_graph=True
                      )
    env.after(100, update)
    env.mainloop()
    #DQN
    RL.plot_cost()
Beispiel #7
0
# training, testing data split
train_start = '2014-01-01'
train_test_split = '2015-01-01'
training_data = df[train_start:train_test_split]

train_test_split = '2015-01-01'

#train_test_split = '2016-01-01'
test_end = '2016-01-01'
testing_data = df[train_test_split:test_end]

#%% train
env = MarketEnv(training_data, 60)
#pg = PolicyGradient(env, gamma= 0.9,file_name = "pg_3.h5")
pg = PolicyGradient(env, gamma=0.9, file_name="new34.h5")
pg.train()

#%% test
env_test = MarketEnv(testing_data, 60)
pg_test = PolicyGradient(env_test, gamma=0.9, weights_path="new34.h5")
model = pg_test.model
act = [-1, 0, 1]

env_test.reset()
observation = env_test.reset()
game_over = False

inputs = []
outputs = []
predicteds = []
Beispiel #8
0
def main():
    model = PolicyGradient(lambda x: tf.layers.dense(x, 4, tf.nn.relu))
    model.learn(gym.make("CartPole-v0"))
    model.play(gym.make("CartPole-v0"))
Beispiel #9
0
import gym
from gym import wrappers
import numpy as np
import matplotlib.pyplot as plt
from pg import PolicyGradient

DISPLAY_REWARD_THRESHOLD = -2000
RENDER = False

env = gym.make('MountainCar-v0')
env.seed(1)
pg = PolicyGradient(n_actions=env.action_space.n,
                    n_features=env.observation_space.shape[0],
                    learning_rate=0.02,
                    reward_decay=0.995)
"""
这里先用env.unwrapper来移除env上包裹的所有wrapper(其中一个让env的每个episode最大step设置为200),
然后给env包裹上Monitor Wrapper来实现渲染
"""
env = env.unwrapped
env = wrappers.Monitor(env, "../gym-results", force=True)

for episode in range(1000):
    obs = env.reset()
    while True:
        if RENDER:
            env.render()

        action = pg.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        # print("cur_reward = ", reward)
INITIAL_EPSILON = 0.7
EPSILON_GREEDY_INCREMENT = 0.01

if __name__ == "__main__":

    # Load checkpoint
    load_version = "2018-06-05 16:31:58"
    timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version)
    save_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(timestamp)

    PG = PolicyGradient(
        n_x=[112, 112, 3],  #env.observation_space.shape,
        n_y=env.action_space.n,
        learning_rate=0.02,
        reward_decay=0.99,
        load_path=load_path,
        save_path=save_path,
        epsilon_max=0.98,
        epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT,
        initial_epsilon=INITIAL_EPSILON)

    for episode in range(EPISODES):

        observation = env.reset()
        # print("obs", observation)
        episode_reward = 0

        tic = time.clock()

        while True:
            if RENDER_ENV: env.render()