Beispiel #1
0
def main():
    model = PolicyGradient(
        gym.make('CartPole-v1'),
        n_in=4,
        hidden_net=lambda x: tf.layers.dense(x, 10, tf.nn.elu),
        n_out=2)
    model.learn()
    model.play()
def main():
    env = grc.RemoteEnv('tmp/sock')

    # Policy gradient has high variance, seed for reproducability
    env.seed(1)

    RENDER_ENV = False
    rewards = []
    INITIAL_EPSILON = 0.7
    EPSILON_GREEDY_INCREMENT = 0.01

    # Load checkpoint
    load_version = "2018-06-05 18:24:13"
    timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version)

    PG = PolicyGradient(
        n_x = [112,112,3], #env.observation_space.shape,
        n_y = env.action_space.n,
        learning_rate=0.02,
        reward_decay=0.99,
        load_path=load_path,
        epsilon_max=0.98,
        epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT,
        initial_epsilon = INITIAL_EPSILON
    )

    observation = env.reset()
    # print("obs", observation)
    episode_reward = 0

    tic = time.clock()

    while True:
        if RENDER_ENV: env.render()

        # 1. Choose an action based on observation
        observation = observation[:,96:,:] # make square, keep right sight of image
        observation = observation[::2,::2,:] # downsample to [112,112,3]
        observation = observation / 255 # normalize
        action = PG.choose_action(observation)

        # 2. Take action in the environment
        observation_, reward, done, info = env.step(action)

        # 4. Store transition for training
        PG.store_transition(observation, action, reward)

        episode_rewards_sum = sum(PG.episode_rewards)

        toc = time.clock()
        elapsed_sec = toc - tic

        # Save new observation
        observation = observation_

        if done:
            episode_rewards_sum = sum(PG.episode_rewards)
            rewards.append(episode_rewards_sum)
            max_reward_so_far = np.amax(rewards)

            if episode_rewards_sum == 0.0:
                print("-----------------------------------")
                print("Backtrack epsilon for more exploration...")
                PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT, INITIAL_EPSILON)

            print("==========================================")
            print("Epsilon: ", PG.epsilon)
            print("Seconds: ", elapsed_sec)
            print("Reward: ", episode_rewards_sum)
            print("Max reward so far: ", max_reward_so_far)

            # 5. Train neural network
            tic = time.clock()
            discounted_episode_rewards_norm = PG.learn()
            toc = time.clock()
            elapsed_sec = toc - tic
            print("Train Seconds: ", elapsed_sec)

            observation = env.reset()
Beispiel #3
0
def main():
    model = PolicyGradient(gym.make('CartPole-v1'),
                           n_in=4, n_hidden=[10], n_out=2)
    model.learn()
    model.play()
Beispiel #4
0
def main():
    model = PolicyGradient()
    model.learn(gym.make("CartPole-v0"))
    model.play(gym.make("CartPole-v0"))
Beispiel #5
0
        if i % 1000 == 0:
            print("i=%d, action=%d" % (i, action))

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01

            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True

            print("episode:", i_episode, "  reward:", int(running_reward))

            vt = RL.learn()

            if is_train and running_reward > max_reward:
                max_reward = running_reward
                RL.saver.save(RL.sess, 'ckpt/car_pole/car_pole.ckpt')

            if i_episode == 30:
                plt.plot(vt)
                plt.xlabel('episode steps')
                plt.ylabel('normalized state-action value')
                plt.show()
            break

        observation = observation_
Beispiel #6
0
def main():
    model = PolicyGradient(lambda x: tf.layers.dense(x, 4, tf.nn.relu))
    model.learn(gym.make("CartPole-v0"))
    model.play(gym.make("CartPole-v0"))
Beispiel #7
0
    while True:
        if RENDER:
            env.render()

        action = pg.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        # print("cur_reward = ", reward)
        pg.store_transition(obs, action, reward)

        if done:
            ep_rewards_sum = sum(pg.ep_rewards)
            if 'running_reward' not in globals():
                running_reward = ep_rewards_sum
            else:
                running_reward = running_reward * 0.99 + ep_rewards_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True
            print("episode: ", episode, "   reward: ", running_reward)
            vt = pg.learn()

            # if episode == 30:
            #     plt.plot(vt)
            #     plt.xlabel("episode")
            #     plt.ylabel("normalized state action value")
            #     plt.save()
            break

        obs = obs_

env.close()
                rewards.append(episode_rewards_sum)
                max_reward_so_far = np.amax(rewards)

                if episode_rewards_sum == 0.0:
                    print("-----------------------------------")
                    print("Backtrack epsilon for more exploration...")
                    PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT,
                                     INITIAL_EPSILON)

                print("==========================================")
                print("Episode: ", episode)
                print("Epsilon: ", PG.epsilon)
                print("Seconds: ", elapsed_sec)
                print("Reward: ", episode_rewards_sum)
                print("Max reward so far: ", max_reward_so_far)

                # 5. Train neural network
                tic = time.clock()
                if episode_rewards_sum > MIN_REWARD_TO_LEARN:
                    discounted_episode_rewards_norm = PG.learn()
                toc = time.clock()
                elapsed_sec = toc - tic
                print("Train Seconds: ", elapsed_sec)

                if max_reward_so_far > RENDER_REWARD_MIN: RENDER_ENV = True

                break

            # Save new observation
            observation = observation_