Python PolicyGradient.store_transition Exemples, pg.PolicyGradient.store_transition Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : car_pole.py Projet : ZHUDEJUN1985/policygra

    RL.saver.restore(RL.sess, model_file)

max_reward = -200
for i_episode in range(1000):
    observation = env.reset()
    running_reward = 0
    i = 0
    while True:
        if RENDER:
            env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)

        RL.store_transition(observation, action, reward)

        i += 1
        if i % 1000 == 0:
            print("i=%d, action=%d" % (i, action))

        if done:
            ep_rs_sum = sum(RL.ep_rs)

            if 'running_reward' not in globals():
                running_reward = ep_rs_sum
            else:
                running_reward = running_reward * 0.99 + ep_rs_sum * 0.01

            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True

Exemple #2

0

Afficher le fichier

Fichier : pg_agent.py Projet : gabrielgarza/retro-contest-sonic

def main():
    env = grc.RemoteEnv('tmp/sock')

    # Policy gradient has high variance, seed for reproducability
    env.seed(1)

    RENDER_ENV = False
    rewards = []
    INITIAL_EPSILON = 0.7
    EPSILON_GREEDY_INCREMENT = 0.01

    # Load checkpoint
    load_version = "2018-06-05 18:24:13"
    timestamp = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    load_path = "output/model2/{}/SonicTheHedgehog.ckpt".format(load_version)

    PG = PolicyGradient(
        n_x = [112,112,3], #env.observation_space.shape,
        n_y = env.action_space.n,
        learning_rate=0.02,
        reward_decay=0.99,
        load_path=load_path,
        epsilon_max=0.98,
        epsilon_greedy_increment=EPSILON_GREEDY_INCREMENT,
        initial_epsilon = INITIAL_EPSILON
    )

    observation = env.reset()
    # print("obs", observation)
    episode_reward = 0

    tic = time.clock()

    while True:
        if RENDER_ENV: env.render()

        # 1. Choose an action based on observation
        observation = observation[:,96:,:] # make square, keep right sight of image
        observation = observation[::2,::2,:] # downsample to [112,112,3]
        observation = observation / 255 # normalize
        action = PG.choose_action(observation)

        # 2. Take action in the environment
        observation_, reward, done, info = env.step(action)

        # 4. Store transition for training
        PG.store_transition(observation, action, reward)

        episode_rewards_sum = sum(PG.episode_rewards)

        toc = time.clock()
        elapsed_sec = toc - tic

        # Save new observation
        observation = observation_

        if done:
            episode_rewards_sum = sum(PG.episode_rewards)
            rewards.append(episode_rewards_sum)
            max_reward_so_far = np.amax(rewards)

            if episode_rewards_sum == 0.0:
                print("-----------------------------------")
                print("Backtrack epsilon for more exploration...")
                PG.epsilon = max(PG.epsilon - EPSILON_GREEDY_INCREMENT, INITIAL_EPSILON)

            print("==========================================")
            print("Epsilon: ", PG.epsilon)
            print("Seconds: ", elapsed_sec)
            print("Reward: ", episode_rewards_sum)
            print("Max reward so far: ", max_reward_so_far)

            # 5. Train neural network
            tic = time.clock()
            discounted_episode_rewards_norm = PG.learn()
            toc = time.clock()
            elapsed_sec = toc - tic
            print("Train Seconds: ", elapsed_sec)

            observation = env.reset()

Exemple #3

0

Afficher le fichier

这里先用env.unwrapper来移除env上包裹的所有wrapper（其中一个让env的每个episode最大step设置为200），
然后给env包裹上Monitor Wrapper来实现渲染
"""
env = env.unwrapped
env = wrappers.Monitor(env, "../gym-results", force=True)

for episode in range(1000):
    obs = env.reset()
    while True:
        if RENDER:
            env.render()

        action = pg.choose_action(obs)
        obs_, reward, done, info = env.step(action)
        # print("cur_reward = ", reward)
        pg.store_transition(obs, action, reward)

        if done:
            ep_rewards_sum = sum(pg.ep_rewards)
            if 'running_reward' not in globals():
                running_reward = ep_rewards_sum
            else:
                running_reward = running_reward * 0.99 + ep_rewards_sum * 0.01
            if running_reward > DISPLAY_REWARD_THRESHOLD:
                RENDER = True
            print("episode: ", episode, "   reward: ", running_reward)
            vt = pg.learn()

            # if episode == 30:
            #     plt.plot(vt)
            #     plt.xlabel("episode")