コード例 #1
0
def main():
    env = gym.make('MountainCarContinuous-v0')
    ft = FeatureTransformer(env, n_components=100)
    D = ft.dimensions
    pmodel = PolicyModel(D, ft)
    vmodel = ValueModel(D, ft)
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 50
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        totalreward = play_one_td(env, pmodel, vmodel, gamma)
        totalrewards[n] = totalreward
        if n % 1 == 0:
            print(
                "episode:", n, "total reward: %.1f" % totalreward,
                "avg reward (last 100): %.1f" %
                totalrewards[max(0, n - 100):(n + 1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(totalrewards)
    plot_cost_to_go(env, vmodel)
コード例 #2
0
def main():
    env = gym.make('MountainCarContinuous-v0')
    ft = FeatureTransformer(env, n_components=100)
    D = ft.dimensions
    pmodel = PolicyModel(ft, D, [], [])
    # init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    # session.run(init)
    pmodel.set_session(session)
    pmodel.init_vars()
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    totalrewards, pmodel = random_search(env, pmodel, gamma)
    print("max reward:", np.max(totalrewards))

    # play 100 episodes and check the average
    avg_totalrewards = play_multiple_episodes(env,
                                              100,
                                              pmodel,
                                              gamma,
                                              print_iters=True)
    print("avg reward over 100 episodes with best models:", avg_totalrewards)

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()
コード例 #3
0
def main():
  env = gym.make('MountainCarContinuous-v0')
  ft = FeatureTransformer(env, n_components=100)
  D = ft.dimensions
  pmodel = PolicyModel(D, ft, [])
  vmodel = ValueModel(D, ft, [])
  init = tf.compat.v1.global_variables_initializer()
  session = tf.compat.v1.InteractiveSession()
  session.run(init)
  pmodel.set_session(session)
  vmodel.set_session(session)
  gamma = 0.95

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)

  N = 50
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma)
    totalrewards[n] = totalreward
    if n % 1 == 0:
      print("episode:", n, "total reward: %.1f" % totalreward, "num steps: %d" % num_steps, "avg reward (last 100): %.1f" % totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)
  plot_cost_to_go(env, vmodel)
def main():
    env = gym.make('MountainCarContinuous-v0')
    ft = FeatureTransformer(env)
    D = ft.dimensions
    pmodel = PolicyModel(ft, D, [], [])
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    totalrewards, pmodel = random_search(env, pmodel, gamma)

    print("max reward", np.max(totalrewards))

    # play 100 episodes and check the average
    avg_totalrewards = play_multiple_episodes(env,
                                              100,
                                              pmodel,
                                              gamma,
                                              print_iters=True)
    print("avg reward over 100 episoes with best models:", avg_totalrewards)

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()
コード例 #5
0
            actions.pop(0)
    else:
        while len(rewards) > 0:
            guess_rewards = rewards + [-1] * (n - len(rewards))
            G = multiplier.dot(guess_rewards)
            model.update(states[0], actions[0], G)
            rewards.pop(0)
            states.pop(0)
            actions.pop(0)

    return totalreward


if __name__ == '__main__':
    env = gym.make('MountainCar-v0')
    ft = FeatureTransformer(env)
    model = Model(env, ft, "constant")
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 300
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 0.1 * (0.97**n)
        totalreward = play_one(model, eps, gamma)
        totalrewards[n] = totalreward
コード例 #6
0
observation

# In[71]:

actions = np.atleast_1d(env.observation_space.sample())

# In[72]:

observation, reward, done, info = env.step(actions)

# In[73]:

observation

# In[74]:

ft.transform(observations=observation)

# In[33]:

ft = FeatureTransformer(env, n_components=100)

# In[37]:

ft.transform(np.atleast_2d(env.reset()))

# In[38]:

np.atleast_2d(env.reset())
        # an object where the actual action is stored in object[0]
        observation, reward, done, info = env.step([action])
        totalreward += reward
        # Update models
        V_next = vmodel.predict(observation)
        G = reward + gamma * V_next
        advantage = G - vmodel.predict(prev_observation)
        pmodel.partial_fit(prev_observation, action, advantage)
        vmodel.partial_fit(prev_observation, G)
        iters += 1
    return totalreward, iters


# In[7]:

ft = FeatureTransformer(env, n_components=100)
D = ft.dimensions
pmodel = PolicyModel(D, ft, [])
vmodel = ValueModel(D, ft, [])
init = tf.global_variables_initializer()
session = tf.InteractiveSession()
session.run(init)
pmodel.set_session(session)
vmodel.set_session(session)
gamma = 0.95

N = 50
totalrewards = np.empty(N)
costs = np.empty(N)
for n in range(N):
    totalreward, num_steps = play_one_td(env, pmodel, vmodel, gamma)