def main():
  env = gym.make('CartPole-v0')
  gamma = 0.99
  copy_period = 50

  D = len(env.observation_space.sample())
  K = env.action_space.n
  sizes = [200,200]
  model = DQN(D, K, sizes, gamma)
  tmodel = DQN(D, K, sizes, gamma)

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)

  N = 500
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    eps = 1.0/np.sqrt(n+1)
    totalreward = play_one(env, model, tmodel, eps, gamma, copy_period)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)
Ejemplo n.º 2
0
def main():
    env = gym.make('MountainCarContinuous-v0')
    ft = FeatureTransformer(env, n_components=100)
    D = ft.dimensions
    pmodel = PolicyModel(D, ft)
    vmodel = ValueModel(D, ft)
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 50
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        totalreward = play_one_td(env, pmodel, vmodel, gamma)
        totalrewards[n] = totalreward
        if n % 1 == 0:
            print(
                "episode:", n, "total reward: %.1f" % totalreward,
                "avg reward (last 100): %.1f" %
                totalrewards[max(0, n - 100):(n + 1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(totalrewards)
    plot_cost_to_go(env, vmodel)
Ejemplo n.º 3
0
def main():
    env = gym.make('CartPole-v0')
    D = env.observation_space.shape[0]
    K = env.action_space.n
    pmodel = PolicyModel(D, K, [])
    vmodel = ValueModel(D, [10])
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 1
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        totalreward = play_one_mc(env, pmodel, vmodel, gamma)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print("episode:", n, "total reward:", totalreward,
                  "avg reward (last 100):",
                  totalrewards[max(0, n - 100):(n + 1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    print("total steps:", totalrewards.sum())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(totalrewards)
Ejemplo n.º 4
0
def main():
    env = gym.make('CartPole-v0')
    D = env.observation_space.shape[0]
    K = env.action_space.n
    p_model = PolicyModel(D, K, [10])
    v_model = ValueModel(D, [5, 5])
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    p_model.set_session(session)
    v_model.set_session(session)
    gamma = 0.99

    N = 3000
    total_rewards = np.empty(N)
    for n in range(N):
        total_reward = play_one_mc(env, p_model, v_model, gamma)
        total_rewards[n] = total_reward
        if n % 100 == 0:
            print('episode:', n, 'total reward:', total_reward, 'avg reward (last 100):', total_rewards[max(0, n - 100):n+1].mean())

    print('avg reward for last 100 episodes:', total_rewards[-100:].mean())
    print('total steps:', total_rewards.sum())

    plt.plot(total_rewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(total_rewards)
Ejemplo n.º 5
0
def main():
    env = gym.make('CartPole-v0').env
    gamma = 0.99
    copy_period = 50

    D = len(env.observation_space.sample())
    K = env.action_space.n

    sizes = [200, 200]
    model = DQN(D, K, sizes, gamma)
    tmodel = DQN(D, K, sizes, gamma)
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    model.set_session(session)
    tmodel.set_session(session)

    N = 500
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0 / np.sqrt(n + 1)
        totalreward = play_one(env, model, tmodel, eps, gamma, copy_period)
        totalrewards[n] = totalreward
        print(f'episode {n}, total reward {totalreward}')
    print(f'avg reward last hundo = {totalrewards[-100].mean()}')
    print(f'tot step = {-totalrewards.sum()}')

    plt.plot(totalrewards)
    plt.title('rewards')
    plt.show()

    plot_running_avg(totalrewards)
Ejemplo n.º 6
0
def main():
    env = gym.make('CartPole-v0')
    ft = FeatureTransformer(env)
    model = Model(env, ft)
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 500
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0 / np.sqrt(n + 1)
        totalreward = play_one(env, model, eps, gamma)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print("episode:", n, "total reward:", totalreward, "eps:", eps,
                  "avg reward (last 100):",
                  totalrewards[max(0, n - 100):(n + 1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    print("total steps:", totalrewards.sum())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(totalrewards)
def main():
  env = gym.make('CartPole-v0')
  ft = FeatureTransformer(env)
  model = Model(env, ft)
  gamma = 0.99

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)


  N = 500
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    eps = 1.0/np.sqrt(n+1)
    totalreward = play_one(env, model, eps, gamma)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)
Ejemplo n.º 8
0
def main():
  env = gym.make('CartPole-v0')
  D = env.observation_space.shape[0]
  K = env.action_space.n
  pmodel = PolicyModel(D, K, [])
  vmodel = ValueModel(D, [10])
  init = tf.global_variables_initializer()
  session = tf.InteractiveSession()
  session.run(init)
  pmodel.set_session(session)
  vmodel.set_session(session)
  gamma = 0.99

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)

  N = 500
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    totalreward = play_one_mc(env, pmodel, vmodel, gamma)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)
Ejemplo n.º 9
0
def main(show_plots=True):
    env = gym.make('MountainCar-v0')
    ft = FeatureTransformer(env)
    model = Model(env, ft, 'constant')
    gamma = 0.99

    N = 300
    total_rewards = np.empty(N)
    for n in range(N):
        eps = 0.1 * (0.97**n)
        total_reward = play_one(model, eps, gamma)
        total_rewards[n] = total_reward

        if n % 10 == 0:
            print('episode:', n, 'total reward:', total_reward)

    print('avg reward for last 100 episodes:', total_rewards[-100].mean())
    print('total steps:', -total_rewards.sum())

    if show_plots:
        plt.plot(total_rewards)
        plt.title('Rewards')
        plt.show()

        plot_running_avg(total_rewards)

        # Plot the optimal state-value function
        plot_cost_to_go(env, model)
def main():
    env = gym.make('MountainCarContinuous-v0').env
    ft = FeatureTransformer(env, n_components=100)
    D = ft.dimensions
    pmodel = PolicyModel(ft, D, [])
    vmodel = ValueModel(ft, D, [])
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    pmodel.set_session(session)
    vmodel.set_session(session)
    gamma = 0.95

    N = 50
    totalrewards = np.empty(N)
    for n in range(N):
        totalreward, _ = play_one_td(env, pmodel, vmodel, gamma)
        totalrewards[n] = totalreward

        print(f"episode {n}, total rewards {totalreward}")

    plt.plot(totalrewards)
    plt.title('rewards')
    plt.show()

    plot_running_avg(totalrewards)
    plot_cost_to_go(env, vmodel)
def main():
  env = gym.make('CartPole-v0')
  gamma = 0.99
  copy_period = 50

  D = len(env.observation_space.sample())
  K = env.action_space.n
  sizes = [200,200]
  model = DQN(D, K, sizes, gamma)
  tmodel = DQN(D, K, sizes, gamma)

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)

  N = 500
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    eps = 1.0/np.sqrt(n+1)
    totalreward = play_one(env, model, tmodel, eps, gamma, copy_period)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)
Ejemplo n.º 12
0
def laplace_env_main():
    session = tf.InteractiveSession()
    # train by a 200*200 grid
    laplace_env = create_LaplaceSolver(session, n=200)
    D = laplace_env.trained_NN.D
    K = 10
    pmodel = PolicyModel(D, K, [10, 5])
    vmodel = ValueModel(D, [20, 15])
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    pmodel.set_session(session)
    vmodel.set_session(session)

    gamma = 0.99
    N = 60
    #iters = np.empty(N)
    costs = np.empty(N)
    iters = []
    output_string = None
    output_strings = []
    for n in range(N):
        if (n == N):
            ite, all_error = play_one_mc(laplace_env, pmodel, vmodel, gamma)
        else:
            ite, _ = play_one_mc(laplace_env, pmodel, vmodel, gamma)
        iters_np = np.array(iters[max(0, n - 100):(n + 1)])
        #output_string = "episode:", n , "iter:", ite, "average iters:", iters[max(0, n-100):(n+1)].mean()
        output_string = "episode: %d, iter: %d, average iters: %f \n" % (
            n, ite, iters_np.mean())
        #output_string = "episode: %d, iter: %d " %(n , ite)
        output_strings.append(output_string)

        #iters[n] = ite
        if ite < 100000000:
            iters.append(ite)
        if len(iters) % 5 == 0:
            print(output_string)
            file_name = 'pg3.text'
            with open(file_name, "a") as text_file:
                text_file.write(output_string)

    iters_np = np.array(iters[-100:])
    print("avg reward for last 100 episodes:", iters_np.mean())
    print("total steps:", iters.sum())

    # After the training process, try the 500*500 grid.
    laplace_env_500 = create_LaplaceSolver(session, n=500)
    ite, errors = last_play(laplace_env_500, pmodel, vmodel)

    plt.plot(iters)
    plt.title("iters_while_training")
    plt.show()

    plot_running_avg(totalrewards)

    plt.plot(errors)
    plt.title("grid 500 * 500 error")
    plt.show()
def main():
    env = gym.make('Breakout-v0')
    gamma = 0.99
    copy_period = 10000

    D = len(env.observation_space.sample())
    K = env.action_space.n
    conv_sizes = [(32, 8, 4), (64, 4, 2), (64, 3, 1)]
    hidden_sizes = [512]
    # model = DQN(K, conv_sizes, hidden_sizes, gamma, scope='main')
    # tmodel = DQN(K, conv_sizes, hidden_sizes, gamma, scope='target')
    model = DQN(K, conv_sizes, hidden_sizes, gamma)
    tmodel = DQN(K, conv_sizes, hidden_sizes, gamma)
    init = tf.global_variables_initializer()
    session = tf.InteractiveSession()
    session.run(init)
    model.set_session(session)
    tmodel.set_session(session)

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 100000
    totalrewards = np.empty(N)
    costs = np.empty(N)
    n_max = 500000  # last step to decrease epsilon
    eps_step = 0.9 / n_max
    eps = 1.0
    for n in range(N):
        t0 = datetime.now()
        totalreward, eps, num_steps = play_one(env, model, tmodel, eps,
                                               eps_step, gamma, copy_period)
        totalrewards[n] = totalreward
        if n % 1 == 0:
            print("episode:", n, "total reward:", totalreward, "eps:",
                  "%.3f" % eps, "num steps:", num_steps, "episode duration:",
                  (datetime.now() - t0), "avg reward (last 100):",
                  "%.3f" % totalrewards[max(0, n - 100):(n + 1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    print("total steps:", totalrewards.sum())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(totalrewards)
def main():
    env = gym.make('redtiebot-v0')
    gamma = 0.99
    copy_period = 50
    s_time = time.time()
    D = 5
    K = 9
    sizes = [10, 15, 20, 15, 10]
    model = DQN(D, K, sizes, gamma, env)
    tmodel = DQN(D, K, sizes, gamma, env)
    tmodel.load()
    init = tf.compat.v1.global_variables_initializer()
    #session = tf.compat.v1.InteractiveSession()
    #session.run(init)
    #model.set_session(session)
    #tmodel.set_session(session)
    import pdb
    pdb.set_trace()

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 500
    model.set_max_guided_run(int(.25 * N))
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0 / np.sqrt(n + 1)
        totalreward = play_one(env, model, tmodel, eps, gamma, copy_period)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print("episode:", n, "total reward:", totalreward, "eps:", eps,
                  "avg reward (last 100):",
                  totalrewards[max(0, n - 100):(n + 1)].mean())
            print("time: " + str(time.time() - s_time))
            s_time = time.time()

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    print("total steps:", totalrewards.sum())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(totalrewards)
Ejemplo n.º 15
0
def main():
    env = gym.make('CartPole-v0')
    ft = FeatureTransformer(env)
    model = Model(env, ft)
    gamma = 0.99

    if 'monitor' in sys.argv:
        filename = os.path.basename(__file__).split('.')[0]
        monitor_dir = './' + filename + '_' + str(datetime.now())
        env = wrappers.Monitor(env, monitor_dir)

    N = 500
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1.0 / np.sqrt(n + 1)
        totalreward = play_one(env, model, eps, gamma)
        totalrewards[n] = totalreward
        if n % 100 == 0:
            print("episode:", n, "total reward:", totalreward, "eps:", eps,
                  "avg reward (last 100):",
                  totalrewards[max(0, n - 100):(n + 1)].mean())

    print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
    print("total steps:", totalrewards.sum())

    plt.plot(totalrewards)
    plt.title("Rewards")
    plt.show()

    plot_running_avg(totalrewards)

    env = wrappers.Monitor(env, 'cart_pole')
    observation = env.reset()
    done = False
    iters = 0
    while not done and iters < 5000:
        #action = model.sample_action(observation, eps)
        action = np.argmax(model.predict(observation))
        #prev_observation = observation
        observation, reward, done, info = env.step(action)
def main():
    env = gym.make('CartPole-v0').env
    ft = FeatureTransformer(env)
    model = Model(env, ft)
    gamma = 0.99

    N = 500
    totalrewards = np.empty(N)
    costs = np.empty(N)
    for n in range(N):
        eps = 1 / np.sqrt(n + 1)
        totalreward = play_one(env, model, eps, gamma)
        totalrewards[n] = totalreward

        if n % 100 == 0:
            print(f'episode {n}, total reward {totalreward}')
    print(f'avg reward last hundo = {totalrewards[-100].mean()}')
    print(f'tot step = {-totalrewards.sum()}')

    plt.plot(totalrewards)
    plt.title('rewards')
    plt.show()

    plot_running_avg(totalrewards)
Ejemplo n.º 17
0
def main():
    env = gym.make('MountainCarContinuous-v0').env
    ft = FeatureTransformer(env, n_components=100)
    D = ft.dimensions
    pmodel = PolicyModel(ft, D, [], [])
    session = tf.InteractiveSession()
    pmodel.set_session(session)
    pmodel.init_vars()
    gamma = 0.99

    
    totalreward, pmodel = random_search(
        env, pmodel, gamma
    )
    print(f'max rewards {max(totalreward)}')
    
    avg_totalrewards = play_multiple_episodes(env, 100, pmodel, gamma)
    print(f'avg reward = {avg_totalrewards}')
    
    plt.plot(totalreward)
    plt.title('rewards')
    plt.show()

    plot_running_avg(totalreward)
def main():
    env = gym.make('CartPole-v0')
    ft = FeatureTransformer()
    model = Model(env, ft)
    N = 500
    total_rewards = np.empty(N)

    for n in range(N):
        eps = 1 / np.sqrt(n + 1)
        total_reward = play_one(model, eps)
        total_rewards[n] = total_reward

        if (n + 1) % 100 == 0:
            print('episode:', (n + 1), 'total reward:', total_reward, 'eps:', eps, 'avg_reward (last 100):', total_rewards[max(0, n - 100):n+1].mean())

    print('avg reward for the last 100 episodes:', total_rewards[-100].mean())

    plt.plot(total_rewards)
    plt.title('Total Rewards')
    plt.show()

    plot_running_avg(total_rewards)

#main()
  lambda_ = 0.7

  if 'monitor' in sys.argv:
    filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)


  N = 500
  totalrewards = np.empty(N)
  # costs = np.empty(N)
  for n in range(N):
    # eps = 1.0/(0.1*n+1)
    # eps = 0.1*(0.97**n)
    eps = 1.0/np.sqrt(n+1)
    # eps = 0.1
    states_actions_rewards, totalreward = play_one(model, env, eps, gamma, lambda_)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())
  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)


Ejemplo n.º 20
0
<<<<<<< HEAD

  N = 600
=======
  N = 500
>>>>>>> upstream/master
  totalrewards = np.empty(N)
  costs = np.empty(N)
  for n in range(N):
    eps = 1.0/np.sqrt(n+1)
    totalreward = play_one(env, model, tmodel, eps, gamma, copy_period)
    totalrewards[n] = totalreward
    if n % 100 == 0:
      print("episode:", n, "total reward:", totalreward, "eps:", eps, "avg reward (last 100):", totalrewards[max(0, n-100):(n+1)].mean())

  print("avg reward for last 100 episodes:", totalrewards[-100:].mean())
  print("total steps:", totalrewards.sum())

  plt.plot(totalrewards)
  plt.title("Rewards")
  plt.show()

  plot_running_avg(totalrewards)


if __name__ == '__main__':
  main()