Esempio n. 1
0
def main(args, reward_result):

    with tf.Session() as sess:

        # Initialize environment and seed
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env = allCars()
        state_dim = 6
        action_dim = 1
        action_bound = 5

        actor = ActorNetwork(sess, state_dim, action_dim, action_bound,
                             float(args['actor_lr']), float(args['tau']),
                             int(args['minibatch_size']))

        critic = CriticNetwork(sess, state_dim, action_dim,
                               float(args['critic_lr']), float(args['tau']),
                               float(args['gamma']),
                               actor.get_num_trainable_vars())

        actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

        [summary_ops, summary_vars,
         paths] = train(sess, env, args, actor, critic, actor_noise,
                        reward_result)

        return [summary_ops, summary_vars, paths]
Esempio n. 2
0
def main(args, reward_result):

    with tf.Session() as sess:

        # Initialize environment and seed
        np.random.seed(int(args['random_seed']))
        tf.set_random_seed(int(args['random_seed']))
        env = allCars()
        state_dim = 6
        action_dim = 1
        action_bound = 5

        learner = Learner(sess, env, state_dim, action_dim, action_bound,
                          float(args['actor_lr']), float(args['critic_lr']),
                          float(args['tau']), float(args['gamma']),
                          float(args['minibatch_size']))

        #[summary_ops, summary_vars, paths, reward_result] = learner.train(replay_buffer, minibatch_size)
        [summary_ops, summary_vars, paths,
         reward_result] = learner.train_rollout(args, reward_result)

        return [summary_ops, summary_vars, paths]
Esempio n. 3
0
    # Continuous environments
    ENVIRONMENT = 'Car-Data'
    # ENVIRONMENT = 'Pendulum-v0'
    # ENVIRONMENT = 'CartPole-v1'
    # ENVIRONMENT = 'MountainCarContinuous-v0'
    # ENVIRONMENT = 'LunarLanderContinuous-v2'
    # ENVIRONMENT = 'BipedalWalker-v2'
    # ENVIRONMENT = 'BipedalWalkerHardcore-v2'
    # ENVIRONMENT = 'CarRacing-v0'

    TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S")
    SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "PPO", ENVIRONMENT,
                               TIMESTAMP)

    #env = gym.make(ENVIRONMENT)
    env = allCars()
    #env = wrappers.Monitor(env, os.path.join(SUMMARY_DIR, ENVIRONMENT), video_callable=None)
    ppo = PPO(env, SUMMARY_DIR, gpu=True)

    if MODEL_RESTORE_PATH is not None:
        ppo.restore_model(MODEL_RESTORE_PATH)

    t, terminal = 0, False
    buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], []
    rolling_r = RunningStats()

    # Get prior and set tuning parameters for adaptive regularization weight
    prior = BasePrior()
    lambda_store = np.zeros(BATCH + 1)
    lambda_all = np.zeros(EP_MAX + 1)
    lambda_max = 8