def main(args, reward_result): with tf.Session() as sess: # Initialize environment and seed np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env = allCars() state_dim = 6 action_dim = 1 action_bound = 5 actor = ActorNetwork(sess, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['tau']), int(args['minibatch_size'])) critic = CriticNetwork(sess, state_dim, action_dim, float(args['critic_lr']), float(args['tau']), float(args['gamma']), actor.get_num_trainable_vars()) actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim)) [summary_ops, summary_vars, paths] = train(sess, env, args, actor, critic, actor_noise, reward_result) return [summary_ops, summary_vars, paths]
def main(args, reward_result): with tf.Session() as sess: # Initialize environment and seed np.random.seed(int(args['random_seed'])) tf.set_random_seed(int(args['random_seed'])) env = allCars() state_dim = 6 action_dim = 1 action_bound = 5 learner = Learner(sess, env, state_dim, action_dim, action_bound, float(args['actor_lr']), float(args['critic_lr']), float(args['tau']), float(args['gamma']), float(args['minibatch_size'])) #[summary_ops, summary_vars, paths, reward_result] = learner.train(replay_buffer, minibatch_size) [summary_ops, summary_vars, paths, reward_result] = learner.train_rollout(args, reward_result) return [summary_ops, summary_vars, paths]
# Continuous environments ENVIRONMENT = 'Car-Data' # ENVIRONMENT = 'Pendulum-v0' # ENVIRONMENT = 'CartPole-v1' # ENVIRONMENT = 'MountainCarContinuous-v0' # ENVIRONMENT = 'LunarLanderContinuous-v2' # ENVIRONMENT = 'BipedalWalker-v2' # ENVIRONMENT = 'BipedalWalkerHardcore-v2' # ENVIRONMENT = 'CarRacing-v0' TIMESTAMP = datetime.now().strftime("%Y%m%d-%H%M%S") SUMMARY_DIR = os.path.join(OUTPUT_RESULTS_DIR, "PPO", ENVIRONMENT, TIMESTAMP) #env = gym.make(ENVIRONMENT) env = allCars() #env = wrappers.Monitor(env, os.path.join(SUMMARY_DIR, ENVIRONMENT), video_callable=None) ppo = PPO(env, SUMMARY_DIR, gpu=True) if MODEL_RESTORE_PATH is not None: ppo.restore_model(MODEL_RESTORE_PATH) t, terminal = 0, False buffer_s, buffer_a, buffer_r, buffer_v, buffer_terminal = [], [], [], [], [] rolling_r = RunningStats() # Get prior and set tuning parameters for adaptive regularization weight prior = BasePrior() lambda_store = np.zeros(BATCH + 1) lambda_all = np.zeros(EP_MAX + 1) lambda_max = 8