def test_ppo_agent(): env = get_benchmark_env(level=1) n_episodes = 5 horizon = 30 def uncertainty_estimator_fn(observation_space, action_space): counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) return counter agent = PPOAgent(env, n_episodes=n_episodes, horizon=horizon, gamma=0.99, learning_rate=0.001, eps_clip=0.2, k_epochs=4, use_bonus=True, uncertainty_estimator_kwargs=dict( uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1)) agent._log_interval = 0 agent.fit() agent.policy(env.observation_space.sample())
def test_adaptive_ql(): env = get_benchmark_env(level=2) agent = AdaptiveQLAgent(env, n_episodes=50, horizon=30) agent.fit() agent.policy(env.observation_space.sample()) agent.Qtree.plot(0, 20) plt.clf()
def run_experiment(params, optimize_hyperparams): """ Main experiment function """ # Choose environment env = get_benchmark_env(level=1) # Initialize AgentStats stats = {} stats['ppo'] = AgentStats(PPOAgent, env, init_kwargs=params['ppo'], eval_horizon=params['ppo']['horizon'], n_fit=2) stats['a2c'] = AgentStats(A2CAgent, env, init_kwargs=params['a2c'], eval_horizon=params['a2c']['horizon'], n_fit=2) agent_stats_list = stats.values() # Optimize hyperparams if optimize_hyperparams: for stats in agent_stats_list: # timeout after 20 seconds stats.optimize_hyperparams(n_trials=50, timeout=10) # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, n_sim=10) print(output)
def test_reinforce_agent(): _env = get_benchmark_env(level=1) n_episodes = 50 horizon = 30 # def uncertainty_estimator_fn(observation_space, action_space): counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) return counter env = UncertaintyEstimatorWrapper(_env, uncertainty_estimator_fn, bonus_scale_factor=1.0) # agent = REINFORCEAgent( env, horizon=horizon, gamma=0.99, learning_rate=0.001, use_bonus_if_available=True, ) agent.fit(budget=n_episodes) agent.policy(env.observation_space.sample())
def test_rs_ucbvi_reward_free(): env = get_benchmark_env(level=1) agent = RSUCBVIAgent( env, gamma=0.99, horizon=30, bonus_scale_factor=0.1, reward_free=True ) agent.fit(budget=5) agent.policy(env.observation_space.sample()) assert agent.R_hat.sum() == 0.0
def test_rs_ucbvi(): env = get_benchmark_env(level=1) agent = RSUCBVIAgent(env, n_episodes=5, gamma=0.99, horizon=30, bonus_scale_factor=0.1) agent._log_interval = 0 agent.fit() agent.policy(env.observation_space.sample())
def run_experiment(params, optimize_hyperparams, rlberry_seed): """ Main experiment function """ seeding.set_global_seed(rlberry_seed) # Choose environment env = get_benchmark_env(level=1) # Initialize AgentStats stats = {} stats['ppo'] = AgentStats(PPOAgent, env, init_kwargs=params['ppo'], eval_horizon=params['ppo']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 2nd PPO thread # stats['ppo'].set_writer(1, None) stats['a2c'] = AgentStats(A2CAgent, env, init_kwargs=params['a2c'], eval_horizon=params['a2c']['horizon'], n_fit=2, output_dir=fs_observer.dir) # uncomment to disable writer of the 1st A2C thread # stats['a2c'].set_writer(0, None) agent_stats_list = stats.values() # Optimize hyperparams if optimize_hyperparams: for stats in agent_stats_list: # timeout after 20 seconds stats.optimize_hyperparams(n_trials=50, timeout=10, n_fit=2) # Fit with best hyperparams and save results for stats in agent_stats_list: stats.fit() stats.save_results() # learning curves plot_episode_rewards(agent_stats_list, cumulative=True, show=False) # compare final policies output = compare_policies(agent_stats_list, n_sim=10) print(output)
def test_reinforce_agent(): env = get_benchmark_env(level=1) n_episodes = 5 horizon = 30 agent = REINFORCEAgent(env, n_episodes=n_episodes, horizon=horizon, gamma=0.99, learning_rate=0.001) agent._log_interval = 0 agent.fit() agent.policy(env.observation_space.sample())
def test_a2c_agent(): env = get_benchmark_env(level=1) n_episodes = 5 horizon = 30 agent = A2CAgent(env, n_episodes=n_episodes, horizon=horizon, gamma=0.99, learning_rate=0.001, k_epochs=4) agent._log_interval = 0 agent.fit() agent.policy(env.observation_space.sample())
def test_rs_kernel_ucbvi(kernel_type): for horizon in [None, 30]: env = get_benchmark_env(level=1) agent = RSKernelUCBVIAgent(env, n_episodes=5, gamma=0.95, horizon=horizon, bonus_scale_factor=0.01, min_dist=0.2, bandwidth=0.05, beta=1.0, kernel_type=kernel_type) agent._log_interval = 0 agent.fit() agent.policy(env.observation_space.sample())
def test_avec_ppo_agent(): env = get_benchmark_env(level=1) n_episodes = 5 horizon = 30 agent = AVECPPOAgent(env, n_episodes=n_episodes, horizon=horizon, gamma=0.99, learning_rate=0.001, eps_clip=0.2, k_epochs=4, batch_size=1) agent._log_interval = 0 agent.fit() agent.policy(env.observation_space.sample())
def test_cem_agent(): env = get_benchmark_env(level=1) n_episodes = 5 batch_size = 100 horizon = 30 gamma = 0.99 agent = CEMAgent(env, n_episodes, horizon=horizon, gamma=gamma, batch_size=batch_size, percentile=70, learning_rate=0.01) agent._log_interval = 0 agent.fit() agent.policy(env.observation_space.sample())
def test_reinforce_agent_partial_fit(): env = get_benchmark_env(level=1) n_episodes = 10 horizon = 30 agent = REINFORCEAgent( env, horizon=horizon, gamma=0.99, learning_rate=0.001, use_bonus_if_available=False, ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) assert agent.episode == 5 agent.fit(budget=n_episodes // 2) assert agent.episode == 10 agent.policy(env.observation_space.sample())
def test_sac_agent_partial_fit(): env = get_benchmark_env(level=1) n_episodes = 10 agent = SACAgent( env, gamma=0.99, learning_rate=0.001, k_epochs=4, use_bonus=False, device="cpu", ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) assert agent.episode == 5 agent.fit(budget=n_episodes // 2) assert agent.episode == 10 agent.policy(env.observation_space.sample())
def test_a2c_agent_partial_fit(): env = get_benchmark_env(level=1) n_episodes = 10 horizon = 30 agent = A2CAgent(env, n_episodes=n_episodes, horizon=horizon, gamma=0.99, learning_rate=0.001, k_epochs=4, use_bonus=False) agent._log_interval = 0 agent.partial_fit(0.5) agent.policy(env.observation_space.sample()) assert agent.episode == 5 agent.partial_fit(0.5) assert agent.episode == 10 agent.policy(env.observation_space.sample())
def test_ppo_agent_partial_fit(): env = get_benchmark_env(level=1) n_episodes = 10 horizon = 30 agent = PPOAgent( env, horizon=horizon, gamma=0.99, learning_rate=0.001, eps_clip=0.2, k_epochs=4, use_bonus=False, ) agent.fit(budget=n_episodes // 2) agent.policy(env.observation_space.sample()) assert agent.episode == 5 agent.fit(budget=n_episodes // 2) assert agent.episode == 10 agent.policy(env.observation_space.sample())
def test_ppo_agent_partial_fit(): env = get_benchmark_env(level=1) n_episodes = 10 horizon = 30 agent = PPOAgent(env, n_episodes=n_episodes, horizon=horizon, gamma=0.99, learning_rate=0.001, eps_clip=0.2, k_epochs=4, batch_size=1) agent._log_interval = 0 agent.partial_fit(0.5) agent.policy(env.observation_space.sample()) assert agent.episode == 5 agent.partial_fit(0.5) assert agent.episode == 10 agent.policy(env.observation_space.sample())
def test_rnd(): # Environment env = get_benchmark_env(level=1) # RND rnd = RandomNetworkDistillation( env.observation_space, env.action_space, learning_rate=0.1, update_period=100, embedding_dim=2) # Test state = env.reset() for ii in range(1000): action = env.action_space.sample() next_s, reward, _, _ = env.step(action) rnd.update(state, action, next_s, reward) state = next_s # measure uncertainty _ = rnd.measure(state, action)
def test_sac_agent(): env = get_benchmark_env(level=1) n_episodes = 5 def uncertainty_estimator_fn(observation_space, action_space): counter = DiscreteCounter(observation_space, action_space, n_bins_obs=20) return counter agent = SACAgent( env, gamma=0.99, learning_rate=0.001, k_epochs=4, use_bonus=True, uncertainty_estimator_kwargs=dict( uncertainty_estimator_fn=uncertainty_estimator_fn, bonus_scale_factor=1.0), device="cpu", ) agent.fit(budget=n_episodes) agent.policy(env.observation_space.sample())
from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env from rlberry.agents import MBQVIAgent from rlberry.agents.ppo import PPOAgent from rlberry.utils.logging import configure_logging from rlberry.wrappers import DiscretizeStateWrapper from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies configure_logging("DEBUG") # global seed seeding.set_global_seed(1234) # -------------------------------- # Define train and evaluation envs # -------------------------------- train_env = get_benchmark_env(level=5) d_train_env = DiscretizeStateWrapper(train_env, 20) # ----------------------------- # Parameters # ----------------------------- N_EPISODES = 500 GAMMA = 0.99 HORIZON = 50 params_oracle = { "n_samples": 20, # samples per state-action "gamma": GAMMA, "horizon": HORIZON }
def test_ball2d_benchmark_instantiation(): for level in [1, 2, 3, 4, 5]: env = get_benchmark_env(level) for aa in range(env.action_space.n): env.step(aa) env.sample(env.observation_space.sample(), aa)
def test_rs_ucbvi(): env = get_benchmark_env(level=1) agent = RSUCBVIAgent(env, gamma=0.99, horizon=30, bonus_scale_factor=0.1) agent.fit(budget=5) agent.policy(env.observation_space.sample())
from rlberry.agents.cem import CEMAgent from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env import rlberry.seeding as seeding seeding.set_global_seed(1234) env = get_benchmark_env(level=1) n_episodes = 1000 horizon = 30 gamma = 0.99 params = { 'n_episodes': n_episodes, 'horizon': horizon, 'gamma': gamma, 'batch_size': 20, 'percentile': 70, 'learning_rate': 0.01 } agent = CEMAgent(env, **params) agent.fit() env.enable_rendering() state = env.reset() for tt in range(4 * horizon): action = agent.policy(state) next_state, reward, done, _ = env.step(action) state = next_state
from copy import deepcopy from rlberry.envs.benchmarks.ball_exploration.ball2d import get_benchmark_env from rlberry.agents.ppo import PPOAgent from rlberry.stats import AgentStats, plot_episode_rewards, compare_policies from rlberry.exploration_tools.online_discretization_counter import OnlineDiscretizationCounter from rlberry.exploration_tools.discrete_counter import DiscreteCounter from rlberry.wrappers.uncertainty_estimator_wrapper import UncertaintyEstimatorWrapper # global seed seeding.set_global_seed(12345) # -------------------------------- # Define train env # -------------------------------- env = get_benchmark_env(level=4) eval_env = get_benchmark_env(level=4) def uncertainty_estimator_fn(obs_space, act_space): counter = DiscreteCounter(obs_space, act_space, n_bins_obs=20) return counter # ----------------------------- # Parameters # ----------------------------- N_EPISODES = 2000 GAMMA = 0.99