def test_rescale_reward(): # tolerance tol = 1e-14 rng = seeding.get_rng() for _ in range(10): # generate random MDP S, A = 5, 2 R = rng.uniform(0.0, 1.0, (S, A)) P = rng.uniform(0.0, 1.0, (S, A, S)) for ss in range(S): for aa in range(A): P[ss, aa, :] /= P[ss, aa, :].sum() env = FiniteMDP(R, P) # test wrapped = RescaleRewardWrapper(env, (-10, 10)) _ = wrapped.reset() for _ in range(100): _, reward, _, _ = wrapped.sample( wrapped.observation_space.sample(), wrapped.action_space.sample()) assert reward <= 10 + tol and reward >= -10 - tol _ = wrapped.reset() for _ in range(100): _, reward, _, _ = wrapped.step(wrapped.action_space.sample()) assert reward <= 10 + tol and reward >= -10 - tol
def test_bellman_operator_monotonicity_and_contraction(gamma, S, A): rng = seeding.get_rng() vmax = 1.0 / (1.0 - gamma) for _ in range(10): # generate random MDP R, P = get_random_mdp(S, A) # generate random Q functions Q0 = rng.uniform(-vmax, vmax, (S, A)) Q1 = rng.uniform(-vmax, vmax, (S, A)) # apply Bellman operator TQ0 = bellman_operator(Q0, R, P, gamma) TQ1 = bellman_operator(Q1, R, P, gamma) # test contraction norm_tq = np.abs(TQ1 - TQ0).max() norm_q = np.abs(Q1 - Q0).max() assert norm_tq <= gamma * norm_q # test monotonicity Q2 = rng.uniform(-vmax / 2, vmax / 2, (S, A)) Q3 = Q2 + rng.uniform(0.0, vmax / 2, (S, A)) TQ2 = bellman_operator(Q2, R, P, gamma) TQ3 = bellman_operator(Q3, R, P, gamma) assert np.greater(TQ2, TQ3).sum() == 0
def test_random_numbers(): seed = 43 seeding.set_global_seed(seed) rng1 = seeding.get_rng() data1 = rng1.integers(100, size=1000) seed = 44 seeding.set_global_seed(seed) rng2 = seeding.get_rng() data2 = rng2.integers(100, size=1000) seed = 44 seeding.set_global_seed(seed) rng3 = seeding.get_rng() data3 = rng3.integers(100, size=1000) assert (data1 != data2).sum() > 5 assert (data2 != data3).sum() == 0
def reseed(self): self.rng = seeding.get_rng() # seed gym.Env that is not a rlberry Model if not isinstance(self.env, Model): # get a seed for gym environment seeding.safe_reseed(self.env) seeding.safe_reseed(self.observation_space) seeding.safe_reseed(self.action_space) # seed rlberry Model else: self.env.reseed() self.observation_space.rng = self.env.rng self.action_space.rng = self.env.rng
def test_seeding(): seed = 123 seeding.set_global_seed(seed) # check that reimports do not cause problems import rlberry import rlberry.seeding # assert seeding._GLOBAL_SEED_SEQ.entropy == seed _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 1 # check that reimports do not cause problems import rlberry import rlberry.seeding assert seeding._GLOBAL_SEED_SEQ.entropy == seed # _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2
def test_seeding(): seed = 123 seeding.set_global_seed(seed) # check that reimports do not cause problems import rlberry import rlberry.seeding # assert seeding._GLOBAL_SEED_SEQ.entropy == seed _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 2 # counting the global rng generated automatically # check that reimports do not cause problems import rlberry import rlberry.seeding assert seeding._GLOBAL_SEED_SEQ.entropy == seed # _ = seeding.get_rng() assert seeding._GLOBAL_SEED_SEQ.n_children_spawned == 3
def __init__(self, env, n_episodes=1000, horizon=100, gamma=0.99, batch_size=16, percentile=70, learning_rate=0.01, optimizer_type='ADAM', policy_net_fn=None, **kwargs): Agent.__init__(self, env, **kwargs) # check environment assert isinstance(self.env.observation_space, spaces.Box) assert isinstance(self.env.action_space, spaces.Discrete) # parameters self.gamma = gamma self.batch_size = batch_size self.n_episodes = n_episodes self.percentile = percentile self.learning_rate = learning_rate self.horizon = horizon # random number generator self.rng = seeding.get_rng() # self.policy_net_fn = policy_net_fn \ or (lambda: default_policy_net_fn(self.env)) self.optimizer_kwargs = {'optimizer_type': optimizer_type, 'lr': learning_rate} # policy net self.policy_net = self.policy_net_fn().to(device) # loss function and optimizer self.loss_fn = nn.CrossEntropyLoss() self.optimizer = optimizer_factory( self.policy_net.parameters(), **self.optimizer_kwargs) # memory self.memory = CEMMemory(self.batch_size) # default writer self.writer = PeriodicWriter(self.name, log_every=5*logger.getEffectiveLevel())
def reseed(self): self.rng = seeding.get_rng() # seed gym.Env that is not a rlberry Model if isinstance(self.env, gym.Env) \ and not isinstance(self.env, Model): # get a seed for gym environment seed = self.rng.integers(2**16).item() self.env.seed(seed) self.observation_space.seed(seed) self.action_space.seed(seed) # seed rlberry Model else: self.env.reseed() self.observation_space.rng = self.env.rng self.action_space.rng = self.env.rng
def test_mbqvi(S, A): rng = seeding.get_rng() for sim in range(5): # generate random MDP with deterministic transitions R = rng.uniform(0.0, 1.0, (S, A)) P = np.zeros((S, A, S)) for ss in range(S): for aa in range(A): ns = rng.integers(0, S) P[ss, aa, ns] = 1 # run MBQVI and check exactness of estimators env = FiniteMDP(R, P) agent = MBQVIAgent(env, n_samples=1) agent.fit() assert np.abs(R - agent.R_hat).max() < 1e-16 assert np.abs(P - agent.P_hat).max() < 1e-16
def __init__(self, env, policy, learning_rate=7e-4, n_steps: int = 5, gamma: float = 0.99, gae_lambda: float = 1.0, ent_coef: float = 0.0, vf_coef: float = 0.5, max_grad_norm: float = 0.5, rms_prop_eps: float = 1e-5, use_rms_prop: bool = True, use_sde: bool = False, sde_sample_freq: int = -1, normalize_advantage: bool = False, tensorboard_log=None, create_eval_env=False, policy_kwargs=None, verbose: int = 0, seed=None, device="auto", _init_setup_model: bool = True, **kwargs): # Generate seed for A2CStableBaselines using rlberry seeding self.rng = seeding.get_rng() seed = self.rng.integers(2**32).item() # init stable baselines class self.wrapped = A2CStableBaselines( policy, env, learning_rate, n_steps, gamma, gae_lambda, ent_coef, vf_coef, max_grad_norm, rms_prop_eps, use_rms_prop, use_sde, sde_sample_freq, normalize_advantage, tensorboard_log, create_eval_env, policy_kwargs, verbose, seed, device, _init_setup_model) # init rlberry base class Agent.__init__(self, env, **kwargs)
def test_mock_args(monkeypatch): monkeypatch.setattr( "sys.argv", ['', 'rlberry/experiment/tests/params_experiment.yaml']) random_numbers = [] for agent_stats in experiment_generator(): rng = sd.get_rng() random_numbers.append(rng.uniform(size=10)) assert agent_stats.agent_class is RSUCBVIAgent assert agent_stats.init_kwargs['n_episodes'] == 100 assert agent_stats.init_kwargs['horizon'] == 50 assert agent_stats.init_kwargs['lp_metric'] == 2 assert agent_stats.init_kwargs['min_dist'] == 0.0 assert agent_stats.init_kwargs['max_repr'] == 800 assert agent_stats.init_kwargs['bonus_scale_factor'] == 1.0 assert agent_stats.init_kwargs['reward_free'] is True assert agent_stats.eval_horizon == 51 train_env = agent_stats.train_env[0](**agent_stats.train_env[1]) assert train_env.reward_free is False assert train_env.array_observation is True if agent_stats.agent_name == 'rsucbvi': assert agent_stats.init_kwargs['gamma'] == 1.0 elif agent_stats.agent_name == 'rsucbvi_alternative': assert agent_stats.init_kwargs['gamma'] == 0.9 else: raise ValueError() # check that seeding is the same for each AgentStats instance for ii in range(1, len(random_numbers)): assert np.array_equal(random_numbers[ii - 1], random_numbers[ii])
def __init__(self, agent_class, train_env, eval_env=None, eval_horizon=None, init_kwargs=None, fit_kwargs=None, policy_kwargs=None, agent_name=None, n_fit=4, n_jobs=4, output_dir='stats_data'): # agent_class should only be None when the constructor is called # by the class method AgentStats.load(), since the agent class # will be loaded. if agent_class is not None: self.agent_name = agent_name if agent_name is None: self.agent_name = agent_class.name # create oject identifier timestamp = datetime.timestamp(datetime.now()) self.identifier = 'stats_{}_{}'.format(self.agent_name, str(int(timestamp))) self.fit_info = agent_class.fit_info self.agent_class = agent_class self.train_env = train_env if eval_env is None: self.eval_env = deepcopy(train_env) self.eval_env.reseed() else: self.eval_env = deepcopy(eval_env) self.eval_env.reseed() self.eval_horizon = eval_horizon # init and fit kwargs are deep copied in fit() self.init_kwargs = deepcopy(init_kwargs) self.fit_kwargs = fit_kwargs self.policy_kwargs = deepcopy(policy_kwargs) self.n_fit = n_fit self.n_jobs = n_jobs self.output_dir = output_dir if init_kwargs is None: self.init_kwargs = {} if fit_kwargs is None: self.fit_kwargs = {} if policy_kwargs is None: self.policy_kwargs = {} # Create environment copies for training self.train_env_set = [] for _ in range(n_fit): _env = deepcopy(train_env) _env.reseed() self.train_env_set.append(_env) # self.fitted_agents = None self.fit_kwargs_list = None # keep in memory for partial_fit() self.fit_statistics = {} # self.rng = seeding.get_rng() # optuna study self.study = None # default filename to save data self.default_filename = os.path.join(self.output_dir, self.identifier)
def compare_policies(agent_stats_list, eval_env=None, eval_horizon=None, stationary_policy=True, n_sim=10, fignum=None, show=True, plot=True, **kwargs): """ Compare the policies of each of the agents in agent_stats_list. Each element of the agent_stats_list contains a list of fitted agents. To evaluate the policy, we repeat n_sim times: * choose one of the fitted agents uniformly at random * run its policy in eval_env for eval_horizon time steps Parameters ---------- agent_stats_list : list of AgentStats objects. eval_env : Model Environment where to evaluate the policies. If None, it is taken from AgentStats. eval_horizon : int Number of time steps for policy evaluation. If None, it is taken from AgentStats. stationary_policy : bool If False, the time step h (0<= h <= eval_horizon) is sent as argument to agent.policy() for policy evaluation. n_sim : int Number of simulations to evaluate each policy. fignum: string or int Identifier of plot figure. show: bool If true, calls plt.show(). plot: bool If false, do not plot. kwargs: Extra parameters for sns.boxplot """ # # evaluation # use_eval_from_agent_stats = (eval_env is None) use_horizon_from_agent_stats = (eval_horizon is None) rng = seeding.get_rng() agents_rewards = [] for agent_stats in agent_stats_list: # train agents if they are not already trained if agent_stats.fitted_agents is None: agent_stats.fit() # eval env and horizon if use_eval_from_agent_stats: eval_env = agent_stats.eval_env assert eval_env is not None, \ "eval_env not in AgentStats %s" % agent_stats.agent_name if use_horizon_from_agent_stats: eval_horizon = agent_stats.eval_horizon assert eval_horizon is not None, \ "eval_horizon not in AgentStats %s" % agent_stats.agent_name # evaluate agent episode_rewards = np.zeros(n_sim) for sim in range(n_sim): # choose one of the fitted agents randomly agent_idx = rng.integers(len(agent_stats.fitted_agents)) agent = agent_stats.fitted_agents[agent_idx] # evaluate agent observation = eval_env.reset() for hh in range(eval_horizon): if stationary_policy: action = agent.policy(observation, **agent_stats.policy_kwargs) else: action = agent.policy(observation, hh, **agent_stats.policy_kwargs) observation, reward, done, _ = eval_env.step(action) episode_rewards[sim] += reward if done: break # store rewards agents_rewards.append(episode_rewards) # # plot # # build unique agent IDs (in case there are two agents with the same ID) unique_ids = [] id_count = {} for agent_stats in agent_stats_list: name = agent_stats.agent_name if name not in id_count: id_count[name] = 1 else: id_count[name] += 1 unique_ids.append(name + "*" * (id_count[name] - 1)) # convert output to DataFrame data = {} for agent_id, agent_rewards in zip(unique_ids, agents_rewards): data[agent_id] = agent_rewards output = pd.DataFrame(data) # plot if plot: plt.figure(fignum) with sns.axes_style("whitegrid"): ax = sns.boxplot(data=output, **kwargs) ax.set_xlabel("agent") ax.set_ylabel("rewards in one episode") plt.title("Environment = %s" % getattr(eval_env.unwrapped, "name", eval_env.unwrapped.__class__.__name__)) if show: plt.show() return output
def mc_policy_evaluation(agent, eval_env, eval_horizon=10**5, n_sim=10, gamma=1.0, policy_kwargs=None, stationary_policy=True): """ Monte-Carlo Policy evaluation [1]_ of an agent to estimate the value at the initial state. If a list of agents is provided as input, for each evaluation, one of the agents is sampled uniformly at random. Parameters ---------- agent : Agent or list of agents. Trained agent(s). eval_env : Env Evaluation environment. eval_horizon : int, default: 10**5 Horizon, maximum episode length. n_sim : int, default: 10 Number of Monte Carlo simulations. gamma : double, default: 1.0 Discount factor. policy_kwargs : dict or None Optional kwargs for agent.policy() method. stationary_policy : bool, default: True If False, the time step h (0<= h <= eval_horizon) is sent as argument to agent.policy() for policy evaluation. Return ------ Numpy array of shape (n_sim, ) containing the sum of rewards in each simulation. References ---------- .. [1] http://incompleteideas.net/book/first/ebook/node50.html """ rng = seeding.get_rng() if not isinstance(agent, list): agents = [agent] else: agents = agent policy_kwargs = policy_kwargs or {} episode_rewards = np.zeros(n_sim) for sim in range(n_sim): idx = rng.integers(len(agents)) observation = eval_env.reset() for hh in range(eval_horizon): if stationary_policy: action = agents[idx].policy(observation, **policy_kwargs) else: action = agents[idx].policy(observation, hh, **policy_kwargs) observation, reward, done, _ = eval_env.step(action) episode_rewards[sim] += reward * np.power(gamma, hh) if done: break return episode_rewards
import numpy as np import pytest import rlberry.seeding as seeding from rlberry.agents.dynprog import ValueIterationAgent from rlberry.agents.dynprog.utils import backward_induction from rlberry.agents.dynprog.utils import backward_induction_in_place from rlberry.agents.dynprog.utils import backward_induction_sd from rlberry.agents.dynprog.utils import bellman_operator from rlberry.agents.dynprog.utils import value_iteration from rlberry.envs.finite import FiniteMDP _rng = seeding.get_rng() def get_random_mdp(S, A): R = _rng.uniform(0.0, 1.0, (S, A)) P = _rng.uniform(0.0, 1.0, (S, A, S)) for ss in range(S): for aa in range(A): P[ss, aa, :] /= P[ss, aa, :].sum() return R, P @pytest.mark.parametrize("gamma, S, A", [(0.001, 2, 1), (0.25, 2, 1), (0.5, 2, 1), (0.75, 2, 1), (0.999, 2, 1), (0.001, 4, 2), (0.25, 4, 2), (0.5, 4, 2), (0.75, 4, 2), (0.999, 4, 2), (0.001, 20, 4), (0.25, 20, 4), (0.5, 20, 4), (0.75, 20, 4),