def test_polynomial_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000] expected = [ 0.5 + (2.0 - 0.5) * (1.0 - min(t, 100) / 100)**2 for t in ts ] config = dict( type= "ray.rllib.utils.schedules.polynomial_schedule.PolynomialSchedule", schedule_timesteps=100, initial_p=2.0, final_p=0.5, power=2.0, ) for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): polynomial = from_config(config, framework=fw) for t, e in zip(ts, expected): out = polynomial(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = polynomial(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_pg_fake_multi_gpu_learning(self): """Test whether PGTrainer can learn CartPole w/ faked multi-GPU.""" config = copy.deepcopy(pg.DEFAULT_CONFIG) # Fake GPU setup. config["num_gpus"] = 2 config["_fake_gpus"] = True # Mimic tuned_example for PG CartPole. config["model"]["fcnet_hiddens"] = [64] config["model"]["fcnet_activation"] = "linear" for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = pg.PGTrainer(config=config, env="CartPole-v0") num_iterations = 300 learnt = False for i in range(num_iterations): results = trainer.train() print("reward={}".format(results["episode_reward_mean"])) # Make this test quite short (75.0). if results["episode_reward_mean"] > 65.0: learnt = True break assert learnt,\ "PG multi-GPU (with fake-GPUs) did not learn CartPole!" trainer.stop()
def do_test_explorations( run, env, config, dummy_obs, prev_a=None, expected_mean_action=None ): """Calls an Agent's `compute_actions` with different `explore` options.""" core_config = config.copy() if run not in [a3c.A3CTrainer]: core_config["num_workers"] = 0 # Test all frameworks. for _ in framework_iterator(core_config): print("Agent={}".format(run)) # Test for both the default Agent's exploration AND the `Random` # exploration class. for exploration in [None, "Random"]: local_config = core_config.copy() if exploration == "Random": # TODO(sven): Random doesn't work for IMPALA yet. if run is impala.ImpalaTrainer: continue local_config["exploration_config"] = {"type": "Random"} print("exploration={}".format(exploration or "default")) trainer = run(config=local_config, env=env) # Make sure all actions drawn are the same, given same # observations. actions = [] for _ in range(25): actions.append( trainer.compute_single_action( observation=dummy_obs, explore=False, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None, ) ) check(actions[-1], actions[0]) # Make sure actions drawn are different # (around some mean value), given constant observations. actions = [] for _ in range(500): actions.append( trainer.compute_single_action( observation=dummy_obs, explore=True, prev_action=prev_a, prev_reward=1.0 if prev_a is not None else None, ) ) check( np.mean(actions), expected_mean_action if expected_mean_action is not None else 0.5, atol=0.4, ) # Check that the stddev is not 0.0 (values differ). check(np.std(actions), 0.0, false=True)
def test_linear_schedule(self): ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000] config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6} for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" linear = from_config(LinearSchedule, config, framework=fw_) for t in ts: out = linear(t) check(out, 2.1 - (min(t, 100) / 100) * (2.1 - 0.6), decimals=4)
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. num_iterations = 2 for _ in framework_iterator(config): trainer = pg.PGTrainer(config=config, env="CartPole-v0") for i in range(num_iterations): trainer.train()
def test_exponential_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] config = dict(initial_p=2.0, decay_rate=0.99, schedule_timesteps=100) for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" exponential = from_config( ExponentialSchedule, config, framework=fw_) for t in ts: out = exponential(t) check(out, 2.0 * 0.99**(t / 100), decimals=4)
def test_constant_schedule(self): value = 2.3 ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56] config = {"value": value} for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" constant = from_config(ConstantSchedule, config, framework=fw_) for t in ts: out = constant(t) check(out, value)
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. num_iterations = 2 for _ in framework_iterator(config): trainer = pg.PGTrainer(config=config, env="CartPole-v0") for i in range(num_iterations): print(trainer.train()) check_compute_single_action( trainer, include_prev_action_reward=True)
def test_piecewise_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 27] expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0] config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)], outside_value=14.5) for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" piecewise = from_config(PiecewiseSchedule, config, framework=fw_) for t, e in zip(ts, expected): out = piecewise(t) check(out, e, decimals=4)
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 1 config["rollout_fragment_length"] = 500 num_iterations = 1 for _ in framework_iterator(config): for env in ["FrozenLake-v0", "CartPole-v0"]: trainer = pg.PGTrainer(config=config, env=env) for i in range(num_iterations): print(trainer.train()) check_compute_single_action( trainer, include_prev_action_reward=True)
def test_pg_compilation(self): """Test whether a PGTrainer can be built with both frameworks.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 num_iterations = 2 for fw in framework_iterator(config): # For tf, build with fake-GPUs. config["_fake_gpus"] = fw == "tf" config["num_gpus"] = 2 if fw == "tf" else 0 trainer = pg.PGTrainer(config=config, env="CartPole-v0") for i in range(num_iterations): print(trainer.train()) check_compute_single_action(trainer, include_prev_action_reward=True)
def test_polynomial_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 23, 1000] config = dict(type="ray.rllib.utils.schedules.polynomial_schedule." "PolynomialSchedule", schedule_timesteps=100, initial_p=2.0, final_p=0.5, power=2.0) for fw in framework_iterator(frameworks=["tf", "tfe", "torch", None]): fw_ = fw if fw != "tfe" else "tf" polynomial = from_config(config, framework=fw_) for t in ts: out = polynomial(t) t = min(t, 100) check(out, 0.5 + (2.0 - 0.5) * (1.0 - t / 100)**2, decimals=4)
def test_linear_schedule(self): ts = [0, 50, 10, 100, 90, 2, 1, 99, 23, 1000] expected = [2.1 - (min(t, 100) / 100) * (2.1 - 0.6) for t in ts] config = {"schedule_timesteps": 100, "initial_p": 2.1, "final_p": 0.6} for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): linear = from_config(LinearSchedule, config, framework=fw) for t, e in zip(ts, expected): out = linear(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = linear(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_piecewise_schedule(self): ts = [0, 5, 10, 100, 90, 2, 1, 99, 27] expected = [50.0, 60.0, 70.0, 14.5, 14.5, 54.0, 52.0, 14.5, 140.0] config = dict(endpoints=[(0, 50.0), (25, 100.0), (30, 200.0)], outside_value=14.5) for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): piecewise = from_config(PiecewiseSchedule, config, framework=fw) for t, e in zip(ts, expected): out = piecewise(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = piecewise(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_constant_schedule(self): value = 2.3 ts = [100, 0, 10, 2, 3, 4, 99, 56, 10000, 23, 234, 56] config = {"value": value} for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): constant = from_config(ConstantSchedule, config, framework=fw) for t in ts: out = constant(t) check(out, value) ts_as_tensors = self._get_framework_tensors(ts, fw) for t in ts_as_tensors: out = constant(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, value, decimals=4)
def test_exponential_schedule(self): decay_rate = 0.2 ts = [0, 5, 10, 100, 90, 2, 1, 99, 23] expected = [2.0 * decay_rate**(t / 100) for t in ts] config = dict(initial_p=2.0, decay_rate=decay_rate, schedule_timesteps=100) for fw in framework_iterator( frameworks=["tf2", "tf", "tfe", "torch", None]): exponential = from_config(ExponentialSchedule, config, framework=fw) for t, e in zip(ts, expected): out = exponential(t) check(out, e, decimals=4) ts_as_tensors = self._get_framework_tensors(ts, fw) for t, e in zip(ts_as_tensors, expected): out = exponential(t) assert fw != "tf" or isinstance(out, tf.Tensor) check(out, e, decimals=4)
def test_pg_loss_functions(self): """Tests the PG loss function math.""" config = pg.DEFAULT_CONFIG.copy() config["num_workers"] = 0 # Run locally. config["gamma"] = 0.99 config["model"]["fcnet_hiddens"] = [10] config["model"]["fcnet_activation"] = "linear" # Fake CartPole episode of n time steps. train_batch = SampleBatch({ SampleBatch.OBS: np.array([[0.1, 0.2, 0.3, 0.4], [0.5, 0.6, 0.7, 0.8], [0.9, 1.0, 1.1, 1.2]]), SampleBatch.ACTIONS: np.array([0, 1, 1]), SampleBatch.REWARDS: np.array([1.0, 1.0, 1.0]), SampleBatch.DONES: np.array([False, False, True]), SampleBatch.EPS_ID: np.array([1234, 1234, 1234]), SampleBatch.AGENT_INDEX: np.array([0, 0, 0]), }) for fw, sess in framework_iterator(config, session=True): dist_cls = (Categorical if fw != "torch" else TorchCategorical) trainer = pg.PGTrainer(config=config, env="CartPole-v0") policy = trainer.get_policy() vars = policy.model.trainable_variables() if sess: vars = policy.get_session().run(vars) # Post-process (calculate simple (non-GAE) advantages) and attach # to train_batch dict. # A = [0.99^2 * 1.0 + 0.99 * 1.0 + 1.0, 0.99 * 1.0 + 1.0, 1.0] = # [2.9701, 1.99, 1.0] train_batch_ = pg.post_process_advantages(policy, train_batch.copy()) if fw == "torch": train_batch_ = policy._lazy_tensor_dict(train_batch_) # Check Advantage values. check(train_batch_[Postprocessing.ADVANTAGES], [2.9701, 1.99, 1.0]) # Actual loss results. if sess: results = policy.get_session().run( policy._loss, feed_dict=policy._get_loss_inputs_dict(train_batch_, shuffle=False)) else: results = (pg.pg_tf_loss if fw in ["tf2", "tfe"] else pg.pg_torch_loss)(policy, policy.model, dist_class=dist_cls, train_batch=train_batch_) # Calculate expected results. if fw != "torch": expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[0], vars[1], framework=fw), vars[2], vars[3], framework=fw) else: expected_logits = fc(fc(train_batch_[SampleBatch.OBS], vars[2], vars[3], framework=fw), vars[0], vars[1], framework=fw) expected_logp = dist_cls(expected_logits, policy.model).logp( train_batch_[SampleBatch.ACTIONS]) adv = train_batch_[Postprocessing.ADVANTAGES] if sess: expected_logp = sess.run(expected_logp) elif fw == "torch": expected_logp = expected_logp.detach().cpu().numpy() adv = adv.detach().cpu().numpy() else: expected_logp = expected_logp.numpy() expected_loss = -np.mean(expected_logp * adv) check(results, expected_loss, decimals=4)