def test_dqn_compilation(self): """Test whether DQN can be built on all frameworks.""" num_iterations = 1 config = dqn.dqn.DQNConfig().rollouts(num_rollout_workers=2) for _ in framework_iterator(config, with_eager_tracing=True): # Double-dueling DQN. print("Double-dueling") plain_config = deepcopy(config) trainer = dqn.DQN(config=plain_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) trainer.stop() # Rainbow. print("Rainbow") rainbow_config = deepcopy(config).training( num_atoms=10, noisy=True, double_q=True, dueling=True, n_step=5 ) trainer = dqn.DQN(config=rainbow_config, env="CartPole-v0") for i in range(num_iterations): results = trainer.train() check_train_results(results) print(results) check_compute_single_action(trainer) trainer.stop()
def test_on_sub_environment_created(self): base_config = { "env": "CartPole-v1", # Create 4 sub-environments per remote worker. "num_envs_per_worker": 4, # Create 2 remote workers. "num_workers": 2, } for callbacks in ( OnSubEnvironmentCreatedCallback, MultiCallbacks([OnSubEnvironmentCreatedCallback]), ): config = dict(base_config, callbacks=callbacks) for _ in framework_iterator(config, frameworks=("tf", "torch")): trainer = dqn.DQN(config=config) # Fake the counter on the local worker (doesn't have an env) and # set it to -1 so the below `foreach_worker()` won't fail. trainer.workers.local_worker().sum_sub_env_vector_indices = -1 # Get sub-env vector index sums from the 2 remote workers: sum_sub_env_vector_indices = trainer.workers.foreach_worker( lambda w: w.sum_sub_env_vector_indices ) # Local worker has no environments -> Expect the -1 special # value returned by the above lambda. self.assertTrue(sum_sub_env_vector_indices[0] == -1) # Both remote workers (index 1 and 2) have a vector index counter # of 6 (sum of vector indices: 0 + 1 + 2 + 3). self.assertTrue(sum_sub_env_vector_indices[1] == 6) self.assertTrue(sum_sub_env_vector_indices[2] == 6) trainer.stop()
def test_leaky_policy(self): """Tests, whether our diagnostics tools can detect leaks in a policy.""" config = dqn.DEFAULT_CONFIG.copy() # Make sure we have an env to test on the local worker. # Otherwise, `check_memory_leaks` will complain. config["create_env_on_driver"] = True config["env"] = "CartPole-v0" config["multiagent"]["policies"] = { "default_policy": PolicySpec(policy_class=MemoryLeakingPolicy), } trainer = dqn.DQN(config=config) results = check_memory_leaks(trainer, to_check={"policy"}, repeats=300) assert results["policy"] trainer.stop()
def train_rllib_policy(config): """Trains a DQN on MsPacman-v0 for n iterations. Saves the trained Trainer to disk and returns the checkpoint path. Returns: str: The saved checkpoint to restore the trainer DQN from. """ # Create trainer from config. trainer = dqn.DQN(config=config) # Train for n iterations, then save. for _ in range(args.train_iters): print(trainer.train()) return trainer.save()
def test_traj_view_normal_case(self): """Tests, whether Model and Policy return the correct ViewRequirements.""" config = dqn.DEFAULT_CONFIG.copy() config["num_envs_per_worker"] = 10 config["rollout_fragment_length"] = 4 for _ in framework_iterator(config): algo = dqn.DQN( config, env="ray.rllib.examples.env.debug_counter_env.DebugCounterEnv") policy = algo.get_policy() view_req_model = policy.model.view_requirements view_req_policy = policy.view_requirements print(_) print(view_req_policy) print(view_req_model) assert len(view_req_model) == 1, view_req_model assert len(view_req_policy) == 11, view_req_policy for key in [ SampleBatch.OBS, SampleBatch.ACTIONS, SampleBatch.REWARDS, SampleBatch.DONES, SampleBatch.NEXT_OBS, SampleBatch.EPS_ID, SampleBatch.AGENT_INDEX, "weights", ]: assert key in view_req_policy # None of the view cols has a special underlying data_col, # except next-obs. if key != SampleBatch.NEXT_OBS: assert view_req_policy[key].data_col is None else: assert view_req_policy[key].data_col == SampleBatch.OBS assert view_req_policy[key].shift == 1 rollout_worker = algo.workers.local_worker() sample_batch = rollout_worker.sample() expected_count = (config["num_envs_per_worker"] * config["rollout_fragment_length"]) assert sample_batch.count == expected_count for v in sample_batch.values(): assert len(v) == expected_count algo.stop()
def __init__(self, config, checkpoint_path): # Create the Trainer. self.trainer = dqn.DQN(config=config) # Load an already trained state for the trainer. self.trainer.restore(checkpoint_path)
def main(): args = parser.parse_args() ray.init(num_cpus=args.num_cpus or None, local_mode=args.local_mode) env_config = { "num_candidates": args.env_num_candidates, "resample_documents": not args.env_dont_resample_documents, "slate_size": args.env_slate_size, "seed": args.env_seed, "convert_to_discrete_action_space": args.run == "DQN", } config = { "env": (InterestEvolutionRecSimEnv if args.env == "interest-evolution" else InterestExplorationRecSimEnv if args.env == "interest-exploration" else LongTermSatisfactionRecSimEnv), "framework": args.framework, "num_gpus": args.num_gpus, "num_workers": args.num_workers, "env_config": env_config, "replay_buffer_config": { "learning_starts": args.learning_starts, }, } # Perform a test run on the env with a random agent to see, what # the random baseline reward is. if args.random_test_episodes: print(f"Running {args.random_test_episodes} episodes to get a random " "agent's baseline reward ...") env = config["env"](config=env_config) env.reset() num_episodes = 0 episode_rewards = [] episode_reward = 0.0 while num_episodes < args.random_test_episodes: action = env.action_space.sample() _, r, d, _ = env.step(action) episode_reward += r if d: num_episodes += 1 episode_rewards.append(episode_reward) episode_reward = 0.0 env.reset() print(f"Ran {args.random_test_episodes} episodes with a random agent " "reaching a mean episode return of " f"{np.mean(episode_rewards)}+/-{sem(episode_rewards)}.") if args.use_tune: stop = { "training_iteration": args.stop_iters, "timesteps_total": args.stop_timesteps, "episode_reward_mean": args.stop_reward, } results = tune.run( args.run, stop=stop, config=config, num_samples=args.tune_num_samples, verbose=2, ) if args.as_test: check_learning_achieved(results, args.stop_reward) else: # Directly run using the trainer interface (good for debugging). if args.run == "DQN": trainer = dqn.DQN(config=config) else: trainer = slateq.SlateQTrainer(config=config) for i in range(10): result = trainer.train() print(pretty_print(result)) ray.shutdown()
def test_dqn_exploration_and_soft_q_config(self): """Tests, whether a DQN Agent outputs exploration/softmaxed actions.""" config = ( dqn.dqn.DQNConfig() .rollouts(num_rollout_workers=0) .environment(env_config={"is_slippery": False, "map_name": "4x4"}) ) obs = np.array(0) # Test against all frameworks. for _ in framework_iterator(config): # Default EpsilonGreedy setup. trainer = dqn.DQN(config=config, env="FrozenLake-v1") # Setting explore=False should always return the same action. a_ = trainer.compute_single_action(obs, explore=False) for _ in range(50): a = trainer.compute_single_action(obs, explore=False) check(a, a_) # explore=None (default: explore) should return different actions. actions = [] for _ in range(50): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) trainer.stop() # Low softmax temperature. Behaves like argmax # (but no epsilon exploration). config.exploration( exploration_config={"type": "SoftQ", "temperature": 0.000001} ) trainer = dqn.DQN(config=config, env="FrozenLake-v1") # Due to the low temp, always expect the same action. actions = [trainer.compute_single_action(obs)] for _ in range(50): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, decimals=3) trainer.stop() # Higher softmax temperature. config.exploration_config["temperature"] = 1.0 trainer = dqn.DQN(config=config, env="FrozenLake-v1") # Even with the higher temperature, if we set explore=False, we # should expect the same actions always. a_ = trainer.compute_single_action(obs, explore=False) for _ in range(50): a = trainer.compute_single_action(obs, explore=False) check(a, a_) # Due to the higher temp, expect different actions avg'ing # around 1.5. actions = [] for _ in range(300): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) trainer.stop() # With Random exploration. config.exploration(exploration_config={"type": "Random"}, explore=True) trainer = dqn.DQN(config=config, env="FrozenLake-v1") actions = [] for _ in range(300): actions.append(trainer.compute_single_action(obs)) check(np.std(actions), 0.0, false=True) trainer.stop()