def test_nested_action_spaces(self): config = DEFAULT_CONFIG.copy() config["env"] = RandomEnv # Write output to check, whether actions are written correctly. tmp_dir = os.popen("mktemp -d").read()[:-1] if not os.path.exists(tmp_dir): # Last resort: Resolve via underlying tempdir (and cut tmp_. tmp_dir = ray._private.utils.tempfile.gettempdir() + tmp_dir[4:] assert os.path.exists(tmp_dir), f"'{tmp_dir}' not found!" config["output"] = tmp_dir # Switch off OPE as we don't write action-probs. # TODO: We should probably always write those if `output` is given. config["input_evaluation"] = [] # Pretend actions in offline files are already normalized. config["actions_in_input_normalized"] = True for _ in framework_iterator(config): for name, action_space in SPACES.items(): config["env_config"] = { "action_space": action_space, } for flatten in [False, True]: print(f"A={action_space} flatten={flatten}") shutil.rmtree(config["output"]) config["_disable_action_flattening"] = not flatten trainer = PGTrainer(config) trainer.train() trainer.stop() # Check actions in output file (whether properly flattened # or not). reader = JsonReader( inputs=config["output"], ioctx=trainer.workers.local_worker().io_context, ) sample_batch = reader.next() if flatten: assert isinstance(sample_batch["actions"], np.ndarray) assert len(sample_batch["actions"].shape) == 2 assert sample_batch["actions"].shape[0] == len( sample_batch) else: tree.assert_same_structure( trainer.get_policy().action_space_struct, sample_batch["actions"], ) # Test, whether offline data can be properly read by a # BCTrainer, configured accordingly. config["input"] = config["output"] del config["output"] bc_trainer = BCTrainer(config=config) bc_trainer.train() bc_trainer.stop() config["output"] = tmp_dir config["input"] = "sampler"
def test_itr_batches(self): """Test that the json reader iterates over batches of rows correctly.""" rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "rllib/tests/data/pendulum/large.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) ioctx = IOContext(config={"train_batch_size": 1200}, worker_index=0) reader = JsonReader([data_file], ioctx) assert len(reader.next()) == 1200
def _init(self, config, env_name): self._policy_graph = self.config["multiagent"]["policy_graphs"] self.local_evaluator = self.make_local_evaluator( env_name, self._policy_graph, self.config) self.remote_evaluators = self.make_remote_evaluators( env_name, self._policy_graph, self.config["num_workers"]) self.train_batch_size = self.config["train_batch_size"] self.num_sgd_iter = self.config["num_sgd_iter"] self.num_train = self.config["num_train"] self.expert_path = self.config["expert_path"] self.theta_lr = self.config["theta_lr"] expert_reader = JsonReader(self.expert_path) self.expert_samples = expert_reader.next() self.expert_features = self.calculate_expected_feature(self.expert_samples) self.theta = np.random.uniform(size=self.expert_features.shape)
def offline_evaluation(self, iteration): self._agent.eval_mode = True validation_dataset = [ os.path.join(self.dataset_path, f) for f in os.listdir(self.dataset_path) if os.path.isfile(os.path.join(self.dataset_path, f)) ] validation_dataset = sorted(validation_dataset) rewards = [] for n_eps in range(len(validation_dataset)): reader = JsonReader(validation_dataset[n_eps]) with open(validation_dataset[n_eps], "r") as f: sb = f.readlines() for _ in range(len(sb)): n = reader.next() batch = reader.next() for episode in batch.split_by_episode(): for r in episode["rewards"]: rewards.append(r) rewards_shift = (round(min(rewards), 5) * -1 if round(min(rewards), 5) < 0 else round( min(rewards), 5)) actions = [] estimation = { "dm/score": [], "dm/pred_reward_mean": [], "dm/pred_reward_total": [], "is/V_prev": [], "is/V_step_IS": [], "is/V_gain_est": [], } for n_eps in range(len(validation_dataset)): reader = JsonReader(validation_dataset[n_eps]) batch = reader.next() for episode in batch.split_by_episode(): action = [] selected_action_prob = [] all_actions_prob = [] for i in range(len(episode["eps_id"])): _action, _action_prob = self._agent.step( episode["rewards"][i], episode["obs"][i]) action.append(_action) selected_action_prob.append(_action_prob[_action]) all_actions_prob.append(_action_prob) is_estimation = self.is_estimator.estimate( episode, all_actions_prob, rewards_shift) actions.extend(action) action = np.array([action]) action_prob = np.array([selected_action_prob]) obs = torch.Tensor( np.concatenate( (episode["obs"], np.reshape(action, (action[0].shape[0], 1))), axis=1, ) ) # concatenate actions and observations for input obs are usually [[obs1],[obs2],[obs3]] and # actions are usually [1,0,1,0] so the goal is to make actions like this: [[1],[0],[1]] scores_raw = self.predictor.predict(obs).detach().numpy() scores = {} scores["score"] = (scores_raw * action_prob).mean() scores["pred_reward_mean"] = scores_raw.mean() scores["pred_reward_total"] = scores_raw.sum() # DM Estimation ------------------------ estimation["dm/score"].append(scores["score"]) estimation["dm/pred_reward_mean"].append( scores["pred_reward_mean"]) estimation["dm/pred_reward_total"].append( scores["pred_reward_total"]) # IS Estimation ----------------------- estimation["is/V_prev"].append(is_estimation["V_prev"]) estimation["is/V_step_IS"].append(is_estimation["V_step_IS"]) estimation["is/V_gain_est"].append(is_estimation["V_gain_est"]) est_mean = pd.DataFrame.from_dict(estimation).mean(axis=0) summary = tf.Summary(value=[ tf.Summary.Value(tag="Eval/DM/score", simple_value=est_mean["dm/score"]), tf.Summary.Value( tag="Eval/DM/pred_reward_mean", simple_value=est_mean["dm/pred_reward_mean"], ), tf.Summary.Value( tag="Eval/DM/pred_reward_total", simple_value=est_mean["dm/pred_reward_total"], ), tf.Summary.Value(tag="Eval/is/V_prev", simple_value=est_mean["is/V_prev"]), tf.Summary.Value( tag="Eval/is/V_step_IS", simple_value=est_mean["is/V_step_IS"], ), tf.Summary.Value( tag="Eval/is/V_gain_est", simple_value=est_mean["is/V_gain_est"], ), tf.Summary.Value( tag="Eval/actions_prob", simple_value=float(actions.count(1)) / len(actions), ), ]) self._summary_writer.add_summary(summary, iteration)
def setUpClass(cls): ray.init() rllib_dir = Path(__file__).parent.parent.parent.parent train_data = os.path.join(rllib_dir, "tests/data/cartpole/large.json") eval_data = train_data env_name = "CartPole-v0" cls.gamma = 0.99 n_episodes = 40 cls.q_model_config = {"n_iters": 600} config = (DQNConfig().environment(env=env_name).training( gamma=cls.gamma).rollouts( num_rollout_workers=3, batch_mode="complete_episodes").framework("torch").resources( num_gpus=int(os.environ.get("RLLIB_NUM_GPUS", 0))). offline_data(input_=train_data).evaluation( evaluation_interval=None, evaluation_duration=n_episodes, evaluation_num_workers=1, evaluation_duration_unit="episodes", evaluation_config={"input": eval_data}, off_policy_estimation_methods={ "is": { "type": ImportanceSampling }, "wis": { "type": WeightedImportanceSampling }, "dm_fqe": { "type": DirectMethod, "q_model_config": { "type": FQETorchModel }, }, "dr_fqe": { "type": DoublyRobust, "q_model_config": { "type": FQETorchModel }, }, }, )) cls.algo = config.build() # Train DQN for evaluation policy for _ in range(n_episodes): cls.algo.train() # Read n_episodes of data, assuming that one line is one episode reader = JsonReader(eval_data) cls.batch = reader.next() for _ in range(n_episodes - 1): cls.batch = concat_samples([cls.batch, reader.next()]) cls.n_episodes = len(cls.batch.split_by_episode()) print("Episodes:", cls.n_episodes, "Steps:", cls.batch.count) cls.mean_ret = {} cls.std_ret = {} cls.losses = {} # Simulate Monte-Carlo rollouts mc_ret = [] env = gym.make(env_name) for _ in range(n_episodes): obs = env.reset() done = False rewards = [] while not done: act = cls.algo.compute_single_action(obs) obs, reward, done, _ = env.step(act) rewards.append(reward) ret = 0 for r in reversed(rewards): ret = r + cls.gamma * ret mc_ret.append(ret) cls.mean_ret["simulation"] = np.mean(mc_ret) cls.std_ret["simulation"] = np.std(mc_ret)
def setUpClass(cls): ray.init(ignore_reinit_error=True) rllib_dir = Path(__file__).parent.parent.parent.parent print("rllib dir={}".format(rllib_dir)) data_file = os.path.join(rllib_dir, "tests/data/cartpole/large.json") print("data_file={} exists={}".format(data_file, os.path.isfile(data_file))) env_name = "CartPole-v0" cls.gamma = 0.99 train_steps = 20000 n_batches = 20 # Approx. equal to n_episodes n_eval_episodes = 100 config = ( DQNConfig() .environment(env=env_name) .training(gamma=cls.gamma) .rollouts(num_rollout_workers=3) .exploration( explore=True, exploration_config={ "type": "SoftQ", "temperature": 1.0, }, ) .framework("torch") .rollouts(batch_mode="complete_episodes") ) cls.trainer = config.build() # Train DQN for evaluation policy tune.run( "DQN", config=config.to_dict(), stop={"timesteps_total": train_steps}, verbose=0, ) # Read n_batches of data reader = JsonReader(data_file) cls.batch = reader.next() for _ in range(n_batches - 1): cls.batch = cls.batch.concat(reader.next()) cls.n_episodes = len(cls.batch.split_by_episode()) print("Episodes:", cls.n_episodes, "Steps:", cls.batch.count) cls.mean_ret = {} cls.std_ret = {} # Simulate Monte-Carlo rollouts mc_ret = [] env = gym.make(env_name) for _ in range(n_eval_episodes): obs = env.reset() done = False rewards = [] while not done: act = cls.trainer.compute_single_action(obs) obs, reward, done, _ = env.step(act) rewards.append(reward) ret = 0 for r in reversed(rewards): ret = r + cls.gamma * ret mc_ret.append(ret) cls.mean_ret["simulation"] = np.mean(mc_ret) cls.std_ret["simulation"] = np.std(mc_ret) # Optional configs for the model-based estimators cls.model_config = {"k": 2, "n_iters": 10} ray.shutdown()