def __init__(self, rl_app, engine_config, dm): super().__init__(rl_app, engine_config, dm) # init a trajectory builder env_id_cols = rl_app.env_id_cols ts_id_col = rl_app.ts_id_col obs_cols = rl_app.obs_cols n_step = rl_app.training_config.get("n_step", 1) self._trajectory_builder = TrajectoryBuilder(obs_cols, env_id_cols, ts_id_col, n_step) self._replay_buffer = rl_app.init_replay_buffer()
def __init__(self, application, dm): super().__init__(application, dm) # init a trajectory builder env_id_cols = application.env.env_id_cols ts_id_col = application.env.ts_id_col obs_cols = application.env.obs_cols n_step = application.config.trajectory.n_step self._trajectory_builder = TrajectoryBuilder(obs_cols, env_id_cols, ts_id_col, n_step) self._replay_buffer = self._application.init_replay_buffer()
def test_non_sufficient_timestep_operation(self): """ * all user with 1 timestep """ mock_timestep = [ { "env_id": 3, "ts_id": 0, "obs": 5, "action": 5, "reward": 0.5, "step_type": 0, "discount": 0.5 }, { "env_id": 1, "ts_id": 1, "obs": 1, "action": 1, "reward": 0.1, "step_type": 1, "discount": 0.1 }, { "env_id": 2, "ts_id": 1, "obs": 4, "action": 4, "reward": 0.4, "step_type": 1, "discount": 0.4 }, ] env_id_cols = ["env_id"] ts_id_col = "ts_id" obs_cols = ["obs"] n_step = 1 tb = TrajectoryBuilder(obs_cols=obs_cols, env_id_cols=env_id_cols, ts_id_col=ts_id_col, n_step=n_step) res = tb.run(mock_timestep) expected_result = None self.assertEquals(res, expected_result)
class TrainAgentOperation(BaseOperation): TRAINING_GLOBAL_STEP = "training_global_step" @classmethod def output_dataname(cls): return DATANAME.MODEL @classmethod def data_dependencies(cls, engine_config): return { "agent": (DATANAME.MODEL, -1), "timestep_df": (DATANAME.TIMESTEP, -engine_config.training_timestep_lag), } @classmethod def optional_data_dependencies(cls, engine_config): # get the window size from which trajectories will be loaded trajectory_training_window = engine_config.trajectory_training_window # compute the number training runs to use excluding the current one num_previous_training_runs = trajectory_training_window - 1 # retrieve previous trajectories other than the one from the current timestep.These trajectories must be # in the time window specified in the app result = {} for i in range(1, num_previous_training_runs): offset = -i result["timestep_offset_%s_df" % str(offset)] = (DATANAME.TIMESTEP, offset - engine_config.training_timestep_lag) return result @classmethod def tracked_run_state(cls): """Returns a dict containing the metrics that the operator needs to track over different runs with it's value in the initial state""" return {TrainAgentOperation.TRAINING_GLOBAL_STEP: 0} def __init__(self, rl_app, engine_config, dm): super().__init__(rl_app, engine_config, dm) # init a trajectory builder env_id_cols = rl_app.env_id_cols ts_id_col = rl_app.ts_id_col obs_cols = rl_app.obs_cols n_step = rl_app.training_config.get("n_step", 1) self._trajectory_builder = TrajectoryBuilder(obs_cols, env_id_cols, ts_id_col, n_step) self._replay_buffer = rl_app.init_replay_buffer() def _get_tensorboard_counter(self, run_id): run_state = self.get_run_state(run_id, [self.TRAINING_GLOBAL_STEP]) training_global_step = run_state[self.TRAINING_GLOBAL_STEP] return training_global_step def _run(self, run_id, rl_app, engine_config, agent, timestep_df, **previous_timestep_dict): logger.info("Starting training for run_sid %s" % str(run_id)) # build a trajectory based on the provided timesteps for _, ts_df in previous_timestep_dict.items(): timestep_df = timestep_df.union(ts_df) # TODO: move this order and trajectory building to it's own operation so that train can # run on gpu while trajectory building run on spark timestep_df = timestep_df.orderBy(*rl_app.env_id_cols, rl_app.ts_id_col) traj_dict = self._trajectory_builder.run(timestep_df.collect()) # setup replay buffer self._replay_buffer.add_batch(traj_dict) # train the agent global_step = tf.compat.v1.train.get_or_create_global_step() summary_interval = rl_app.training_config.get("summary_interval", rl_app.training_config.get("log_interval", 200)) with tf.compat.v2.summary.record_if(lambda: tf.math.equal(global_step % summary_interval, 0)): num_iterations = rl_app.training_config["num_iterations"] mini_batch_size = rl_app.training_config["mini_batch_size"] self._train(agent, self._replay_buffer, num_iterations, mini_batch_size) # update global step in run_context so that it can be used in the next run run_state = self.get_run_state(run_id, [self.TRAINING_GLOBAL_STEP]) training_global_step = run_state[self.TRAINING_GLOBAL_STEP] run_state[self.TRAINING_GLOBAL_STEP] = training_global_step + num_iterations self.update_run_state(run_id, run_state) return agent @staticmethod def _train(agent, replay_buffer, num_iterations, mini_batch_size): """ train the agent using trajectory. Params: training_trajectory: trajectory used for training """ start_time = time.time() logger.info("Starting training process RL Agent. time: %s" % str(start_time)) # train the agent training_progress = None for i in range(num_iterations): replay_buffer.pre_process(i) traj, traj_meta = replay_buffer.get_batch(mini_batch_size) weights = traj_meta.probabilities if traj_meta else None loss_info = agent.train(experience=traj, weights=weights) replay_buffer.post_process(traj_meta, loss_info, i) curr_progress = int(i * 100 / num_iterations) if not training_progress or curr_progress > training_progress: training_progress = curr_progress logger.info("Current progress: %s Percent. Loss: %s" % (str(curr_progress), str(loss_info.loss))) end_time = time.time() total_time_second = int(end_time - start_time) logger.info("Agent training is complete. Total training time: %s minutes" % str(total_time_second / 60))
class TrainAgentOperation(BaseOperation): TRAINING_GLOBAL_STEP = "training_global_step" @classmethod def output_dataname(cls): return DATANAME.MODEL @classmethod def data_dependencies(cls, timing_data): return { "agent": (DATANAME.MODEL, -1), "timestep_df": (DATANAME.TIMESTEP, -timing_data.training_timestep_lag), } @classmethod def optional_data_dependencies(cls, timing_data): # get the window size from which trajectories will be loaded trajectory_training_window = timing_data.trajectory_training_window # compute the number training runs to use excluding the current one num_previous_training_runs = trajectory_training_window - 1 # retrieve previous trajectories other than the one from the current timestep.These trajectories must be # in the time window specified in the app result = {} for i in range(1, num_previous_training_runs): offset = -i result["timestep_offset_%s_df" % str(offset)] = (DATANAME.TIMESTEP, offset - timing_data.training_timestep_lag) return result @classmethod def tracked_run_state(cls): """Returns a dict containing the metrics that the operator needs to track over different runs with it's value in the initial state""" return {TrainAgentOperation.TRAINING_GLOBAL_STEP: 0} def __init__(self, application, dm): super().__init__(application, dm) # init a trajectory builder env_id_cols = application.env.env_id_cols ts_id_col = application.env.ts_id_col obs_cols = application.env.obs_cols n_step = application.config.trajectory.n_step self._trajectory_builder = TrajectoryBuilder(obs_cols, env_id_cols, ts_id_col, n_step) self._replay_buffer = self._application.init_replay_buffer() def _get_tensorboard_counter(self, run_id): run_state = self.get_run_state(run_id, [self.TRAINING_GLOBAL_STEP]) training_global_step = run_state[self.TRAINING_GLOBAL_STEP] return training_global_step def _run(self, run_id, agent, timestep_df, **previous_timestep_dict): """ This updates the given agent with the latest timesteps. :param run_id: The run id to train for :param agent: The agent to update :param timestep_df: A Spark dataframe of the latest timesteps :param previous_timestep_dict: Any previously available timesteps (these will be added to the latest) :return: the given agent is updated and returned """ logger.info("Starting training for run_sid %s" % str(run_id)) # build a trajectory based on the provided timesteps for _, ts_df in previous_timestep_dict.items(): timestep_df = timestep_df.union(ts_df) # TODO: move this order and trajectory building to it's own operation so that train can # run on gpu while trajectory building run on spark timestep_df = timestep_df.orderBy(*self._application.env.env_id_cols, self._application.env.ts_id_col) traj_dict = self._trajectory_builder.run(timestep_df.collect()) # setup replay buffer self._replay_buffer.add_batch(traj_dict) # train the agent global_step = tf.compat.v1.train.get_or_create_global_step() summary_interval = self._application.config.project.summary_interval with tf.compat.v2.summary.record_if( lambda: tf.math.equal(global_step % summary_interval, 0)): num_iterations = self._application.config.training.num_iterations mini_batch_size = self._application.config.training.batch_size self._train(agent, self._replay_buffer, num_iterations, mini_batch_size) # update global step in run_context so that it can be used in the next run run_state = self.get_run_state(run_id, [self.TRAINING_GLOBAL_STEP]) training_global_step = run_state[self.TRAINING_GLOBAL_STEP] run_state[ self.TRAINING_GLOBAL_STEP] = training_global_step + num_iterations self.update_run_state(run_id, run_state) return agent @staticmethod def _train(agent, replay_buffer, num_iterations, mini_batch_size): """ Go through an update cycle on an Agent. The given agent's parameters are updated. :param agent: The Agent to update :param replay_buffer: A replay buffer to get trajectories from :param num_iterations: The number of epochs to run :param mini_batch_size: The number of trajectories to put in each mini-batch """ start_time = time.time() logger.info("Starting training process RL Agent. time: %s" % str(start_time)) # train the agent training_progress = None for i in range(num_iterations): replay_buffer.pre_process(i) traj, traj_meta = replay_buffer.get_batch(mini_batch_size) weights = traj_meta.probabilities if traj_meta else None loss_info = agent.train(experience=traj, weights=weights) replay_buffer.post_process(traj_meta, loss_info, i) curr_progress = int(i * 100 / num_iterations) if not training_progress or curr_progress > training_progress: training_progress = curr_progress logger.info("Current progress: %s Percent. Loss: %s" % (str(curr_progress), str(loss_info.loss))) end_time = time.time() total_time_second = int(end_time - start_time) logger.info( "Agent training is complete. Total training time: %s minutes" % str(total_time_second / 60))
def test_timestep_operation(self): """ * 1 user with 3 timesteps * 1 user with 2 timesteps * 1 user with 1 timesteps """ mock_timestep = [{ "env_id": 1, "ts_id": 0, "obs": 0, "action": 0, "reward": 0.0, "step_type": 0, "discount": 0.0 }, { "env_id": 1, "ts_id": 1, "obs": 1, "action": 1, "reward": 0.1, "step_type": 1, "discount": 0.1 }, { "env_id": 1, "ts_id": 2, "obs": 2, "action": 2, "reward": 0.2, "step_type": 2, "discount": 0.2 }, { "env_id": 2, "ts_id": 0, "obs": 3, "action": 3, "reward": 0.3, "step_type": 0, "discount": 0.3 }, { "env_id": 2, "ts_id": 1, "obs": 4, "action": 4, "reward": 0.4, "step_type": 1, "discount": 0.4 }, { "env_id": 3, "ts_id": 0, "obs": 5, "action": 5, "reward": 0.5, "step_type": 0, "discount": 0.5 }] env_id_cols = ["env_id"] ts_id_col = "ts_id" obs_cols = ["obs"] n_step = 1 tb = TrajectoryBuilder(obs_cols=obs_cols, env_id_cols=env_id_cols, ts_id_col=ts_id_col, n_step=n_step) res = tb.run(mock_timestep) expected_result = { "observation": [[[0], [1]], [[1], [2]], [[3], [4]]], "step_type": [[0, 1], [1, 2], [0, 1]], "action": [[0, 1], [1, 2], [3, 4]], "reward": [[0.1, 0.1], [0.2, 0.2], [0.4, 0.4]], "discount": [[0.1, 0.1], [0.2, 0.2], [0.4, 0.4]], "next_step_type": [[1, 1], [2, 2], [1, 1]], "policy_info": () } self.assertEquals(res, expected_result)
def test_timestep_operation_with_policy_info(self): """ * 1 user with 3 timesteps * 1 user with 2 timesteps * 1 user with 1 timesteps """ meta_value_1 = [0.123, 0.876] meta_value_2 = 0.4 meta_value_3 = [0.3, 0.7] meta_value_4 = 0.6 policy_info_dict_1 = {"meta_1": meta_value_1, "meta_2": meta_value_2} policy_info_dict_2 = {"meta_1": meta_value_3, "meta_2": meta_value_4} mock_timestep = [{ "env_id": 1, "ts_id": 0, "obs": 0, "action": 0, "reward": 0.0, "step_type": 0, "discount": 0.0, "policy_info": policy_info_dict_1 }, { "env_id": 1, "ts_id": 1, "obs": 1, "action": 1, "reward": 0.1, "step_type": 1, "discount": 0.1, "policy_info": policy_info_dict_2 }, { "env_id": 1, "ts_id": 2, "obs": 2, "action": 2, "reward": 0.2, "step_type": 2, "discount": 0.2, "policy_info": policy_info_dict_1 }, { "env_id": 2, "ts_id": 0, "obs": 3, "action": 3, "reward": 0.3, "step_type": 0, "discount": 0.3, "policy_info": policy_info_dict_2 }, { "env_id": 2, "ts_id": 1, "obs": 4, "action": 4, "reward": 0.4, "step_type": 1, "discount": 0.4, "policy_info": policy_info_dict_1 }, { "env_id": 3, "ts_id": 0, "obs": 5, "action": 5, "reward": 0.5, "step_type": 0, "discount": 0.5, "policy_info": policy_info_dict_1 }] env_id_cols = ["env_id"] ts_id_col = "ts_id" obs_cols = ["obs"] n_step = 1 tb = TrajectoryBuilder(obs_cols=obs_cols, env_id_cols=env_id_cols, ts_id_col=ts_id_col, n_step=n_step) res = tb.run(mock_timestep) expected_result = { "observation": [[[0], [1]], [[1], [2]], [[3], [4]]], "step_type": [[0, 1], [1, 2], [0, 1]], "action": [[0, 1], [1, 2], [3, 4]], "reward": [[0.1, 0.1], [0.2, 0.2], [0.4, 0.4]], "discount": [[0.1, 0.1], [0.2, 0.2], [0.4, 0.4]], "next_step_type": [[1, 1], [2, 2], [1, 1]], "policy_info": { "meta_1": [[meta_value_1, meta_value_3], [meta_value_3, meta_value_1], [meta_value_3, meta_value_1]], "meta_2": [[meta_value_2, meta_value_4], [meta_value_4, meta_value_2], [meta_value_4, meta_value_2]] } } self.assertEquals(res, expected_result)