def load(self, path): """ Part of the l2rpn_baselines interface, this function allows to read back a trained model, to continue the training or to evaluate its performance for example. **NB** To reload an agent, it must have exactly the same name and have been saved at the right location. Parameters ---------- path: ``str`` The path where the agent has previously beens saved. """ # not modified compare to original implementation tmp_me = os.path.join(path, self.name) if not os.path.exists(tmp_me): raise RuntimeError("The model should be stored in \"{}\". But this appears to be empty".format(tmp_me)) self._load_action_space(tmp_me) # TODO handle case where training param class has been overidden self._training_param = TrainingParam.from_json(os.path.join(tmp_me, "training_params.json".format(self.name))) self.deep_q = self._nn_archi.make_nn(self._training_param) try: self.deep_q.load_network(tmp_me, name=self.name) except Exception as e: raise RuntimeError("Impossible to load the model located at \"{}\" with error \n{}".format(path, e)) for nm_attr in ["_time_step_lived", "_nb_chosen", "_proba"]: conv_path = os.path.join(tmp_me, "{}.npy".format(nm_attr)) if os.path.exists(conv_path): setattr(self, nm_attr, np.load(file=conv_path))
def __init__(self, nn_params, training_param=None): self._action_size = nn_params.action_size self._observation_size = nn_params.observation_size self._nn_archi = nn_params if training_param is None: self._training_param = TrainingParam() else: self._training_param = training_param self._lr = training_param.lr self._lr_decay_steps = training_param.lr_decay_steps self._lr_decay_rate = training_param.lr_decay_rate self._model = None self._target_model = None self._schedule_model = None self._optimizer_model = None self._custom_objects = None # to be able to load other keras layers type
def __init__(self, action_size, observation_size, lr=1e-5, learning_rate_decay_steps=1000, learning_rate_decay_rate=0.95, training_param=TrainingParam()): # TODO add more flexibilities when building the deep Q networks, with a "NNParam" for example. self.action_size = action_size self.observation_size = observation_size self.lr = lr self.lr_decay_steps = learning_rate_decay_steps self.lr_decay_rate = learning_rate_decay_rate self.qvalue_evolution = np.zeros((0, )) self.training_param = training_param self.model = None self.target_model = None self.schedule_model = None self.optimizer_model = None self.custom_objects = None # to be able to load other keras layers type
def train(self, env, iterations, save_path, logdir, training_param=TrainingParam()): self.training_param = training_param self._init_replay_buffer() # efficient reading of the data (read them by chunk of roughly 1 day nb_ts_one_day = 24 * 60 / 5 # number of time steps per day self.set_chunk(env, nb_ts_one_day) # Create file system related vars if save_path is not None: save_path = os.path.abspath(save_path) os.makedirs(save_path, exist_ok=True) if logdir is not None: logpath = os.path.join(logdir, self.name) self.tf_writer = tf.summary.create_file_writer(logpath, name=self.name) else: logpath = None self.tf_writer = None UPDATE_FREQ = 100 # update tensorboard every "UPDATE_FREQ" steps SAVING_NUM = 1000 training_step = 0 # some parameters have been move to a class named "training_param" for convenience self.epsilon = training_param.INITIAL_EPSILON # now the number of alive frames and total reward depends on the "underlying environment". It is vector instead # of scalar alive_frame, total_reward = self._init_global_train_loop() reward, done = self._init_local_train_loop() epoch_num = 0 self.losses = np.zeros(iterations) alive_frames = np.zeros(iterations) total_rewards = np.zeros(iterations) new_state = None self.reset_num = 0 with tqdm(total=iterations) as pbar: while training_step < iterations: # reset or build the environment initial_state = self._need_reset(env, training_step, epoch_num, done, new_state) # Slowly decay the exploration parameter epsilon # if self.epsilon > training_param.FINAL_EPSILON: self.epsilon = training_param.get_next_epsilon(current_step=training_step) if training_step == 0: # we initialize the NN with the proper shape self.init_deep_q(initial_state) # then we need to predict the next moves. Agents have been adapted to predict a batch of data pm_i, pq_v, act = self._next_move(initial_state, self.epsilon) # todo store the illegal / ambiguous / ... actions reward, done = self._init_local_train_loop() if self.__nb_env == 1: # still the "hack" to have same interface between multi env and env... # yeah it's a pain act = act[0] temp_observation_obj, temp_reward, temp_done, info = env.step(act) if self.__nb_env == 1: # dirty hack to wrap them into list temp_observation_obj = [temp_observation_obj] temp_reward = np.array([temp_reward], dtype=np.float32) temp_done = np.array([temp_done], dtype=np.bool) info = [info] new_state = self.convert_obs_train(temp_observation_obj) self._updage_illegal_ambiguous(training_step, info) done, reward, total_reward, alive_frame, epoch_num \ = self._update_loop(done, temp_reward, temp_done, alive_frame, total_reward, reward, epoch_num) # update the replay buffer self._store_new_state(initial_state, pm_i, reward, done, new_state) # now train the model if not self._train_model(training_param, training_step): # infinite loss in this case print("ERROR INFINITE LOSS") break # Save the network every 1000 iterations if training_step % SAVING_NUM == 0 or training_step == iterations - 1: self.save(save_path) # save some information to tensorboard alive_frames[epoch_num] = np.mean(alive_frame) total_rewards[epoch_num] = np.mean(total_reward) self._store_action_played_train(training_step, pm_i) self._save_tensorboard(training_step, epoch_num, UPDATE_FREQ, total_rewards, alive_frames) training_step += 1 pbar.update(1)
def train(self, env, iterations, save_path, logdir, training_param=None): """ Part of the public l2rpn-baselines interface, this function allows to train the baseline. If `save_path` is not None, the the model is saved regularly, and also at the end of training. TODO explain a bit more how you can train it. Parameters ---------- env: :class:`grid2op.Environment.Environment` or :class:`grid2op.Environment.MultiEnvironment` The environment used to train your model. iterations: ``int`` The number of training iteration. NB when reloading a model, this is **NOT** the training steps that will be used when re training. Indeed, if `iterations` is 1000 and the model was already trained for 750 time steps, then when reloaded, the training will occur on 250 (=1000 - 750) time steps only. save_path: ``str`` Location at which to save the model logdir: ``str`` Location at which tensorboard related information will be kept. training_param: :class:`l2rpn_baselines.utils.TrainingParam` The meta parameters for the training procedure. This is currently ignored if the model is reloaded (in that case the parameters used when first created will be used) """ if training_param is None: training_param = TrainingParam() self._train_lr = training_param.lr if self._training_param is None: self._training_param = training_param else: training_param = self._training_param self._init_deep_q(self._training_param, env) self._fill_vectors(self._training_param) self._init_replay_buffer() # efficient reading of the data (read them by chunk of roughly 1 day nb_ts_one_day = 24 * 60 / 5 # number of time steps per day self._set_chunk(env, nb_ts_one_day) # Create file system related vars if save_path is not None: save_path = os.path.abspath(save_path) os.makedirs(save_path, exist_ok=True) if logdir is not None: logpath = os.path.join(logdir, self.name) self._tf_writer = tf.summary.create_file_writer(logpath, name=self.name) else: logpath = None self._tf_writer = None UPDATE_FREQ = training_param.update_tensorboard_freq # update tensorboard every "UPDATE_FREQ" steps SAVING_NUM = training_param.save_model_each if hasattr(env, "nb_env"): nb_env = env.nb_env warnings.warn("Training using {} environments".format(nb_env)) self.__nb_env = nb_env else: self.__nb_env = 1 # if isinstance(env, grid2op.Environment.Environment): # self.__nb_env = 1 # else: # import warnings # nb_env = env.nb_env # warnings.warn("Training using {} environments".format(nb_env)) # self.__nb_env = nb_env self.init_obs_extraction(env.observation_space) training_step = self._training_param.last_step # some parameters have been move to a class named "training_param" for convenience self.epsilon = self._training_param.initial_epsilon # now the number of alive frames and total reward depends on the "underlying environment". It is vector instead # of scalar alive_frame, total_reward = self._init_global_train_loop() reward, done = self._init_local_train_loop() epoch_num = 0 self._losses = np.zeros(iterations) alive_frames = np.zeros(iterations) total_rewards = np.zeros(iterations) new_state = None self._reset_num = 0 self._curr_iter_env = 0 self._max_reward = env.reward_range[1] # action types # injection, voltage, topology, line, redispatching = action.get_types() self.nb_injection = 0 self.nb_voltage = 0 self.nb_topology = 0 self.nb_line = 0 self.nb_redispatching = 0 self.nb_do_nothing = 0 # for non uniform random sampling of the scenarios th_size = None self._prev_obs_num = 0 if self.__nb_env == 1: # TODO make this available for multi env too if _CACHE_AVAILABLE_DEEPQAGENT: if isinstance(env.chronics_handler.real_data, MultifolderWithCache): th_size = env.chronics_handler.real_data.cache_size if th_size is None: th_size = len(env.chronics_handler.real_data.subpaths) # number of time step lived per possible scenarios if self._time_step_lived is None or self._time_step_lived.shape[ 0] != th_size: self._time_step_lived = np.zeros(th_size, dtype=np.uint64) # number of time a given scenario has been played if self._nb_chosen is None or self._nb_chosen.shape[0] != th_size: self._nb_chosen = np.zeros(th_size, dtype=np.uint) # number of time a given scenario has been played if self._proba is None or self._proba.shape[0] != th_size: self._proba = np.ones(th_size, dtype=np.float64) self._prev_id = 0 # this is for the "limit the episode length" depending on your previous success self._total_sucesses = 0 with tqdm(total=iterations - training_step, disable=not self.verbose) as pbar: while training_step < iterations: # reset or build the environment initial_state = self._need_reset(env, training_step, epoch_num, done, new_state) # Slowly decay the exploration parameter epsilon # if self.epsilon > training_param.FINAL_EPSILON: self.epsilon = self._training_param.get_next_epsilon( current_step=training_step) # then we need to predict the next moves. Agents have been adapted to predict a batch of data pm_i, pq_v, act = self._next_move(initial_state, self.epsilon, training_step) # todo store the illegal / ambiguous / ... actions reward, done = self._init_local_train_loop() if self.__nb_env == 1: # still the "hack" to have same interface between multi env and env... # yeah it's a pain act = act[0] temp_observation_obj, temp_reward, temp_done, info = env.step( act) if self.__nb_env == 1: # dirty hack to wrap them into list temp_observation_obj = [temp_observation_obj] temp_reward = np.array([temp_reward], dtype=np.float32) temp_done = np.array([temp_done], dtype=np.bool) info = [info] new_state = self._convert_obs_train(temp_observation_obj) self._updage_illegal_ambiguous(training_step, info) done, reward, total_reward, alive_frame, epoch_num \ = self._update_loop(done, temp_reward, temp_done, alive_frame, total_reward, reward, epoch_num) # update the replay buffer self._store_new_state(initial_state, pm_i, reward, done, new_state) # now train the model if not self._train_model(training_step): # infinite loss in this case raise RuntimeError("ERROR INFINITE LOSS") # Save the network every 1000 iterations if training_step % SAVING_NUM == 0 or training_step == iterations - 1: self.save(save_path) # save some information to tensorboard alive_frames[epoch_num] = np.mean(alive_frame) total_rewards[epoch_num] = np.mean(total_reward) self._store_action_played_train(training_step, pm_i) self._save_tensorboard(training_step, epoch_num, UPDATE_FREQ, total_rewards, alive_frames) training_step += 1 pbar.update(1) self.save(save_path)