def test(self) -> None: """ Teste l'agent sur un episode complet sans bruit """ start_test_time = time() env = make_env(self._config) obs, done = env.reset(), False rews = [] nb_step = 0 while not done: act = self._agent(obs=obs) obs, rew, done, _ = env.step(act) rews.append(rew) nb_step += 1 rew_mean = np.mean(rews) # logging self._logger.add_scalar(label="test/reward", value=sum(rews), step=self._global_update_step.val()) self._logger.add_scalar(label="test/nb_step", value=nb_step, step=self._global_update_step.val()) self._logger.add_scalar(label="test/reward_mean", value=float(rew_mean), step=self._global_update_step.val()) self._logger.add_scalar(label="test/reward_var", value=float(np.var(rews)), step=self._global_update_step.val()) if self._best_test_reward < int(rew_mean): self._agent.save(episode=self._global_episode.val(), update_step=self._global_update_step.val(), test_reward=int(sum(rews))) self._logger.add_scalar(label="test/test_speed", value=time() - start_test_time, step=self._global_update_step.val())
def __init__(self, config: Dict, agent: Agent, id_worker: int, worker_seed: int, global_episode: Counter, transitions_queue: Queue, global_update_step: Counter, epsilon: Union[ExponentialEpsilon, SinusoidalEpsilone], logger: Logger) -> None: """ Thread responsable de récupérer les transitions en interagissant avec l'environnement avec la dernière version de l'agent :param config: Dictionnaire de configuration de l'expérience :param agent: Agent à utiliser afin de choisir les actions :param id_worker: Id du worker :param worker_seed: Random seed à utiliser :param global_episode: Compteur du nombre d'époside finis (partagé entre les threads) :param transitions_queue: Queue à partir de laquelle les threads Player envoient leurs transitions au thread Trainer :param global_update_step: Compteur du nombre d'update effectués (partagé entre les threads) :param epsilon: Epsilone processus utilisé pour le bruit ajouté aux actions de l'agent :param logger: Le logger utilisé au cours de l'expérience """ super().__init__() self._config = config self._agent = agent self._id = id_worker self._global_episode = global_episode self._transitions_queue = transitions_queue self._global_update_step = global_update_step self._logger = logger self._seed = worker_seed plant_seed(self._seed) self._env = make_env(self._config) self._nb_action = self._env.action_space.shape[ 0] if self._env.action_space.shape else 1 self._random_noise_process = RandomNoise(config, self._nb_action, epsilon)
def run(self) -> None: """ Lance le thread """ episode = 1 step = 0 self._start_player_time = time() while True: start_loop_time = time() if self._should_stop(): break # Pour éviter les fuites mémoire sur certain environnement (rip opensim) if self._config["invalidate_env_time"] and episode % self._config[ "invalidate_env_time"] == 0: self._env = make_env(self._config) cur_episode = self._global_episode.val() self._global_episode.inc() episode_time = 0 obs, done = self._env.reset(), False rew_sum = 0 n_steps = 0 while not done: start_episode_time = time() noise = self._random_noise_process.sample() act = self._agent(obs=obs, noise=noise) next_obs, rew, done, _ = self._env.step(act) rew_sum += rew n_steps += 1 transition = Transition(observation=obs, action=act, new_observation=next_obs, reward=rew, done=done) obs = next_obs episode_time += time() - start_episode_time # Placement de la transition dans la queue while True: if self._should_stop(): break try: self._transitions_queue.put_nowait(transition) break except Full: sleep(0.01) # logging if step % self._config["players_config"]["log_freq"] == 0: self._logger.add_scalar( label=f"players/reward_per_episode_{self._id}", value=rew_sum, step=cur_episode) self._logger.add_scalar( label=f"players/noise_abs_mean_{self._id}", value=np.abs(noise).mean(), step=cur_episode) self._logger.add_scalar( label=f"players/step_per_episode_{self._id}", value=n_steps, step=cur_episode) self._logger.add_scalar(label=f"players/step_{self._id}", value=step, step=cur_episode) self._logger.add_scalar( label=f"players/epoch_speed_{self._id}", value=episode_time, step=cur_episode) self._logger.add_scalar(label=f"players/idle_{self._id}", value=time() - start_loop_time - episode_time, step=cur_episode) step += 1 print(f"player {self._id} end")
def run(self) -> None: """ Lance le thread """ self._start_training_time = time() # Initialise le replay buffer avec la policy random jusqu'à qu'il contienne min_replay_size transitions p_bar = tqdm(total=self._config["agent_config"]["min_replay_size"]) env = make_env(self._config) while len(self._replay_buffer ) < self._config["agent_config"]["min_replay_size"]: obs, done = env.reset(), False while not done: act = env.action_space.sample() if len(act.shape) < 1: act = [act] next_obs, rew, done, _ = env.step(act) transition = Transition(observation=obs, action=act, new_observation=next_obs, reward=rew, done=done) obs = next_obs self._replay_buffer.add(transition) p_bar.update(1) print("buffer initialization done") while True: start_get_replays_time = time() # Récupération d'une transition dans la queue et placement de cette transition dans le replay buffer while True: if self._should_stop(): break try: transition = self._episode_queue.get_nowait() self._replay_buffer.add(transition) break except Empty: sleep(0.01) pass if self._should_stop(): break end_get_replays_time = time() start_update_time = time() indexes_b, transition_b, weights_b = self._replay_buffer.sample( self._config["agent_config"]["batch_size"]) if self._global_update_step.val( ) % self._config["trainer_config"]["log_freq"] == 0: update_step = self._global_update_step.val() else: update_step = None td_error = self._agent.update(transition_b, weights_b, update_step) # Update les priorités du replay buffer avec l'erreur du critic new_priorities = np.abs( td_error ) + 1e-16 # attention les priorités doivent toujours être strictement positives self._replay_buffer.update(indexes_b, new_priorities) # logging if update_step: self._logger.add_scalar(label="multithreading/queue_size", value=self._episode_queue.qsize(), step=update_step) self._logger.add_scalar(label="trainer/buffer_size", value=len(self._replay_buffer), step=update_step) self._logger.add_scalar(label="trainer/priority_mean", value=weights_b.mean(), step=update_step) self._logger.add_scalar(label="trainer/priority_var", value=weights_b.var(), step=update_step) self._logger.add_scalar(label="trainer/td_error_mean", value=np.abs(td_error).mean(), step=update_step) self._logger.add_scalar(label="trainer/update_step", value=self._global_episode.val(), step=update_step) self._logger.add_scalar(label="trainer/update_time", value=time() - start_update_time, step=update_step) self._logger.add_scalar(label="trainer/idle_time", value=end_get_replays_time - start_get_replays_time, step=update_step) self._global_update_step.inc() self._epsilone.step() if self._global_update_step.val( ) % self._config["trainer_config"]["test_freq"] == 0: self.test() self.test() print("trainer end")
def __init__(self, config: Dict, logger: Logger, force_cpu: bool = False) -> None: """ La classe Agent regroupe un reseau de neurones Actor et un Critic et correspond à un agent de RL pouvant prendre des décisions et updater les poids de ses réseaux :param config: Dictionnaire de configuration de l'expérience :param logger: Logger à utiliser au cours de l'entrainement de l'agent :param force_cpu: force l'utilisation du CPU même si un GPU est détecté """ self._lock = threading.Lock() self._config = config source_env = make_env(self._config) if force_cpu: self._device = torch.device("cpu") else: self._device = torch.device( "cuda") if torch.cuda.is_available() else torch.device("cpu") self._n_action = source_env.action_space.shape[ 0] if source_env.action_space.shape else 1 self._n_observation = source_env.observation_space.shape[ 0] if source_env.observation_space.shape else 1 self._actor = Actor(self._config, self._n_observation, self._n_action).to(self._device) self._critic = Critic(self._config, self._n_observation, self._n_action).to(self._device) if self._config["agent_config"]["load_from_ckpt"] is not None: ckpt = torch.load(self._config["agent_config"]["load_from_ckpt"], map_location=self._device) self._actor.load_state_dict(ckpt["actor"]) self._critic.load_state_dict(ckpt["critic"]) self._target_actor = Actor(self._config, self._n_observation, self._n_action).to(self._device) self._target_critic = Critic(self._config, self._n_observation, self._n_action).to(self._device) self._hard_update(self._target_actor, self._actor) self._hard_update(self._target_critic, self._critic) self._logger = logger self._loss_fn = WeightedMSELoss() self._actor_optim = torch.optim.Adam( self._actor.parameters(), lr=self._config["agent_config"]["lr_actor"], weight_decay=self._config["agent_config"]["weight_decay_actor"]) self._critic_optim = torch.optim.Adam( self._critic.parameters(), lr=self._config["agent_config"]["lr_critic"], weight_decay=self._config["agent_config"]["weight_decay_critic"]) if self._config["agent_config"]["load_critic_from"] is not None: self._critic.load_state_dict( torch.load(self._config["agent_config"]["load_critic_from"])) print( f"load critic from {self._config['agent_config']['load_critic_from']}" ) elif self._config["agent_config"]["warmstart_critic"]: self._pretrain_critic()
def _pretrain_critic(self) -> None: """ Initialisation du critic en l'entraînant sur la policy random """ # génération des transitions random env = make_env(self._config) p_bar = tqdm(total=self._config["agent_config"]["warmstart_size"]) samples = [] while len(samples) < self._config["agent_config"]["warmstart_size"]: obs, done = env.reset(), False while not done: act = env.action_space.sample() next_obs, rew, done, _ = env.step(act) samples.append( Transition(observation=obs, action=act, new_observation=next_obs, reward=rew, done=done)) obs = next_obs p_bar.update(1) samples = np.array(samples) patience = 0 batch_size = self._config["agent_config"]["batch_size"] best_td_error_mean = float('inf') epoch = 0 # TODO : remonter la configuration de la patience du warmstart dans la config de l'expérience while patience < 5: np.random.shuffle(samples) train_td_error = [] # train one epoch for i in range(int(len(samples) / batch_size)): batch = samples[i * batch_size:min(len(samples), (i + 1) * batch_size)] observations_t, actions_t, rewards_t, next_observations_t, dones_t = unpack_batch( batch, self._device) weight_t = torch.ones(len(observations_t)) td_error = self._update_critic(observations_t, actions_t, rewards_t, next_observations_t, dones_t, weight_t, None) # Target soft update self._soft_update(self._target_critic, self._critic) train_td_error.append(np.abs(td_error).mean()) train_td_error_mean = np.mean(train_td_error) self._logger.add_scalar(label=f"pretrain/td_error_mean", value=float(train_td_error_mean), step=epoch) epoch += 1 if train_td_error_mean < best_td_error_mean: best_td_error_mean = train_td_error_mean patience = 0 else: patience += 1 save_path = os.path.join(self._config["log_dir"], f"critic_warmstart.pkl") torch.save(self._critic.state_dict(), save_path) print(f"saving critic to {save_path}") print(f"warmstart done")
if __name__ == '__main__': parser = argparse.ArgumentParser( description="Script permettant de tester un agent") parser.add_argument("-l", "--logdir", type=str) parser.add_argument('--last', dest='last_ckpt', action='store_true') parser.set_defaults(last_ckpt=False) args = parser.parse_args() with open(os.path.join(args.logdir, "config.json")) as f: config = json.load(f) n_skip = config["skip_frame"] config["skip_frame"] = 1 with make_env(config) as env: while True: best = float('-inf') best_ckpt_file = "" for ckpt in os.listdir(os.path.join(args.logdir, "checkpoints")): if ckpt.startswith('.'): continue if args.last_ckpt: n = int(ckpt.split('e')[1].split('_')[0]) else: n = int(ckpt.split('r')[-1].split('.')[0]) if n > best: best = n best_ckpt_file = os.path.join(args.logdir, "checkpoints", ckpt)