def learn(self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "SAC", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True) -> OffPolicyRLModel: callback = self._setup_learn(eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts(self.env, n_episodes=self.n_episodes_rollout, n_steps=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval) if rollout.continue_training is False: break self._update_current_progress(self.num_timesteps, total_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(gradient_steps, batch_size=self.batch_size) callback.on_training_end() return self
def _init_callback( self, callback: MaybeCallback, eval_env: Optional[VecEnv] = None, eval_freq: int = 10000, n_eval_episodes: int = 5, log_path: Optional[str] = None, ) -> BaseCallback: """ :param callback: Callback(s) called at every step with state of the algorithm. :param eval_freq: How many steps between evaluations; if None, do not evaluate. :param n_eval_episodes: How many episodes to play per evaluation :param n_eval_episodes: Number of episodes to rollout during evaluation. :param log_path: Path to a folder where the evaluations will be saved :return: A hybrid callback calling `callback` and performing evaluation. """ # Convert a list of callbacks into a callback if isinstance(callback, list): callback = CallbackList(callback) # Convert functional callback to object if not isinstance(callback, BaseCallback): callback = ConvertCallback(callback) # Create eval callback in charge of the evaluation if eval_env is not None: eval_callback = EvalCallback(eval_env, best_model_save_path=log_path, log_path=log_path, eval_freq=eval_freq, n_eval_episodes=n_eval_episodes) callback = CallbackList([callback, eval_callback]) callback.init_callback(self) return callback
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) # debug =============================================================== if mode == 'debug': print(['OPA.learn started, ready to loop (OPA.collect_rollouts + OPA.train)']) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # debug =========================================================== if mode == 'debug': print(['OPA.learn', 'num_timesteps:', self.num_timesteps, 'total_timesteps:', total_timesteps]) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # debug =============================================================== if mode == 'debug': print(['OPA.learn finished, ready to OPA.train']) self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: self.fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_reward_mean", safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer])) logger.record("rollout/ep_len_mean", safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer])) if len(self.specific_reward_info_buffer) > 0 and len(self.specific_reward_info_buffer[0]) > 0: logger.record('rollout/mimic_qpos_reward', safe_mean([specific_reward_info['mimic_qpos_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_qvel_reward', safe_mean([specific_reward_info['mimic_qvel_reward'] for specific_reward_info in self.specific_reward_info_buffer])) #logger.record('rollout/mimic_ee_reward', safe_mean([specific_reward_info['mimic_ee_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_orientation_reward', safe_mean([specific_reward_info['mimic_body_orientation_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_reward', safe_mean([specific_reward_info['mimic_body_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_body_vel_reward', safe_mean([specific_reward_info['mimic_body_vel_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record('rollout/mimic_contact_reward', safe_mean([specific_reward_info['mimic_contact_reward'] for specific_reward_info in self.specific_reward_info_buffer])) logger.record("time/fps", self.fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 100, tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, use_trajectory_buffer: bool=False ) -> "OffPolicyAlgorithm": total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) if use_trajectory_buffer: buffer = self.trajectory_buffer else: buffer = self.replay_buffer while self.num_timesteps < total_timesteps: if use_trajectory_buffer: buffer = self.trajectory_buffer else: buffer = self.replay_buffer ms = get_ms() rollout = self.collect_rollouts( self.env, train_freq=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, buffer=buffer, log_interval=log_interval, ) # print("collect_time: ", get_ms()-ms) if rollout.continue_training is False: break if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # ms = get_ms() # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) # print('train_time: ', get_ms() - ms) # exit() callback.on_training_end() return self
def learn(self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "PPO", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True) -> 'PPO': iteration = 0 callback = self._setup_learn(eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps) # if self.tensorboard_log is not None and SummaryWriter is not None: # self.tb_writer = SummaryWriter(log_dir=os.path.join(self.tensorboard_log, tb_log_name)) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress(self.num_timesteps, total_timesteps) # Display training infos if self.verbose >= 1 and log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.logkv("iterations", iteration) if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: logger.logkv('ep_rew_mean', self.safe_mean([ep_info['r'] for ep_info in self.ep_info_buffer])) logger.logkv('ep_len_mean', self.safe_mean([ep_info['l'] for ep_info in self.ep_info_buffer])) logger.logkv("fps", fps) logger.logkv('time_elapsed', int(time.time() - self.start_time)) logger.logkv("total timesteps", self.num_timesteps) logger.dumpkvs() self.train(self.n_epochs, batch_size=self.batch_size) # For tensorboard integration # if self.tb_writer is not None: # self.tb_writer.add_scalar('Eval/reward', mean_reward, self.num_timesteps) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "HER", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> BaseAlgorithm: total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) self.model.start_time = self.start_time self.model.ep_info_buffer = self.ep_info_buffer self.model.ep_success_buffer = self.ep_success_buffer self.model.num_timesteps = self.num_timesteps self.model._episode_num = self._episode_num self.model._last_obs = self._last_obs self.model._total_timesteps = self._total_timesteps callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, n_episodes=self.n_episodes_rollout, n_steps=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, log_interval=log_interval, ) if rollout.continue_training is False: break if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts and self.replay_buffer.size( ) > 0: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OffPolicyAlgorithm": total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name, ) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, train_freq=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout gradient_steps = self.gradient_steps if self.gradient_steps >= 0 else rollout.episode_timesteps # Special case when the user passes `gradient_steps=0` if gradient_steps > 0: self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OffPolicyAlgorithm": total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) # AF (could put this in callback.on_training_start) self.episode_forecast = [] self.episode_forecasts = [] self.plan, self.forecasts = self._replan( self._last_obs[0], self.empty_plan(), self.zero_forecasts) # VecEnv resets automatically # AF END while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, train_freq=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OffPolicyAlgorithm": total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name ) callback.on_training_start(locals(), globals()) last_tested = 0 while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, train_freq=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) for e in self.env.envs: e.env.train_return = rollout.episode_reward if rollout.continue_training is False: break if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout last_tested += 1 gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(batch_size=self.batch_size, gradient_steps=gradient_steps) if last_tested > 5: last_tested = 0 test_return = self.test(num_episodes=3) logger.record("rollout/test_rew_mean", test_return) callback.on_training_end() return self
def learn(self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "AWAC", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True) -> OffPolicyAlgorithm: total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) self.pretrain_bc(int(1e3), batch_size=self.batch_size) observations, actions, next_observations, rewards, dones = self.bc_buffer.observations, self.bc_buffer.actions, self.bc_buffer.next_observations, self.bc_buffer.rewards, self.bc_buffer.dones for data in zip(observations, next_observations, actions, rewards, dones): self.replay_buffer.add(*data) self.pretrain_rl(int(1e4), batch_size=self.batch_size) while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, n_episodes=self.n_episodes_rollout, n_steps=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval) if rollout.continue_training is False: break self._update_current_progress_remaining(self.num_timesteps, total_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train(gradient_steps, batch_size=self.batch_size) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) for k in self.ep_info_buffer[0].keys(): if k not in "lrt": logger.record( f"progress/{k}", safe_mean([ ep_info[k] for ep_info in self.ep_info_buffer ])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) if iteration % (log_interval * 10) == 0: #save parameters every 10 log steps self.save('./interim_trained_models/') self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 4, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "run", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OffPolicyAlgorithm": total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) # train vae print("Train VAE...") while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, train_freq=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout print("T VAE") gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train_vae(batch_size=self.batch_size, gradient_steps=gradient_steps) """ gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train_vae(batch_size=self.batch_size, gradient_steps=gradient_steps) """ # train mdnrnn print("Train MDNRNN...") self.replay_buffer = ReplayBufferAD( self.buffer_size, self.observation_space, self.action_space, self.device, optimize_memory_usage=self.optimize_memory_usage, ) total_timesteps = 30 while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, train_freq=self.train_freq, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout print("T MDNRNN") gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train_mdnrnn(batch_size=self.batch_size, gradient_steps=gradient_steps) """ gradient_steps = self.gradient_steps if self.gradient_steps > 0 else rollout.episode_timesteps self.train_mdnrnn(batch_size=self.batch_size, gradient_steps=gradient_steps) """ # train controller print("Train Controller...") p_queue = Queue() r_queue = Queue() e_queue = Queue() num_workers = 16 for p_index in range(num_workers): Process(target=self.slave_routine, args=(p_queue, r_queue, e_queue, p_index)).start() cur_best = None parameters = self.controller.parameters() es = cma.CMAEvolutionStrategy(flatten_parameters(parameters), 0.1, {'popsize': 4}) epoch = 0 log_step = 3 while not es.stop(): if cur_best is not None and -cur_best > 950: print("Already better than target, breaking...") break r_list = [0] * 4 # result list solutions = es.ask() # push parameters to queue i = 0 for s_id, s in enumerate(solutions): for _ in range(4): i += 1 p_queue.put((s_id, s)) # retrieve results for _ in range(16): while r_queue.empty(): sleep(.1) r_s_id, r = r_queue.get() r_list[r_s_id] += r / 4 es.tell(solutions, r_list) es.disp() # evaluation and saving if epoch % log_step == log_step - 1: best_params, best, std_best = self.evaluate( p_queue, r_queue, solutions, r_list) print("Current evaluation: {}".format(best)) if not cur_best or cur_best > best: cur_best = best print("Saving new best with value {}+-{}...".format( -cur_best, std_best)) load_parameters(best_params, self.controller) if -best > 950: print("Terminating controller training with value {}...". format(best)) break epoch += 1 es.result_pretty() e_queue.put('EOP') callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "PPO", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: """Replay buffer size""" ### No need to use larger buffer, because that doesn't solve the catastrophic forgetting problem. ### For this experiment, just count the best score is enough. # Determine buffer size using safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer]) # I want it to be stable when learned walking. # Start with small buffer, once # ep_len_mean = safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer]) # if ep_len_mean>=1000: # self.use_small_buffer = False if not args.single and self.use_small_buffer: output( f"Collect rollouts for {self.n_steps//self.env.num_envs} steps.", 2) continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer_small, n_rollout_steps=self.n_steps // self.env.num_envs) else: output(f"Collect rollouts for {self.n_steps} steps.", 2) continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: for partner_idx in range(self.policy.num_partners): try: self.env.envs[0].switch_to_env(partner_idx) except: pass continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps, partner_idx=partner_idx) #continue_training = self.collect_rollouts(self.env, callback, self.rollout_buffer[partner_idx], n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 print('setup training') total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) print(f'start training, total timesteps is {total_timesteps}') while self.num_timesteps < total_timesteps: print(f'num timesteps: {self.num_timesteps}/{total_timesteps}') print(f'collect rollouts, rollout steps = {self.n_steps}') continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: print( 'stop training (only happens if callback on_step returns false)' ) break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos print('display training infos') # print(f'len(self.ep_info_buffer)={len(self.ep_info_buffer)}, len(self.ep_info_buffer[0])={len(self.ep_info_buffer[0])}') if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) print('train') self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: # Collect n_steps (e.g. 512) number of steps. Total timesteps = n_steps * num_envs (e.g. 512 * 8 = 4096) # Hence each rollout has a total of 4096 timesteps continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") #logger.record("rollout/ep_rew_mean", safe_mean([goal_diff for goal_diff in self.ep_info_buffer])) #if len(self.ep_info_buffer) > 0 and len(self.ep_info_buffer[0]) > 0: # logger.record("rollout/ep_rew_mean", safe_mean([ep_info["r"] for ep_info in self.ep_info_buffer])) # logger.record("rollout/ep_len_mean", safe_mean([ep_info["l"] for ep_info in self.ep_info_buffer])) #logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # Save model every 50 iterations if iteration > 0 and iteration % 50 == 0: # Save Pytorch Model locally if self.model_checkpoints_path is not None: th.save( self.policy.state_dict(), self.model_checkpoints_path + f"/model_v{iteration}") # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard if self.log_handler is not None: self.log_handler.save( self.model_checkpoints_path + f"/model_v{iteration}", base_path=self.model_checkpoints_path) # Save the best model if achieve a new high score if self.save_best_model: print( f"Model achieve best score: {self.best_score} at iteration {iteration}" ) # Save Pytorch Model locally if self.model_checkpoints_path is not None: th.save(self.policy.state_dict(), self.model_checkpoints_path + "/model_bestscore") # Save Pytorch model to wanb local dir and upload to wandb cloud dashboard if self.log_handler is not None: self.log_handler.save( self.model_checkpoints_path + "/model_bestscore", base_path=self.model_checkpoints_path) self.save_best_model = False # PPO Training self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) from stable_baselines3.common.utils import obs_as_tensor, safe_mean import time while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int((self.num_timesteps - self._num_timesteps_at_start) / (time.time() - self.start_time)) self.logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: self.logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) self.logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) self.logger.record("time/fps", fps) self.logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") self.logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") # [RLA] set timesteps time_step_holder.set_time(self.num_timesteps) self.logger.dump() self.train() callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: rollout = self.collect_rollouts( self.env, n_episodes=-1, n_steps=1, action_noise=self.action_noise, callback=callback, learning_starts=self.learning_starts, replay_buffer=self.replay_buffer, log_interval=log_interval, ) if rollout.continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) if self.num_timesteps > 0 and self.num_timesteps > self.learning_starts: # If no `gradient_steps` is specified, # do as many gradients steps as steps performed during the rollout self.train(gradient_steps=1, batch_size=self.batch_size) callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, parameter_noise: bool = False, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) #initiatilizing value of noise std current_sigma = 1.0 while self.num_timesteps < total_timesteps: continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps, parameter_noise=parameter_noise, sigma=0.5) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) self.train() if parameter_noise: states = self.rollout_buffer.observations states = th.tensor(states) actions_unnoisy, values_unnoisy, log_prob_unnoisy = self.policy( states, parameter_noise=False) actions_noisy, values_noisy, log_prob_noisy = self.policy( states, parameter_noise=True, sigma=current_sigma) distance = th.sum((actions_unnoisy - actions_noisy)**2)**0.5 distance_threshold = 1 sigma_scalefactor = 1.01 if distance > distance_threshold: current_sigma /= sigma_scalefactor else: current_sigma *= sigma_scalefactor callback.on_training_end() return self
def learn( self, total_timesteps: int, callback: MaybeCallback = None, log_interval: int = 1, eval_env: Optional[GymEnv] = None, eval_freq: int = -1, n_eval_episodes: int = 5, tb_log_name: str = "OnPolicyAlgorithm", eval_log_path: Optional[str] = None, reset_num_timesteps: bool = True, param_noise: bool = False, sigma: float = 0.1, ) -> "OnPolicyAlgorithm": iteration = 0 total_timesteps, callback = self._setup_learn( total_timesteps, eval_env, callback, eval_freq, n_eval_episodes, eval_log_path, reset_num_timesteps, tb_log_name) callback.on_training_start(locals(), globals()) while self.num_timesteps < total_timesteps: #during rollout we collect batches of states and rewards continue_training = self.collect_rollouts( self.env, callback, self.rollout_buffer, n_rollout_steps=self.n_steps, param_noise=param_noise, sigma=sigma) if continue_training is False: break iteration += 1 self._update_current_progress_remaining(self.num_timesteps, total_timesteps) # Display training infos if log_interval is not None and iteration % log_interval == 0: fps = int(self.num_timesteps / (time.time() - self.start_time)) logger.record("time/iterations", iteration, exclude="tensorboard") if len(self.ep_info_buffer) > 0 and len( self.ep_info_buffer[0]) > 0: logger.record( "rollout/ep_rew_mean", safe_mean( [ep_info["r"] for ep_info in self.ep_info_buffer])) logger.record( "rollout/ep_len_mean", safe_mean( [ep_info["l"] for ep_info in self.ep_info_buffer])) logger.record("time/fps", fps) logger.record("time/time_elapsed", int(time.time() - self.start_time), exclude="tensorboard") logger.record("time/total_timesteps", self.num_timesteps, exclude="tensorboard") logger.dump(step=self.num_timesteps) # during training gradient descent is done self.train(param_noise, sigma) if param_noise: sigma = self.update_sigma(sigma) # print("current_sigma") # print(sigma) callback.on_training_end() return self